|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# # H1B VISA DATA ANALYSIS |
| 5 | + |
| 6 | +# # importing libraries |
| 7 | + |
| 8 | +# In[2]: |
| 9 | + |
| 10 | + |
| 11 | +import pandas as pd |
| 12 | + |
| 13 | + |
| 14 | +# In[3]: |
| 15 | + |
| 16 | + |
| 17 | +import numpy as np |
| 18 | +import matplotlib.pyplot as plt |
| 19 | +import seaborn as sns |
| 20 | + |
| 21 | + |
| 22 | +# Reading csv file |
| 23 | + |
| 24 | +# In[4]: |
| 25 | + |
| 26 | + |
| 27 | +f = pd.read_csv("h1b_kaggle.csv") |
| 28 | +f.head() |
| 29 | + |
| 30 | + |
| 31 | +# deleting unnamed column |
| 32 | + |
| 33 | +# In[5]: |
| 34 | + |
| 35 | + |
| 36 | +del f['Unnamed: 0'] |
| 37 | +len(f) |
| 38 | + |
| 39 | + |
| 40 | +# In[6]: |
| 41 | + |
| 42 | + |
| 43 | +f = f.dropna() |
| 44 | +f.reset_index() |
| 45 | +lng = len(f) |
| 46 | +print(lng) |
| 47 | + |
| 48 | + |
| 49 | +# In[6]: |
| 50 | + |
| 51 | + |
| 52 | +f.head() |
| 53 | + |
| 54 | + |
| 55 | +# TOP EMPLOYERS |
| 56 | + |
| 57 | +# In[7]: |
| 58 | + |
| 59 | + |
| 60 | +f.EMPLOYER_NAME.value_counts().head(15) |
| 61 | + |
| 62 | + |
| 63 | +# In[8]: |
| 64 | + |
| 65 | + |
| 66 | +f['EMPLOYER_NAME'].value_counts().head(15).plot(kind = 'bar',title = "TOP 15 Hiring Company") |
| 67 | + |
| 68 | + |
| 69 | +# ANALYZING THE PREVAILING WAGE |
| 70 | + |
| 71 | +# In[21]: |
| 72 | + |
| 73 | + |
| 74 | +f.PREVAILING_WAGE.value_counts().sort_values(ascending = False ).head() |
| 75 | + |
| 76 | + |
| 77 | +# In[9]: |
| 78 | + |
| 79 | + |
| 80 | +f.PREVAILING_WAGE.mean() |
| 81 | + |
| 82 | + |
| 83 | +# In[24]: |
| 84 | + |
| 85 | + |
| 86 | +###Wages given by the employers |
| 87 | + |
| 88 | + |
| 89 | +# In[10]: |
| 90 | + |
| 91 | + |
| 92 | +a = f.groupby(['EMPLOYER_NAME']).mean()['PREVAILING_WAGE'].nlargest(15).plot(kind = 'bar') |
| 93 | + |
| 94 | + |
| 95 | +# WORKSITE |
| 96 | + |
| 97 | +# In[11]: |
| 98 | + |
| 99 | + |
| 100 | +f['WORKSITE'].value_counts().head(20) |
| 101 | + |
| 102 | + |
| 103 | +# In[12]: |
| 104 | + |
| 105 | + |
| 106 | +f.loc[:,'WORKSITE'] = f.loc[:,'WORKSITE'].apply(lambda rec:rec.split(',')[1][1:]) |
| 107 | +def change_NA(rec): |
| 108 | + if (rec=='NA') : |
| 109 | + return 'MARIANA ISLAND' |
| 110 | + return rec |
| 111 | +f.loc[:,'WORKSITE'] = f.loc[:,'WORKSITE'].apply(lambda rec: change_NA(rec)) |
| 112 | +print(len(f['WORKSITE'].unique())) |
| 113 | + |
| 114 | + |
| 115 | +# In[13]: |
| 116 | + |
| 117 | + |
| 118 | +f['WORKSITE'].value_counts().head(20).plot(kind='bar') |
| 119 | + |
| 120 | + |
| 121 | +# Changing the columns name |
| 122 | + |
| 123 | +# In[14]: |
| 124 | + |
| 125 | + |
| 126 | +f.rename(columns ={'EMPLOYER_NAME':'EMPLOYER','FULL_TIME_POSITION':'FULL_T','PREVAILING_WAGE':'PREV_WAGE','WORKSITE':'STATES','lon':'LON','lat':'LAT'},inplace = True) |
| 127 | + |
| 128 | + |
| 129 | +# In[34]: |
| 130 | + |
| 131 | + |
| 132 | +columns_to_keep = ['CASE_STATUS','YEAR','STATES','SOC_NAME','JOB_TITLE','FULL_T','PREV_WAGE','EMPLOYER','LON','LAT'] |
| 133 | +f =f[columns_to_keep] |
| 134 | +f.columns |
| 135 | + |
| 136 | + |
| 137 | +# Setting the precision value upto 2 decimal place |
| 138 | + |
| 139 | +# In[15]: |
| 140 | + |
| 141 | + |
| 142 | +f['LON'] = f['LON'].apply(lambda lon : float("% .2f" %lon)) |
| 143 | + |
| 144 | + |
| 145 | +# In[16]: |
| 146 | + |
| 147 | + |
| 148 | +f['LAT'] = f['LAT'].apply(lambda lat : float("% .2f" %lat)) |
| 149 | + |
| 150 | + |
| 151 | +# In[17]: |
| 152 | + |
| 153 | + |
| 154 | +f.head() |
| 155 | + |
| 156 | + |
| 157 | +# In[18]: |
| 158 | + |
| 159 | + |
| 160 | +f['CASE_STATUS'].unique() |
| 161 | + |
| 162 | + |
| 163 | +# calculating the petition ditribution by case status |
| 164 | + |
| 165 | +# In[19]: |
| 166 | + |
| 167 | + |
| 168 | +s_f = [0]*7 |
| 169 | +states = ['CERTIFIED-WITHDRAWN', 'WITHDRAWN', 'CERTIFIED', 'DENIED', |
| 170 | + 'REJECTED', 'INVALIDATED', |
| 171 | + 'PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED'] |
| 172 | +for i in range(0,7): |
| 173 | + s_f[i]= f[f.CASE_STATUS == states[i]]['CASE_STATUS'].count() |
| 174 | +s_f |
| 175 | + |
| 176 | + |
| 177 | +# In[20]: |
| 178 | + |
| 179 | + |
| 180 | +from matplotlib.pyplot import pie , axis , show |
| 181 | +plt.figure(figsize=(4.5,4.5)) |
| 182 | +plt.title("PETITION BY CASE STATUS") |
| 183 | +axis('equal') |
| 184 | +pie(s_f[:4],labels = states[:4]) |
| 185 | +show() |
| 186 | + |
| 187 | + |
| 188 | +# calculating petitions distribution per year |
| 189 | + |
| 190 | +# In[21]: |
| 191 | + |
| 192 | + |
| 193 | +f['YEAR'].unique() |
| 194 | + |
| 195 | + |
| 196 | +# In[22]: |
| 197 | + |
| 198 | + |
| 199 | +y = [2016., 2015., 2014., 2013., 2012., 2011.] |
| 200 | +y_c = [0]*6 |
| 201 | +for j in range(0,6): |
| 202 | + y_c[j] = f[f.YEAR==y[j]]['YEAR'].count() |
| 203 | +y_c |
| 204 | + |
| 205 | + |
| 206 | +# In[23]: |
| 207 | + |
| 208 | + |
| 209 | +sns.set_context("notebook",font_scale = 1.0) |
| 210 | +plt.figure(figsize = (13,3)) |
| 211 | +plt.title("PETITIONS DISTRIBUTION PER YEAR") |
| 212 | +sns.countplot(f['YEAR']) |
| 213 | + |
| 214 | + |
| 215 | +# ANALYZING DENIED PETITIONS PER YEAR |
| 216 | + |
| 217 | +# In[24]: |
| 218 | + |
| 219 | + |
| 220 | +den = f[f.CASE_STATUS=='DENIED'] |
| 221 | +len(den) |
| 222 | + |
| 223 | + |
| 224 | +# In[25]: |
| 225 | + |
| 226 | + |
| 227 | +del den['CASE_STATUS'] |
| 228 | +den = den.reset_index() |
| 229 | +den.head(2) |
| 230 | + |
| 231 | + |
| 232 | +# Calculating denied petition distribution per year |
| 233 | + |
| 234 | +# In[26]: |
| 235 | + |
| 236 | + |
| 237 | +d_y = [0]*6 |
| 238 | +for i in range(0,6): |
| 239 | + d_y[i]=den[den.YEAR==y[i]]['YEAR'].count() |
| 240 | +d_y |
| 241 | + |
| 242 | + |
| 243 | +# In[27]: |
| 244 | + |
| 245 | + |
| 246 | +sns.set_context("notebook",font_scale = 1.0) |
| 247 | +plt.figure(figsize = (13,3)) |
| 248 | +plt.title(" DENIED PETITIONS DISTRIBUTION PER YEAR") |
| 249 | +sns.countplot(den['YEAR']) |
| 250 | + |
| 251 | + |
| 252 | +# Calculating the rate at which denied petitins per year |
| 253 | +# |
| 254 | + |
| 255 | +# In[28]: |
| 256 | + |
| 257 | + |
| 258 | +d_y_r = [0]*6 |
| 259 | +for i in range(0,6): |
| 260 | + d_y_r[i] = float("% .2f" %((d_y[i]/y_c[i])*100)) |
| 261 | +ratio = pd.DataFrame() |
| 262 | +ratio['YEAR'] = y |
| 263 | +ratio['denied rate %'] = d_y_r |
| 264 | +ratio = ratio.set_index(['YEAR']) |
| 265 | +ratio |
| 266 | + |
| 267 | + |
| 268 | +# In[29]: |
| 269 | + |
| 270 | + |
| 271 | +ratio.T |
| 272 | + |
| 273 | + |
| 274 | +# In[30]: |
| 275 | + |
| 276 | + |
| 277 | +ratio = ratio.reset_index() |
| 278 | +ratio |
| 279 | + |
| 280 | + |
| 281 | +# In[31]: |
| 282 | + |
| 283 | + |
| 284 | +sns.set_context("notebook",font_scale = 1.0) |
| 285 | +plt.figure(figsize=(13,3)) |
| 286 | +plt.title("DENIED PETITIONS PER YEAR") |
| 287 | +g = sns.barplot(x ='YEAR',y ='denied rate %',data=ratio) |
| 288 | + |
| 289 | + |
| 290 | +# Calculating the number of petitions filled by the states |
| 291 | +# |
| 292 | + |
| 293 | +# In[32]: |
| 294 | + |
| 295 | + |
| 296 | +f['STATES'].unique() |
| 297 | + |
| 298 | + |
| 299 | +# In[34]: |
| 300 | + |
| 301 | + |
| 302 | +US_states = ['ALABAMA','ALASKA','ARIZONA','ARKANSAS','CAALIFORNIA','COLORADO','CONNECTICUT','DELAWARE','DISTRICT OF COLUMBIA','FLORIDA','GEORGIA','HAWII','IDAHO','ILLINOIS','INDIANA','IOWA','KANSAS','KENTUCKY','LOUISTANA','MAINE','MARIANA ISLANDS','MARYLAND','MASSACHUUSSETS','MICHIGAN','MINNESOTA','MISSISSIPPI','MISSOURI','MONTANA','NEBRASKA','NEVADA','NEW HAMPSHIRE','NEW JERSEY','NEW MEXICO','NEW YORK','NORTH CAROLINA','NORTH DAKOTA','OHIO','OKALHOMA','OREGON','PENNSYLVANIA','PUERTO RICO','RHODE ISLAND','SOUTH CAROLINA','SOUTH DAKOTA','TENNESEE','TEXAS','UTAH','VERMONT','VIRGINIA','WASHINGTON','WEST VIRGINIA','WISCONSIN','WYOMING'] |
| 303 | +p_s = [0]*53 |
| 304 | +for i in range(0,53): |
| 305 | + p_s[i]= f[f.STATES == US_states[i]]['STATES'].count() |
| 306 | +pe = pd.DataFrame() |
| 307 | +pe['STATES']= US_states |
| 308 | +pe['FIELD PETITIONS'] = p_s |
| 309 | +print(sum(p_s)) |
| 310 | + |
| 311 | + |
| 312 | +# In[56]: |
| 313 | + |
| 314 | + |
| 315 | +sns.set_context("notebook",font_scale = 1.0) |
| 316 | +plt.figure(figsize=(13,7)) |
| 317 | +plt.title("FIELD PETITION BY STATE") |
| 318 | +g = sns.barplot(x ='STATES',y ='FIELD PETITIONS',data=pe) |
| 319 | +v = g.set_xticklabels(g.get_xticklabels(),rotation = 90) |
| 320 | + |
| 321 | + |
| 322 | +# NO. OF PETITIONS DENIED BY STATE |
| 323 | + |
| 324 | +# In[49]: |
| 325 | + |
| 326 | + |
| 327 | +deni = [0]*53 |
| 328 | +for i in range(0,53): |
| 329 | + deni[i]= den[den.STATES==US_states[i]]['STATES'].count() |
| 330 | +de_s = pd.DataFrame() |
| 331 | +de_s['STATES'] = US_states |
| 332 | +de_s['DENIED PETITIONS'] = deni |
| 333 | +de_s |
| 334 | + |
| 335 | + |
| 336 | +# In[50]: |
| 337 | + |
| 338 | + |
| 339 | +print(sum(deni)) |
| 340 | + |
| 341 | + |
| 342 | +# In[59]: |
| 343 | + |
| 344 | + |
| 345 | +sns.set_context("notebook",font_scale = 1.0) |
| 346 | +plt.figure(figsize=(13,3)) |
| 347 | +plt.title("DENIED PETITIONS BY STATE") |
| 348 | +g = sns.barplot(x ='STATES',y ='DENIED PETITIONS',data=de_s) |
| 349 | +v = g.set_xticklabels(g.get_xticklabels(),rotation = 90) |
| 350 | + |
0 commit comments