Skip to content

Commit 5f5d540

Browse files
author
codehouseindia
authored
Merge pull request codehouseindia#360 from mishra-ayushknp/patch-5
Create data analysis using python
2 parents a5a38d3 + a0d60b1 commit 5f5d540

1 file changed

Lines changed: 350 additions & 0 deletions

File tree

data analysis using python

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# # H1B VISA DATA ANALYSIS
5+
6+
# # importing libraries
7+
8+
# In[2]:
9+
10+
11+
import pandas as pd
12+
13+
14+
# In[3]:
15+
16+
17+
import numpy as np
18+
import matplotlib.pyplot as plt
19+
import seaborn as sns
20+
21+
22+
# Reading csv file
23+
24+
# In[4]:
25+
26+
27+
f = pd.read_csv("h1b_kaggle.csv")
28+
f.head()
29+
30+
31+
# deleting unnamed column
32+
33+
# In[5]:
34+
35+
36+
del f['Unnamed: 0']
37+
len(f)
38+
39+
40+
# In[6]:
41+
42+
43+
f = f.dropna()
44+
f.reset_index()
45+
lng = len(f)
46+
print(lng)
47+
48+
49+
# In[6]:
50+
51+
52+
f.head()
53+
54+
55+
# TOP EMPLOYERS
56+
57+
# In[7]:
58+
59+
60+
f.EMPLOYER_NAME.value_counts().head(15)
61+
62+
63+
# In[8]:
64+
65+
66+
f['EMPLOYER_NAME'].value_counts().head(15).plot(kind = 'bar',title = "TOP 15 Hiring Company")
67+
68+
69+
# ANALYZING THE PREVAILING WAGE
70+
71+
# In[21]:
72+
73+
74+
f.PREVAILING_WAGE.value_counts().sort_values(ascending = False ).head()
75+
76+
77+
# In[9]:
78+
79+
80+
f.PREVAILING_WAGE.mean()
81+
82+
83+
# In[24]:
84+
85+
86+
###Wages given by the employers
87+
88+
89+
# In[10]:
90+
91+
92+
a = f.groupby(['EMPLOYER_NAME']).mean()['PREVAILING_WAGE'].nlargest(15).plot(kind = 'bar')
93+
94+
95+
# WORKSITE
96+
97+
# In[11]:
98+
99+
100+
f['WORKSITE'].value_counts().head(20)
101+
102+
103+
# In[12]:
104+
105+
106+
f.loc[:,'WORKSITE'] = f.loc[:,'WORKSITE'].apply(lambda rec:rec.split(',')[1][1:])
107+
def change_NA(rec):
108+
if (rec=='NA') :
109+
return 'MARIANA ISLAND'
110+
return rec
111+
f.loc[:,'WORKSITE'] = f.loc[:,'WORKSITE'].apply(lambda rec: change_NA(rec))
112+
print(len(f['WORKSITE'].unique()))
113+
114+
115+
# In[13]:
116+
117+
118+
f['WORKSITE'].value_counts().head(20).plot(kind='bar')
119+
120+
121+
# Changing the columns name
122+
123+
# In[14]:
124+
125+
126+
f.rename(columns ={'EMPLOYER_NAME':'EMPLOYER','FULL_TIME_POSITION':'FULL_T','PREVAILING_WAGE':'PREV_WAGE','WORKSITE':'STATES','lon':'LON','lat':'LAT'},inplace = True)
127+
128+
129+
# In[34]:
130+
131+
132+
columns_to_keep = ['CASE_STATUS','YEAR','STATES','SOC_NAME','JOB_TITLE','FULL_T','PREV_WAGE','EMPLOYER','LON','LAT']
133+
f =f[columns_to_keep]
134+
f.columns
135+
136+
137+
# Setting the precision value upto 2 decimal place
138+
139+
# In[15]:
140+
141+
142+
f['LON'] = f['LON'].apply(lambda lon : float("% .2f" %lon))
143+
144+
145+
# In[16]:
146+
147+
148+
f['LAT'] = f['LAT'].apply(lambda lat : float("% .2f" %lat))
149+
150+
151+
# In[17]:
152+
153+
154+
f.head()
155+
156+
157+
# In[18]:
158+
159+
160+
f['CASE_STATUS'].unique()
161+
162+
163+
# calculating the petition ditribution by case status
164+
165+
# In[19]:
166+
167+
168+
s_f = [0]*7
169+
states = ['CERTIFIED-WITHDRAWN', 'WITHDRAWN', 'CERTIFIED', 'DENIED',
170+
'REJECTED', 'INVALIDATED',
171+
'PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED']
172+
for i in range(0,7):
173+
s_f[i]= f[f.CASE_STATUS == states[i]]['CASE_STATUS'].count()
174+
s_f
175+
176+
177+
# In[20]:
178+
179+
180+
from matplotlib.pyplot import pie , axis , show
181+
plt.figure(figsize=(4.5,4.5))
182+
plt.title("PETITION BY CASE STATUS")
183+
axis('equal')
184+
pie(s_f[:4],labels = states[:4])
185+
show()
186+
187+
188+
# calculating petitions distribution per year
189+
190+
# In[21]:
191+
192+
193+
f['YEAR'].unique()
194+
195+
196+
# In[22]:
197+
198+
199+
y = [2016., 2015., 2014., 2013., 2012., 2011.]
200+
y_c = [0]*6
201+
for j in range(0,6):
202+
y_c[j] = f[f.YEAR==y[j]]['YEAR'].count()
203+
y_c
204+
205+
206+
# In[23]:
207+
208+
209+
sns.set_context("notebook",font_scale = 1.0)
210+
plt.figure(figsize = (13,3))
211+
plt.title("PETITIONS DISTRIBUTION PER YEAR")
212+
sns.countplot(f['YEAR'])
213+
214+
215+
# ANALYZING DENIED PETITIONS PER YEAR
216+
217+
# In[24]:
218+
219+
220+
den = f[f.CASE_STATUS=='DENIED']
221+
len(den)
222+
223+
224+
# In[25]:
225+
226+
227+
del den['CASE_STATUS']
228+
den = den.reset_index()
229+
den.head(2)
230+
231+
232+
# Calculating denied petition distribution per year
233+
234+
# In[26]:
235+
236+
237+
d_y = [0]*6
238+
for i in range(0,6):
239+
d_y[i]=den[den.YEAR==y[i]]['YEAR'].count()
240+
d_y
241+
242+
243+
# In[27]:
244+
245+
246+
sns.set_context("notebook",font_scale = 1.0)
247+
plt.figure(figsize = (13,3))
248+
plt.title(" DENIED PETITIONS DISTRIBUTION PER YEAR")
249+
sns.countplot(den['YEAR'])
250+
251+
252+
# Calculating the rate at which denied petitins per year
253+
#
254+
255+
# In[28]:
256+
257+
258+
d_y_r = [0]*6
259+
for i in range(0,6):
260+
d_y_r[i] = float("% .2f" %((d_y[i]/y_c[i])*100))
261+
ratio = pd.DataFrame()
262+
ratio['YEAR'] = y
263+
ratio['denied rate %'] = d_y_r
264+
ratio = ratio.set_index(['YEAR'])
265+
ratio
266+
267+
268+
# In[29]:
269+
270+
271+
ratio.T
272+
273+
274+
# In[30]:
275+
276+
277+
ratio = ratio.reset_index()
278+
ratio
279+
280+
281+
# In[31]:
282+
283+
284+
sns.set_context("notebook",font_scale = 1.0)
285+
plt.figure(figsize=(13,3))
286+
plt.title("DENIED PETITIONS PER YEAR")
287+
g = sns.barplot(x ='YEAR',y ='denied rate %',data=ratio)
288+
289+
290+
# Calculating the number of petitions filled by the states
291+
#
292+
293+
# In[32]:
294+
295+
296+
f['STATES'].unique()
297+
298+
299+
# In[34]:
300+
301+
302+
US_states = ['ALABAMA','ALASKA','ARIZONA','ARKANSAS','CAALIFORNIA','COLORADO','CONNECTICUT','DELAWARE','DISTRICT OF COLUMBIA','FLORIDA','GEORGIA','HAWII','IDAHO','ILLINOIS','INDIANA','IOWA','KANSAS','KENTUCKY','LOUISTANA','MAINE','MARIANA ISLANDS','MARYLAND','MASSACHUUSSETS','MICHIGAN','MINNESOTA','MISSISSIPPI','MISSOURI','MONTANA','NEBRASKA','NEVADA','NEW HAMPSHIRE','NEW JERSEY','NEW MEXICO','NEW YORK','NORTH CAROLINA','NORTH DAKOTA','OHIO','OKALHOMA','OREGON','PENNSYLVANIA','PUERTO RICO','RHODE ISLAND','SOUTH CAROLINA','SOUTH DAKOTA','TENNESEE','TEXAS','UTAH','VERMONT','VIRGINIA','WASHINGTON','WEST VIRGINIA','WISCONSIN','WYOMING']
303+
p_s = [0]*53
304+
for i in range(0,53):
305+
p_s[i]= f[f.STATES == US_states[i]]['STATES'].count()
306+
pe = pd.DataFrame()
307+
pe['STATES']= US_states
308+
pe['FIELD PETITIONS'] = p_s
309+
print(sum(p_s))
310+
311+
312+
# In[56]:
313+
314+
315+
sns.set_context("notebook",font_scale = 1.0)
316+
plt.figure(figsize=(13,7))
317+
plt.title("FIELD PETITION BY STATE")
318+
g = sns.barplot(x ='STATES',y ='FIELD PETITIONS',data=pe)
319+
v = g.set_xticklabels(g.get_xticklabels(),rotation = 90)
320+
321+
322+
# NO. OF PETITIONS DENIED BY STATE
323+
324+
# In[49]:
325+
326+
327+
deni = [0]*53
328+
for i in range(0,53):
329+
deni[i]= den[den.STATES==US_states[i]]['STATES'].count()
330+
de_s = pd.DataFrame()
331+
de_s['STATES'] = US_states
332+
de_s['DENIED PETITIONS'] = deni
333+
de_s
334+
335+
336+
# In[50]:
337+
338+
339+
print(sum(deni))
340+
341+
342+
# In[59]:
343+
344+
345+
sns.set_context("notebook",font_scale = 1.0)
346+
plt.figure(figsize=(13,3))
347+
plt.title("DENIED PETITIONS BY STATE")
348+
g = sns.barplot(x ='STATES',y ='DENIED PETITIONS',data=de_s)
349+
v = g.set_xticklabels(g.get_xticklabels(),rotation = 90)
350+

0 commit comments

Comments
 (0)