-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformat.py
More file actions
104 lines (91 loc) · 2.61 KB
/
format.py
File metadata and controls
104 lines (91 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#! usr/bin/env python3
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime,timedelta
from pymongo import MongoClient
import time
mongo = MongoClient()["dataanalysis"]["lagou"]
values = mongo.find({},{"_id":0,"positionAdvantage":1,"salary":1,"city":1,"positionName":1,"workYear":1,"education":1,"industryField":1,"companySize":1,"financeStage":1,"firstType":1,"secondType":1,"thirdType":1})
values = [row for row in values]
df = pd.DataFrame(values)
# 格式化公司规模
def length(data,type):
value = data.values
if not value:
return 0
value = value[0]
if not value:
return 0
if value.find("以上") != -1:
if type == 1:
return 2000
else:
return 10000
elif value.find("-") != -1:
t = value.replace("人","").split("-")
if type == 1:
return int(t[0])
else:
return int(t[1])
else:
if type == 1:
return 0
else:
return 15
def min_staff(data):
return length(data,1)
def max_staff(data):
return length(data,2)
df["min_staff"] = df[["companySize"]].apply(min_staff,axis=1)
df["max_staff"] = df[["companySize"]].apply(max_staff,axis=1)
df = df.drop(["companySize"],axis=1)
# 格式化薪资
def salary(data,type):
value = data.values
if not value:
return 0
value = value[0]
if not value:
return 0
if value.find("-") != -1:
t = value.replace("k","").replace("K","").split("-")
if type == 1:
return int(t[0])*1000
elif type == 2:
return int(t[1])*1000
else:
return (int(t[0])*1000+int(t[1])*1000)/2
else:
return 0
def min_salary(data):
return salary(data,1)
def max_salary(data):
return salary(data,2)
def avg_salary(data):
return salary(data,3)
df["min_salary"] = df[["salary"]].apply(min_salary,axis=1)
df["max_salary"] = df[["salary"]].apply(max_salary,axis=1)
df["avg_salary"] = df[["salary"]].apply(avg_salary,axis=1)
# 格式化语言
def language(data):
value = data.values
if not value:
return None
value = value[0]
if not value:
return None
value = value.upper()
if value.find("PYTHON") != -1:
return "python"
if value.find("C++") != -1:
return "c/c++"
if value.find("C") != -1:
return "c/c++"
if value.find("JAVA") != -1:
return "java"
if value.find("PHP") != -1:
return "php"
return None
df["language"] = df[["positionName"]].apply(language,axis=1)
df = df.dropna()