forked from Echo9573/DataAnalysisbyPython
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1dataExplore.py
More file actions
66 lines (40 loc) · 1.58 KB
/
1dataExplore.py
File metadata and controls
66 lines (40 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
inputfile1 = 'data1.csv'
data = pd.read_csv(inputfile1)
data # 1994到2013年间的各个影响因素的数据
# In[2]:
# ---------------* 1_1summaryMeasure *---------------
# 概括性分析描述性统计
r = [data.min(), data.max(), data.mean(), data.std()] #统计最小、最大、平均、标准差
r = pd.DataFrame(r, index= ['Min', 'Max', 'Mean', 'Std']).T #计算相关系数矩阵
result = np.round(r, 2) # 保留两位小数 (***)
# np.round(data.describe().T[['min', 'max', 'mean', 'std']],2) # 等价于上面数据探索
#保存的表名命名格式为“1_k此表功能名称”,是此小节生成的第1张表格,功能为summaryMeasure:概括性分析描述性统计
result.to_excel('1_1summaryMeasure.xlsx')
result
# In[3]:
# ---------------* 1_2relatedAnalysis *---------------
# 计算各个变量之间的皮尔森系数'pearson'/ 'kendall'/ 'spearman'
result1 = np.round(data.corr(method='pearson'), 2)
#保存的表名命名格式为“1_k此表功能名称”,是此小节生成的第2张表格,功能为relatedAnalysis:相关性分析
result1.to_csv("1_2relatedAnalysis.csv")
result1
# In[4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
get_ipython().magic(u'matplotlib inline')
# In[6]:
corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
# In[ ]:
# In[ ]: