forked from AllenDowney/ThinkStats2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhinc2.py
More file actions
61 lines (43 loc) · 1.58 KB
/
hinc2.py
File metadata and controls
61 lines (43 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
import numpy as np
import hinc
import thinkplot
import thinkstats2
def InterpolateSample(df, log_upper=6.0):
"""Makes a sample of log10 household income.
Assumes that log10 income is uniform in each range.
df: DataFrame with columns income and freq
log_upper: log10 of the assumed upper bound for the highest range
returns: NumPy array of log10 household income
"""
# compute the log10 of the upper bound for each range
df['log_upper'] = np.log10(df.income)
# get the lower bounds by shifting the upper bound and filling in
# the first element
df['log_lower'] = df.log_upper.shift(1)
df.log_lower[0] = 3.0
# plug in a value for the unknown upper bound of the highest range
df.log_upper[41] = log_upper
# use the freq column to generate the right number of values in
# each range
arrays = []
for _, row in df.iterrows():
vals = np.linspace(row.log_lower, row.log_upper, row.freq)
arrays.append(vals)
# collect the arrays into a single sample
log_sample = np.concatenate(arrays)
return log_sample
def main():
df = hinc.ReadData()
log_sample = InterpolateSample(df, log_upper=6.0)
log_cdf = thinkstats2.Cdf(log_sample)
thinkplot.Cdf(log_cdf)
thinkplot.Show(xlabel='household income',
ylabel='CDF')
if __name__ == "__main__":
main()