forked from AllenDowney/ThinkStats2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchap13soln.py
More file actions
111 lines (78 loc) · 2.89 KB
/
chap13soln.py
File metadata and controls
111 lines (78 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
import pandas
import numpy as np
import thinkplot
import thinkstats2
import survival
def CleanData(resp):
"""Cleans respondent data.
resp: DataFrame
"""
resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True)
resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0
month0 = pandas.to_datetime('1899-12-15')
dates = [month0 + pandas.DateOffset(months=cm)
for cm in resp.cmbirth]
resp['decade'] = (pandas.DatetimeIndex(dates).year - 1900) // 10
def ResampleDivorceCurve(resps):
"""Plots divorce curves based on resampled data.
resps: list of respondent DataFrames
"""
for _ in range(41):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)
thinkplot.Show(xlabel='years',
axis=[0, 28, 0, 1])
def ResampleDivorceCurveByDecade(resps):
"""Plots divorce curves for each birth cohort.
resps: list of respondent DataFrames
"""
for i in range(41):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
groups = sample.groupby('decade')
if i == 0:
survival.AddLabelsByDecade(groups, alpha=0.7)
EstimateSurvivalByDecade(groups, alpha=0.1)
thinkplot.Save(root='survival7',
xlabel='years',
axis=[0, 28, 0, 1])
def EstimateSurvivalByDecade(groups, **options):
"""Groups respondents by decade and plots survival curves.
groups: GroupBy object
"""
thinkplot.PrePlot(len(groups))
for name, group in groups:
print(name, len(group))
_, sf = EstimateSurvival(group)
thinkplot.Plot(sf, **options)
def EstimateSurvival(resp):
"""Estimates the survival curve.
resp: DataFrame of respondents
returns: pair of HazardFunction, SurvivalFunction
"""
complete = resp[resp.notdivorced == 0].duration
ongoing = resp[resp.notdivorced == 1].durationsofar
hf = survival.EstimateHazardFunction(complete, ongoing)
sf = hf.MakeSurvival()
return hf, sf
def main():
resp6 = survival.ReadFemResp2002()
CleanData(resp6)
married6 = resp6[resp6.evrmarry==1]
resp7 = survival.ReadFemResp2010()
CleanData(resp7)
married7 = resp7[resp7.evrmarry==1]
ResampleDivorceCurveByDecade([married6, married7])
if __name__ == '__main__':
main()