Skip to content

Commit 70acadd

Browse files
authored
662 rework population data download (#684)
1 parent 9740ac0 commit 70acadd

27 files changed

Lines changed: 480 additions & 789 deletions

.github/workflows/epidata_main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ jobs:
116116
run: |
117117
mkdir -p data_dl
118118
getcasedata -o data_dl --no-progress-indicators
119-
getpopuldata -o data_dl --no-progress-indicators
119+
getpopuldata -o data_dl --no-progress-indicators --username=${{ secrets.REGIODBUSER }} --password=${{ secrets.REGIODBPW }}
120120
getjhdata -o data_dl --no-progress-indicators
121121
getdividata -o data_dl --no-progress-indicators
122122
getcommutermobility -o data_dl --no-progress-indicators

pycode/examples/plot/plotResultsMapGermany.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
population = gpd.get_population_data(
7676
read_data=False, file_format=file_format,
7777
out_folder='data/pydata/Germany/', no_raw=True,
78-
split_gender=False, merge_eisenach=True)
78+
merge_eisenach=True)
7979

8080
# For fitting of different age groups we need format ">X".
8181
age_group_values = list(age_groups.values())

pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import matplotlib.pyplot as plt
3030
import numpy as np
3131
import pandas as pd
32-
import requests
3332

3433
from memilio.epidata import defaultDict as dd
3534
from memilio.epidata import getCaseData as gcd
@@ -243,13 +242,10 @@ def compare_estimated_and_rki_deathsnumbers(
243242
df_jh_week.rename(
244243
columns={'deaths_daily': 'Deaths_weekly'}, inplace=True)
245244

246-
# download weekly deaths numbers from rki
247-
if not read_data:
248-
download_weekly_deaths_numbers(data_path)
245+
df_dict = download_weekly_deaths_numbers(
246+
sheet_names=['COVID_Todesfälle'], data_path=data_path)
249247

250-
df_real_deaths_per_week = pd.read_excel(
251-
data_path + "Cases_deaths_weekly.xlsx", sheet_name='COVID_Todesfälle',
252-
header=0, engine='openpyxl')
248+
df_real_deaths_per_week = df_dict['COVID_Todesfälle']
253249
df_real_deaths_per_week.rename(
254250
columns={'Sterbejahr': 'year', 'Sterbewoche': 'week',
255251
'Anzahl verstorbene COVID-19 Fälle': 'confirmed_deaths_weekly'},
@@ -321,15 +317,11 @@ def get_weekly_deaths_data_age_gender_resolved(data_path, read_data):
321317
@param read_data False or True. Defines if data is read from file or downloaded.
322318
"""
323319

324-
if not read_data:
325-
download_weekly_deaths_numbers(data_path)
326-
327-
df_real_deaths_per_week_age = pd.read_excel(
328-
data_path + 'Cases_deaths_weekly.xlsx',
329-
sheet_name='COVID_Todesfälle_KW_AG10', header=0, engine='openpyxl')
330-
df_real_deaths_per_week_gender = pd.read_excel(
331-
data_path + 'Cases_deaths_weekly.xlsx',
332-
sheet_name='COVID_Todesfälle_KW_AG20_G', header=0, engine='openpyxl')
320+
df_dict = download_weekly_deaths_numbers(sheet_names=[
321+
'COVID_Todesfälle_KW_AG10', 'COVID_Todesfälle_KW_AG20_G'], data_path=data_path)
322+
323+
df_real_deaths_per_week_age = df_dict['COVID_Todesfälle_KW_AG10']
324+
df_real_deaths_per_week_gender = df_dict['COVID_Todesfälle_KW_AG20_G']
333325
df_real_deaths_per_week_age.rename(
334326
columns={'Sterbejahr': 'year', 'Sterbewoche': 'week',
335327
'AG 0-9 Jahre': 'age 0-9 years',
@@ -381,21 +373,25 @@ def get_weekly_deaths_data_age_gender_resolved(data_path, read_data):
381373
'cases_weekly_deaths_gender_resolved', 'json')
382374

383375

384-
def download_weekly_deaths_numbers(data_path):
376+
def download_weekly_deaths_numbers(sheet_names, data_path):
385377
"""!Downloads excel file from RKI webpage
386-
378+
@param sheet_names List. Sheet names to be returned.
387379
@param data_path Path where to store the file.
380+
381+
@return dict of dataframes with sheetnames as keys.
388382
"""
389383

390-
name_file = "Cases_deaths_weekly.xlsx"
384+
name_file = "Cases_deaths_weekly"
391385
url = "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Projekte_RKI/" \
392386
"COVID-19_Todesfaelle.xlsx?__blob=publicationFile"
393387

394-
# data_path: path where to safe Excel-file
395-
r = requests.get(url)
396-
filename = os.path.join(data_path, name_file)
397-
with open(filename, 'wb') as output_file:
398-
output_file.write(r.content)
388+
# Either download excel file from url or read json file from filepath.
389+
# Since sheet_names is a list of names get file returns a dict
390+
# with sheet_names as keys and their corresponding dataframes as values.
391+
df_dict = gd.get_file(filepath=data_path + name_file + '.json', url=url, read_data=False,
392+
param_dict={'sheet_name': sheet_names, 'header': 0, 'engine': 'openpyxl'})
393+
394+
return df_dict
399395

400396

401397
def main():

pycode/memilio-epidata/memilio/epidata/getCommuterMobility.py

Lines changed: 38 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -197,14 +197,16 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'],
197197
gd.write_dataframe(
198198
commuter_migration_files[state_id_file], mobility_dir, filename, 'json')
199199

200-
countykey_list = geoger.get_county_ids(merge_eisenach=False, zfill=True)
200+
countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
201201
govkey_list = geoger.get_governing_regions()
202202

203203
# get population data for all countys (TODO: better to provide a corresponding method for the following lines in getPopulationData itself)
204204
# This is not very nice either to have the same file with either Eisenach merged or not...
205-
206-
population = gPd.get_population_data(
207-
out_folder=out_folder, merge_eisenach=False, read_data=read_data)
205+
if read_data:
206+
population = pd.read_json(directory+'county_current_population.json')
207+
else:
208+
population = gPd.get_population_data(
209+
out_folder=out_folder, merge_eisenach=True, read_data=read_data)
208210

209211
countypop_list = list(population[dd.EngEng["population"]])
210212

@@ -244,6 +246,12 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'],
244246
counties_migratedfrom.append(
245247
np.zeros(len(gov_county_table[gov_region])))
246248

249+
# merge eisenach and wartburgkreis
250+
commuter_migration_file.iloc[:, 2].replace(
251+
'16056', '16063', inplace=True)
252+
commuter_migration_file.iloc[:, 0].replace(
253+
'16056', '16063', inplace=True)
254+
247255
current_col = countykey2numlist[commuter_migration_file.iloc[i, 0]]
248256
curr_county_migratedto = commuter_migration_file.iloc[i, 1]
249257
current_key = commuter_migration_file.iloc[i, 0]
@@ -449,30 +457,33 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'],
449457

450458
# this is neither a very elegant nor a very general way to merge...
451459
# better options to be searched for!
452-
merge_id = 16063
453-
new_idx = countykey_list.index(geoger.CountyMerging[merge_id][0])
454-
old_idx = countykey_list.index(geoger.CountyMerging[merge_id][1])
455-
456-
mat_commuter_migration[new_idx, :] = mat_commuter_migration[new_idx,
457-
:] + mat_commuter_migration[old_idx, :]
458-
mat_commuter_migration[:, new_idx] = mat_commuter_migration[:,
459-
new_idx] + mat_commuter_migration[:, old_idx]
460-
mat_commuter_migration[new_idx, new_idx] = 0
461-
462-
mat_commuter_migration = np.delete(mat_commuter_migration, old_idx, axis=0)
463-
mat_commuter_migration = np.delete(mat_commuter_migration, old_idx, axis=1)
460+
if 16056 in countykey_list:
461+
merge_id = 16063
462+
new_idx = countykey_list.index(geoger.CountyMerging[merge_id][0])
463+
old_idx = countykey_list.index(geoger.CountyMerging[merge_id][1])
464+
465+
mat_commuter_migration[new_idx, :] = mat_commuter_migration[new_idx,
466+
:] + mat_commuter_migration[old_idx, :]
467+
mat_commuter_migration[:, new_idx] = mat_commuter_migration[:,
468+
new_idx] + mat_commuter_migration[:, old_idx]
469+
mat_commuter_migration[new_idx, new_idx] = 0
470+
471+
mat_commuter_migration = np.delete(
472+
mat_commuter_migration, old_idx, axis=0)
473+
mat_commuter_migration = np.delete(
474+
mat_commuter_migration, old_idx, axis=1)
464475

465476
countykey_list = geoger.get_county_ids()
466477
df_commuter_migration = pd.DataFrame(
467478
data=mat_commuter_migration, columns=countykey_list)
468479
df_commuter_migration.index = countykey_list
469480
commuter_sanity_checks(df_commuter_migration)
470-
filename = 'migration_bfa_' + \
471-
str(ref_year) + '_dim' + str(mat_commuter_migration.shape[0])
481+
filename = 'migration_bfa_' + str(ref_year)
472482
gd.write_dataframe(df_commuter_migration, directory, filename, file_format)
473-
gd.check_dir(os.path.join(directory.split('pydata')[0], 'mobility'))
483+
directory = directory.split('pydata')[0] + 'mobility/'
484+
gd.check_dir(directory)
474485
gd.write_dataframe(
475-
df_commuter_migration, directory.split('pydata')[0] + 'mobility/',
486+
df_commuter_migration, directory,
476487
'commuter_migration_scaled_' + str(ref_year),
477488
'txt', {'sep': ' ', 'index': False, 'header': False})
478489

@@ -491,7 +502,7 @@ def commuter_sanity_checks(df):
491502

492503
def get_neighbors_mobility(
493504
countyid, direction='both', abs_tol=0, rel_tol=0, tol_comb='or',
494-
merge_eisenach=True, out_folder=dd.defaultDict['out_folder'], ref_year=2022):
505+
out_folder=dd.defaultDict['out_folder'], ref_year=2022):
495506
'''! Returns the neighbors of a particular county ID depening on the
496507
commuter mobility and given absolute and relative thresholds on the number
497508
of commuters.
@@ -524,12 +535,8 @@ def get_neighbors_mobility(
524535
directory = os.path.join(out_folder, 'Germany/')
525536
gd.check_dir(directory)
526537
try:
527-
if merge_eisenach:
528-
commuter = gd.get_file(os.path.join(
529-
directory, "migration_bfa_"+str(ref_year)+"_dim400.json"), None, True)
530-
else:
531-
commuter = gd.get_file(os.path.join(
532-
directory, "migration_bfa_"+str(ref_year)+"_dim401.json"), None, True)
538+
commuter = gd.get_file(os.path.join(
539+
directory, "migration_bfa_"+str(ref_year)+"_dim400.json"), read_data=True)
533540
except FileNotFoundError:
534541
print("Commuter data was not found. Download and process it from the internet.")
535542
commuter = get_commuter_data(out_folder=out_folder, ref_year=ref_year)
@@ -556,7 +563,7 @@ def get_neighbors_mobility(
556563

557564
def get_neighbors_mobility_all(
558565
direction='both', abs_tol=0, rel_tol=0, tol_comb='or',
559-
merge_eisenach=True, out_folder=dd.defaultDict['out_folder'], ref_year=2022):
566+
out_folder=dd.defaultDict['out_folder'], ref_year=2022):
560567
'''! Returns the neighbors of all counties ID depening on the
561568
commuter mobility and given absolute and relative thresholds on the number
562569
of commuters.
@@ -580,15 +587,14 @@ def get_neighbors_mobility_all(
580587
'''
581588
directory = os.path.join(out_folder, 'Germany/')
582589
gd.check_dir(directory)
583-
countyids = geoger.get_county_ids(merge_eisenach=merge_eisenach)
590+
countyids = geoger.get_county_ids()
584591
neighbors_table = []
585-
# TODO:
592+
# TODO: performance has to be improved
586593
for id in countyids:
587594
neighbors_table.append(
588595
get_neighbors_mobility(
589596
id, direction=direction, abs_tol=abs_tol,
590597
rel_tol=rel_tol, tol_comb=tol_comb,
591-
merge_eisenach=merge_eisenach,
592598
out_folder=out_folder, ref_year=ref_year))
593599

594600
return dict(zip(countyids, neighbors_table))
@@ -612,7 +618,7 @@ def main():
612618
arg_dict_commuter = {**arg_dict, "setup_dict": setup_dict}
613619

614620
get_neighbors_mobility(
615-
1001, abs_tol=0, rel_tol=0, tol_comb='or', merge_eisenach=True,
621+
1001, abs_tol=0, rel_tol=0, tol_comb='or',
616622
out_folder=dd.defaultDict['out_folder'])
617623

618624
get_commuter_data(**arg_dict_commuter)

pycode/memilio-epidata/memilio/epidata/getDIVIData.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,10 @@ def divi_data_sanity_checks(df=pd.DataFrame()):
207207
raise gd.DataError("Error: Data categories have changed.")
208208
# check if size of dataframe is not unusal
209209
# data colletion starts at 24.04.2020
210+
# TODO: Number of reporting counties get less with time.
211+
# Maybe we should look for a new method to sanitize the size of the DataFrame.
210212
num_dates = (date.today() - date(2020, 4, 24)).days
211-
min_num_data = 390*num_dates # not all 400 counties report every day
213+
min_num_data = 380*num_dates # not all 400 counties report every day
212214
max_num_data = 400*num_dates
213215
if (len(df) < min_num_data) or (len(df) > max_num_data):
214216
raise gd.DataError("Error: unexpected length of dataframe.")

pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,7 @@ def get_file(
173173

174174
if read_data:
175175
try:
176-
if filepath.endswith('xlsx'):
177-
df = pd.read_excel(filepath, **param_dict)
178-
else:
179-
df = pd.read_json(filepath)
176+
df = pd.read_json(filepath)
180177
except FileNotFoundError:
181178
if interactive and user_choice(
182179
"Warning: The file: " + filepath +
@@ -217,9 +214,12 @@ def get_file(
217214
if df.empty:
218215
raise DataError("Error: Dataframe is empty.")
219216
except AttributeError:
220-
for i in range(len(df)):
221-
if df[i].empty:
222-
raise DataError("Error: Dataframe is empty.")
217+
if isinstance(df, list) or isinstance(df, dict):
218+
for i in df:
219+
if df[i].empty:
220+
raise DataError("Error: Dataframe is empty.")
221+
else:
222+
raise DataError("Could not catch type of df: " + str(type(df)))
223223
return df
224224

225225

@@ -263,7 +263,7 @@ def cli(what):
263263
cli_dict = {"divi": ['Downloads data from DIVI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'],
264264
"cases": ['Download case data from RKI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'split_berlin', 'rep_date'],
265265
"cases_est": ['Download case data from RKI and JHU and estimate recovered and deaths', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'split_berlin', 'rep_date'],
266-
"population": ['Download population data from official sources'],
266+
"population": ['Download population data from official sources', 'username'],
267267
"commuter_official": ['Download commuter data from official sources', 'make_plot'],
268268
"vaccination": ['Download vaccination data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'sanitize_data'],
269269
"testing": ['Download testing data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'],
@@ -351,6 +351,14 @@ def cli(what):
351351
help='Disables all progress indicators (used for downloads etc.).',
352352
action='store_true')
353353

354+
if 'username' in what_list:
355+
parser.add_argument(
356+
'--username', type=str
357+
)
358+
359+
parser.add_argument(
360+
'--password', type=str
361+
)
354362
args = vars(parser.parse_args())
355363
# disable progress indicators globally, if the argument --no-progress-indicators was specified
356364
progress_indicator.ProgressIndicator.disable_indicators(

0 commit comments

Comments
 (0)