Skip to content

Commit a97183a

Browse files
authored
916-refactor-epidata-to-support-ingestion-into-the-LOKI-database (#1024)
Co-authored-by: Mariama Jaiteh <[email protected]> This PR is the refactoring of the memilio-epidata package to support data ingestion using python objects rather than files. No change has been made to preexisting functionalities. A CLI args was added to choose the option of creating objects. This PR allows integration with external packages of the LOKI project.
1 parent 7518ea3 commit a97183a

8 files changed

Lines changed: 1008 additions & 439 deletions

File tree

pycode/memilio-epidata/memilio/epidata/defaultDict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
'file_format': 'json_timeasstring',
5252
'no_raw': False,
5353
'rep_date': False,
54-
'sanitize_data': 1
54+
'sanitize_data': 1,
5555
}
5656

5757
# The following dict EngEng makes sure that for all

pycode/memilio-epidata/memilio/epidata/download_config.conf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,6 @@ no_raw = False
3232

3333
# matplotlib backend to use
3434
mpl_backend = QtAgg
35+
36+
# To dataset allows to not generate json file but rather return python objects
37+
to_dataset = False

pycode/memilio-epidata/memilio/epidata/getCaseData.py

Lines changed: 289 additions & 151 deletions
Large diffs are not rendered by default.

pycode/memilio-epidata/memilio/epidata/getDIVIData.py

Lines changed: 194 additions & 65 deletions
Large diffs are not rendered by default.

pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ def __init__(self, out_folder, **kwargs):
8484

8585
# activate CoW for more predictable behaviour of pandas DataFrames
8686
pd.options.mode.copy_on_write = True
87-
8887
# read in config file
8988
# if no config file is given, use default values
9089
if os.path.exists(path):
@@ -105,12 +104,17 @@ def __init__(self, out_folder, **kwargs):
105104
if key not in kwargs:
106105
kwargs.update({key: parser['SETTINGS'][key]})
107106

108-
Conf.show_progr = True if kwargs['show_progress'] == 'True' else False
107+
Conf.show_progr = True if str(
108+
kwargs['show_progress']) == 'True' else False
109109
Conf.v_level = str(kwargs['verbosity_level'])
110-
self.checks = True if kwargs['run_checks'] == 'True' else False
111-
self.interactive = True if kwargs['interactive'] == 'True' else False
112-
self.plot = True if kwargs['make_plot'] == 'True' else False
113-
self.no_raw = True if kwargs['no_raw'] == 'True' else False
110+
self.checks = True if str(
111+
kwargs['run_checks']) == 'True' else False
112+
self.interactive = True if str(
113+
kwargs['interactive']) == 'True' else False
114+
self.plot = True if str(kwargs['make_plot']) == 'True' else False
115+
self.no_raw = True if str(kwargs['no_raw']) == 'True' else False
116+
self.to_dataset = True if str(
117+
kwargs['to_dataset']) == 'True' else False
114118
else:
115119
# default values:
116120
Conf.show_progr = kwargs['show_progress'] if 'show_progress' in kwargs.keys(
@@ -126,6 +130,8 @@ def __init__(self, out_folder, **kwargs):
126130
self.no_raw = kwargs['no_raw'] if 'no_raw' in kwargs.keys(
127131
) else dd.defaultDict['no_raw']
128132
self.path_to_use = out_folder
133+
self.to_dataset = kwargs['to_dataset'] if 'to_dataset' in kwargs.keys(
134+
) else False
129135

130136
# suppress Future & DepricationWarnings
131137
if VerbosityLevel[Conf.v_level].value <= 2:
@@ -354,6 +360,7 @@ def cli(what):
354360
- no_raw
355361
- username
356362
- password
363+
- to_dataset
357364
358365
@param what Defines what packages calls and thus what kind of command line arguments should be defined.
359366
"""
@@ -493,6 +500,13 @@ def cli(what):
493500
parser.add_argument(
494501
'--password', type=str
495502
)
503+
if '--to-dataset' in sys.argv:
504+
parser.add_argument(
505+
'--to-dataset', dest='to_dataset',
506+
help="To return saved dataframes as objects.",
507+
action='store_true'
508+
)
509+
496510
args = vars(parser.parse_args())
497511

498512
return args

pycode/memilio-epidata/memilio/epidata/getPopulationData.py

Lines changed: 125 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,15 @@
4242

4343

4444
def read_population_data(username, password):
45-
'''! Reads Population data from regionalstatistik.de
45+
"""! Reads Population data from regionalstatistik.de
4646
4747
Username and Password are required to sign in on regionalstatistik.de.
4848
A request is made to regionalstatistik.de and the StringIO is read in as a csv into the dataframe format.
4949
5050
@param username Username to sign in at regionalstatistik.de.
5151
@param password Password to sign in at regionalstatistik.de.
5252
@return DataFrame
53-
'''
53+
"""
5454

5555
download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
5656
req = requests.get(download_url, auth=(username, password))
@@ -63,14 +63,14 @@ def read_population_data(username, password):
6363

6464

6565
def path_to_credential_file():
66-
'''Returns path to .ini file where credentials are stored.
66+
"""! Returns path to .ini file where credentials are stored.
6767
The Path can be changed if neccessary.
68-
'''
68+
"""
6969
return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'CredentialsRegio.ini')
7070

7171

7272
def manage_credentials(interactive):
73-
'''! Manages credentials for regionalstatistik.de (needed for dowload).
73+
"""! Manages credentials for regionalstatistik.de (needed for dowload).
7474
7575
A connfig file inside the epidata folder is either written (if not existent yet)
7676
with input from user or read with following format:
@@ -79,7 +79,7 @@ def manage_credentials(interactive):
7979
Password = XXXXX
8080
8181
@return Username and password to sign in at regionalstatistik.de.
82-
'''
82+
"""
8383
# path where ini file is found
8484
path = path_to_credential_file()
8585

@@ -118,8 +118,8 @@ def manage_credentials(interactive):
118118
return username, password
119119

120120

121-
def export_population_dataframe(df_pop, directory, file_format, merge_eisenach):
122-
'''! Writes population dataframe into directory with new column names and age groups
121+
def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_format: str, merge_eisenach: bool):
122+
"""! Writes population dataframe into directory with new column names and age groups
123123
124124
@param df_pop Population data DataFrame to be exported
125125
@param directory Directory where data is written to.
@@ -128,7 +128,7 @@ def export_population_dataframe(df_pop, directory, file_format, merge_eisenach):
128128
and 'Eisenach' are listed separately or
129129
combined as one entity 'Wartburgkreis'.
130130
@return exported DataFrame
131-
'''
131+
"""
132132

133133
new_cols = [
134134
dd.EngEng['idCounty'],
@@ -194,7 +194,7 @@ def export_population_dataframe(df_pop, directory, file_format, merge_eisenach):
194194

195195

196196
def assign_population_data(df_pop_raw, counties, age_cols, idCounty_idx):
197-
'''! Assigns population data of all counties of old dataframe in new created dataframe
197+
"""! Assigns population data of all counties of old dataframe in new created dataframe
198198
199199
In df_pop_raw there might be additional information like federal states,
200200
governing regions etc. which is not necessary for the dataframe.
@@ -205,7 +205,7 @@ def assign_population_data(df_pop_raw, counties, age_cols, idCounty_idx):
205205
@param age_cols Age groups in old DataFrame
206206
@param idCountyidx indexes in old DataFrame where data of corresponding county starts
207207
@return new DataFrame
208-
'''
208+
"""
209209

210210
new_cols = {dd.EngEng['idCounty']: counties[:, 1],
211211
dd.EngEng['county']: counties[:, 0]}
@@ -283,45 +283,25 @@ def test_total_population(df_pop, age_cols):
283283
raise gd.DataError('Total Population does not match expectation.')
284284

285285

286-
def get_population_data(read_data=dd.defaultDict['read_data'],
287-
file_format=dd.defaultDict['file_format'],
288-
out_folder=dd.defaultDict['out_folder'],
289-
merge_eisenach=True,
290-
username='',
291-
password='',
292-
**kwargs):
293-
"""! Download age-stratified population data for the German counties.
294-
295-
The data we use is:
296-
Official 'Bevölkerungsfortschreibung' 12411-02-03-4:
297-
'Bevölkerung nach Geschlecht und Altersgruppen (17)'
298-
of regionalstatistik.de.
299-
ATTENTION: The raw file cannot be downloaded
300-
automatically by our scripts without an Genesis Online account. In order to
301-
work on this dataset, please enter your username and password or manually download it from:
302-
303-
https://www.regionalstatistik.de/genesis/online -> "1: Gebiet, Bevölkerung,
304-
Arbeitsmarkt, Wahlen" -> "12: Bevölkerung" -> "12411 Fortschreibung des
305-
Bevölkerungsstandes" -> "12411-02-03-4: Bevölkerung nach Geschlecht und
306-
Altersgruppen (17) - Stichtag 31.12. - regionale Tiefe: Kreise und
307-
krfr. Städte".
308-
309-
Download the xlsx or csv file and put it under dd.defaultDict['out_folder'],
310-
this normally is Memilio/data/pydata/Germany.
311-
The folders 'pydata/Germany' have to be created if they do not exist yet.
312-
Then this script can be run.
286+
def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
287+
out_folder: str = dd.defaultDict['out_folder'],
288+
username='',
289+
password='',
290+
**kwargs
291+
) -> pd.DataFrame:
292+
"""! Downloads or reads the population data.
293+
If it does not already exist, the folder Germany is generated in the given out_folder.
294+
If read_data == True and the file "FullData_population.json" exists, the data is read form this file
295+
and stored in a pandas dataframe. If read_data = True and the file does not exist the program is stopped.
296+
The downloaded dataframe is written to the file "FullData_population".
313297
314298
@param read_data False or True. Defines if data is read from file or
315299
downloaded. Default defined in defaultDict.
316-
@param file_format File format which is used for writing the data.
317-
Default defined in defaultDict.
318300
@param out_folder Path to folder where data is written in folder
319301
out_folder/Germany. Default defined in defaultDict.
320-
@param merge_eisenach [Default: True] or False. Defines whether the
321-
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
322-
combined as one entity 'Wartburgkreis'.
323-
@param username Username to sign in at regionalstatistik.de.
302+
@param username Username to sign in at regionalstatistik.de.
324303
@param password Password to sign in at regionalstatistik.de.
304+
325305
@return DataFrame with adjusted population data for all ages to current level.
326306
"""
327307
conf = gd.Conf(out_folder, **kwargs)
@@ -341,6 +321,22 @@ def get_population_data(read_data=dd.defaultDict['read_data'],
341321

342322
df_pop_raw = read_population_data(username, password)
343323

324+
return df_pop_raw
325+
326+
327+
def preprocess_population_data(df_pop_raw: pd.DataFrame,
328+
merge_eisenach: bool = True,
329+
) -> pd.DataFrame:
330+
"""! Processing of the downloaded data
331+
* the columns are renamed to English and the state and county names are added.
332+
333+
@param df_pop_raw pd.DataFrame. A Dataframe containing input population data
334+
@param merge_eisenach [Default: True] or False. Defines whether the
335+
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
336+
combined as one entity 'Wartburgkreis'.
337+
338+
@return df pd.DataFrame. Processed population data
339+
"""
344340
column_names = list(df_pop_raw.columns)
345341
# rename columns
346342
rename_columns = {
@@ -381,12 +377,96 @@ def get_population_data(read_data=dd.defaultDict['read_data'],
381377

382378
df_pop = assign_population_data(
383379
df_pop_raw, counties, age_cols, idCounty_idx)
384-
385380
test_total_population(df_pop, age_cols)
381+
return df_pop
386382

383+
384+
def write_population_data(df_pop: pd.DataFrame,
385+
out_folder: str = dd.defaultDict['out_folder'],
386+
file_format: str = dd.defaultDict['file_format'],
387+
merge_eisenach: bool = True
388+
) -> None or pd.DataFrame:
389+
"""! Write the population data into json files
390+
Three kinds of structuring of the data are done.
391+
We obtain the chronological sequence of ICU and ICU_ventilated
392+
stored in the files "county_population".json", "state_population.json" and "germany_population.json"
393+
for counties, states and whole Germany, respectively.
394+
395+
@param df_pop pd.DataFrame. A Dataframe containing processed population data
396+
@param file_format str. File format which is used for writing the data. Default defined in defaultDict.
397+
@param out_folder str. Folder where data is written to. Default defined in defaultDict.
398+
@param merge_eisenach [Default: True] or False. Defines whether the
399+
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
400+
combined as one entity 'Wartburgkreis'.
401+
402+
@return None
403+
"""
404+
directory = os.path.join(out_folder, 'Germany')
387405
df_pop_export = export_population_dataframe(
388406
df_pop, directory, file_format, merge_eisenach)
407+
return df_pop_export
408+
409+
410+
def get_population_data(read_data: bool = dd.defaultDict['read_data'],
411+
file_format: str = dd.defaultDict['file_format'],
412+
out_folder: str = dd.defaultDict['out_folder'],
413+
merge_eisenach: bool = True,
414+
username='',
415+
password='',
416+
**kwargs
417+
):
418+
"""! Download age-stratified population data for the German counties.
389419
420+
The data we use is:
421+
Official 'Bevölkerungsfortschreibung' 12411-02-03-4:
422+
'Bevölkerung nach Geschlecht und Altersgruppen (17)'
423+
of regionalstatistik.de.
424+
ATTENTION: The raw file cannot be downloaded
425+
automatically by our scripts without an Genesis Online account. In order to
426+
work on this dataset, please enter your username and password or manually download it from:
427+
428+
https://www.regionalstatistik.de/genesis/online -> "1: Gebiet, Bevölkerung,
429+
Arbeitsmarkt, Wahlen" -> "12: Bevölkerung" -> "12411 Fortschreibung des
430+
Bevölkerungsstandes" -> "12411-02-03-4: Bevölkerung nach Geschlecht und
431+
Altersgruppen (17) - Stichtag 31.12. - regionale Tiefe: Kreise und
432+
krfr. Städte".
433+
434+
Download the xlsx or csv file and put it under dd.defaultDict['out_folder'],
435+
this normally is Memilio/data/pydata/Germany.
436+
The folders 'pydata/Germany' have to be created if they do not exist yet.
437+
Then this script can be run.
438+
439+
@param read_data False or True. Defines if data is read from file or
440+
downloaded. Default defined in defaultDict.
441+
@param file_format File format which is used for writing the data.
442+
Default defined in defaultDict.
443+
@param out_folder Path to folder where data is written in folder
444+
out_folder/Germany. Default defined in defaultDict.
445+
@param merge_eisenach [Default: True] or False. Defines whether the
446+
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
447+
combined as one entity 'Wartburgkreis'.
448+
@param username str. Username to sign in at regionalstatistik.de.
449+
@param password str. Password to sign in at regionalstatistik.de.
450+
@return DataFrame with adjusted population data for all ages to current level.
451+
"""
452+
raw_df = fetch_population_data(
453+
read_data=read_data,
454+
out_folder=out_folder,
455+
file_format=file_format,
456+
username=username,
457+
password=password,
458+
**kwargs
459+
)
460+
preprocess_df = preprocess_population_data(
461+
df_pop_raw=raw_df,
462+
merge_eisenach=merge_eisenach
463+
)
464+
df_pop_export = write_population_data(
465+
df_pop=preprocess_df,
466+
file_format=file_format,
467+
out_folder=out_folder,
468+
merge_eisenach=True
469+
)
390470
return df_pop_export
391471

392472

0 commit comments

Comments
 (0)