956 update Epidata Readme (#957)

patricklnz · web-flow · commit 870a8b18b231 · 2024-04-02T13:38:44.000+02:00
diff --git a/pycode/memilio-epidata/README.rst b/pycode/memilio-epidata/README.rst
@@ -136,16 +136,19 @@ If a new functionality shall be added please stick to the following instructions
 When you start creating a new script:
 
 - have a look into getDataIntoPandasDataFrame.py there the main functionality which should be used is implemented.
-   - get_file is used to read in data
-   - use the dictionaries in defaultDict.py to rename the existing columns of you data
-      - add new column names to one of the existing languages; english, german and spanish translation exists at the moment.
-      - for non-english languages always use the EngEng dictionary as the key, thus we can easily change names with just changing one line.
-      - in defaultDict.py a dictionary with id and state and county name, respectivly exists. Please use it.
+    - get_file is used to read in data.
+    - the Conf class sets relevant download options.
+    - use write_dataframe to write the pandas dataframe to file.
+    - use check_dir if you want to create a new folder to write data to
+- use the dictionaries in defaultDict.py to rename the existing columns of your data
+    - add new column names to one of the existing language dictionaries; english, german and spanish translation exists at the moment.
+    - for non-english languages always use the EngEng dictionary as the key, thus we can easily change names with just changing one line.
+    - in defaultDict.py a dictionary with id, state and county name, respectively exists. Please use it.
 - After renaming columns, you should not use pandas dataframe.column but instead use
   dataframe[column] where column is given by the dictionaries in defaultDict.py.
   Example: ID_County = dd.GerEng['IdLandkreis'] or dd.EngEng['idCounty'].
-- use check_dir of getDataIntoPandasDataFrame.py if you want to create a new folder to write data to
-- use write_dataframe of getDataIntoPandasDataFrame.py to write the pandas dataframe to file.
+- For extensive operations use the progress indicator to give feedback for the user
+- ALWAYS use Copy-on-Write for pandas DataFrames.
 - use doxygen like comments in code as
     - add description in the beginning of the file
         - ## Header
diff --git a/pycode/memilio-epidata/memilio/epidata/README.rst b/pycode/memilio-epidata/memilio/epidata/README.rst
@@ -134,6 +134,8 @@ optional arguments working for some are:
 +---------------------------------------------+-----------------------------------------------------------+
 | --password                                  | Password for regionalstatistik.de [population]            |
 +---------------------------------------------+-----------------------------------------------------------+
+| --files                                     | Files to write [case]                                     |
++---------------------------------------------+-----------------------------------------------------------+
 
 
 Hint:
diff --git a/pycode/memilio-epidata/memilio/epidata/getCaseData.py b/pycode/memilio-epidata/memilio/epidata/getCaseData.py
@@ -136,12 +136,12 @@ def get_case_data(read_data=dd.defaultDict['read_data'],
     no_raw = conf.no_raw
     run_checks = conf.checks
 
-    if files == 'All':
+    if (files == 'All') or (files == ['All']):
         files = ['infected', 'deaths', 'all_germany', 'infected_state',
                  'all_state', 'infected_county', 'all_county', 'all_gender',
                  'all_state_gender', 'all_county_gender', 'all_age',
                  'all_state_age', 'all_county_age']
-    if files == 'Plot':
+    if (files == 'Plot') or (files == ['Plot']):
         # only consider plotable files
         files = ['infected', 'deaths', 'all_gender', 'all_age']
     # handle error of passing a string of one file instead of a list
@@ -287,14 +287,12 @@ def get_case_data(read_data=dd.defaultDict['read_data'],
         'infected_state': [[dateToUse, IdBundesland], {AnzahlFall: "sum"}, [IdBundesland],
                            {dd.EngEng["idState"]: geoger.get_state_ids()}, ['Confirmed']],
         'all_state': [[dateToUse, IdBundesland], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"},
-                      [IdBundesland], {dd.EngEng["idState"]
-                          : geoger.get_state_ids()},
+                      [IdBundesland], {dd.EngEng["idState"]: geoger.get_state_ids()},
                       ['Confirmed', 'Deaths', 'Recovered']],
         'infected_county': [[dateToUse, IdLandkreis], {AnzahlFall: "sum"}, [IdLandkreis],
                             {dd.EngEng["idCounty"]: df[dd.EngEng["idCounty"]].unique()}, ['Confirmed']],
         'all_county': [[dateToUse, IdLandkreis], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"},
-                       [IdLandkreis], {dd.EngEng["idCounty"]
-                           : df[dd.EngEng["idCounty"]].unique()},
+                       [IdLandkreis], {dd.EngEng["idCounty"]: df[dd.EngEng["idCounty"]].unique()},
                        ['Confirmed', 'Deaths', 'Recovered']],
         'all_gender': [[dateToUse, Geschlecht], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"},
                        [Geschlecht], {dd.EngEng["gender"]: list(
@@ -313,8 +311,7 @@ def get_case_data(read_data=dd.defaultDict['read_data'],
                               ), dd.EngEng["gender"]: list(df[dd.EngEng["gender"]].unique())},
                               ['Confirmed', 'Deaths', 'Recovered']],
         'all_age': [[dateToUse, Altersgruppe], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"},
-                    [Altersgruppe], {dd.EngEng["ageRKI"]
-                        : df[dd.EngEng["ageRKI"]].unique()},
+                    [Altersgruppe], {dd.EngEng["ageRKI"]: df[dd.EngEng["ageRKI"]].unique()},
                     ['Confirmed', 'Deaths', 'Recovered']],
         'all_state_age': [[dateToUse, IdBundesland, Altersgruppe],
                           {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, [
diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py
@@ -357,7 +357,7 @@ def cli(what):
     #                "start_date": ['divi']                 }
 
     cli_dict = {"divi": ['Downloads data from DIVI', 'start_date', 'end_date', 'impute_dates', 'moving_average'],
-                "cases": ['Download case data from RKI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date'],
+                "cases": ['Download case data from RKI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date', 'files'],
                 "cases_est": ['Download case data from RKI and JHU and estimate recovered and deaths', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date'],
                 "population": ['Download population data from official sources', 'username'],
                 "commuter_official": ['Download commuter data from official sources'],
@@ -440,6 +440,10 @@ def cli(what):
             '-sd', '--sanitize-data', type=int, default=dd.defaultDict['sanitize_data'], dest='sanitize_data',
             help='Redistributes cases of every county either based on regions ratios or on thresholds and population'
         )
+    if 'files' in what_list:
+        parser.add_argument(
+            '--files', nargs="*", default='All'
+        )
 
     # add optional download options
     if '--no-progress-indicators' in sys.argv:
diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py
@@ -465,7 +465,7 @@ def test_call_functions(
         arg_dict_cases = {
             **arg_dict_all, **arg_dict_data_download,
             "rep_date": dd.defaultDict['rep_date'],
-            "split_berlin": dd.defaultDict['split_berlin']}
+            "split_berlin": dd.defaultDict['split_berlin'], 'files': 'All'}
 
         arg_dict_divi = {
             **arg_dict_all, **arg_dict_data_download}
@@ -477,6 +477,7 @@ def test_call_functions(
             "sanitize_data": dd.defaultDict['sanitize_data']}
 
         arg_dict_cases_est = {**arg_dict_cases}
+        arg_dict_cases_est.pop('files')
 
         arg_dict_jh = {**arg_dict_all, **arg_dict_data_download}
         # change start-date of jh to 2020-01-22