Skip to content

Commit 64a8581

Browse files
authored
921 Improve PopulationData Download (#922)
1 parent 66693d4 commit 64a8581

2 files changed

Lines changed: 14 additions & 56 deletions

File tree

pycode/memilio-epidata/memilio/epidata/getPopulationData.py

Lines changed: 13 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
import getpass
2929
import requests
3030
import os
31-
import twill
32-
import time
3331
import io
3432

3533
import numpy as np
@@ -43,56 +41,20 @@
4341
pd.options.mode.copy_on_write = True
4442

4543

46-
def read_population_data(username, password, read_data, directory):
47-
'''! Reads Population data either from regionalstatistik.de or from directory
44+
def read_population_data(username, password):
45+
'''! Reads Population data from regionalstatistik.de
4846
49-
A request is made using the twill package. Username and Password are required to
50-
sign in on regionalstatistik.de. After the sign twill navigates to the file to download.
47+
Username and Password are required to sign in on regionalstatistik.de.
48+
A request is made to regionalstatistik.de and the StringIO is read in as a csv into the dataframe format.
5149
5250
@param username Username to sign in at regionalstatistik.de.
5351
@param password Password to sign in at regionalstatistik.de.
54-
@param read_data False or True. Defines if data is read from file or downloaded.
55-
@param directory Path to folder where data is read from.
5652
@return DataFrame
5753
'''
5854

59-
filename = '12411-02-03-4'
60-
if not read_data:
61-
sign_in_url = 'https://www.regionalstatistik.de/genesis/online?Menu=Anmeldung'
62-
63-
# sign in to regionalstatistik.de with given username and password
64-
twill.browser.user_agent = requests.utils.default_headers()[
65-
'User-Agent']
66-
twill.commands.go(sign_in_url)
67-
twill.commands.fv('3', 'KENNUNG', username)
68-
twill.commands.fv('3', 'PASSWORT', password)
69-
twill.commands.submit('login', '3')
70-
# navigate to file as in documentation
71-
twill.commands.follow('Themen')
72-
twill.commands.follow(filename[:2])
73-
# wait 2 seconds to prevent error
74-
# page needs some time to load
75-
time.sleep(2)
76-
twill.commands.follow(filename.split('-')[0])
77-
twill.commands.follow(filename)
78-
# start 'Werteabruf'
79-
twill.commands.submit('45', '3')
80-
# read csv file (1,4 for xlsx)
81-
twill.commands.submit('1', '5')
82-
83-
df_pop_raw = pd.read_csv(io.StringIO(
84-
twill.browser.html), sep=';', header=6)
85-
86-
else:
87-
data_file = os.path.join(directory, filename)
88-
if os.path.isfile(data_file+'.xlsx'):
89-
df_pop_raw = pd.read_excel(
90-
data_file+'.xlsx', engine='openpyxl', sheet_name=filename, header=4)
91-
elif os.path.isfile(data_file+'.csv'):
92-
df_pop_raw = pd.read_excel(data_file+'.csv', sep=';', header=6)
93-
else:
94-
raise FileNotFoundError(
95-
'Data file '+filename+' was not found in out_folder/Germany')
55+
download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
56+
req = requests.get(download_url, auth=(username, password))
57+
df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
9658

9759
return df_pop_raw
9860

@@ -365,14 +327,19 @@ def get_population_data(read_data=dd.defaultDict['read_data'],
365327
conf = gd.Conf(out_folder, **kwargs)
366328
out_folder = conf.path_to_use
367329

330+
if read_data == True:
331+
gd.default_print(
332+
'Warning', 'Read_data is not supportet for getPopulationData.py. Setting read_data = False')
333+
read_data = False
334+
368335
# If no username or password is provided, the credentials are either read from an .ini file or,
369336
# if the file does not exist they have to be given as user input.
370337
if (username is None) or (password is None):
371338
username, password = manage_credentials(conf.interactive)
372339
directory = os.path.join(out_folder, 'Germany')
373340
gd.check_dir(directory)
374341

375-
df_pop_raw = read_population_data(username, password, read_data, directory)
342+
df_pop_raw = read_population_data(username, password)
376343

377344
column_names = list(df_pop_raw.columns)
378345
# rename columns

pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,6 @@ def test_export_population_data(self):
6767
'18-24 years', '25-29 years', '30-39 years', '40-49 years',
6868
'50-64 years', '65-74 years', '>74 years'])
6969

70-
def test_read_population_data(self):
71-
72-
directory = os.path.join(self.path, 'Germany/')
73-
74-
# test file not found
75-
with self.assertRaises(FileNotFoundError) as error:
76-
df = gpd.read_population_data(
77-
username='', password='', read_data=True, directory=directory)
78-
7970
@patch('memilio.epidata.getPopulationData.read_population_data',
8071
return_value=df_pop_raw)
8172
@patch('memilio.epidata.getPopulationData.assign_population_data', return_value=df_pop)
@@ -125,7 +116,7 @@ def test_config_read(self, mock_test, mock_export, mock_read, mock_path):
125116
username=None, password=None, read_data=False, out_folder=self.path, interactive=False)
126117
# The file exist in the directory (mocked) and the credentials should be read.
127118
mock_read.assert_called_with(
128-
self.test_username, self.test_password, False, os.path.join(self.path, 'Germany'))
119+
self.test_username, self.test_password)
129120

130121

131122
if __name__ == '__main__':

0 commit comments

Comments
 (0)