|
28 | 28 | import getpass |
29 | 29 | import requests |
30 | 30 | import os |
31 | | -import twill |
32 | | -import time |
33 | 31 | import io |
34 | 32 |
|
35 | 33 | import numpy as np |
|
43 | 41 | pd.options.mode.copy_on_write = True |
44 | 42 |
|
45 | 43 |
|
46 | | -def read_population_data(username, password, read_data, directory): |
47 | | - '''! Reads Population data either from regionalstatistik.de or from directory |
| 44 | +def read_population_data(username, password): |
| 45 | + '''! Reads Population data from regionalstatistik.de |
48 | 46 |
|
49 | | - A request is made using the twill package. Username and Password are required to |
50 | | - sign in on regionalstatistik.de. After the sign twill navigates to the file to download. |
| 47 | + Username and Password are required to sign in on regionalstatistik.de. |
| 48 | + A request is made to regionalstatistik.de and the StringIO is read in as a csv into the dataframe format. |
51 | 49 |
|
52 | 50 | @param username Username to sign in at regionalstatistik.de. |
53 | 51 | @param password Password to sign in at regionalstatistik.de. |
54 | | - @param read_data False or True. Defines if data is read from file or downloaded. |
55 | | - @param directory Path to folder where data is read from. |
56 | 52 | @return DataFrame |
57 | 53 | ''' |
58 | 54 |
|
59 | | - filename = '12411-02-03-4' |
60 | | - if not read_data: |
61 | | - sign_in_url = 'https://www.regionalstatistik.de/genesis/online?Menu=Anmeldung' |
62 | | - |
63 | | - # sign in to regionalstatistik.de with given username and password |
64 | | - twill.browser.user_agent = requests.utils.default_headers()[ |
65 | | - 'User-Agent'] |
66 | | - twill.commands.go(sign_in_url) |
67 | | - twill.commands.fv('3', 'KENNUNG', username) |
68 | | - twill.commands.fv('3', 'PASSWORT', password) |
69 | | - twill.commands.submit('login', '3') |
70 | | - # navigate to file as in documentation |
71 | | - twill.commands.follow('Themen') |
72 | | - twill.commands.follow(filename[:2]) |
73 | | - # wait 2 seconds to prevent error |
74 | | - # page needs some time to load |
75 | | - time.sleep(2) |
76 | | - twill.commands.follow(filename.split('-')[0]) |
77 | | - twill.commands.follow(filename) |
78 | | - # start 'Werteabruf' |
79 | | - twill.commands.submit('45', '3') |
80 | | - # read csv file (1,4 for xlsx) |
81 | | - twill.commands.submit('1', '5') |
82 | | - |
83 | | - df_pop_raw = pd.read_csv(io.StringIO( |
84 | | - twill.browser.html), sep=';', header=6) |
85 | | - |
86 | | - else: |
87 | | - data_file = os.path.join(directory, filename) |
88 | | - if os.path.isfile(data_file+'.xlsx'): |
89 | | - df_pop_raw = pd.read_excel( |
90 | | - data_file+'.xlsx', engine='openpyxl', sheet_name=filename, header=4) |
91 | | - elif os.path.isfile(data_file+'.csv'): |
92 | | - df_pop_raw = pd.read_excel(data_file+'.csv', sep=';', header=6) |
93 | | - else: |
94 | | - raise FileNotFoundError( |
95 | | - 'Data file '+filename+' was not found in out_folder/Germany') |
| 55 | + download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv' |
| 56 | + req = requests.get(download_url, auth=(username, password)) |
| 57 | + df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6) |
96 | 58 |
|
97 | 59 | return df_pop_raw |
98 | 60 |
|
@@ -365,14 +327,19 @@ def get_population_data(read_data=dd.defaultDict['read_data'], |
365 | 327 | conf = gd.Conf(out_folder, **kwargs) |
366 | 328 | out_folder = conf.path_to_use |
367 | 329 |
|
| 330 | + if read_data == True: |
| 331 | + gd.default_print( |
| 332 | + 'Warning', 'Read_data is not supportet for getPopulationData.py. Setting read_data = False') |
| 333 | + read_data = False |
| 334 | + |
368 | 335 | # If no username or password is provided, the credentials are either read from an .ini file or, |
369 | 336 | # if the file does not exist they have to be given as user input. |
370 | 337 | if (username is None) or (password is None): |
371 | 338 | username, password = manage_credentials(conf.interactive) |
372 | 339 | directory = os.path.join(out_folder, 'Germany') |
373 | 340 | gd.check_dir(directory) |
374 | 341 |
|
375 | | - df_pop_raw = read_population_data(username, password, read_data, directory) |
| 342 | + df_pop_raw = read_population_data(username, password) |
376 | 343 |
|
377 | 344 | column_names = list(df_pop_raw.columns) |
378 | 345 | # rename columns |
|
0 commit comments