Skip to content

Commit 88ad122

Browse files
Refactored code to clean up
1 parent a60033c commit 88ad122

File tree

3 files changed

+63
-46
lines changed

3 files changed

+63
-46
lines changed

etl/extract.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,48 @@ def __repr__(self):
9595
def __str__(self):
9696
return f'The API key used in this DataLibraryData is {self.api_key}'
9797

98+
99+
def get_survey_data(client):
100+
survey_list = client.get_survey_list()
101+
total_surveys = len(survey_list)
102+
103+
all_surveys = client.get_surveys_with_resources(limit=total_surveys)
104+
all_surveys_df = pd.json_normalize(all_surveys)
105+
106+
return all_surveys_df
107+
108+
109+
def get_user_data(client):
110+
users = client.get_users()
111+
users_df = pd.DataFrame(users)
112+
113+
return users_df
114+
115+
116+
def get_data(client):
117+
survey_df = get_survey_data(client)
118+
user_df = get_user_data(client)
119+
120+
return survey_df, user_df
121+
122+
123+
98124

125+
# def get_data(client: Client) -> Tuple[DataFrame, DataFrame]:
126+
# # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
127+
# survey_list = client.get_survey_list()
128+
# # # get total number survey present on Data Library
129+
# total_surveys = len(survey_list)
130+
131+
# # get information on user
132+
# users = client.get_users()
133+
# # get total number of users with an account on Data Library
134+
# total_users = len(users)
135+
# users = pd.DataFrame(users)
136+
137+
# # get all information about surveys
138+
# all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
139+
140+
# all_surveys_with_resources = pd.json_normalize(all_surveys_with_resources)
141+
142+
# return (all_surveys_with_resources, users)

etl/transform.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,6 @@
11
import pandas as pd
22
import json
33

4-
def normalize_json(data: dict) -> dict:
5-
"""Flatten json"""
6-
new_data = dict()
7-
for key, value in data.items():
8-
if not isinstance(value, dict):
9-
new_data[key] = value
10-
else:
11-
for k, v in value.items():
12-
new_data[key + "_" + k] = v
13-
14-
return new_data
154

165
def flatten_response(df, col: str, df_id: str):
176
flat_list = []
@@ -24,7 +13,7 @@ def flatten_response(df, col: str, df_id: str):
2413
df1 = pd.DataFrame(flat_list)
2514
return df1
2615

27-
def process_data(data: tuple) -> tuple:
16+
def transform(data: tuple) -> tuple:
2817
surveys, users = data
2918
surveys = surveys[['assessment_status', 'collection_method',
3019
'creator_user_id', 'data_collector', 'description', 'end_date', 'id',

main.py

Lines changed: 18 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,35 @@
11
from dotenv import load_dotenv
22
import pandas as pd
33
import os
4-
from etl.extract import DataLibrary
5-
from etl.transform import process_data
4+
from etl.extract import DataLibrary, get_data
5+
from etl.transform import transform
66
from etl.load import load_to_db, save_to_excel
77

8-
98
load_dotenv() # take environment variables from .env.
109

11-
def extract_data(client):
12-
"""
13-
Get data from DL api and returns two dataframes.
14-
15-
This function initializes a DataLibrary API instance, retrieves the survey list and user information from the API, and returns two dataframes - one containing all surveys with resources, and one containing user information.
16-
17-
Returns:
18-
Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the dataframes for surveys with resources and users.
10+
def main():
1911
"""
12+
Executes the ETL (Extract, Transform, Load) process to fetch data from VAM Data Library, process the data, and load it into a database and save it to an Excel file.
2013
21-
# get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
22-
survey_list = client.get_survey_list()
23-
# # get total number survey present on Data Library
24-
total_surveys = len(survey_list)
14+
There are three data points being loaded:
15+
- Survey Information
16+
- User Information
17+
- Survey Resources
2518
26-
# get information on user
27-
users = client.get_users()
28-
# get total number of users with an account on Data Library
29-
total_users = len(users)
30-
users = pd.DataFrame(users)
31-
32-
print(f"\n---\n There are {total_surveys + 1} surveys and {total_users + 1} active users in Data Library\n---\n ")
33-
34-
# get all information about surveys
35-
all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
36-
37-
all_surveys_with_resources = pd.json_normalize(all_surveys_with_resources)
38-
39-
return (all_surveys_with_resources, users)
40-
41-
def run_etl_process():
42-
raw_data = extract_data(DataLibrary(os.getenv("DATALIB_API_KEY")))
19+
This function is the main entry point for the ETL pipeline. It performs the following steps:
20+
1. Fetches data from Data Library using the `get_data` function and the `DataLibrary` class, which is configured using an environment variable.
21+
2. Processes the fetched data using the `process_data` function.
22+
3. Saves the processed data to an Excel file using the `save_to_excel` function.
23+
4. Loads the processed data into a database using the `load_to_db` function.
24+
"""
25+
dl_api_data = get_data(DataLibrary(os.getenv("DATALIB_API_KEY")))
4326
# Load processed data to DB
44-
processed_data = process_data(raw_data)
27+
processed_data = transform(dl_api_data)
4528
save_to_excel(processed_data)
4629
load_to_db(processed_data)
4730
print("Done")
4831

4932
if __name__== "__main__":
50-
run_etl_process()
33+
main()
34+
5135

0 commit comments

Comments
 (0)