Refactored code to clean up

AlexGherardelli · AlexGherardelli · commit 88ad1223670a · 2024-04-16T11:26:52.000+02:00
diff --git a/etl/extract.py b/etl/extract.py
@@ -95,4 +95,48 @@ def __repr__(self):
     def __str__(self):
         return f'The API key used in this DataLibraryData is {self.api_key}'
 
+
+def get_survey_data(client):
+  survey_list = client.get_survey_list()
+  total_surveys = len(survey_list)
+
+  all_surveys = client.get_surveys_with_resources(limit=total_surveys) 
+  all_surveys_df = pd.json_normalize(all_surveys)
+
+  return all_surveys_df
+
+
+def get_user_data(client):
+  users = client.get_users()
+  users_df = pd.DataFrame(users)
+
+  return users_df
+
+
+def get_data(client):
+  survey_df = get_survey_data(client)
+  user_df = get_user_data(client)
+
+  return survey_df, user_df
+
+
+
         
+# def get_data(client: Client) -> Tuple[DataFrame, DataFrame]:
+#     # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
+#     survey_list = client.get_survey_list()
+#     # # get total number survey present on Data Library
+#     total_surveys = len(survey_list)
+    
+#     # get information on user
+#     users = client.get_users()
+#     # get total number of users with an account on Data Library
+#     total_users = len(users)
+#     users = pd.DataFrame(users)
+
+#     # get all information about surveys
+#     all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
+
+#     all_surveys_with_resources =  pd.json_normalize(all_surveys_with_resources)
+
+#     return  (all_surveys_with_resources, users)
diff --git a/etl/transform.py b/etl/transform.py
@@ -1,17 +1,6 @@
 import pandas as pd
 import json
 
-def normalize_json(data: dict) -> dict:
-    """Flatten json"""
-    new_data = dict()
-    for key, value in data.items():
-        if not isinstance(value, dict):
-            new_data[key] = value
-        else:
-            for k, v in value.items():
-                new_data[key + "_" + k] = v
-  
-    return new_data
 
 def flatten_response(df, col: str, df_id: str):
     flat_list = []
@@ -24,7 +13,7 @@ def flatten_response(df, col: str, df_id: str):
     df1 = pd.DataFrame(flat_list)
     return df1
 
-def process_data(data: tuple) -> tuple:
+def transform(data: tuple) -> tuple:
     surveys, users = data
     surveys = surveys[['assessment_status', 'collection_method',
     'creator_user_id', 'data_collector', 'description', 'end_date', 'id',
diff --git a/main.py b/main.py
@@ -1,51 +1,35 @@
 from dotenv import load_dotenv
 import pandas as pd
 import os
-from etl.extract import DataLibrary
-from etl.transform import process_data
+from etl.extract import DataLibrary, get_data
+from etl.transform import transform
 from etl.load import load_to_db, save_to_excel
 
-
 load_dotenv()  # take environment variables from .env.
 
-def extract_data(client):
-    """
-    Get data from DL api and returns two dataframes.
-
-    This function initializes a DataLibrary API instance, retrieves the survey list and user information from the API, and returns two dataframes - one containing all surveys with resources, and one containing user information.
-
-    Returns:
-        Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the dataframes for surveys with resources and users.
+def main():
     """
+    Executes the ETL (Extract, Transform, Load) process to fetch data from VAM Data Library, process the data, and load it into a database and save it to an Excel file.
 
-    # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
-    survey_list = client.get_survey_list()
-    # # get total number survey present on Data Library
-    total_surveys = len(survey_list)
+    There are three data points being loaded:
+    - Survey Information
+    - User Information
+    - Survey Resources
     
-    # get information on user
-    users = client.get_users()
-    # get total number of users with an account on Data Library
-    total_users = len(users)
-    users = pd.DataFrame(users)
-
-    print(f"\n---\n There are {total_surveys + 1} surveys and {total_users + 1} active users in Data Library\n---\n ")
-
-    # get all information about surveys
-    all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
-
-    all_surveys_with_resources =  pd.json_normalize(all_surveys_with_resources)
-
-    return  (all_surveys_with_resources, users)
-
-def run_etl_process():
-    raw_data = extract_data(DataLibrary(os.getenv("DATALIB_API_KEY")))
+    This function is the main entry point for the ETL pipeline. It performs the following steps:
+    1. Fetches data from Data Library using the `get_data` function and the `DataLibrary` class, which is configured using an environment variable.
+    2. Processes the fetched data using the `process_data` function.
+    3. Saves the processed data to an Excel file using the `save_to_excel` function.
+    4. Loads the processed data into a database using the `load_to_db` function.
+    """
+    dl_api_data = get_data(DataLibrary(os.getenv("DATALIB_API_KEY")))
     # Load processed data to DB
-    processed_data = process_data(raw_data)
+    processed_data = transform(dl_api_data)
     save_to_excel(processed_data)
     load_to_db(processed_data)
     print("Done")
 
 if __name__== "__main__":
-    run_etl_process()
+    main()
+