Updated README and added transformations to tables

AlexGherardelli · AlexGherardelli · commit 597e9c5a24c9 · 2024-04-17T12:16:46.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -164,5 +164,6 @@ config.py
 *.xlsx
 .env
 playground.py
+playground.R
 profile.html
 *.bak
diff --git a/README.md b/README.md
@@ -1,21 +1,32 @@
-# Data Library API tool
+# Data Library API Tool
+This is a Python tool to query the WFP Data Library API and export the data into CSV files.
 
-This is a simple Python tool to get data from the [RAM Data Library API]((https://datalib.vam.wfp.org)) in CSV format.
+##  Features
+Queries the Data Library API to get:
+- List of users
+- List of survey codes
+- Complete information on all surveys (name, code, country, etc.)
+- Exports the API data into CSV files
+- Provides helper functions to get info on using the API
+## Usage
+- Clone this repo
+- Get an API key from your Data Library account
+- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
+- Run python main.py to query the API and export CSV files
+- The output CSV files will be saved in the output folder.
 
-Currently this tools query three API endpoints: 
-- **```user_list```**: Get list of users registered in Data Library
-- **```package_list```**: Get survey codes (YYMM_ISO3_SURVEYTYPE) for all surveys available in the platform
-- ***```current_package_list_with_resources```**: Complete information about surveys in Data Library, including name of survey, survey code, country and uploader
+## Requirements
+Python 3.x
+Requests library
+Pandas library
+Python Dotenv library
 
-For more information on the RAM Data Library API, consult the [documentation](https://docs.ckan.org/en/2.9/api/) 
+## Documentation
+For more details on the Data Library API endpoints, see the API documentation.
 
-## How to use it
+## Contributing
+Contributions to add more API querying/exporting functionality are welcome!
 
-1. Make sure you have Python installed on your machine. 
-2. Get an API key from your [Data Library](https://datalib.vam.wfp.org) account
-3. Add your API key to the api_key.py file. Do not forget to add this file to the .gitignore!
-4. Run main.py
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
 
->> **Quick tip**
->> 
->> If you're stuck, use the help() function in the DataLibraryData class for information about 
diff --git a/etl/extract.py b/etl/extract.py
@@ -10,7 +10,8 @@
 ENDPOINTS = {
 'users': 'action/user_list',
 'all_surveys_information': 'action/current_package_list_with_resources',
-'all_surveys_code': 'action/package_list'
+'all_surveys_code': 'action/package_list',
+'member_list': 'action/member_list',
 }
 
 class DataLibrary:
@@ -88,6 +89,16 @@ def get_surveys_with_resources(self, limit=None):
         response = self.get_response(url, limit=limit) 
         data = response["result"]
         return data
+    
+    def get_member_list(self, limit=None):
+        """Get list of members"""
+        url = BASE_URL + ENDPOINTS['member_list']
+        response = self.get_response(url, limit=limit)
+        try: 
+            data = response["result"]
+            return data
+        except TypeError:
+            print(response)
 
     def __repr__(self):
         return f'DataLibraryData({self.api_key})'
@@ -112,6 +123,11 @@ def get_user_data(client):
 
   return users_df
 
+# def get_member_list(client):
+#     member_list = client.get_member_list()
+#     member_list_df = pd.DataFrame(member_list)
+#     return member_list_df
+
 
 def get_data(client):
   survey_df = get_survey_data(client)
@@ -120,23 +136,12 @@ def get_data(client):
   return survey_df, user_df
 
 
+if __name__ == "__main__":
+    import os
+    from dotenv import load_dotenv
 
-        
-# def get_data(client: Client) -> Tuple[DataFrame, DataFrame]:
-#     # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
-#     survey_list = client.get_survey_list()
-#     # # get total number survey present on Data Library
-#     total_surveys = len(survey_list)
-    
-#     # get information on user
-#     users = client.get_users()
-#     # get total number of users with an account on Data Library
-#     total_users = len(users)
-#     users = pd.DataFrame(users)
-
-#     # get all information about surveys
-#     all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
-
-#     all_surveys_with_resources =  pd.json_normalize(all_surveys_with_resources)
+    load_dotenv() 
 
-#     return  (all_surveys_with_resources, users)
+    client = DataLibrary(os.getenv("DATALIB_API_KEY"))
+    members = client.get_member_list()
+    print(members)
diff --git a/etl/load.py b/etl/load.py
@@ -23,8 +23,6 @@ class ExcelExportError(Exception):
 def load_data(data, table_name = 'table'):
     try:
         data.to_sql(name=table_name, con=engine, if_exists='replace')
-
-        print("Done")
     except Exception as e:
         logger.error(f"Error {e} when populating {table_name}")
 
@@ -52,8 +50,9 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
             continue
 
 if __name__ == "__main__":
+    pass
 
-    # test_read_sql()  
-    sample_data = {'col1': [1, 2], 'col2': [3, 4]}
-    df = pd.DataFrame(data=sample_data)
-    load_data(df, 'test_table', index=True)
+    # # test_read_sql()  
+    # sample_data = {'col1': [1, 2], 'col2': [3, 4]}
+    # data = pd.DataFrame(data=sample_data)
+    # load_data(data, 'test_table')
diff --git a/etl/transform.py b/etl/transform.py
@@ -2,14 +2,25 @@
 import json
 
 
+SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
+
+SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
+                          "organization_title": "container_name",
+                          "organization_description": "container_description",
+                          "organization_created": "container_created",
+                          "data_collector": "organization", "owner_org": "parent_container_id",
+                          "title": "survey_title"}
+
+RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "restricted-allowed_users", "restricted-level", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
+
 def clean_column_names(df):
     """Rename columns to be more readable"""
     df.columns = df.columns.str.replace('.', '_') 
     return df
 
 def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
     flat_list = []
-    for index, row in df.iterrows():
+    for _, row in df.iterrows():
         for r in row[col]:
             flat_dict = {_id : row[_id]} 
             flat_dict.update(r)
@@ -33,41 +44,59 @@ def normalize_restrictions(df):
     restricted = pd.json_normalize(df["restricted"])
     return restricted
 
+def user_data_transform(df):
+        # Remove unnecessary columns from users table
+    users_columns_to_drop = ["apikey", "display_name", "about", "state", "image_url", "image_display_url"]
+    df = df.drop(columns=users_columns_to_drop)
+    return  df
+
+def survey_data_transform(df):
+    df = clean_column_names(df)
+    df = df[SURVEY_COLUMNS_SELECTION]
+    df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
+    return df
+
+
 
 def transform(data: tuple) -> tuple:
+    """
+    Transforms the input data tuple into a new tuple.
+
+    Args:
+    data (tuple): A tuple containing the data to be transformed.
+
+    Returns:
+    tuple: The transformed data.
+    """
     surveys, users = data
 
-    # Clean column names
-    surveys = clean_column_names(surveys)
-    
-    # Select relevant columns
-    survey_cols = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_type', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
-                   
-    surveys = surveys[survey_cols]
-    
-    # Rename survey columns
-    survey_cols_renaming = {"id": "survey_id", "organization_id": "container_id", "organization_type": "container_type", "organization_title": "container_title", "organization_description": "container_description", "organization_created": "container_created", "data_collector": "organization", "owner_org": "parent_container_id"}
-    surveys = surveys.rename(columns= survey_cols_renaming)
+    surveys = survey_data_transform(surveys)
 
     # Flatten resources
     resources = surveys[["resources", "container_id", "survey_id"]]
     flat_resources = flatten_response(resources, "resources", "survey_id")
 
-    # Join back to surveys
-    surveys.drop(columns= ['resources'], inplace=True) 
+    # Remove unnecessary columns from survey table
+    surveys.drop(columns=["resources"], inplace=True) 
     full_resources = pd.merge(resources, flat_resources, on="survey_id")
 
     # Normalize restricted datasets
     restricted = normalize_restrictions(full_resources)
     full_resources = full_resources.join(restricted)
 
 
-    # Drop unnecessary columns
-    drop_cols = ["resources", "restricted", "restricted-allowed_users", 
-                 "restricted-level", "cache_last_updated", "cache_url", 
-                 "revision_id"]
-    
-    full_resources.drop(columns=drop_cols, inplace=True)
+    # Remove unnecessary columns from resource table
+    full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
+
+    # Rename resource columns
+    resources_cols_renaming = {
+        "level": "access_level",
+    }
+    full_resources = full_resources.rename(columns=resources_cols_renaming)
+
+
+    # User table transformations 
+    users = user_data_transform(users)
 
     return (surveys, full_resources, users)