WFP-VAM
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 24 additions & 16 deletions b/‎README.md‎
Lines changed: 24 additions & 16 deletions
diff --git a/‎api/__init__.py‎ ‎datalibrary/__init__.py‎api/__init__.py renamed to datalibrary/__init__.py b/‎api/__init__.py‎ ‎datalibrary/__init__.py‎api/__init__.py renamed to datalibrary/__init__.py
diff --git a/‎api/client.py‎ ‎datalibrary/extract.py‎api/client.py renamed to datalibrary/extract.py
Lines changed: 42 additions & 3 deletions b/‎api/client.py‎ ‎datalibrary/extract.py‎api/client.py renamed to datalibrary/extract.py
Lines changed: 42 additions & 3 deletions
diff --git a/‎scripts/export.py‎ ‎datalibrary/load.py‎scripts/export.py renamed to datalibrary/load.py
Lines changed: 11 additions & 15 deletions b/‎scripts/export.py‎ ‎datalibrary/load.py‎scripts/export.py renamed to datalibrary/load.py
Lines changed: 11 additions & 15 deletions
diff --git a/‎datalibrary/transform.py‎
Lines changed: 102 additions & 0 deletions b/‎datalibrary/transform.py‎
Lines changed: 102 additions & 0 deletions
@@ -164,3 +164,6 @@ config.py
 *.xlsx
 .env
 playground.py
+playground.R
+profile.html
+*.bak
@@ -1,22 +1,30 @@
-# Data Library API tool
+# Data Library API Tool
+This is a Python tool to query the WFP Data Library API and export the data into CSV files.
 
-This is a simple Python tool to get data from the [RAM Data Library API]((https://datalib.vam.wfp.org)) in CSV format.
+##  Features
+Queries the Data Library API to get:
+- List of users
+- List of survey codes
+- Complete information on all surveys (name, code, country, etc.)
+- Exports the API data into CSV files
+- Provides helper functions to get info on using the API
+## Usage
+- Clone this repo
+- Get an API key from your Data Library account
+- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
+- Run python main.py to query the API and export CSV files
+- The output CSV files will be saved in the output folder.
 
-Currently this tools query three API endpoints: 
-- **```user_list```**: Get list of users registered in Data Library
-- **```package_list```**: Get survey codes (YYMM_ISO3_SURVEYTYPE) for all surveys available in the platform
-- **```current_package_list_with_resources```**: Complete information about surveys in Data Library, including name of survey, survey code, country and uploader
+## Requirements
+- Python 3.x
 
-For more information on the RAM Data Library API, consult the [documentation](https://docs.ckan.org/en/2.9/api/) 
 
-## How to use it
+## Documentation
+For more details on the Data Library API endpoints, see the API documentation.
 
-1. Make sure you have Python installed on your machine. 
-2. Get an API key from your [Data Library](https://datalib.vam.wfp.org) account
-3. Add your API key to the api_key.py file. Do not forget to add this file to the .gitignore!
-4. Run main.py
+## Contributing
+Contributions to add more API querying/exporting functionality are welcome!
+
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
 
->> **Quick tip**
->> 
->> If you're stuck, use the help() function in the DataLibraryData class for information about usage
->>
 
@@ -1,13 +1,16 @@
 import requests
 import json
+import logging
+import pandas as pd
 
 logger = logging.getLogger(__name__)
 
 BASE_URL = "https://datalib.vam.wfp.org/api/3/"
 ENDPOINTS = {
 'users': 'action/user_list',
 'all_surveys_information': 'action/current_package_list_with_resources',
-'all_surveys_code': 'action/package_list'
+'all_surveys_code': 'action/package_list',
+'member_list': 'action/member_list',
 }
 
 class DataLibrary:
@@ -85,11 +88,47 @@ def get_surveys_with_resources(self, limit=None):
         response = self.get_response(url, limit=limit) 
         data = response["result"]
         return data
-
+        
     def __repr__(self):
         return f'DataLibraryData({self.api_key})'
 
     def __str__(self):
         return f'The API key used in this DataLibraryData is {self.api_key}'
 
-        
+
+def get_survey_data(client):
+  survey_list = client.get_survey_list()
+  total_surveys = len(survey_list)
+
+  all_surveys = client.get_surveys_with_resources(limit=total_surveys) 
+  all_surveys_df = pd.json_normalize(all_surveys)
+
+  return all_surveys_df
+
+
+def get_user_data(client):
+  users = client.get_users()
+  users_df = pd.DataFrame(users)
+
+  return users_df
+
+
+def get_data(client):
+  survey_df = get_survey_data(client)
+  user_df = get_user_data(client)
+
+  return survey_df, user_df
+
+
+
+if __name__ == "__main__":
+    import os
+    from dotenv import load_dotenv
+
+    load_dotenv() 
+
+
+    client = DataLibrary(os.getenv("DATALIB_API_KEY"))
+    survey_df, user_df = get_data(client)
+    print(survey_df.head())
+    print(user_df.head())
@@ -17,17 +17,12 @@
 conn_str = f'mssql+pyodbc://{USERNAME}:{PASSWORD}@{SERVER}/{DATABASE}?driver=ODBC+Driver+17+for+SQL+Server'
 engine = create_engine(conn_str)
 
-# def test_read_sql(table_name):
-#     sql = """
-#     SELECT * FROM [dbo].table_name
-#     """
-#     sql_df = pd.read_sql( sql, con=engine) 
-#     print(sql_df.head())
-
-def load_data(data, table_name = 'table', index = False):
+class ExcelExportError(Exception):
+    pass
+
+def load_data(data, table_name = 'table'):
     try:
-        data.to_sql(name=table_name, con=engine, if_exists='replace', index=index)
-        print("Done")
+        data.to_sql(name=table_name, con=engine, if_exists='replace')
     except Exception as e:
         logger.error(f"Error {e} when populating {table_name}")
 
@@ -36,7 +31,7 @@ def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Use
         for df, table_name in zip(data, table_names):
             logger.info("Loading data to database")
             load_data(df, table_name)
-    except Exception as e:
+    except ExcelExportError as e:
         logger.error(f"Error loading data: {e}")
 
 
@@ -55,8 +50,9 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
             continue
 
 if __name__ == "__main__":
+    pass
 
-    # test_read_sql()  
-    sample_data = {'col1': [1, 2], 'col2': [3, 4]}
-    df = pd.DataFrame(data=sample_data)
-    load_data(df, 'test_table', index=True)
+    # # test_read_sql()  
+    # sample_data = {'col1': [1, 2], 'col2': [3, 4]}
+    # data = pd.DataFrame(data=sample_data)
+    # load_data(data, 'test_table')
@@ -0,0 +1,102 @@
+import pandas as pd
+import json
+
+
+SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
+
+SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
+                          "organization_title": "container_name",
+                          "organization_description": "container_description",
+                          "organization_created": "container_created",
+                          "data_collector": "organization", "owner_org": "parent_container_id",
+                          "title": "survey_title"}
+
+RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "restricted-allowed_users", "restricted-level", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
+
+def clean_column_names(df):
+    """Rename columns to be more readable"""
+    df.columns = df.columns.str.replace('.', '_') 
+    return df
+
+def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
+    flat_list = []
+    for _, row in df.iterrows():
+        for r in row[col]:
+            flat_dict = {_id : row[_id]} 
+            flat_dict.update(r)
+            flat_list.append(flat_dict)
+            
+    df = pd.DataFrame(flat_list)
+    return df
+
+def flatten_resources(df):
+    resources = df[["resources", "organization_id", "survey_id"]]
+    flat_resources = flatten_response(resources, "resources", "survey_id")
+    return flat_resources
+
+def normalize_restrictions(df):
+    """Normalize restricted column"""
+    try:
+        df['restricted'] = df['restricted'].apply(json.loads)
+    except TypeError:
+        pass
+    
+    restricted = pd.json_normalize(df["restricted"])
+    return restricted
+
+def user_data_transform(df):
+        # Remove unnecessary columns from users table
+    users_columns_to_drop = ["apikey", "display_name", "about", "state", "image_url", "image_display_url"]
+    df = df.drop(columns=users_columns_to_drop)
+    return  df
+
+def survey_data_transform(df):
+    df = clean_column_names(df)
+    df = df[SURVEY_COLUMNS_SELECTION]
+    df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
+    return df
+
+
+
+def transform(data: tuple) -> tuple:
+    """
+    Transforms the input data tuple into a new tuple.
+
+    Args:
+    data (tuple): A tuple containing the data to be transformed.
+
+    Returns:
+    tuple: The transformed data.
+    """
+    surveys, users = data
+
+    surveys = survey_data_transform(surveys)
+
+    # Flatten resources
+    resources = surveys[["resources", "container_id", "survey_id"]]
+    flat_resources = flatten_response(resources, "resources", "survey_id")
+
+    # Remove unnecessary columns from survey table
+    surveys.drop(columns=["resources"], inplace=True) 
+    full_resources = pd.merge(resources, flat_resources, on="survey_id")
+
+    # Normalize restricted datasets
+    restricted = normalize_restrictions(full_resources)
+    full_resources = full_resources.join(restricted)
+
+
+    # Remove unnecessary columns from resource table
+    full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
+
+    # Rename resource columns
+    resources_cols_renaming = {
+        "level": "access_level",
+    }
+    full_resources = full_resources.rename(columns=resources_cols_renaming)
+
+
+    # User table transformations 
+    users = user_data_transform(users)
+
+    return (surveys, full_resources, users)
+