WFP-VAM
diff --git a/‎.env-example‎
Lines changed: 8 additions & 0 deletions b/‎.env-example‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎LICENSE.md‎
Lines changed: 661 additions & 0 deletions b/‎LICENSE.md‎
Lines changed: 661 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 17 deletions b/‎README.md‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎datalibrary/extract.py‎
Lines changed: 48 additions & 21 deletions b/‎datalibrary/extract.py‎
Lines changed: 48 additions & 21 deletions
diff --git a/‎datalibrary/load.py‎
Lines changed: 2 additions & 7 deletions b/‎datalibrary/load.py‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎datalibrary/transform.py‎
Lines changed: 13 additions & 2 deletions b/‎datalibrary/transform.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎environment.yaml‎
Lines changed: 56 additions & 0 deletions b/‎environment.yaml‎
Lines changed: 56 additions & 0 deletions
@@ -0,0 +1,8 @@
+DATALIB_API_KEY=
+
+SERVER =
+DB_NAME = 
+DB_USERNAME =
+DB_PASSWORD= 
+
+
@@ -1,30 +1,30 @@
 # Data Library API Tool
-This is a Python tool to query the WFP Data Library API and export the data into CSV files.
+This is a Python CLI tool to query the [WFP Data Library API](https://datalib.vam.wfp.org/) and export the data into CSV files or an MS SQL database. It performs an ETL (Extract, Transform, Load) process to fetch data from the VAM Data Library, process the data, and load it into a database and/or save it to an Excel file.
+
+## Features
+- Queries the Data Library API to get information about users, surveys, resources, and container members.
+- Exports data to CSV files.
+- Uploads data to an MS SQL database.
 
-##  Features
-Queries the Data Library API to get:
-- List of users
-- List of survey codes
-- Complete information on all surveys (name, code, country, etc.)
-- Exports the API data into CSV files
-- Provides helper functions to get info on using the API
 ## Usage
-- Clone this repo
-- Get an API key from your Data Library account
-- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
-- Run python main.py to query the API and export CSV files
-- The output CSV files will be saved in the output folder.
+1. Clone this repository.
+2. Get an API key from your Data Library account.
+3. Rename the `.env-example` file to `.env`.
+4. Add the API key and database credentials to the `.env` file.
+5. Run `python main.py` to query the API and export data.
+6. Use the `--csv` flag to export data to CSV files (e.g., `python main.py --csv`).
+7. Use the `--db` flag to upload data to a database (e.g., `python main.py --db`).
+8. The output CSV files will be saved in the `output` folder.
 
 ## Requirements
 - Python 3.x
-
+- Packages listed in `requirements.txt`
 
 ## Documentation
-For more details on the Data Library API endpoints, see the API documentation.
+For more details on the Data Library API endpoints, see the [API documentation](http://docs.ckan.org/en/2.9/api/).
 
 ## Contributing
 Contributions to add more API querying/exporting functionality are welcome!
 
 ## License
-This project is licensed under the MIT License - see the LICENSE file for details.
-
+This project is licensed under the Affero GPL License - see the [LICENSE](LICENSE) file for details.
@@ -34,20 +34,24 @@ def __init__(self, api_key):
         self.api_key = api_key
         self.session = requests.Session()
 
-    def get_response(self, url, limit=None):
+    def get_response(self, url, params=None):
         """Send API request.
 
         Args:
             url (str): API endpoint URL
-            limit (int, optional): Max number of results
-        
+            params (dict, optional): Query parameters
+
         Returns:
-            dict: API response 
+            dict: API response
         """
-        headers = { 'Authorization': f'{self.api_key}'}
-        params = {'limit': limit}
+        headers = {'Authorization': f'{self.api_key}'}
 
-        logger.info(f'Querying {url} with limit {limit}')
+        if params is None:
+            params = {}
+        elif isinstance(params, int):
+            params = {'limit': params}
+
+        logger.info(f'Querying {url} with params {params}')
 
         r = self.session.get(url, headers=headers, params=params)
         if r.status_code == 200:
@@ -78,16 +82,34 @@ def get_users(self):
     def get_survey_list(self, limit=None):
         """Get package list"""
         url = BASE_URL + ENDPOINTS['all_surveys_code']
-        response = self.get_response(url, limit=limit) 
+        response = self.get_response(url, params=limit) 
         data = response["result"]
         return data
 
     def get_surveys_with_resources(self, limit=None):
         """Get all surveys with country, type of survey and description"""
         url = BASE_URL + ENDPOINTS['all_surveys_information']
-        response = self.get_response(url, limit=limit) 
+        response = self.get_response(url, params=limit) 
         data = response["result"]
         return data
+    
+    def get_member_list(self, id=None, object_type=None, capacity=None, limit=None):
+        """Get list of members of a group.
+
+        Args:
+            id (str, optional): The ID or name of the group.
+            object_type (str, optional): Restrict members to a given type (e.g., 'user' or 'package').
+            capacity (str, optional): Restrict members to a given capacity (e.g., 'member', 'editor', 'admin').
+            limit (int, optional): Maximum number of results to return.
+
+        Returns:
+            list: List of tuples containing (id, type, capacity) for each member.
+        """
+        url = BASE_URL + ENDPOINTS['member_list']
+        params = {'id': id, 'object_type': object_type, 'limit': limit}
+        response = self.get_response(url, params=params)
+        data = response.get("result", [])
+        return data
 
     def __repr__(self):
         return f'DataLibraryData({self.api_key})'
@@ -112,23 +134,28 @@ def get_user_data(client):
 
   return users_df
 
+def get_member_data(client, id=None):
+    members = client.get_member_list(id=id)
+    members_df = pd.DataFrame(members)
+    return members_df
+
 
 def get_data(client):
   survey_df = get_survey_data(client)
   user_df = get_user_data(client)
+  
+  result = []
+  container_ids = set(survey_df['organization.id'])
+  for container_id in container_ids:
+    container_members = get_member_data(client, id=container_id)
+    if container_members is not None:
+        container_members.insert(3, "container_id", container_id) # Check if container_members is not None
+        result.append(container_members)
+        # BUG: container id should be included as column, along with user_id
+  member_df = pd.concat(result, ignore_index=True)
 
-  return survey_df, user_df
-
+  return survey_df, user_df, member_df
 
 
 if __name__ == "__main__":
-    import os
-    from dotenv import load_dotenv
-
-    load_dotenv() 
-
-
-    client = DataLibrary(os.getenv("DATALIB_API_KEY"))
-    survey_df, user_df = get_data(client)
-    print(survey_df.head())
-    print(user_df.head())
+    pass
@@ -26,7 +26,7 @@ def load_data(data, table_name = 'table'):
     except Exception as e:
         logger.error(f"Error {e} when populating {table_name}")
 
-def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Users")):
+def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Users", "DL_Members")):
     try: 
         for df, table_name in zip(data, table_names):
             logger.info("Loading data to database")
@@ -35,7 +35,7 @@ def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Use
         logger.error(f"Error loading data: {e}")
 
 
-def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
+def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users", "members")):
     # export survey list, survey information with resources and user list as csv
     folder = "output"
     today = str(date.today()).replace("-", "_")
@@ -51,8 +51,3 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
 
 if __name__ == "__main__":
     pass
-
-    # # test_read_sql()  
-    # sample_data = {'col1': [1, 2], 'col2': [3, 4]}
-    # data = pd.DataFrame(data=sample_data)
-    # load_data(data, 'test_table')
@@ -56,6 +56,11 @@ def survey_data_transform(df):
     df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
     return df
 
+def member_data_transform(df):
+    df = df.rename(columns={0: "user_id", 1: "type", 2: "capacity"})
+    df = df[df.type.isin(["user"])]
+    df = df[["user_id", "capacity", "container_id"]]
+    return df
 
 
 def transform(data: tuple) -> tuple:
@@ -68,7 +73,7 @@ def transform(data: tuple) -> tuple:
     Returns:
     tuple: The transformed data.
     """
-    surveys, users = data
+    surveys, users, members = data
 
     surveys = survey_data_transform(surveys)
 
@@ -98,5 +103,11 @@ def transform(data: tuple) -> tuple:
     # User table transformations 
     users = user_data_transform(users)
 
-    return (surveys, full_resources, users)
+    # Member DF
+    members = member_data_transform(members)
+
+
+    return (surveys, full_resources, users, members)
 
+if __name__ == "__main__":
+    pass
@@ -0,0 +1,56 @@
+name: datalib
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - brotli-python=1.1.0=py312h53d5487_1
+  - bzip2=1.0.8=hcfcfb64_5
+  - ca-certificates=2024.2.2=h56e8100_0
+  - certifi=2024.2.2=pyhd8ed1ab_0
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - greenlet=3.0.3=py312h53d5487_0
+  - idna=3.7=pyhd8ed1ab_0
+  - intel-openmp=2024.1.0=h57928b3_966
+  - libblas=3.9.0=22_win64_mkl
+  - libcblas=3.9.0=22_win64_mkl
+  - libexpat=2.6.2=h63175ca_0
+  - libffi=3.4.2=h8ffe710_5
+  - libhwloc=2.10.0=default_h8125262_1001
+  - libiconv=1.17=hcfcfb64_2
+  - liblapack=3.9.0=22_win64_mkl
+  - libsqlite=3.45.3=hcfcfb64_0
+  - libxml2=2.12.7=h283a6d9_0
+  - libzlib=1.2.13=h2466b09_6
+  - mkl=2024.1.0=h66d3029_692
+  - numpy=1.26.4=py312h8753938_0
+  - openssl=3.3.0=h2466b09_3
+  - pandas=2.2.2=py312h72972c8_1
+  - pip=24.0=pyhd8ed1ab_0
+  - pthreads-win32=2.9.1=hfa6e2cd_3
+  - pyodbc=5.1.0=py312h53d5487_0
+  - pysocks=1.7.1=pyh0701188_6
+  - python=3.12.3=h2628c8c_0_cpython
+  - python-dateutil=2.9.0=pyhd8ed1ab_0
+  - python-tzdata=2024.1=pyhd8ed1ab_0
+  - python_abi=3.12=4_cp312
+  - pytz=2024.1=pyhd8ed1ab_0
+  - requests=2.32.2=pyhd8ed1ab_0
+  - setuptools=70.0.0=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlalchemy=2.0.30=py312h4389bb4_0
+  - tbb=2021.12.0=hc790b64_1
+  - tk=8.6.13=h5226925_1
+  - typing-extensions=4.11.0=hd8ed1ab_0
+  - typing_extensions=4.11.0=pyha770c72_0
+  - tzdata=2024a=h0c530f3_0
+  - ucrt=10.0.22621.0=h57928b3_0
+  - urllib3=2.2.1=pyhd8ed1ab_0
+  - vc=14.3=ha32ba9b_20
+  - vc14_runtime=14.38.33135=h835141b_20
+  - vs2015_runtime=14.38.33135=h22015db_20
+  - wheel=0.43.0=pyhd8ed1ab_1
+  - win_inet_pton=1.1.0=pyhd8ed1ab_6
+  - xz=5.2.6=h8d14728_0
+  - pip:
+      - python-dotenv==1.0.1
+prefix: C:\Users\alessandra.gherardel\AppData\Local\miniconda3\envs\datalib