Skip to content

Commit fba3957

Browse files
Merge pull request #1 from WFP-VAM/dev
Added Members List + updated README and requirements
2 parents 9a718dd + 4223c85 commit fba3957

File tree

10 files changed

+848
-198
lines changed

10 files changed

+848
-198
lines changed

.env-example

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
DATALIB_API_KEY=
2+
3+
SERVER =
4+
DB_NAME =
5+
DB_USERNAME =
6+
DB_PASSWORD=
7+
8+

LICENSE.md

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
11
# Data Library API Tool
2-
This is a Python tool to query the WFP Data Library API and export the data into CSV files.
2+
This is a Python CLI tool to query the [WFP Data Library API](https://datalib.vam.wfp.org/) and export the data into CSV files or an MS SQL database. It performs an ETL (Extract, Transform, Load) process to fetch data from the VAM Data Library, process the data, and load it into a database and/or save it to an Excel file.
3+
4+
## Features
5+
- Queries the Data Library API to get information about users, surveys, resources, and container members.
6+
- Exports data to CSV files.
7+
- Uploads data to an MS SQL database.
38

4-
## Features
5-
Queries the Data Library API to get:
6-
- List of users
7-
- List of survey codes
8-
- Complete information on all surveys (name, code, country, etc.)
9-
- Exports the API data into CSV files
10-
- Provides helper functions to get info on using the API
119
## Usage
12-
- Clone this repo
13-
- Get an API key from your Data Library account
14-
- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
15-
- Run python main.py to query the API and export CSV files
16-
- The output CSV files will be saved in the output folder.
10+
1. Clone this repository.
11+
2. Get an API key from your Data Library account.
12+
3. Rename the `.env-example` file to `.env`.
13+
4. Add the API key and database credentials to the `.env` file.
14+
5. Run `python main.py` to query the API and export data.
15+
6. Use the `--csv` flag to export data to CSV files (e.g., `python main.py --csv`).
16+
7. Use the `--db` flag to upload data to a database (e.g., `python main.py --db`).
17+
8. The output CSV files will be saved in the `output` folder.
1718

1819
## Requirements
1920
- Python 3.x
20-
21+
- Packages listed in `requirements.txt`
2122

2223
## Documentation
23-
For more details on the Data Library API endpoints, see the API documentation.
24+
For more details on the Data Library API endpoints, see the [API documentation](http://docs.ckan.org/en/2.9/api/).
2425

2526
## Contributing
2627
Contributions to add more API querying/exporting functionality are welcome!
2728

2829
## License
29-
This project is licensed under the MIT License - see the LICENSE file for details.
30-
30+
This project is licensed under the Affero GPL License - see the [LICENSE](LICENSE) file for details.

datalibrary/extract.py

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,24 @@ def __init__(self, api_key):
3434
self.api_key = api_key
3535
self.session = requests.Session()
3636

37-
def get_response(self, url, limit=None):
37+
def get_response(self, url, params=None):
3838
"""Send API request.
3939
4040
Args:
4141
url (str): API endpoint URL
42-
limit (int, optional): Max number of results
43-
42+
params (dict, optional): Query parameters
43+
4444
Returns:
45-
dict: API response
45+
dict: API response
4646
"""
47-
headers = { 'Authorization': f'{self.api_key}'}
48-
params = {'limit': limit}
47+
headers = {'Authorization': f'{self.api_key}'}
4948

50-
logger.info(f'Querying {url} with limit {limit}')
49+
if params is None:
50+
params = {}
51+
elif isinstance(params, int):
52+
params = {'limit': params}
53+
54+
logger.info(f'Querying {url} with params {params}')
5155

5256
r = self.session.get(url, headers=headers, params=params)
5357
if r.status_code == 200:
@@ -78,16 +82,34 @@ def get_users(self):
7882
def get_survey_list(self, limit=None):
7983
"""Get package list"""
8084
url = BASE_URL + ENDPOINTS['all_surveys_code']
81-
response = self.get_response(url, limit=limit)
85+
response = self.get_response(url, params=limit)
8286
data = response["result"]
8387
return data
8488

8589
def get_surveys_with_resources(self, limit=None):
8690
"""Get all surveys with country, type of survey and description"""
8791
url = BASE_URL + ENDPOINTS['all_surveys_information']
88-
response = self.get_response(url, limit=limit)
92+
response = self.get_response(url, params=limit)
8993
data = response["result"]
9094
return data
95+
96+
def get_member_list(self, id=None, object_type=None, capacity=None, limit=None):
97+
"""Get list of members of a group.
98+
99+
Args:
100+
id (str, optional): The ID or name of the group.
101+
object_type (str, optional): Restrict members to a given type (e.g., 'user' or 'package').
102+
capacity (str, optional): Restrict members to a given capacity (e.g., 'member', 'editor', 'admin').
103+
limit (int, optional): Maximum number of results to return.
104+
105+
Returns:
106+
list: List of tuples containing (id, type, capacity) for each member.
107+
"""
108+
url = BASE_URL + ENDPOINTS['member_list']
109+
params = {'id': id, 'object_type': object_type, 'limit': limit}
110+
response = self.get_response(url, params=params)
111+
data = response.get("result", [])
112+
return data
91113

92114
def __repr__(self):
93115
return f'DataLibraryData({self.api_key})'
@@ -112,23 +134,28 @@ def get_user_data(client):
112134

113135
return users_df
114136

137+
def get_member_data(client, id=None):
138+
members = client.get_member_list(id=id)
139+
members_df = pd.DataFrame(members)
140+
return members_df
141+
115142

116143
def get_data(client):
117144
survey_df = get_survey_data(client)
118145
user_df = get_user_data(client)
146+
147+
result = []
148+
container_ids = set(survey_df['organization.id'])
149+
for container_id in container_ids:
150+
container_members = get_member_data(client, id=container_id)
151+
if container_members is not None:
152+
container_members.insert(3, "container_id", container_id) # Check if container_members is not None
153+
result.append(container_members)
154+
# BUG: container id should be included as column, along with user_id
155+
member_df = pd.concat(result, ignore_index=True)
119156

120-
return survey_df, user_df
121-
157+
return survey_df, user_df, member_df
122158

123159

124160
if __name__ == "__main__":
125-
import os
126-
from dotenv import load_dotenv
127-
128-
load_dotenv()
129-
130-
131-
client = DataLibrary(os.getenv("DATALIB_API_KEY"))
132-
survey_df, user_df = get_data(client)
133-
print(survey_df.head())
134-
print(user_df.head())
161+
pass

datalibrary/load.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def load_data(data, table_name = 'table'):
2626
except Exception as e:
2727
logger.error(f"Error {e} when populating {table_name}")
2828

29-
def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Users")):
29+
def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Users", "DL_Members")):
3030
try:
3131
for df, table_name in zip(data, table_names):
3232
logger.info("Loading data to database")
@@ -35,7 +35,7 @@ def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Use
3535
logger.error(f"Error loading data: {e}")
3636

3737

38-
def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
38+
def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users", "members")):
3939
# export survey list, survey information with resources and user list as csv
4040
folder = "output"
4141
today = str(date.today()).replace("-", "_")
@@ -51,8 +51,3 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
5151

5252
if __name__ == "__main__":
5353
pass
54-
55-
# # test_read_sql()
56-
# sample_data = {'col1': [1, 2], 'col2': [3, 4]}
57-
# data = pd.DataFrame(data=sample_data)
58-
# load_data(data, 'test_table')

datalibrary/transform.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ def survey_data_transform(df):
5656
df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
5757
return df
5858

59+
def member_data_transform(df):
60+
df = df.rename(columns={0: "user_id", 1: "type", 2: "capacity"})
61+
df = df[df.type.isin(["user"])]
62+
df = df[["user_id", "capacity", "container_id"]]
63+
return df
5964

6065

6166
def transform(data: tuple) -> tuple:
@@ -68,7 +73,7 @@ def transform(data: tuple) -> tuple:
6873
Returns:
6974
tuple: The transformed data.
7075
"""
71-
surveys, users = data
76+
surveys, users, members = data
7277

7378
surveys = survey_data_transform(surveys)
7479

@@ -98,5 +103,11 @@ def transform(data: tuple) -> tuple:
98103
# User table transformations
99104
users = user_data_transform(users)
100105

101-
return (surveys, full_resources, users)
106+
# Member DF
107+
members = member_data_transform(members)
108+
109+
110+
return (surveys, full_resources, users, members)
102111

112+
if __name__ == "__main__":
113+
pass

environment.yaml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
name: datalib
2+
channels:
3+
- conda-forge
4+
- defaults
5+
dependencies:
6+
- brotli-python=1.1.0=py312h53d5487_1
7+
- bzip2=1.0.8=hcfcfb64_5
8+
- ca-certificates=2024.2.2=h56e8100_0
9+
- certifi=2024.2.2=pyhd8ed1ab_0
10+
- charset-normalizer=3.3.2=pyhd8ed1ab_0
11+
- greenlet=3.0.3=py312h53d5487_0
12+
- idna=3.7=pyhd8ed1ab_0
13+
- intel-openmp=2024.1.0=h57928b3_966
14+
- libblas=3.9.0=22_win64_mkl
15+
- libcblas=3.9.0=22_win64_mkl
16+
- libexpat=2.6.2=h63175ca_0
17+
- libffi=3.4.2=h8ffe710_5
18+
- libhwloc=2.10.0=default_h8125262_1001
19+
- libiconv=1.17=hcfcfb64_2
20+
- liblapack=3.9.0=22_win64_mkl
21+
- libsqlite=3.45.3=hcfcfb64_0
22+
- libxml2=2.12.7=h283a6d9_0
23+
- libzlib=1.2.13=h2466b09_6
24+
- mkl=2024.1.0=h66d3029_692
25+
- numpy=1.26.4=py312h8753938_0
26+
- openssl=3.3.0=h2466b09_3
27+
- pandas=2.2.2=py312h72972c8_1
28+
- pip=24.0=pyhd8ed1ab_0
29+
- pthreads-win32=2.9.1=hfa6e2cd_3
30+
- pyodbc=5.1.0=py312h53d5487_0
31+
- pysocks=1.7.1=pyh0701188_6
32+
- python=3.12.3=h2628c8c_0_cpython
33+
- python-dateutil=2.9.0=pyhd8ed1ab_0
34+
- python-tzdata=2024.1=pyhd8ed1ab_0
35+
- python_abi=3.12=4_cp312
36+
- pytz=2024.1=pyhd8ed1ab_0
37+
- requests=2.32.2=pyhd8ed1ab_0
38+
- setuptools=70.0.0=pyhd8ed1ab_0
39+
- six=1.16.0=pyh6c4a22f_0
40+
- sqlalchemy=2.0.30=py312h4389bb4_0
41+
- tbb=2021.12.0=hc790b64_1
42+
- tk=8.6.13=h5226925_1
43+
- typing-extensions=4.11.0=hd8ed1ab_0
44+
- typing_extensions=4.11.0=pyha770c72_0
45+
- tzdata=2024a=h0c530f3_0
46+
- ucrt=10.0.22621.0=h57928b3_0
47+
- urllib3=2.2.1=pyhd8ed1ab_0
48+
- vc=14.3=ha32ba9b_20
49+
- vc14_runtime=14.38.33135=h835141b_20
50+
- vs2015_runtime=14.38.33135=h22015db_20
51+
- wheel=0.43.0=pyhd8ed1ab_1
52+
- win_inet_pton=1.1.0=pyhd8ed1ab_6
53+
- xz=5.2.6=h8d14728_0
54+
- pip:
55+
- python-dotenv==1.0.1
56+
prefix: C:\Users\alessandra.gherardel\AppData\Local\miniconda3\envs\datalib

0 commit comments

Comments
 (0)