Skip to content

Commit 597e9c5

Browse files
Updated README and added transformations to tables
1 parent 4b0d943 commit 597e9c5

File tree

5 files changed

+105
-60
lines changed

5 files changed

+105
-60
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,5 +164,6 @@ config.py
164164
*.xlsx
165165
.env
166166
playground.py
167+
playground.R
167168
profile.html
168169
*.bak

README.md

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,32 @@
1-
# Data Library API tool
1+
# Data Library API Tool
2+
This is a Python tool to query the WFP Data Library API and export the data into CSV files.
23

3-
This is a simple Python tool to get data from the [RAM Data Library API]((https://datalib.vam.wfp.org)) in CSV format.
4+
## Features
5+
Queries the Data Library API to get:
6+
- List of users
7+
- List of survey codes
8+
- Complete information on all surveys (name, code, country, etc.)
9+
- Exports the API data into CSV files
10+
- Provides helper functions to get info on using the API
11+
## Usage
12+
- Clone this repo
13+
- Get an API key from your Data Library account
14+
- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
15+
- Run python main.py to query the API and export CSV files
16+
- The output CSV files will be saved in the output folder.
417

5-
Currently this tools query three API endpoints:
6-
- **```user_list```**: Get list of users registered in Data Library
7-
- **```package_list```**: Get survey codes (YYMM_ISO3_SURVEYTYPE) for all surveys available in the platform
8-
- ***```current_package_list_with_resources```**: Complete information about surveys in Data Library, including name of survey, survey code, country and uploader
18+
## Requirements
19+
Python 3.x
20+
Requests library
21+
Pandas library
22+
Python Dotenv library
923

10-
For more information on the RAM Data Library API, consult the [documentation](https://docs.ckan.org/en/2.9/api/)
24+
## Documentation
25+
For more details on the Data Library API endpoints, see the API documentation.
1126

12-
## How to use it
27+
## Contributing
28+
Contributions to add more API querying/exporting functionality are welcome!
1329

14-
1. Make sure you have Python installed on your machine.
15-
2. Get an API key from your [Data Library](https://datalib.vam.wfp.org) account
16-
3. Add your API key to the api_key.py file. Do not forget to add this file to the .gitignore!
17-
4. Run main.py
30+
## License
31+
This project is licensed under the MIT License - see the LICENSE file for details.
1832

19-
>> **Quick tip**
20-
>>
21-
>> If you're stuck, use the help() function in the DataLibraryData class for information about

etl/extract.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
ENDPOINTS = {
1111
'users': 'action/user_list',
1212
'all_surveys_information': 'action/current_package_list_with_resources',
13-
'all_surveys_code': 'action/package_list'
13+
'all_surveys_code': 'action/package_list',
14+
'member_list': 'action/member_list',
1415
}
1516

1617
class DataLibrary:
@@ -88,6 +89,16 @@ def get_surveys_with_resources(self, limit=None):
8889
response = self.get_response(url, limit=limit)
8990
data = response["result"]
9091
return data
92+
93+
def get_member_list(self, limit=None):
94+
"""Get list of members"""
95+
url = BASE_URL + ENDPOINTS['member_list']
96+
response = self.get_response(url, limit=limit)
97+
try:
98+
data = response["result"]
99+
return data
100+
except TypeError:
101+
print(response)
91102

92103
def __repr__(self):
93104
return f'DataLibraryData({self.api_key})'
@@ -112,6 +123,11 @@ def get_user_data(client):
112123

113124
return users_df
114125

126+
# def get_member_list(client):
127+
# member_list = client.get_member_list()
128+
# member_list_df = pd.DataFrame(member_list)
129+
# return member_list_df
130+
115131

116132
def get_data(client):
117133
survey_df = get_survey_data(client)
@@ -120,23 +136,12 @@ def get_data(client):
120136
return survey_df, user_df
121137

122138

139+
if __name__ == "__main__":
140+
import os
141+
from dotenv import load_dotenv
123142

124-
125-
# def get_data(client: Client) -> Tuple[DataFrame, DataFrame]:
126-
# # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE
127-
# survey_list = client.get_survey_list()
128-
# # # get total number survey present on Data Library
129-
# total_surveys = len(survey_list)
130-
131-
# # get information on user
132-
# users = client.get_users()
133-
# # get total number of users with an account on Data Library
134-
# total_users = len(users)
135-
# users = pd.DataFrame(users)
136-
137-
# # get all information about surveys
138-
# all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys)
139-
140-
# all_surveys_with_resources = pd.json_normalize(all_surveys_with_resources)
143+
load_dotenv()
141144

142-
# return (all_surveys_with_resources, users)
145+
client = DataLibrary(os.getenv("DATALIB_API_KEY"))
146+
members = client.get_member_list()
147+
print(members)

etl/load.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ class ExcelExportError(Exception):
2323
def load_data(data, table_name = 'table'):
2424
try:
2525
data.to_sql(name=table_name, con=engine, if_exists='replace')
26-
27-
print("Done")
2826
except Exception as e:
2927
logger.error(f"Error {e} when populating {table_name}")
3028

@@ -52,8 +50,9 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
5250
continue
5351

5452
if __name__ == "__main__":
53+
pass
5554

56-
# test_read_sql()
57-
sample_data = {'col1': [1, 2], 'col2': [3, 4]}
58-
df = pd.DataFrame(data=sample_data)
59-
load_data(df, 'test_table', index=True)
55+
# # test_read_sql()
56+
# sample_data = {'col1': [1, 2], 'col2': [3, 4]}
57+
# data = pd.DataFrame(data=sample_data)
58+
# load_data(data, 'test_table')

etl/transform.py

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,25 @@
22
import json
33

44

5+
SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
6+
7+
SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
8+
"organization_title": "container_name",
9+
"organization_description": "container_description",
10+
"organization_created": "container_created",
11+
"data_collector": "organization", "owner_org": "parent_container_id",
12+
"title": "survey_title"}
13+
14+
RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "restricted-allowed_users", "restricted-level", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
15+
516
def clean_column_names(df):
617
"""Rename columns to be more readable"""
718
df.columns = df.columns.str.replace('.', '_')
819
return df
920

1021
def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
1122
flat_list = []
12-
for index, row in df.iterrows():
23+
for _, row in df.iterrows():
1324
for r in row[col]:
1425
flat_dict = {_id : row[_id]}
1526
flat_dict.update(r)
@@ -33,41 +44,59 @@ def normalize_restrictions(df):
3344
restricted = pd.json_normalize(df["restricted"])
3445
return restricted
3546

47+
def user_data_transform(df):
48+
# Remove unnecessary columns from users table
49+
users_columns_to_drop = ["apikey", "display_name", "about", "state", "image_url", "image_display_url"]
50+
df = df.drop(columns=users_columns_to_drop)
51+
return df
52+
53+
def survey_data_transform(df):
54+
df = clean_column_names(df)
55+
df = df[SURVEY_COLUMNS_SELECTION]
56+
df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
57+
return df
58+
59+
3660

3761
def transform(data: tuple) -> tuple:
62+
"""
63+
Transforms the input data tuple into a new tuple.
64+
65+
Args:
66+
data (tuple): A tuple containing the data to be transformed.
67+
68+
Returns:
69+
tuple: The transformed data.
70+
"""
3871
surveys, users = data
3972

40-
# Clean column names
41-
surveys = clean_column_names(surveys)
42-
43-
# Select relevant columns
44-
survey_cols = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_type', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
45-
46-
surveys = surveys[survey_cols]
47-
48-
# Rename survey columns
49-
survey_cols_renaming = {"id": "survey_id", "organization_id": "container_id", "organization_type": "container_type", "organization_title": "container_title", "organization_description": "container_description", "organization_created": "container_created", "data_collector": "organization", "owner_org": "parent_container_id"}
50-
surveys = surveys.rename(columns= survey_cols_renaming)
73+
surveys = survey_data_transform(surveys)
5174

5275
# Flatten resources
5376
resources = surveys[["resources", "container_id", "survey_id"]]
5477
flat_resources = flatten_response(resources, "resources", "survey_id")
5578

56-
# Join back to surveys
57-
surveys.drop(columns= ['resources'], inplace=True)
79+
# Remove unnecessary columns from survey table
80+
surveys.drop(columns=["resources"], inplace=True)
5881
full_resources = pd.merge(resources, flat_resources, on="survey_id")
5982

6083
# Normalize restricted datasets
6184
restricted = normalize_restrictions(full_resources)
6285
full_resources = full_resources.join(restricted)
6386

6487

65-
# Drop unnecessary columns
66-
drop_cols = ["resources", "restricted", "restricted-allowed_users",
67-
"restricted-level", "cache_last_updated", "cache_url",
68-
"revision_id"]
69-
70-
full_resources.drop(columns=drop_cols, inplace=True)
88+
# Remove unnecessary columns from resource table
89+
full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
90+
91+
# Rename resource columns
92+
resources_cols_renaming = {
93+
"level": "access_level",
94+
}
95+
full_resources = full_resources.rename(columns=resources_cols_renaming)
96+
97+
98+
# User table transformations
99+
users = user_data_transform(users)
71100

72101
return (surveys, full_resources, users)
73102

0 commit comments

Comments
 (0)