Skip to content

Commit 9a718dd

Browse files
Merge branch 'pipeline'
2 parents 53a6efa + 1c56804 commit 9a718dd

File tree

13 files changed

+375
-157
lines changed

13 files changed

+375
-157
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,6 @@ config.py
164164
*.xlsx
165165
.env
166166
playground.py
167+
playground.R
168+
profile.html
169+
*.bak

README.md

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,30 @@
1-
# Data Library API tool
1+
# Data Library API Tool
2+
This is a Python tool to query the WFP Data Library API and export the data into CSV files.
23

3-
This is a simple Python tool to get data from the [RAM Data Library API]((https://datalib.vam.wfp.org)) in CSV format.
4+
## Features
5+
Queries the Data Library API to get:
6+
- List of users
7+
- List of survey codes
8+
- Complete information on all surveys (name, code, country, etc.)
9+
- Exports the API data into CSV files
10+
- Provides helper functions to get info on using the API
11+
## Usage
12+
- Clone this repo
13+
- Get an API key from your Data Library account
14+
- Add the API key to a .env file or pass it directly when instantiating the DataLibrary class
15+
- Run python main.py to query the API and export CSV files
16+
- The output CSV files will be saved in the output folder.
417

5-
Currently this tools query three API endpoints:
6-
- **```user_list```**: Get list of users registered in Data Library
7-
- **```package_list```**: Get survey codes (YYMM_ISO3_SURVEYTYPE) for all surveys available in the platform
8-
- **```current_package_list_with_resources```**: Complete information about surveys in Data Library, including name of survey, survey code, country and uploader
18+
## Requirements
19+
- Python 3.x
920

10-
For more information on the RAM Data Library API, consult the [documentation](https://docs.ckan.org/en/2.9/api/)
1121

12-
## How to use it
22+
## Documentation
23+
For more details on the Data Library API endpoints, see the API documentation.
1324

14-
1. Make sure you have Python installed on your machine.
15-
2. Get an API key from your [Data Library](https://datalib.vam.wfp.org) account
16-
3. Add your API key to the api_key.py file. Do not forget to add this file to the .gitignore!
17-
4. Run main.py
25+
## Contributing
26+
Contributions to add more API querying/exporting functionality are welcome!
27+
28+
## License
29+
This project is licensed under the MIT License - see the LICENSE file for details.
1830

19-
>> **Quick tip**
20-
>>
21-
>> If you're stuck, use the help() function in the DataLibraryData class for information about usage
22-
>>
File renamed without changes.

api/client.py renamed to datalibrary/extract.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import requests
22
import json
3+
import logging
4+
import pandas as pd
35

46
logger = logging.getLogger(__name__)
57

68
BASE_URL = "https://datalib.vam.wfp.org/api/3/"
79
ENDPOINTS = {
810
'users': 'action/user_list',
911
'all_surveys_information': 'action/current_package_list_with_resources',
10-
'all_surveys_code': 'action/package_list'
12+
'all_surveys_code': 'action/package_list',
13+
'member_list': 'action/member_list',
1114
}
1215

1316
class DataLibrary:
@@ -85,11 +88,47 @@ def get_surveys_with_resources(self, limit=None):
8588
response = self.get_response(url, limit=limit)
8689
data = response["result"]
8790
return data
88-
91+
8992
def __repr__(self):
9093
return f'DataLibraryData({self.api_key})'
9194

9295
def __str__(self):
9396
return f'The API key used in this DataLibraryData is {self.api_key}'
9497

95-
98+
99+
def get_survey_data(client):
100+
survey_list = client.get_survey_list()
101+
total_surveys = len(survey_list)
102+
103+
all_surveys = client.get_surveys_with_resources(limit=total_surveys)
104+
all_surveys_df = pd.json_normalize(all_surveys)
105+
106+
return all_surveys_df
107+
108+
109+
def get_user_data(client):
110+
users = client.get_users()
111+
users_df = pd.DataFrame(users)
112+
113+
return users_df
114+
115+
116+
def get_data(client):
117+
survey_df = get_survey_data(client)
118+
user_df = get_user_data(client)
119+
120+
return survey_df, user_df
121+
122+
123+
124+
if __name__ == "__main__":
125+
import os
126+
from dotenv import load_dotenv
127+
128+
load_dotenv()
129+
130+
131+
client = DataLibrary(os.getenv("DATALIB_API_KEY"))
132+
survey_df, user_df = get_data(client)
133+
print(survey_df.head())
134+
print(user_df.head())
Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,12 @@
1717
conn_str = f'mssql+pyodbc://{USERNAME}:{PASSWORD}@{SERVER}/{DATABASE}?driver=ODBC+Driver+17+for+SQL+Server'
1818
engine = create_engine(conn_str)
1919

20-
# def test_read_sql(table_name):
21-
# sql = """
22-
# SELECT * FROM [dbo].table_name
23-
# """
24-
# sql_df = pd.read_sql( sql, con=engine)
25-
# print(sql_df.head())
26-
27-
def load_data(data, table_name = 'table', index = False):
20+
class ExcelExportError(Exception):
21+
pass
22+
23+
def load_data(data, table_name = 'table'):
2824
try:
29-
data.to_sql(name=table_name, con=engine, if_exists='replace', index=index)
30-
print("Done")
25+
data.to_sql(name=table_name, con=engine, if_exists='replace')
3126
except Exception as e:
3227
logger.error(f"Error {e} when populating {table_name}")
3328

@@ -36,7 +31,7 @@ def load_to_db(data: tuple, table_names = ("DL_Surveys", "DL_Resources", "DL_Use
3631
for df, table_name in zip(data, table_names):
3732
logger.info("Loading data to database")
3833
load_data(df, table_name)
39-
except Exception as e:
34+
except ExcelExportError as e:
4035
logger.error(f"Error loading data: {e}")
4136

4237

@@ -55,8 +50,9 @@ def save_to_excel(data: tuple, filenames = ("surveys", "resources", "users")):
5550
continue
5651

5752
if __name__ == "__main__":
53+
pass
5854

59-
# test_read_sql()
60-
sample_data = {'col1': [1, 2], 'col2': [3, 4]}
61-
df = pd.DataFrame(data=sample_data)
62-
load_data(df, 'test_table', index=True)
55+
# # test_read_sql()
56+
# sample_data = {'col1': [1, 2], 'col2': [3, 4]}
57+
# data = pd.DataFrame(data=sample_data)
58+
# load_data(data, 'test_table')

datalibrary/transform.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import pandas as pd
2+
import json
3+
4+
5+
SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
6+
7+
SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
8+
"organization_title": "container_name",
9+
"organization_description": "container_description",
10+
"organization_created": "container_created",
11+
"data_collector": "organization", "owner_org": "parent_container_id",
12+
"title": "survey_title"}
13+
14+
RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "restricted-allowed_users", "restricted-level", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
15+
16+
def clean_column_names(df):
17+
"""Rename columns to be more readable"""
18+
df.columns = df.columns.str.replace('.', '_')
19+
return df
20+
21+
def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
22+
flat_list = []
23+
for _, row in df.iterrows():
24+
for r in row[col]:
25+
flat_dict = {_id : row[_id]}
26+
flat_dict.update(r)
27+
flat_list.append(flat_dict)
28+
29+
df = pd.DataFrame(flat_list)
30+
return df
31+
32+
def flatten_resources(df):
33+
resources = df[["resources", "organization_id", "survey_id"]]
34+
flat_resources = flatten_response(resources, "resources", "survey_id")
35+
return flat_resources
36+
37+
def normalize_restrictions(df):
38+
"""Normalize restricted column"""
39+
try:
40+
df['restricted'] = df['restricted'].apply(json.loads)
41+
except TypeError:
42+
pass
43+
44+
restricted = pd.json_normalize(df["restricted"])
45+
return restricted
46+
47+
def user_data_transform(df):
48+
# Remove unnecessary columns from users table
49+
users_columns_to_drop = ["apikey", "display_name", "about", "state", "image_url", "image_display_url"]
50+
df = df.drop(columns=users_columns_to_drop)
51+
return df
52+
53+
def survey_data_transform(df):
54+
df = clean_column_names(df)
55+
df = df[SURVEY_COLUMNS_SELECTION]
56+
df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
57+
return df
58+
59+
60+
61+
def transform(data: tuple) -> tuple:
62+
"""
63+
Transforms the input data tuple into a new tuple.
64+
65+
Args:
66+
data (tuple): A tuple containing the data to be transformed.
67+
68+
Returns:
69+
tuple: The transformed data.
70+
"""
71+
surveys, users = data
72+
73+
surveys = survey_data_transform(surveys)
74+
75+
# Flatten resources
76+
resources = surveys[["resources", "container_id", "survey_id"]]
77+
flat_resources = flatten_response(resources, "resources", "survey_id")
78+
79+
# Remove unnecessary columns from survey table
80+
surveys.drop(columns=["resources"], inplace=True)
81+
full_resources = pd.merge(resources, flat_resources, on="survey_id")
82+
83+
# Normalize restricted datasets
84+
restricted = normalize_restrictions(full_resources)
85+
full_resources = full_resources.join(restricted)
86+
87+
88+
# Remove unnecessary columns from resource table
89+
full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
90+
91+
# Rename resource columns
92+
resources_cols_renaming = {
93+
"level": "access_level",
94+
}
95+
full_resources = full_resources.rename(columns=resources_cols_renaming)
96+
97+
98+
# User table transformations
99+
users = user_data_transform(users)
100+
101+
return (surveys, full_resources, users)
102+

0 commit comments

Comments
 (0)