Skip to content

Commit d6722a2

Browse files
Merge pull request #3 from WFP-VAM/dev
Fix Data Library query
2 parents f292a7a + c2a9c39 commit d6722a2

File tree

2 files changed

+117
-1
lines changed

2 files changed

+117
-1
lines changed

datalibrary/transform.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,118 @@
1+
<<<<<<< HEAD
2+
import pandas as pd
3+
import json
4+
5+
6+
SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
7+
8+
SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
9+
"organization_title": "container_name",
10+
"organization_description": "container_description",
11+
"organization_created": "container_created",
12+
"data_collector": "organization", "owner_org": "parent_container_id",
13+
"title": "survey_title"}
14+
15+
RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
16+
17+
def clean_column_names(df):
18+
"""Rename columns to be more readable"""
19+
df.columns = df.columns.str.replace('.', '_')
20+
return df
21+
22+
def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
23+
flat_list = []
24+
for _, row in df.iterrows():
25+
for r in row[col]:
26+
flat_dict = {_id : row[_id]}
27+
flat_dict.update(r)
28+
flat_list.append(flat_dict)
29+
30+
df = pd.DataFrame(flat_list)
31+
return df
32+
33+
def flatten_resources(df):
34+
resources = df[["resources", "organization_id", "survey_id"]]
35+
flat_resources = flatten_response(resources, "resources", "survey_id")
36+
return flat_resources
37+
38+
def normalize_restrictions(df):
39+
"""Normalize restricted column"""
40+
try:
41+
df['restricted'] = df['restricted'].apply(json.loads)
42+
except TypeError:
43+
pass
44+
45+
restricted = pd.json_normalize(df["restricted"])
46+
return restricted
47+
48+
def user_data_transform(df):
49+
# Remove unnecessary columns from users table
50+
users_columns_to_drop = ["apikey", "display_name", "about", "state", "image_url", "image_display_url"]
51+
df = df.drop(columns=users_columns_to_drop)
52+
return df
53+
54+
def survey_data_transform(df):
55+
df = clean_column_names(df)
56+
df = df[SURVEY_COLUMNS_SELECTION]
57+
df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
58+
return df
59+
60+
def member_data_transform(df):
61+
df = df.rename(columns={0: "user_id", 1: "type", 2: "capacity"})
62+
df = df[df.type.isin(["user"])]
63+
df = df[["user_id", "capacity", "container_id"]]
64+
return df
65+
66+
67+
def transform(data: tuple) -> tuple:
68+
"""
69+
Transforms the input data tuple into a new tuple.
70+
71+
Args:
72+
data (tuple): A tuple containing the data to be transformed.
73+
74+
Returns:
75+
tuple: The transformed data.
76+
"""
77+
surveys, users, members = data
78+
79+
surveys = survey_data_transform(surveys)
80+
81+
# Flatten resources
82+
resources = surveys[["resources", "container_id", "survey_id"]]
83+
flat_resources = flatten_response(resources, "resources", "survey_id")
84+
85+
# Remove unnecessary columns from survey table
86+
surveys.drop(columns=["resources"], inplace=True)
87+
full_resources = pd.merge(resources, flat_resources, on="survey_id")
88+
89+
# Normalize restricted datasets
90+
restricted = normalize_restrictions(full_resources)
91+
full_resources = full_resources.join(restricted)
92+
93+
94+
# Remove unnecessary columns from resource table
95+
full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
96+
97+
# Rename resource columns
98+
resources_cols_renaming = {
99+
"restricted-level": "access_level",
100+
'restricted-allowed_users': "allowed_users"
101+
}
102+
full_resources = full_resources.rename(columns=resources_cols_renaming)
103+
104+
105+
# User table transformations
106+
users = user_data_transform(users)
107+
108+
# Member DF
109+
members = member_data_transform(members)
110+
111+
112+
return (surveys, full_resources, users, members)
113+
114+
if __name__ == "__main__":
115+
=======
1116
import pandas as pd
2117
import json
3118

@@ -111,4 +226,5 @@ def transform(data: tuple) -> tuple:
111226
return (surveys, full_resources, users, members)
112227

113228
if __name__ == "__main__":
229+
>>>>>>> master
114230
pass

run.bat

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ rem - conda to be in the PATH
66
rem - cmd.exe to be initialized with conda init
77

88
rem Define here the path to your conda installation
9-
set CONDAPATH=E:\Anaconda3
9+
set CONDAPATH=E:\ProgramData\Anaconda3
1010
rem Define here the name of the environment
1111
set ENVNAME=datalib
1212

0 commit comments

Comments
 (0)