Skip to content

Commit c2a9c39

Browse files
merge master
2 parents 434fa01 + f292a7a commit c2a9c39

2 files changed

Lines changed: 149 additions & 33 deletions

File tree

datalibrary/transform.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<<<<<<< HEAD
12
import pandas as pd
23
import json
34

@@ -111,4 +112,119 @@ def transform(data: tuple) -> tuple:
111112
return (surveys, full_resources, users, members)
112113

113114
if __name__ == "__main__":
115+
=======
116+
import pandas as pd
117+
import json
118+
119+
120+
SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
121+
122+
SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
123+
"organization_title": "container_name",
124+
"organization_description": "container_description",
125+
"organization_created": "container_created",
126+
"data_collector": "organization", "owner_org": "parent_container_id",
127+
"title": "survey_title"}
128+
129+
RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
130+
131+
def clean_column_names(df):
132+
"""Rename columns to be more readable"""
133+
df.columns = df.columns.str.replace('.', '_')
134+
return df
135+
136+
def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
137+
flat_list = []
138+
for _, row in df.iterrows():
139+
for r in row[col]:
140+
flat_dict = {_id : row[_id]}
141+
flat_dict.update(r)
142+
flat_list.append(flat_dict)
143+
144+
df = pd.DataFrame(flat_list)
145+
return df
146+
147+
def flatten_resources(df):
148+
resources = df[["resources", "organization_id", "survey_id"]]
149+
flat_resources = flatten_response(resources, "resources", "survey_id")
150+
return flat_resources
151+
152+
def normalize_restrictions(df):
153+
"""Normalize restricted column"""
154+
try:
155+
df['restricted'] = df['restricted'].apply(json.loads)
156+
except TypeError:
157+
pass
158+
159+
restricted = pd.json_normalize(df["restricted"])
160+
return restricted
161+
162+
def user_data_transform(df):
163+
# Remove unnecessary columns from users table
164+
users_columns_to_drop = ["display_name", "about", "state", "image_url", "image_display_url"]
165+
df = df.drop(columns=users_columns_to_drop)
166+
return df
167+
168+
def survey_data_transform(df):
169+
df = clean_column_names(df)
170+
df = df[SURVEY_COLUMNS_SELECTION]
171+
df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
172+
return df
173+
174+
def member_data_transform(df):
175+
df = df.rename(columns={0: "user_id", 1: "type", 2: "capacity"})
176+
df = df[df.type.isin(["user"])]
177+
df = df[["user_id", "capacity", "container_id"]]
178+
return df
179+
180+
181+
def transform(data: tuple) -> tuple:
182+
"""
183+
Transforms the input data tuple into a new tuple.
184+
185+
Args:
186+
data (tuple): A tuple containing the data to be transformed.
187+
188+
Returns:
189+
tuple: The transformed data.
190+
"""
191+
surveys, users, members = data
192+
193+
surveys = survey_data_transform(surveys)
194+
195+
# Flatten resources
196+
resources = surveys[["resources", "container_id", "survey_id"]]
197+
flat_resources = flatten_response(resources, "resources", "survey_id")
198+
199+
# Remove unnecessary columns from survey table
200+
surveys.drop(columns=["resources"], inplace=True)
201+
full_resources = pd.merge(resources, flat_resources, on="survey_id")
202+
203+
# Normalize restricted datasets
204+
restricted = normalize_restrictions(full_resources)
205+
full_resources = full_resources.join(restricted)
206+
207+
208+
# Remove unnecessary columns from resource table
209+
full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
210+
211+
# Rename resource columns
212+
resources_cols_renaming = {
213+
"restricted-level": "access_level",
214+
'restricted-allowed_users': "allowed_users"
215+
}
216+
full_resources = full_resources.rename(columns=resources_cols_renaming)
217+
218+
219+
# User table transformations
220+
users = user_data_transform(users)
221+
222+
# Member DF
223+
members = member_data_transform(members)
224+
225+
226+
return (surveys, full_resources, users, members)
227+
228+
if __name__ == "__main__":
229+
>>>>>>> master
114230
pass

run.bat

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
1-
@echo OFF
2-
rem How to run a Python script in a given conda environment from a batch file.
3-
4-
rem It doesn't require:
5-
rem - conda to be in the PATH
6-
rem - cmd.exe to be initialized with conda init
7-
8-
rem Define here the path to your conda installation
9-
set CONDAPATH=E:\ProgramData\Anaconda3
10-
rem Define here the name of the environment
11-
set ENVNAME=datalib
12-
13-
rem The following command activates the base environment.
14-
rem call C:\ProgramData\Miniconda3\Scripts\activate.bat C:\ProgramData\Miniconda3
15-
if %ENVNAME%==base (set ENVPATH=%CONDAPATH%) else (set ENVPATH=%CONDAPATH%\envs\%ENVNAME%)
16-
17-
rem Activate the conda environment
18-
rem Using call is required here, see: https://stackoverflow.com/questions/24678144/conda-environments-and-bat-files
19-
call %CONDAPATH%\Scripts\activate.bat %ENVPATH%
20-
21-
rem Run a python script in that environment
22-
python main.py --db
23-
24-
rem Deactivate the environment
25-
call conda deactivate
26-
27-
rem If conda is directly available from the command line then the following code works.
28-
rem call activate someenv
29-
rem python script.py
30-
rem conda deactivate
31-
32-
rem One could also use the conda run command
33-
rem conda run -n someenv python script.py
1+
@echo OFF
2+
rem How to run a Python script in a given conda environment from a batch file.
3+
4+
rem It doesn't require:
5+
rem - conda to be in the PATH
6+
rem - cmd.exe to be initialized with conda init
7+
8+
rem Define here the path to your conda installation
9+
set CONDAPATH=E:\ProgramData\Anaconda3
10+
rem Define here the name of the environment
11+
set ENVNAME=datalib
12+
13+
rem The following command activates the base environment.
14+
rem call C:\ProgramData\Miniconda3\Scripts\activate.bat C:\ProgramData\Miniconda3
15+
if %ENVNAME%==base (set ENVPATH=%CONDAPATH%) else (set ENVPATH=%CONDAPATH%\envs\%ENVNAME%)
16+
17+
rem Activate the conda environment
18+
rem Using call is required here, see: https://stackoverflow.com/questions/24678144/conda-environments-and-bat-files
19+
call %CONDAPATH%\Scripts\activate.bat %ENVPATH%
20+
21+
rem Run a python script in that environment
22+
python main.py --db
23+
24+
rem Deactivate the environment
25+
call conda deactivate
26+
27+
rem If conda is directly available from the command line then the following code works.
28+
rem call activate someenv
29+
rem python script.py
30+
rem conda deactivate
31+
32+
rem One could also use the conda run command
33+
rem conda run -n someenv python script.py

0 commit comments

Comments
 (0)