1+ < << << << HEAD
2+ import pandas as pd
3+ import json
4+
5+
6+ SURVEY_COLUMNS_SELECTION = ['assessment_status' , 'collection_method' , 'creator_user_id' , 'data_collector' , 'description' , 'end_date' , 'id' , 'metadata_created' , 'metadata_modified' , 'month' , 'name' , 'num_resources' , 'organization_id' , 'organization_title' , 'organization_description' , 'organization_created' , 'owner_org' , 'private' , 'progress_status' , 'start_date' , 'survey_attributes' , 'survey_category' , 'survey_type' , 'title' , 'year' , 'resources' ]
7+
8+ SURVEY_COLUMNS_RENAMING = {"id" : "survey_id" , "organization_id" : "container_id" ,
9+ "organization_title" : "container_name" ,
10+ "organization_description" : "container_description" ,
11+ "organization_created" : "container_created" ,
12+ "data_collector" : "organization" , "owner_org" : "parent_container_id" ,
13+ "title" : "survey_title" }
14+
15+ RESOURCES_COLUMNS_TO_DROP = ["resources" , "restricted" , "cache_last_updated" , "cache_url" , "revision_id" , "url_type" , "state" , "resource_type" , "mimetype_inner" , "hash" , "package_id" ]
16+
17+ def clean_column_names (df ):
18+ """Rename columns to be more readable"""
19+ df .columns = df .columns .str .replace ('.' , '_' )
20+ return df
21+
22+ def flatten_response (df : pd .DataFrame , col : str , _id : str ) -> pd .DataFrame :
23+ flat_list = []
24+ for _ , row in df .iterrows ():
25+ for r in row [col ]:
26+ flat_dict = {_id : row [_id ]}
27+ flat_dict .update (r )
28+ flat_list .append (flat_dict )
29+
30+ df = pd .DataFrame (flat_list )
31+ return df
32+
33+ def flatten_resources (df ):
34+ resources = df [["resources" , "organization_id" , "survey_id" ]]
35+ flat_resources = flatten_response (resources , "resources" , "survey_id" )
36+ return flat_resources
37+
38+ def normalize_restrictions (df ):
39+ """Normalize restricted column"""
40+ try :
41+ df ['restricted' ] = df ['restricted' ].apply (json .loads )
42+ except TypeError :
43+ pass
44+
45+ restricted = pd .json_normalize (df ["restricted" ])
46+ return restricted
47+
48+ def user_data_transform (df ):
49+ # Remove unnecessary columns from users table
50+ users_columns_to_drop = ["apikey" , "display_name" , "about" , "state" , "image_url" , "image_display_url" ]
51+ df = df .drop (columns = users_columns_to_drop )
52+ return df
53+
54+ def survey_data_transform (df ):
55+ df = clean_column_names (df )
56+ df = df [SURVEY_COLUMNS_SELECTION ]
57+ df = df .rename (columns = SURVEY_COLUMNS_RENAMING )
58+ return df
59+
60+ def member_data_transform (df ):
61+ df = df .rename (columns = {0 : "user_id" , 1 : "type" , 2 : "capacity" })
62+ df = df [df .type .isin (["user" ])]
63+ df = df [["user_id" , "capacity" , "container_id" ]]
64+ return df
65+
66+
67+ def transform (data : tuple ) -> tuple :
68+ """
69+ Transforms the input data tuple into a new tuple.
70+
71+ Args:
72+ data (tuple): A tuple containing the data to be transformed.
73+
74+ Returns:
75+ tuple: The transformed data.
76+ """
77+ surveys , users , members = data
78+
79+ surveys = survey_data_transform (surveys )
80+
81+ # Flatten resources
82+ resources = surveys [["resources" , "container_id" , "survey_id" ]]
83+ flat_resources = flatten_response (resources , "resources" , "survey_id" )
84+
85+ # Remove unnecessary columns from survey table
86+ surveys .drop (columns = ["resources" ], inplace = True )
87+ full_resources = pd .merge (resources , flat_resources , on = "survey_id" )
88+
89+ # Normalize restricted datasets
90+ restricted = normalize_restrictions (full_resources )
91+ full_resources = full_resources .join (restricted )
92+
93+
94+ # Remove unnecessary columns from resource table
95+ full_resources .drop (columns = RESOURCES_COLUMNS_TO_DROP , inplace = True )
96+
97+ # Rename resource columns
98+ resources_cols_renaming = {
99+ "restricted-level" : "access_level" ,
100+ 'restricted-allowed_users' : "allowed_users"
101+ }
102+ full_resources = full_resources .rename (columns = resources_cols_renaming )
103+
104+
105+ # User table transformations
106+ users = user_data_transform (users )
107+
108+ # Member DF
109+ members = member_data_transform (members )
110+
111+
112+ return (surveys , full_resources , users , members )
113+
114+ if __name__ == "__main__" :
115+ == == == =
1116import pandas as pd
2117import json
3118
@@ -111,4 +226,5 @@ def transform(data: tuple) -> tuple:
111226 return (surveys , full_resources , users , members )
112227
113228if __name__ == "__main__" :
229+ > >> >> >> master
114230 pass
0 commit comments