1+ < << << << HEAD
12import pandas as pd
23import json
34
@@ -111,4 +112,119 @@ def transform(data: tuple) -> tuple:
111112 return (surveys , full_resources , users , members )
112113
113114if __name__ == "__main__" :
115+ == == == =
116+ import pandas as pd
117+ import json
118+
119+
120+ SURVEY_COLUMNS_SELECTION = ['assessment_status' , 'collection_method' , 'creator_user_id' , 'data_collector' , 'description' , 'end_date' , 'id' , 'metadata_created' , 'metadata_modified' , 'month' , 'name' , 'num_resources' , 'organization_id' , 'organization_title' , 'organization_description' , 'organization_created' , 'owner_org' , 'private' , 'progress_status' , 'start_date' , 'survey_attributes' , 'survey_category' , 'survey_type' , 'title' , 'year' , 'resources' ]
121+
122+ SURVEY_COLUMNS_RENAMING = {"id" : "survey_id" , "organization_id" : "container_id" ,
123+ "organization_title" : "container_name" ,
124+ "organization_description" : "container_description" ,
125+ "organization_created" : "container_created" ,
126+ "data_collector" : "organization" , "owner_org" : "parent_container_id" ,
127+ "title" : "survey_title" }
128+
129+ RESOURCES_COLUMNS_TO_DROP = ["resources" , "restricted" , "cache_last_updated" , "cache_url" , "revision_id" , "url_type" , "state" , "resource_type" , "mimetype_inner" , "hash" , "package_id" ]
130+
131+ def clean_column_names (df ):
132+ """Rename columns to be more readable"""
133+ df .columns = df .columns .str .replace ('.' , '_' )
134+ return df
135+
136+ def flatten_response (df : pd .DataFrame , col : str , _id : str ) -> pd .DataFrame :
137+ flat_list = []
138+ for _ , row in df .iterrows ():
139+ for r in row [col ]:
140+ flat_dict = {_id : row [_id ]}
141+ flat_dict .update (r )
142+ flat_list .append (flat_dict )
143+
144+ df = pd .DataFrame (flat_list )
145+ return df
146+
147+ def flatten_resources (df ):
148+ resources = df [["resources" , "organization_id" , "survey_id" ]]
149+ flat_resources = flatten_response (resources , "resources" , "survey_id" )
150+ return flat_resources
151+
152+ def normalize_restrictions (df ):
153+ """Normalize restricted column"""
154+ try :
155+ df ['restricted' ] = df ['restricted' ].apply (json .loads )
156+ except TypeError :
157+ pass
158+
159+ restricted = pd .json_normalize (df ["restricted" ])
160+ return restricted
161+
162+ def user_data_transform (df ):
163+ # Remove unnecessary columns from users table
164+ users_columns_to_drop = ["display_name" , "about" , "state" , "image_url" , "image_display_url" ]
165+ df = df .drop (columns = users_columns_to_drop )
166+ return df
167+
168+ def survey_data_transform (df ):
169+ df = clean_column_names (df )
170+ df = df [SURVEY_COLUMNS_SELECTION ]
171+ df = df .rename (columns = SURVEY_COLUMNS_RENAMING )
172+ return df
173+
174+ def member_data_transform (df ):
175+ df = df .rename (columns = {0 : "user_id" , 1 : "type" , 2 : "capacity" })
176+ df = df [df .type .isin (["user" ])]
177+ df = df [["user_id" , "capacity" , "container_id" ]]
178+ return df
179+
180+
181+ def transform (data : tuple ) -> tuple :
182+ """
183+ Transforms the input data tuple into a new tuple.
184+
185+ Args:
186+ data (tuple): A tuple containing the data to be transformed.
187+
188+ Returns:
189+ tuple: The transformed data.
190+ """
191+ surveys , users , members = data
192+
193+ surveys = survey_data_transform (surveys )
194+
195+ # Flatten resources
196+ resources = surveys [["resources" , "container_id" , "survey_id" ]]
197+ flat_resources = flatten_response (resources , "resources" , "survey_id" )
198+
199+ # Remove unnecessary columns from survey table
200+ surveys .drop (columns = ["resources" ], inplace = True )
201+ full_resources = pd .merge (resources , flat_resources , on = "survey_id" )
202+
203+ # Normalize restricted datasets
204+ restricted = normalize_restrictions (full_resources )
205+ full_resources = full_resources .join (restricted )
206+
207+
208+ # Remove unnecessary columns from resource table
209+ full_resources .drop (columns = RESOURCES_COLUMNS_TO_DROP , inplace = True )
210+
211+ # Rename resource columns
212+ resources_cols_renaming = {
213+ "restricted-level" : "access_level" ,
214+ 'restricted-allowed_users' : "allowed_users"
215+ }
216+ full_resources = full_resources .rename (columns = resources_cols_renaming )
217+
218+
219+ # User table transformations
220+ users = user_data_transform (users )
221+
222+ # Member DF
223+ members = member_data_transform (members )
224+
225+
226+ return (surveys , full_resources , users , members )
227+
228+ if __name__ == "__main__" :
229+ > >> >> >> master
114230 pass
0 commit comments