merge master

AlexGherardelli · AlexGherardelli · commit c2a9c3966ae2 · 2026-02-12T14:52:35.000+01:00
diff --git a/datalibrary/transform.py b/datalibrary/transform.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import pandas as pd
 import json
 
@@ -111,4 +112,119 @@ def transform(data: tuple) -> tuple:
     return (surveys, full_resources, users, members)
 
 if __name__ == "__main__":
+=======
+import pandas as pd
+import json
+
+
+SURVEY_COLUMNS_SELECTION = ['assessment_status', 'collection_method', 'creator_user_id', 'data_collector', 'description', 'end_date', 'id', 'metadata_created', 'metadata_modified', 'month', 'name', 'num_resources', 'organization_id', 'organization_title', 'organization_description', 'organization_created', 'owner_org', 'private', 'progress_status', 'start_date', 'survey_attributes', 'survey_category', 'survey_type', 'title', 'year', 'resources']
+
+SURVEY_COLUMNS_RENAMING = {"id": "survey_id", "organization_id": "container_id",
+                          "organization_title": "container_name",
+                          "organization_description": "container_description",
+                          "organization_created": "container_created",
+                          "data_collector": "organization", "owner_org": "parent_container_id",
+                          "title": "survey_title"}
+
+RESOURCES_COLUMNS_TO_DROP = ["resources", "restricted", "cache_last_updated", "cache_url", "revision_id", "url_type", "state", "resource_type", "mimetype_inner", "hash", "package_id"]
+
+def clean_column_names(df):
+    """Rename columns to be more readable"""
+    df.columns = df.columns.str.replace('.', '_') 
+    return df
+
+def flatten_response(df: pd.DataFrame, col: str, _id: str) -> pd.DataFrame:
+    flat_list = []
+    for _, row in df.iterrows():
+        for r in row[col]:
+            flat_dict = {_id : row[_id]} 
+            flat_dict.update(r)
+            flat_list.append(flat_dict)
+            
+    df = pd.DataFrame(flat_list)
+    return df
+
+def flatten_resources(df):
+    resources = df[["resources", "organization_id", "survey_id"]]
+    flat_resources = flatten_response(resources, "resources", "survey_id")
+    return flat_resources
+
+def normalize_restrictions(df):
+    """Normalize restricted column"""
+    try:
+        df['restricted'] = df['restricted'].apply(json.loads)
+    except TypeError:
+        pass
+    
+    restricted = pd.json_normalize(df["restricted"])
+    return restricted
+
+def user_data_transform(df):
+        # Remove unnecessary columns from users table
+    users_columns_to_drop = ["display_name", "about", "state", "image_url", "image_display_url"]
+    df = df.drop(columns=users_columns_to_drop)
+    return  df
+
+def survey_data_transform(df):
+    df = clean_column_names(df)
+    df = df[SURVEY_COLUMNS_SELECTION]
+    df = df.rename(columns=SURVEY_COLUMNS_RENAMING)
+    return df
+
+def member_data_transform(df):
+    df = df.rename(columns={0: "user_id", 1: "type", 2: "capacity"})
+    df = df[df.type.isin(["user"])]
+    df = df[["user_id", "capacity", "container_id"]]
+    return df
+
+
+def transform(data: tuple) -> tuple:
+    """
+    Transforms the input data tuple into a new tuple.
+
+    Args:
+    data (tuple): A tuple containing the data to be transformed.
+
+    Returns:
+    tuple: The transformed data.
+    """
+    surveys, users, members = data
+
+    surveys = survey_data_transform(surveys)
+
+    # Flatten resources
+    resources = surveys[["resources", "container_id", "survey_id"]]
+    flat_resources = flatten_response(resources, "resources", "survey_id")
+
+    # Remove unnecessary columns from survey table
+    surveys.drop(columns=["resources"], inplace=True) 
+    full_resources = pd.merge(resources, flat_resources, on="survey_id")
+
+    # Normalize restricted datasets
+    restricted = normalize_restrictions(full_resources)
+    full_resources = full_resources.join(restricted)
+
+
+    # Remove unnecessary columns from resource table
+    full_resources.drop(columns=RESOURCES_COLUMNS_TO_DROP, inplace=True)
+
+    # Rename resource columns
+    resources_cols_renaming = {
+        "restricted-level": "access_level",
+        'restricted-allowed_users': "allowed_users"
+    }
+    full_resources = full_resources.rename(columns=resources_cols_renaming)
+
+
+    # User table transformations 
+    users = user_data_transform(users)
+
+    # Member DF
+    members = member_data_transform(members)
+
+
+    return (surveys, full_resources, users, members)
+
+if __name__ == "__main__":
+>>>>>>> master
     pass
diff --git a/run.bat b/run.bat
@@ -1,33 +1,33 @@
-@echo OFF
-rem How to run a Python script in a given conda environment from a batch file.
-
-rem It doesn't require:
-rem - conda to be in the PATH
-rem - cmd.exe to be initialized with conda init
-
-rem Define here the path to your conda installation
-set CONDAPATH=E:\ProgramData\Anaconda3
-rem Define here the name of the environment
-set ENVNAME=datalib
-
-rem The following command activates the base environment.
-rem call C:\ProgramData\Miniconda3\Scripts\activate.bat C:\ProgramData\Miniconda3
-if %ENVNAME%==base (set ENVPATH=%CONDAPATH%) else (set ENVPATH=%CONDAPATH%\envs\%ENVNAME%)
-
-rem Activate the conda environment
-rem Using call is required here, see: https://stackoverflow.com/questions/24678144/conda-environments-and-bat-files
-call %CONDAPATH%\Scripts\activate.bat %ENVPATH%
-
-rem Run a python script in that environment
-python main.py --db
-
-rem Deactivate the environment
-call conda deactivate
-
-rem If conda is directly available from the command line then the following code works.
-rem call activate someenv
-rem python script.py
-rem conda deactivate
-
-rem One could also use the conda run command
-rem conda run -n someenv python script.py
+@echo OFF
+rem How to run a Python script in a given conda environment from a batch file.
+
+rem It doesn't require:
+rem - conda to be in the PATH
+rem - cmd.exe to be initialized with conda init
+
+rem Define here the path to your conda installation
+set CONDAPATH=E:\ProgramData\Anaconda3
+rem Define here the name of the environment
+set ENVNAME=datalib
+
+rem The following command activates the base environment.
+rem call C:\ProgramData\Miniconda3\Scripts\activate.bat C:\ProgramData\Miniconda3
+if %ENVNAME%==base (set ENVPATH=%CONDAPATH%) else (set ENVPATH=%CONDAPATH%\envs\%ENVNAME%)
+
+rem Activate the conda environment
+rem Using call is required here, see: https://stackoverflow.com/questions/24678144/conda-environments-and-bat-files
+call %CONDAPATH%\Scripts\activate.bat %ENVPATH%
+
+rem Run a python script in that environment
+python main.py --db
+
+rem Deactivate the environment
+call conda deactivate
+
+rem If conda is directly available from the command line then the following code works.
+rem call activate someenv
+rem python script.py
+rem conda deactivate
+
+rem One could also use the conda run command
+rem conda run -n someenv python script.py