|
1 | 1 | from dotenv import load_dotenv |
2 | 2 | import pandas as pd |
3 | 3 | import os |
4 | | -from etl.extract import DataLibrary |
5 | | -from etl.transform import process_data |
| 4 | +from etl.extract import DataLibrary, get_data |
| 5 | +from etl.transform import transform |
6 | 6 | from etl.load import load_to_db, save_to_excel |
7 | 7 |
|
8 | | - |
9 | 8 | load_dotenv() # take environment variables from .env. |
10 | 9 |
|
11 | | -def extract_data(client): |
12 | | - """ |
13 | | - Get data from DL api and returns two dataframes. |
14 | | -
|
15 | | - This function initializes a DataLibrary API instance, retrieves the survey list and user information from the API, and returns two dataframes - one containing all surveys with resources, and one containing user information. |
16 | | -
|
17 | | - Returns: |
18 | | - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the dataframes for surveys with resources and users. |
| 10 | +def main(): |
19 | 11 | """ |
| 12 | + Executes the ETL (Extract, Transform, Load) process to fetch data from VAM Data Library, process the data, and load it into a database and save it to an Excel file. |
20 | 13 |
|
21 | | - # get survey list in format DATE_ISO3_SURVEYTYPE or DATEISO3SURVEYTYPE |
22 | | - survey_list = client.get_survey_list() |
23 | | - # # get total number survey present on Data Library |
24 | | - total_surveys = len(survey_list) |
| 14 | + There are three data points being loaded: |
| 15 | + - Survey Information |
| 16 | + - User Information |
| 17 | + - Survey Resources |
25 | 18 | |
26 | | - # get information on user |
27 | | - users = client.get_users() |
28 | | - # get total number of users with an account on Data Library |
29 | | - total_users = len(users) |
30 | | - users = pd.DataFrame(users) |
31 | | - |
32 | | - print(f"\n---\n There are {total_surveys + 1} surveys and {total_users + 1} active users in Data Library\n---\n ") |
33 | | - |
34 | | - # get all information about surveys |
35 | | - all_surveys_with_resources = client.get_surveys_with_resources(limit=total_surveys) |
36 | | - |
37 | | - all_surveys_with_resources = pd.json_normalize(all_surveys_with_resources) |
38 | | - |
39 | | - return (all_surveys_with_resources, users) |
40 | | - |
41 | | -def run_etl_process(): |
42 | | - raw_data = extract_data(DataLibrary(os.getenv("DATALIB_API_KEY"))) |
| 19 | + This function is the main entry point for the ETL pipeline. It performs the following steps: |
| 20 | + 1. Fetches data from Data Library using the `get_data` function and the `DataLibrary` class, which is configured using an environment variable. |
| 21 | + 2. Processes the fetched data using the `process_data` function. |
| 22 | + 3. Saves the processed data to an Excel file using the `save_to_excel` function. |
| 23 | + 4. Loads the processed data into a database using the `load_to_db` function. |
| 24 | + """ |
| 25 | + dl_api_data = get_data(DataLibrary(os.getenv("DATALIB_API_KEY"))) |
43 | 26 | # Load processed data to DB |
44 | | - processed_data = process_data(raw_data) |
| 27 | + processed_data = transform(dl_api_data) |
45 | 28 | save_to_excel(processed_data) |
46 | 29 | load_to_db(processed_data) |
47 | 30 | print("Done") |
48 | 31 |
|
49 | 32 | if __name__== "__main__": |
50 | | - run_etl_process() |
| 33 | + main() |
| 34 | + |
51 | 35 |
|
0 commit comments