diff --git a/Makefile b/Makefile index a2d8bca5e49..8894496729b 100644 --- a/Makefile +++ b/Makefile @@ -81,7 +81,8 @@ test-python-integration-local: python -m pytest -n 8 --integration \ -k "not gcs_registry and \ not s3_registry and \ - not test_lambda_materialization" \ + not test_lambda_materialization and \ + not test_snowflake" \ sdk/python/tests \ ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; @@ -113,7 +114,8 @@ test-python-universal-spark: not test_push_features_to_offline_store.py and \ not gcs_registry and \ not s3_registry and \ - not test_universal_types" \ + not test_universal_types and \ + not test_snowflake" \ sdk/python/tests test-python-universal-trino: @@ -136,9 +138,27 @@ test-python-universal-trino: not test_push_features_to_offline_store.py and \ not gcs_registry and \ not s3_registry and \ - not test_universal_types" \ + not test_universal_types and \ + not test_snowflake" \ sdk/python/tests + +# Note: to use this, you'll need to have Microsoft ODBC 17 installed. +# See https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15#17 +test-python-universal-mssql: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.mssql_repo_configuration \ + PYTEST_PLUGINS=feast.infra.offline_stores.contrib.mssql_offline_store.tests \ + FEAST_USAGE=False IS_TEST=True \ + FEAST_LOCAL_ONLINE_CONTAINER=True \ + python -m pytest -n 8 --integration \ + -k "not gcs_registry and \ + not s3_registry and \ + not test_lambda_materialization and \ + not test_snowflake" \ + sdk/python/tests + + #To use Athena as an offline store, you need to create an Athena database and an S3 bucket on AWS. https://docs.aws.amazon.com/athena/latest/ug/getting-started.html #Modify environment variables ATHENA_DATA_SOURCE, ATHENA_DATABASE, ATHENA_S3_BUCKET_NAME if you want to change the data source, database, and bucket name of S3 to use. #If tests fail with the pytest -n 8 option, change the number to 1. @@ -161,7 +181,8 @@ test-python-universal-athena: not test_historical_features_persisting and \ not test_historical_retrieval_fails_on_validation and \ not gcs_registry and \ - not s3_registry" \ + not s3_registry and \ + not test_snowflake" \ sdk/python/tests test-python-universal-postgres-offline: @@ -203,7 +224,8 @@ test-python-universal-postgres-online: not test_push_features_to_offline_store and \ not gcs_registry and \ not s3_registry and \ - not test_universal_types" \ + not test_universal_types and \ + not test_snowflake" \ sdk/python/tests test-python-universal-cassandra: @@ -230,7 +252,8 @@ test-python-universal-cassandra-no-cloud-providers: not test_apply_data_source_integration and \ not test_nullable_online_store and \ not gcs_registry and \ - not s3_registry" \ + not s3_registry and \ + not test_snowflake" \ sdk/python/tests test-python-universal: diff --git a/README.md b/README.md index 9616c91e8c4..b663533710b 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) - * [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL source (contrib plugin)](https://docs.feast.dev/reference/data-sources/mssql) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) @@ -161,7 +161,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) - * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL (contrib plugin)](https://docs.feast.dev/reference/offline-stores/mssql.md) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/offline-stores/postgres) * [x] [Trino (contrib plugin)](https://github.com/Shopify/feast-trino) diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index bdfe9555dd9..4330bc25647 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -71,6 +71,7 @@ * [Spark (contrib)](reference/data-sources/spark.md) * [PostgreSQL (contrib)](reference/data-sources/postgres.md) * [Trino (contrib)](reference/data-sources/trino.md) + * [Azure Synapse + Azure SQL (contrib)](reference/data-sources/mssql.md) * [Offline stores](reference/offline-stores/README.md) * [Overview](reference/offline-stores/overview.md) * [File](reference/offline-stores/file.md) @@ -80,6 +81,7 @@ * [Spark (contrib)](reference/offline-stores/spark.md) * [PostgreSQL (contrib)](reference/offline-stores/postgres.md) * [Trino (contrib)](reference/offline-stores/trino.md) + * [Azure Synapse + Azure SQL (contrib)](reference/offline-stores/mssql.md) * [Online stores](reference/online-stores/README.md) * [SQLite](reference/online-stores/sqlite.md) * [Snowflake](reference/online-stores/snowflake.md) @@ -87,10 +89,12 @@ * [Datastore](reference/online-stores/datastore.md) * [DynamoDB](reference/online-stores/dynamodb.md) * [PostgreSQL (contrib)](reference/online-stores/postgres.md) + * [Cassandra + Astra DB (contrib)](reference/online-stores/cassandra.md) * [Providers](reference/providers/README.md) * [Local](reference/providers/local.md) * [Google Cloud Platform](reference/providers/google-cloud-platform.md) * [Amazon Web Services](reference/providers/amazon-web-services.md) + * [Azure](reference/providers/azure.md) * [Feature repository](reference/feature-repository/README.md) * [feature\_store.yaml](reference/feature-repository/feature-store-yaml.md) * [.feastignore](reference/feature-repository/feast-ignore.md) diff --git a/docs/getting-started/concepts/registry.md b/docs/getting-started/concepts/registry.md index 99d6d746d5c..ac34829008b 100644 --- a/docs/getting-started/concepts/registry.md +++ b/docs/getting-started/concepts/registry.md @@ -1,15 +1,15 @@ # Registry -Feast uses a registry to store all applied Feast objects (e.g. Feature views, entities, etc). The registry exposes +Feast uses a registry to store all applied Feast objects (e.g. Feature views, entities, etc). The registry exposes methods to apply, list, retrieve and delete these objects, and is an abstraction with multiple implementations. ### Options for registry implementations #### File-based registry -By default, Feast uses a file-based registry implementation, which stores the protobuf representation of the registry as -a serialized file. This registry file can be stored in a local file system, or in cloud storage (in, say, S3 or GCS). +By default, Feast uses a file-based registry implementation, which stores the protobuf representation of the registry as +a serialized file. This registry file can be stored in a local file system, or in cloud storage (in, say, S3 or GCS, or Azure). -The quickstart guides that use `feast init` will use a registry on a local file system. To allow Feast to configure +The quickstart guides that use `feast init` will use a registry on a local file system. To allow Feast to configure a remote file registry, you need to create a GCS / S3 bucket that Feast can understand: {% tabs %} {% tab title="Example S3 file registry" %} @@ -35,9 +35,9 @@ offline_store: {% endtab %} {% endtabs %} -However, there are inherent limitations with a file-based registry, since changing a single field in the registry -requires re-writing the whole registry file. With multiple concurrent writers, this presents a risk of data loss, or -bottlenecks writes to the registry since all changes have to be serialized (e.g. when running materialization for +However, there are inherent limitations with a file-based registry, since changing a single field in the registry +requires re-writing the whole registry file. With multiple concurrent writers, this presents a risk of data loss, or +bottlenecks writes to the registry since all changes have to be serialized (e.g. when running materialization for multiple feature views or time ranges concurrently). #### SQL Registry @@ -47,14 +47,14 @@ This supports any SQLAlchemy compatible database as a backend. The exact schema ### Updating the registry -We recommend users store their Feast feature definitions in a version controlled repository, which then via CI/CD -automatically stays synced with the registry. Users will often also want multiple registries to correspond to -different environments (e.g. dev vs staging vs prod), with staging and production registries with locked down write +We recommend users store their Feast feature definitions in a version controlled repository, which then via CI/CD +automatically stays synced with the registry. Users will often also want multiple registries to correspond to +different environments (e.g. dev vs staging vs prod), with staging and production registries with locked down write access since they can impact real user traffic. See [Running Feast in Production](../../how-to-guides/running-feast-in-production.md#1.-automatically-deploying-changes-to-your-feature-definitions) for details on how to set this up. ### Accessing the registry from clients -Users can specify the registry through a `feature_store.yaml` config file, or programmatically. We often see teams +Users can specify the registry through a `feature_store.yaml` config file, or programmatically. We often see teams preferring the programmatic approach because it makes notebook driven development very easy: #### Option 1: programmatically specifying the registry diff --git a/docs/how-to-guides/adding-or-reusing-tests.md b/docs/how-to-guides/adding-or-reusing-tests.md index 45b9aa26e00..d68e47df5c6 100644 --- a/docs/how-to-guides/adding-or-reusing-tests.md +++ b/docs/how-to-guides/adding-or-reusing-tests.md @@ -241,7 +241,8 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, table_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp = event_timestamp, ) # ... more test code ``` diff --git a/docs/reference/data-sources/README.md b/docs/reference/data-sources/README.md index 6ab2e4b083f..ae5e25dbc51 100644 --- a/docs/reference/data-sources/README.md +++ b/docs/reference/data-sources/README.md @@ -35,9 +35,13 @@ Please see [Data Source](../../getting-started/concepts/data-ingestion.md) for a {% endcontent-ref %} {% content-ref url="postgres.md" %} -[postgres.md]([postgres].md) +[postgres.md](postgres.md) {% endcontent-ref %} {% content-ref url="trino.md" %} -[trino.md]([trino].md) +[trino.md](trino.md) +{% endcontent-ref %} + +{% content-ref url="mssql.md" %} +[mssql.md](mssql.md) {% endcontent-ref %} diff --git a/docs/reference/data-sources/mssql.md b/docs/reference/data-sources/mssql.md new file mode 100644 index 00000000000..8bf1ede6aa8 --- /dev/null +++ b/docs/reference/data-sources/mssql.md @@ -0,0 +1,29 @@ +# MsSQL source (contrib) + +## Description + +MsSQL data sources are Microsoft sql table sources. +These can be specified either by a table reference or a SQL query. + +## Disclaimer + +The MsSQL data source does not achieve full test coverage. +Please do not assume complete stability. + +## Examples + +Defining a MsSQL source: + +```python +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) + +driver_hourly_table = "driver_hourly" + +driver_source = MsSqlServerSource( + table_ref=driver_hourly_table, + event_timestamp_column="datetime", + created_timestamp_column="created", +) +``` diff --git a/docs/reference/offline-stores/README.md b/docs/reference/offline-stores/README.md index 08a28f9e7ee..02b873bb592 100644 --- a/docs/reference/offline-stores/README.md +++ b/docs/reference/offline-stores/README.md @@ -35,3 +35,7 @@ Please see [Offline Store](../../getting-started/architecture-and-components/off {% content-ref url="trino.md" %} [trino.md](trino.md) {% endcontent-ref %} + +{% content-ref url="mssql.md" %} +[mssql.md](mssql.md) +{% endcontent-ref %} diff --git a/docs/reference/offline-stores/mssql.md b/docs/reference/offline-stores/mssql.md new file mode 100644 index 00000000000..bec0c8deb82 --- /dev/null +++ b/docs/reference/offline-stores/mssql.md @@ -0,0 +1,59 @@ +# MsSQL/Synapse offline store (contrib) + +## Description + +The MsSQL offline store provides support for reading [MsSQL Sources](../data-sources/mssql.md). Specifically, it is developed to read from [Synapse SQL](https://docs.microsoft.com/en-us/azure/synapse-analytics/sql/overview-features) on Microsoft Azure + +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. + +## Disclaimer + +The MsSQL offline store does not achieve full test coverage. +Please do not assume complete stability. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +offline_store: + type: mssql + connection_string: ${SQL_CONN} # Environment Variable +``` +{% endcode %} + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Spark offline store. + +| | MsSql | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `MsSqlServerRetrievalJob`. + +| | MsSql | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | no | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | no | +| local execution of Python-based on-demand transforms | no | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/online-stores/README.md b/docs/reference/online-stores/README.md index b7e7d4e7ca0..6d616b46f25 100644 --- a/docs/reference/online-stores/README.md +++ b/docs/reference/online-stores/README.md @@ -29,3 +29,4 @@ Please see [Online Store](../../getting-started/architecture-and-components/onli {% content-ref url="cassandra.md" %} [cassandra.md](cassandra.md) {% endcontent-ref %} + diff --git a/docs/reference/online-stores/cassandra.md b/docs/reference/online-stores/cassandra.md index 7a83f905ede..3355c3728ce 100644 --- a/docs/reference/online-stores/cassandra.md +++ b/docs/reference/online-stores/cassandra.md @@ -1,4 +1,4 @@ -# Cassandra / Astra DB online store +# Cassandra + Astra DB online store (contrib) ## Description diff --git a/docs/reference/providers/README.md b/docs/reference/providers/README.md index dc52d927264..20686a1e140 100644 --- a/docs/reference/providers/README.md +++ b/docs/reference/providers/README.md @@ -7,3 +7,5 @@ Please see [Provider](../../getting-started/architecture-and-components/provider {% page-ref page="google-cloud-platform.md" %} {% page-ref page="amazon-web-services.md" %} + +{% page-ref page="azure.md" %} diff --git a/docs/reference/providers/azure.md b/docs/reference/providers/azure.md new file mode 100644 index 00000000000..123bf087635 --- /dev/null +++ b/docs/reference/providers/azure.md @@ -0,0 +1,26 @@ +# Azure (contrib) + +## Description + +* Offline Store: Uses the **MsSql** offline store by default. Also supports File as the offline store. +* Online Store: Uses the **Redis** online store by default. Also supports Sqlite as an online store. + +## Disclaimer + +The Azure provider does not achieve full test coverage. +Please do not assume complete stability. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +``` +{% endcode %} \ No newline at end of file diff --git a/docs/roadmap.md b/docs/roadmap.md index 4e610aa172e..dc1d9ae1ab8 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -10,7 +10,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) - * [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL source (contrib plugin)](https://docs.feast.dev/reference/data-sources/mssql) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) @@ -19,7 +19,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) - * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL (contrib plugin)](https://docs.feast.dev/reference/offline-stores/mssql.md) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/offline-stores/postgres) * [x] [Trino (contrib plugin)](https://github.com/Shopify/feast-trino) diff --git a/docs/tutorials/azure/README.md b/docs/tutorials/azure/README.md new file mode 100644 index 00000000000..2bfd53adf7c --- /dev/null +++ b/docs/tutorials/azure/README.md @@ -0,0 +1,88 @@ +# Getting started with Feast on Azure + +The objective of this tutorial is to build a model that predicts if a driver will complete a trip based on a number of features ingested into Feast. During this tutorial you will: + +1. Deploy the infrastructure for a feature store (using an ARM template) +2. Register features into a central feature registry hosted on Blob Storage +3. Consume features from the feature store for training and inference + +## Prerequisites + +For this tutorial you will require: + +1. An Azure subscription. +2. Working knowledge of Python and ML concepts. +3. Basic understanding of Azure Machine Learning - using notebooks, etc. + +## 1. Deploy Infrastructure + +We have created an ARM template that deploys and configures all the infrastructure required to run feast in Azure. This makes the set-up very simple - select the **Deploy to Azure** button below. + +The only 2 required parameters during the set-up are: + +- **Admin Password** for the the Dedicated SQL Pool being deployed. +- **Principal ID** this is to set the storage permissions for the feast registry store. You can find the value for this by opening **Cloud Shell** and run the following command: + +```bash +# If you are using Azure portal CLI or Azure CLI 2.37.0 or above +az ad signed-in-user show --query id -o tsv + +# If you are using Azure CLI below 2.37.0 +az ad signed-in-user show --query objectId -o tsv +``` + +> You may want to first make sure your subscription has registered `Microsoft.Synapse`, `Microsoft.SQL`, `Microsoft.Network` and `Microsoft.Compute` providers before running the template below, as some of them may require explicit registration. If you are on a Free Subscription, you will not be able to deploy the workspace part of this tutorial. + +[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Ffeast-dev%2Ffeast%2Fmaster%2Fdocs%2Ftutorials%2Fazure%2Fdeployment%2Ffs_synapse_azuredeploy.json) + +![feast architecture](media/arch.png) + +The ARM template will not only deploy the infrastructure but it will also: + +- install feast with the azure provider on the compute instance +- set the Registry Blob path, Dedicated SQL Pool and Redis cache connection strings in the Azure ML default Keyvault. + +> **☕ It can take up to 20 minutes for the Redis cache to be provisioned.** + +## 2. Git clone this repo to your compute instance + +In the [Azure Machine Learning Studio](https://ml.azure.com), navigate to the left-hand menu and select **Compute**. You should see your compute instance running, select **Terminal** + +![compute instance terminal](media/ci.png) + +In the terminal you need to clone this GitHub repo: + +```bash +git clone https://github.com/feast-dev/feast +``` + +### 3. Load feature values into Feature Store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [Loading feature values into feature store notebook](./notebooks/part1-load-data.ipynb).Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ + +![compute instance kernel](media/ci-kernel.png) + + +## 4. Register features in Feature store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [register features into your feature registry notebook](notebooks/part2-register-features.ipynb). Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ + +## 5.Train and Deploy a model using the Feature Store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [train and deploy a model using feast notebook](notebooks/part3-train-and-deploy-with-feast.ipynb). Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ +> +> If problems are encountered during model training stage, create a new cell and rexecute `!pip install scikit-learn==0.22.1`. Upon completion, restart the Kernel and start over. + +## 6. Running Feast Azure Tutorials locally without Azure workspace + +* If you are on a free tier instance, you will not be able to deploy the azure deployment because the azure workspace requires VCPUs and the free trial subscription does not have a quota. +* The workaround is to remove the `Microsoft.MachineLearningServices/workspaces/computes` resource from `fs_synapse_azure_deploy.json` and setting up the environment locally. + 1. After deployment, find your `Azure SQL Pool` secrets by going to `Subscriptions->->Resource Group->Key Vault` and giving your account admin permissions to the keyvault. Retrieve the `FEAST-REGISTRY-PATH`, `FEAST-OFFLINE-STORE-CONN`, and `FEAST-ONLINE-STORE-CONN` secrets to use in your local environment. + 2. In your local environment, you will need to install the azure cli and login to the cli using `az login`. + 3. After everything is setup, you should be able to work through the first 2 tutorial notebooks without any errors (The 3rd notebook requires Azure workspace resources). \ No newline at end of file diff --git a/docs/tutorials/azure/data/data_generator.py b/docs/tutorials/azure/data/data_generator.py new file mode 100644 index 00000000000..77fec082963 --- /dev/null +++ b/docs/tutorials/azure/data/data_generator.py @@ -0,0 +1,260 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import numpy as np +import pandas as pd +from datetime import datetime, timedelta +from pytz import FixedOffset, timezone, utc +from random import randint +from enum import Enum +from sqlalchemy import create_engine, DateTime +from datetime import datetime + +DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL = "event_timestamp" + + +class EventTimestampType(Enum): + TZ_NAIVE = 0 + TZ_AWARE_UTC = 1 + TZ_AWARE_FIXED_OFFSET = 2 + TZ_AWARE_US_PACIFIC = 3 + + +def _convert_event_timestamp(event_timestamp: pd.Timestamp, t: EventTimestampType): + if t == EventTimestampType.TZ_NAIVE: + return event_timestamp + elif t == EventTimestampType.TZ_AWARE_UTC: + return event_timestamp.replace(tzinfo=utc) + elif t == EventTimestampType.TZ_AWARE_FIXED_OFFSET: + return event_timestamp.replace(tzinfo=utc).astimezone(FixedOffset(60)) + elif t == EventTimestampType.TZ_AWARE_US_PACIFIC: + return event_timestamp.replace(tzinfo=utc).astimezone(timezone("US/Pacific")) + + +def create_orders_df( + customers, + drivers, + start_date, + end_date, + order_count, + infer_event_timestamp_col=False, +) -> pd.DataFrame: + """ + Example df generated by this function: + | order_id | driver_id | customer_id | order_is_success | event_timestamp | + +----------+-----------+-------------+------------------+---------------------+ + | 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 | + | 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 | + | 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 | + | 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 | + """ + df = pd.DataFrame() + df["order_id"] = [order_id for order_id in range(100, 100 + order_count)] + df["driver_id"] = np.random.choice(drivers, order_count) + df["customer_id"] = np.random.choice(customers, order_count) + df["order_is_success"] = np.random.randint(0, 2, size=order_count).astype(np.int32) + + if infer_event_timestamp_col: + df["e_ts"] = [ + _convert_event_timestamp( + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"), + EventTimestampType(3), + ) + for idx, dt in enumerate( + pd.date_range(start=start_date, end=end_date, periods=order_count) + ) + ] + df.sort_values( + by=["e_ts", "order_id", "driver_id", "customer_id"], inplace=True, + ) + else: + df[DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL] = [ + _convert_event_timestamp( + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"), + EventTimestampType(idx % 4), + ) + for idx, dt in enumerate( + pd.date_range(start=start_date, end=end_date, periods=order_count) + ) + ] + df.sort_values( + by=[ + DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, + "order_id", + "driver_id", + "customer_id", + ], + inplace=True, + ) + return df + + +def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame: + """ + Example df generated by this function: + | datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created | + |------------------+-----------+-----------+----------+-----------------+------------------| + | 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 | + | 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 | + | 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 | + | 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 | + | 2021-03-17 23:31 | 5010 | 0.819598 | 0.262503 | 473 | 2021-03-24 19:34 | + | | ... | ... | ... | ... | | + | 2021-03-24 16:31 | 5001 | 0.061585 | 0.658140 | 477 | 2021-03-24 19:34 | + | 2021-03-24 17:31 | 5001 | 0.088949 | 0.303897 | 618 | 2021-03-24 19:34 | + | 2021-03-24 18:31 | 5001 | 0.096652 | 0.747421 | 480 | 2021-03-24 19:34 | + | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | + | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | + """ + df_hourly = pd.DataFrame( + { + "datetime": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=start_date, end=end_date, freq="1H", closed="left" + ) + ] + # include a fixed timestamp for get_historical_features in the quickstart + # + [ + # pd.Timestamp( + # year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC" + # ) + # ] + } + ) + df_all_drivers = pd.DataFrame() + dates = df_hourly["datetime"].map(pd.Timestamp.date).unique() + + for driver in drivers: + df_hourly_copy = df_hourly.copy() + df_hourly_copy["driver_id"] = driver + for date in dates: + df_hourly_copy.loc[ + df_hourly_copy["datetime"].map(pd.Timestamp.date) == date, + "avg_daily_trips", + ] = randint(10, 30) + df_all_drivers = pd.concat([df_hourly_copy, df_all_drivers]) + + df_all_drivers.reset_index(drop=True, inplace=True) + rows = df_all_drivers["datetime"].count() + + df_all_drivers["conv_rate"] = np.random.random(size=rows).astype(np.float32) + df_all_drivers["acc_rate"] = np.random.random(size=rows).astype(np.float32) + + df_all_drivers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) + + # Create duplicate rows that should be filtered by created timestamp + # TODO: These duplicate rows area indirectly being filtered out by the point in time join already. We need to + # inject a bad row at a timestamp where we know it will get joined to the entity dataframe, and then test that + # we are actually filtering it with the created timestamp + late_row = df_all_drivers.iloc[int(rows / 2)] + df_all_drivers = df_all_drivers.append(late_row).append(late_row) + + return df_all_drivers + + +def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.DataFrame: + """ + Example df generated by this function: + | datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created | + |------------------+-------------+-----------------+---------------------+---------------------+------------------| + | 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 | + | 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 | + | 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 | + | 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 | + | 2021-03-21 19:31 | 1010 | 0.438262 | 0.313009 | 192 | 2021-03-24 19:38 | + | | ... | ... | ... | ... | | + | 2021-03-19 19:31 | 1001 | 0.738860 | 0.857422 | 344 | 2021-03-24 19:38 | + | 2021-03-20 19:31 | 1001 | 0.848397 | 0.745989 | 106 | 2021-03-24 19:38 | + | 2021-03-21 19:31 | 1001 | 0.301552 | 0.185873 | 812 | 2021-03-24 19:38 | + | 2021-03-22 19:31 | 1001 | 0.943030 | 0.561219 | 322 | 2021-03-24 19:38 | + | 2021-03-23 19:31 | 1001 | 0.354919 | 0.810093 | 273 | 2021-03-24 19:38 | + """ + df_daily = pd.DataFrame( + { + "datetime": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=start_date, end=end_date, freq="1D", closed="left" + ) + ] + } + ) + df_all_customers = pd.DataFrame() + + for customer in customers: + df_daily_copy = df_daily.copy() + rows = df_daily_copy["datetime"].count() + df_daily_copy["customer_id"] = customer + df_daily_copy["current_balance"] = np.random.uniform( + low=10.0, high=50.0, size=rows + ).astype(np.float32) + df_daily_copy["lifetime_trip_count"] = np.linspace( + start=randint(10, 20), stop=randint(40, 50), num=rows + ).astype(np.int32) + df_daily_copy["avg_passenger_count"] = np.random.uniform( + low=1, high=3, size=rows + ).astype(np.float32) + df_all_customers = pd.concat([df_daily_copy, df_all_customers]) + + df_all_customers.reset_index(drop=True, inplace=True) + + rows = df_all_customers["datetime"].count() + + # TODO: Remove created timestamp in order to test whether its really optional + df_all_customers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) + return df_all_customers + + +def generate_entities(date, n_customers, n_drivers, order_count): + end_date = date + before_start_date = end_date - timedelta(days=365) + start_date = end_date - timedelta(days=7) + after_end_date = end_date + timedelta(days=365) + customer_entities = [20000 + c_id for c_id in range(n_customers)] + driver_entities = [50000 + d_id for d_id in range(n_drivers)] + orders_df = create_orders_df( + customers=customer_entities, + drivers=driver_entities, + start_date=start_date, + end_date=end_date, + order_count=order_count, + infer_event_timestamp_col=False, + ) + return customer_entities, driver_entities, end_date, orders_df, start_date + + +def save_df_to_csv(df, table_name, dtype): + df.to_csv(table_name+".csv", index=False) + + +if __name__ == "__main__": + start_date = datetime.now().replace(microsecond=0, second=0, minute=0) + ( + customer_entities, + driver_entities, + end_date, + orders_df, + start_date, + ) = generate_entities(start_date, 1000, 1000, 20000) + + customer_df = create_customer_daily_profile_df( + customer_entities, start_date, end_date + ) + print(customer_df.head()) + + drivers_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) + + print(drivers_df.head()) + + + orders_table = "orders" + driver_hourly_table = "driver_hourly" + customer_profile_table = "customer_profile" + + print("uploading orders") + save_df_to_csv(orders_df, orders_table, dtype={"event_timestamp": DateTime()}) + print("uploading drivers") + save_df_to_csv(drivers_df, driver_hourly_table, dtype={"datetime": DateTime()}) + print("uploading customers") + save_df_to_csv(customer_df, customer_profile_table, dtype={"datetime": DateTime()}) \ No newline at end of file diff --git a/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json b/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json new file mode 100644 index 00000000000..2846a5341dd --- /dev/null +++ b/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json @@ -0,0 +1,340 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "administratorLoginPassword": { + "type": "securestring", + "metadata": { + "description": "The administrator password of the SQL logical server." + } + }, + "principalId": { + "type": "string", + "metadata": { + "description": "Specifies the principal ID assigned to the role. You can find in cloud shell using 'az ad signed-in-user show --query id -o tsv'" + } + }, + "administratorLogin": { + "type": "string", + "metadata": { + "description": "The administrator username of the SQL logical server." + }, + "defaultValue": "azureuser" + }, + "location": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "[resourceGroup().location]" + }, + "registryBlobStore": { + "type": "string", + "metadata": { + "description": "Storage account to host the feast registry db" + }, + "defaultValue": "[concat('fsregistry',uniqueString(resourceGroup().id))]" + }, + "sqlServerName": { + "type": "string", + "metadata": { + "description": "The SQL Server Name" + }, + "defaultValue": "[concat('fssqlsvr',uniqueString(resourceGroup().id))]" + }, + "sqlDbName": { + "type": "string", + "metadata": { + "description": "SQL DB Name" + }, + "defaultValue": "[concat('fsoffline',uniqueString(resourceGroup().id))]" + }, + "redisCacheName": { + "type": "string", + "metadata": { + "description": "Redis Cache Name" + }, + "defaultValue": "[concat('fsonline',uniqueString(resourceGroup().id))]" + }, + "amlWorkspaceName": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "[concat('mlws',uniqueString(resourceGroup().id))]" + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "Standard_DS3_v2" + }, + "roleDefinitionID": { + "type": "string", + "metadata": { + "description": "Specifies the role definition ID used in the role assignment." + }, + "defaultValue": "ba92f5b4-2d11-453d-a403-e96b0029c9fe" + } + }, + "functions": [], + "variables": { + "tenantId": "[subscription().tenantId]", + "storageAccountName": "[concat('st', uniqueString(resourceGroup().id))]", + "keyVaultName": "[concat('kv-', uniqueString(resourceGroup().id))]", + "applicationInsightsName": "[concat('appi-', uniqueString(resourceGroup().id))]", + "containerRegistryName": "[concat('cr', uniqueString(resourceGroup().id))]", + "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "registryAccount": "[resourceId('Microsoft.Storage/storageAccounts', parameters('registryBlobStore'))]", + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", + "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", + "redisCache": "[resourceId('Microsoft.Cache/redis', parameters('redisCacheName'))]", + "roleAssignmentName": "[guid(parameters('principalId'), parameters('roleDefinitionID'), resourceGroup().id)]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-01-01", + "name": "[variables('storageAccountName')]", + "location": "[parameters('location')]", + "sku": { + "name": "Standard_RAGRS" + }, + "kind": "StorageV2", + "properties": { + "encryption": { + "services": { + "blob": { + "enabled": true + }, + "file": { + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-04-01-preview", + "name": "[variables('keyVaultName')]", + "location": "[parameters('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" + }, + "accessPolicies": [], + "enableSoftDelete": true + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-OFFLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('mssql+pyodbc://',parameters('administratorLogin'),':',parameters('administratorLoginPassword'),'@', parameters('sqlServerName'),'.database.windows.net:1433/', parameters('sqlDbName'), '?driver=ODBC+Driver+17+for+SQL+Server&autocommit=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + } + ] + }, + { + "type": "Microsoft.Insights/components", + "apiVersion": "2020-02-02", + "name": "[variables('applicationInsightsName')]", + "location": "[if(or(equals(parameters('location'),'eastus2'), equals(parameters('location'),'westcentralus')),'southcentralus',parameters('location'))]", + "kind": "web", + "properties": { + "Application_Type": "web" + } + }, + { + "type": "Microsoft.ContainerRegistry/registries", + "sku": { + "name": "Standard", + "tier": "Standard" + }, + "name": "[variables('containerRegistryName')]", + "apiVersion": "2019-12-01-preview", + "location": "[parameters('location')]", + "properties": { + "adminUserEnabled": true + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces", + "apiVersion": "2021-04-01", + "name": "[parameters('amlWorkspaceName')]", + "location": "[resourceGroup().location]", + "identity": { + "type": "SystemAssigned" + }, + "tags": { + "displayName": "Azure ML Workspace" + }, + "dependsOn": [ + "[variables('storageAccount')]", + "[variables('keyVault')]", + "[variables('applicationInsights')]", + "[variables('containerRegistry')]" + ], + "properties": { + "storageAccount": "[variables('storageAccount')]", + "keyVault": "[variables('keyVault')]", + "applicationInsights": "[variables('applicationInsights')]", + "containerRegistry": "[variables('containerRegistry')]" + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(parameters('amlWorkspaceName'), '/', concat('ci-',uniqueString(resourceGroup().id)))]", + "apiVersion": "2021-07-01", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', concat(parameters('amlWorkspaceName')))]" + ], + "location": "[parameters('location')]", + "properties": { + "computeType": "ComputeInstance", + "properties": { + "vmSize": "[parameters('vmSize')]", + "setupScripts": { + "scripts": { + "creationScript": { + "scriptSource": "inline", + "scriptData": "[base64('conda activate azureml_py38;pip install feast[azure];pip install pymssql')]" + } + } + } + } + } + } + ] + }, + { + "name": "[parameters('registryBlobStore')]", + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-04-01", + "tags": { + "displayName": "Feast Registry Store" + }, + "location": "[resourceGroup().location]", + "kind": "StorageV2", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "allowBlobPublicAccess": false + }, + "resources": [ + { + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "name": "[concat('default/', 'fs-reg-container')]", + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ] + }, + { + "name": "[parameters('sqlServerName')]", + "type": "Microsoft.Sql/servers", + "apiVersion": "2014-04-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Offline Store Server" + }, + "properties": { + "administratorLogin": "[parameters('administratorLogin')]", + "administratorLoginPassword": "[parameters('administratorLoginPassword')]" + }, + "resources": [ + { + "type": "firewallRules", + "apiVersion": "2014-04-01", + "dependsOn": [ + "[resourceId('Microsoft.Sql/servers', concat(parameters('sqlServerName')))]" + ], + "location": "[resourceGroup().location]", + "name": "AllowAllWindowsAzureIps", + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } + }, + { + "name": "[parameters('sqlDbName')]", + "type": "databases", + "apiVersion": "2021-02-01-preview", + "location": "[resourceGroup().location]", + "sku": { + "tier": "Basic", + "name": "Basic" + }, + "tags": { + "displayName": "Feast Offline Store" + }, + "dependsOn": [ + "[resourceId('Microsoft.Sql/servers', concat(parameters('sqlServerName')))]" + ], + "properties": {} + } + ] + }, + { + "type": "Microsoft.Cache/redis", + "name": "[parameters('redisCacheName')]", + "apiVersion": "2020-12-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Online Store" + }, + "properties": { + "sku": { + "name": "Basic", + "family": "C", + "capacity": 2 + } + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-ONLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat(parameters('redisCacheName'),'.redis.cache.windows.net:6380,password=',listKeys(concat('Microsoft.Cache/redis/', parameters('redisCacheName')), providers('Microsoft.Cache', 'Redis').apiVersions[0]).primaryKey, ',ssl=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]", + "[variables('redisCache')]" + ] + } + ] + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-04-01-preview", + "name": "[variables('roleAssignmentName')]", + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', parameters('roleDefinitionId'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + }, + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ], + "outputs": {} +} \ No newline at end of file diff --git a/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json b/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json new file mode 100644 index 00000000000..476d332c569 --- /dev/null +++ b/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json @@ -0,0 +1,413 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "administratorLoginPassword": { + "type": "securestring", + "metadata": { + "description": "The administrator password of the SQL logical server." + } + }, + "principalId": { + "type": "string", + "metadata": { + "description": "Specifies the principal ID assigned to the role. You can find in cloud shell using 'az ad signed-in-user show --query id -o tsv'" + } + }, + "sku": { + "type": "string", + "defaultValue": "DW100c", + "allowedValues": [ + "DW100c", + "DW200c", + "DW300c", + "DW400c", + "DW500c", + "DW1000c", + "DW1500c", + "DW2000c", + "DW2500c", + "DW3000c" + ], + "metadata": { + "description": "Select the SKU of the SQL pool." + } + }, + "allowAllConnections": { + "type": "string", + "allowedValues": [ + "true", + "false" + ], + "defaultValue": "true", + "metadata": { + "description": "Specifies whether to allow client IPs to connect to Synapse" + } + }, + "administratorLogin": { + "type": "string", + "metadata": { + "description": "The administrator username of the SQL logical server." + }, + "defaultValue": "azureuser" + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "Standard_DS3_v2" + }, + "roleDefinitionID": { + "type": "string", + "metadata": { + "description": "Specifies the role definition ID used in the role assignment. Defaults to Storage Blob Data Contributor." + }, + "defaultValue": "ba92f5b4-2d11-453d-a403-e96b0029c9fe" + } + }, + "functions": [], + "variables": { + "location": "[resourceGroup().location]", + "tenantId": "[subscription().tenantId]", + "registryBlobStore": "[concat('fsregistry',uniqueString(resourceGroup().id))]", + "redisCacheName": "[concat('fsonline',uniqueString(resourceGroup().id))]", + "amlWorkspaceName": "[concat('ml',uniqueString(resourceGroup().id))]", + "synapseName": "[concat('sy',uniqueString(resourceGroup().id))]", + "storageAccountName": "[concat('st', uniqueString(resourceGroup().id))]", + "keyVaultName": "[concat('kv-', uniqueString(resourceGroup().id))]", + "applicationInsightsName": "[concat('appi-', uniqueString(resourceGroup().id))]", + "containerRegistryName": "[concat('cr', uniqueString(resourceGroup().id))]", + "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "registryAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('registryBlobStore'))]", + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", + "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", + "redisCache": "[resourceId('Microsoft.Cache/redis', variables('redisCacheName'))]", + "roleAssignmentName": "[guid(parameters('principalId'), parameters('roleDefinitionID'), resourceGroup().id)]", + "sqlPoolName": "[toLower(concat(variables('workspaceName'),'p1'))]", + "workspaceName": "[toLower(concat(variables('synapseName'),'ws1'))]", + "dlsName": "[toLower(concat('dls',variables('synapseName')))]", + "dlsFsName": "[toLower(concat(variables('dlsName'),'fs1'))]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-01-01", + "name": "[variables('storageAccountName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard_RAGRS" + }, + "kind": "StorageV2", + "properties": { + "encryption": { + "services": { + "blob": { + "enabled": true + }, + "file": { + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-04-01-preview", + "name": "[variables('keyVaultName')]", + "location": "[variables('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" + }, + "accessPolicies": [], + "enableSoftDelete": true + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-OFFLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('mssql+pyodbc://',parameters('administratorLogin'),':',parameters('administratorLoginPassword'),'@', variables('workspaceName'),'.database.windows.net:1433/', variables('sqlPoolName'), '?driver=ODBC+Driver+17+for+SQL+Server&autocommit=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-REGISTRY-PATH')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('https://',variables('registryBlobStore'),'.blob.core.windows.net/fs-reg-container/registry.db')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + } + ] + }, + { + "type": "Microsoft.Insights/components", + "apiVersion": "2020-02-02", + "name": "[variables('applicationInsightsName')]", + "location": "[if(or(equals(variables('location'),'eastus2'), equals(variables('location'),'westcentralus')),'southcentralus',variables('location'))]", + "kind": "web", + "properties": { + "Application_Type": "web" + } + }, + { + "type": "Microsoft.ContainerRegistry/registries", + "sku": { + "name": "Standard", + "tier": "Standard" + }, + "name": "[variables('containerRegistryName')]", + "apiVersion": "2019-12-01-preview", + "location": "[variables('location')]", + "properties": { + "adminUserEnabled": true + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces", + "apiVersion": "2021-04-01", + "name": "[variables('amlWorkspaceName')]", + "location": "[resourceGroup().location]", + "identity": { + "type": "SystemAssigned" + }, + "tags": { + "displayName": "Azure ML Workspace" + }, + "dependsOn": [ + "[variables('storageAccount')]", + "[variables('keyVault')]", + "[variables('applicationInsights')]", + "[variables('containerRegistry')]" + ], + "properties": { + "storageAccount": "[variables('storageAccount')]", + "keyVault": "[variables('keyVault')]", + "applicationInsights": "[variables('applicationInsights')]", + "containerRegistry": "[variables('containerRegistry')]" + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(variables('amlWorkspaceName'), '/', concat('ci-',uniqueString(resourceGroup().id)))]", + "apiVersion": "2021-07-01", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', concat(variables('amlWorkspaceName')))]" + ], + "location": "[variables('location')]", + "properties": { + "computeType": "ComputeInstance", + "properties": { + "vmSize": "[parameters('vmSize')]", + "setupScripts": { + "scripts": { + "creationScript": { + "scriptSource": "inline", + "scriptData": "[base64('conda activate azureml_py38;pip install feast[azure];pip install pymssql')]" + } + } + } + } + } + } + ] + }, + { + "name": "[variables('registryBlobStore')]", + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-04-01", + "tags": { + "displayName": "Feast Registry Store" + }, + "location": "[resourceGroup().location]", + "kind": "StorageV2", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "allowBlobPublicAccess": false + }, + "resources": [ + { + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "name": "[concat('default/', 'fs-reg-container')]", + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ] + }, + { + "type": "Microsoft.Cache/redis", + "name": "[variables('redisCacheName')]", + "apiVersion": "2020-12-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Online Store" + }, + "properties": { + "sku": { + "name": "Basic", + "family": "C", + "capacity": 2 + } + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-ONLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat(variables('redisCacheName'),'.redis.cache.windows.net:6380,password=',listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), providers('Microsoft.Cache', 'Redis').apiVersions[0]).primaryKey, ',ssl=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]", + "[variables('redisCache')]" + ] + } + ] + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-04-01-preview", + "name": "[variables('roleAssignmentName')]", + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', parameters('roleDefinitionId'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + }, + "dependsOn": [ + "[variables('registryAccount')]" + ] + }, + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2019-06-01", + "name": "[variables('dlsName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard_LRS" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "supportsHttpsTrafficOnly": true, + "isHnsEnabled": true + }, + "resources": [ + { + "name": "[concat('default/', variables('dlsFsName'))]", + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "dependsOn": [ + "[variables('dlsName')]" + ], + "properties": { + "publicAccess": "None" + } + } + ] + }, + { + "type": "Microsoft.Synapse/workspaces", + "apiVersion": "2019-06-01-preview", + "name": "[variables('workspaceName')]", + "location": "[variables('location')]", + "identity": { + "type": "SystemAssigned" + }, + "dependsOn": [ + "[variables('dlsName')]", + "[variables('dlsFsName')]" + ], + "properties": { + "defaultDataLakeStorage": { + "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]", + "filesystem": "[variables('dlsFsName')]" + }, + "sqlAdministratorLogin": "[parameters('administratorLogin')]", + "sqlAdministratorLoginPassword": "[parameters('administratorLoginPassword')]", + "managedVirtualNetwork": "default" + }, + "resources": [ + { + "condition": "[equals(parameters('allowAllConnections'),'true')]", + "type": "firewallrules", + "apiVersion": "2019-06-01-preview", + "name": "allowAll", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "255.255.255.255" + } + }, + { + "type": "firewallrules", + "apiVersion": "2019-06-01-preview", + "name": "AllowAllWindowsAzureIps", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } + }, + { + "type": "managedIdentitySqlControlSettings", + "apiVersion": "2019-06-01-preview", + "name": "default", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "grantSqlControlToManagedIdentity": { + "desiredState": "Enabled" + } + } + } + ] + }, + { + "type": "Microsoft.Synapse/workspaces/sqlPools", + "apiVersion": "2019-06-01-preview", + "name": "[concat(variables('workspaceName'), '/', variables('sqlPoolName'))]", + "location": "[variables('location')]", + "sku": { + "name": "[parameters('sku')]" + }, + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "createMode": "Default", + "collation": "SQL_Latin1_General_CP1_CI_AS" + } + } + ], + "outputs": {} +} \ No newline at end of file diff --git a/docs/tutorials/azure/media/arch.png b/docs/tutorials/azure/media/arch.png new file mode 100644 index 00000000000..c386c65f53f Binary files /dev/null and b/docs/tutorials/azure/media/arch.png differ diff --git a/docs/tutorials/azure/media/ci-kernel.png b/docs/tutorials/azure/media/ci-kernel.png new file mode 100644 index 00000000000..eeab1993b82 Binary files /dev/null and b/docs/tutorials/azure/media/ci-kernel.png differ diff --git a/docs/tutorials/azure/media/ci.png b/docs/tutorials/azure/media/ci.png new file mode 100644 index 00000000000..3b93391efcc Binary files /dev/null and b/docs/tutorials/azure/media/ci.png differ diff --git a/docs/tutorials/azure/media/feast-overview.png b/docs/tutorials/azure/media/feast-overview.png new file mode 100644 index 00000000000..d8eb5451430 Binary files /dev/null and b/docs/tutorials/azure/media/feast-overview.png differ diff --git a/docs/tutorials/azure/media/feast-tutorial-arch.png b/docs/tutorials/azure/media/feast-tutorial-arch.png new file mode 100644 index 00000000000..621df4dd2ed Binary files /dev/null and b/docs/tutorials/azure/media/feast-tutorial-arch.png differ diff --git a/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml b/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml new file mode 100644 index 00000000000..9cd55d2e9ac --- /dev/null +++ b/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml @@ -0,0 +1,11 @@ +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +offline_store: + type: mssql + connection_string: ${SQL_CONN} # Environment Variable \ No newline at end of file diff --git a/docs/tutorials/azure/notebooks/part1-load-data.ipynb b/docs/tutorials/azure/notebooks/part1-load-data.ipynb new file mode 100644 index 00000000000..a6ab34bbaac --- /dev/null +++ b/docs/tutorials/azure/notebooks/part1-load-data.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load feature values into the Feature store\n", + "\n", + "The objective of this tutorial is to build a model that predicts if a driver will complete a trip based on a number of features ingested into Feast.\n", + "\n", + "This notebook creates you will create and load data into the following 3 feature tables.\n", + "\n", + "**Customer Profile**: This contains features related to a customer entity such as current balance, lifetime trip count, average number of passengers per trip. A snippet of data:\n", + "\n", + "| datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created |\n", + "|------------------|-------------|-----------------|---------------------|---------------------|------------------|\n", + "| 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 |\n", + "| 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 |\n", + "| 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 |\n", + "| 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 |\n", + "\n", + "**Driver table**: This contains features related to a driver entity such as conversion rate, average number of daily trips. A snippet of data:\n", + "\n", + "| datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created |\n", + "|------------------|-----------|-----------|----------|-----------------|------------------|\n", + "| 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 |\n", + "| 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 |\n", + "| 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 |\n", + "| 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 |\n", + "\n", + "\n", + "**Orders table**: This is a typical *fact table* that contains the order information such driver/customer id and whether the trip was completed. A snippet of data:\n", + "\n", + "| order_id | driver_id | customer_id | order_is_success | event_timestamp |\n", + "|----------|-----------|-------------|------------------|---------------------|\n", + "| 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 |\n", + "| 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 |\n", + "| 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 |\n", + "| 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "from sqlalchemy.sql import text\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "\n", + "engine = create_engine(kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Create Customer profile table and load data\n", + "The cell below will create the customer profile table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.customer_profile\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/customer_profile.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " # create and load customer profile table\n", + " file = open(\"../sql/create_cx_profile_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating customer profile table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_cx_profile_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading customer profile data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.customer_profile\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create drivers table and load data\n", + "The cell below will create the drivers table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.driver_hourly\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/driver_hourly.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " file = open(\"../sql/create_drivers_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating drivers table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_drivers_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading drivers data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.driver_hourly\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create orders table and load data\n", + "The cell below will create the orders table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.orders\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/orders.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " file = open(\"../sql/create_orders_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating orders table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_orders_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading orders data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.orders\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "With the feature values loaded into the feature store, you will need to register the features in the feast central registry. [Follow the Register Features part of the tutorial](./part2-register-features.ipynb)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/part2-register-features.ipynb b/docs/tutorials/azure/notebooks/part2-register-features.ipynb new file mode 100644 index 00000000000..6ec87577cf1 --- /dev/null +++ b/docs/tutorials/azure/notebooks/part2-register-features.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation.\n", + "Licensed under the MIT license.\n", + "\n", + "# Feast Azure Provider Tutorial: Register Features\n", + "\n", + "In this notebook you will connect to your feature store and register features into a central repository hosted on Azure Blob Storage. It should be noted that best practice for registering features would be through a CI/CD process e.g. GitHub Actions, or Azure DevOps.\n", + "\n", + "## What is Feast?\n", + "\n", + "Feast is an operational data system for managing and serving machine learning features to models in production. Feast is able to serve feature data to models from a low-latency online store (for real-time prediction) or from an offline store (for scale-out batch scoring or model training).\n", + "\n", + "![feast overview](../media/feast-overview.png)\n", + "\n", + "## Configure Feature Repository\n", + "\n", + "The cell below displays the feature_store.yaml file - a file that contains infrastructural configuration, such as where the registry file is located, and connection strings to data.\n", + "\n", + "__There is no need to change the details in this file. When you connect to the feature store afterwards, the credentials are resolved from the Azure ML default keyvault.__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cat feature_repo/feature_store.yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to the feature store\n", + "\n", + "Below you connect to the feature store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from feast import FeatureStore\n", + "from azureml.core import Workspace\n", + "\n", + "# access key vault to get secrets\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "os.environ['REGISTRY_PATH']=kv.get_secret(\"FEAST-REGISTRY-PATH\")\n", + "os.environ['SQL_CONN']=kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\")\n", + "os.environ['REDIS_CONN']=kv.get_secret(\"FEAST-ONLINE-STORE-CONN\")\n", + "\n", + "# connect to feature store\n", + "fs = FeatureStore(\"./feature_repo\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the data source (offline store)\n", + "\n", + "The data source refers to raw underlying data (a table in Azure SQL DB or Synapse SQL). Feast uses a time-series data model to represent data. This data model is used to interpret feature data in data sources in order to build training datasets or when materializing features into an online store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import MsSqlServerSource\n", + "\n", + "orders_table = \"orders\"\n", + "driver_hourly_table = \"driver_hourly\"\n", + "customer_profile_table = \"customer_profile\"\n", + "\n", + "driver_source = MsSqlServerSource(\n", + " table_ref=driver_hourly_table,\n", + " event_timestamp_column=\"datetime\",\n", + " created_timestamp_column=\"created\",\n", + ")\n", + "\n", + "customer_source = MsSqlServerSource(\n", + " table_ref=customer_profile_table,\n", + " event_timestamp_column=\"datetime\",\n", + " created_timestamp_column=\"\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Feature Views\n", + "\n", + "A feature view is an object that represents a logical group of time-series feature data as it is found in a data source. Feature views consist of one or more entities, features, and a data source. Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment.\n", + "\n", + "Feature views are used during:\n", + "\n", + "- The generation of training datasets by querying the data source of feature views in order to find historical feature values. A single training dataset may consist of features from multiple feature views. \n", + "- Loading of feature values into an online store. Feature views determine the storage schema in the online store.\n", + "- Retrieval of features from the online store. Feature views provide the schema definition to Feast in order to look up features from the online store.\n", + "\n", + "__NOTE: Feast does not generate feature values. It acts as the ingestion and serving system. The data sources described within feature views should reference feature values in their already computed form.__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast import Feature, FeatureView, ValueType\n", + "from datetime import timedelta\n", + "\n", + "driver_fv = FeatureView(\n", + " name=\"driver_stats\",\n", + " entities=[\"driver\"],\n", + " features=[\n", + " Feature(name=\"conv_rate\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"acc_rate\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"avg_daily_trips\", dtype=ValueType.INT32),\n", + " ],\n", + " batch_source=driver_source,\n", + " ttl=timedelta(hours=2),\n", + ")\n", + "\n", + "customer_fv = FeatureView(\n", + " name=\"customer_profile\",\n", + " entities=[\"customer_id\"],\n", + " features=[\n", + " Feature(name=\"current_balance\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"avg_passenger_count\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"lifetime_trip_count\", dtype=ValueType.INT32),\n", + " ],\n", + " batch_source=customer_source,\n", + " ttl=timedelta(days=2),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define entities\n", + "\n", + "An entity is a collection of semantically related features. Users define entities to map to the domain of their use case. For example, a ride-hailing service could have customers and drivers as their entities, which group related features that correspond to these customers and drivers.\n", + "\n", + "Entities are defined as part of feature views. Entities are used to identify the primary key on which feature values should be stored and retrieved. These keys are used during the lookup of feature values from the online store and the join process in point-in-time joins. It is possible to define composite entities (more than one entity object) in a feature view.\n", + "Entities should be reused across feature views.\n", + "\n", + "## Entity key\n", + "\n", + "A related concept is an entity key. These are one or more entity values that uniquely describe a feature view record. In the case of an entity (like a driver) that only has a single entity field, the entity is an entity key. However, it is also possible for an entity key to consist of multiple entity values. For example, a feature view with the composite entity of (customer, country) might have an entity key of (1001, 5).\n", + "\n", + "Entity keys act as primary keys. They are used during the lookup of features from the online store, and they are also used to match feature rows across feature views during point-in-time joins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast import Entity\n", + "driver = Entity(name=\"driver\", join_key=\"driver_id\", value_type=ValueType.INT64)\n", + "customer = Entity(name=\"customer_id\", value_type=ValueType.INT64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feast `apply()`\n", + "\n", + "Feast `apply` will:\n", + "\n", + "1. Feast will scan Python files in your feature repository and find all Feast object definitions, such as feature views, entities, and data sources.\n", + "1. Feast will validate your feature definitions\n", + "1. Feast will sync the metadata about Feast objects to the registry. If a registry does not exist, then it will be instantiated. The standard registry is a simple protobuf binary file that is stored on Azure Blob Storage.\n", + "1. Feast CLI will create all necessary feature store infrastructure. The exact infrastructure that is deployed or configured depends on the provider configuration that you have set in feature_store.yaml." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.apply([driver, driver_fv, customer, customer_fv])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What just happened?\n", + "\n", + "If you look in your feast registry storage account, you will see there is now a registry.db file that contains the metadata for your registered features. Below you can list the feature views:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from google.protobuf.json_format import MessageToDict\n", + "\n", + "for x in fs.list_feature_views():\n", + " d=MessageToDict(x.to_proto())\n", + " print(\"🪟 Feature view name:\", d['spec']['name'])\n", + " print(\"🧑 Entities:\", d['spec']['entities'])\n", + " print(\"🧪 Features:\", d['spec']['features'])\n", + " print(\"💾 Batch source type:\", d['spec']['batchSource']['dataSourceClassType'])\n", + " print(\"\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "In the [next part of this tutorial](./part3-train-and-deploy-with-feast.ipynb) you will:\n", + "\n", + "- Train a model using features stored in your feature store\n", + "- Materialize the data from the offline store to the online store\n", + "- Deploy the model to a real-time endpoint, that consumes feature vectors from the online store.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb b/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb new file mode 100644 index 00000000000..ff15aac60d2 --- /dev/null +++ b/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "Copyright (c) Microsoft Corporation. Licensed under the MIT license.\n", + "\n", + "# Train and Deploy a model using Feast\n", + "\n", + "In this notebook we show how to:\n", + "\n", + "1. access a feature store \n", + "1. discover features in the feature store\n", + "1. train a model using the offline store (using the feast function `get_historical_features()`)\n", + "1. use the feast `materialize()` function to push features from the offline store to an online store (redis)\n", + "1. Deploy the model to an Azure ML endpoint where the features are consumed from the online store (feast function `get_online_features()`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to Feature store\n", + "\n", + "Below we create a Feast repository config, which accesses the registry.db file and also provides the credentials to the offline and online storage. These credentials are done via the Azure Keyvault." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "gather": { + "logged": 1627130565121 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import os\n", + "from feast import FeatureStore\n", + "from azureml.core import Workspace\n", + "\n", + "# access key vault to get secrets\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "os.environ['REGISTRY_PATH']=kv.get_secret(\"FEAST-REGISTRY-PATH\")\n", + "os.environ['SQL_CONN']=kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\")\n", + "os.environ['REDIS_CONN']=kv.get_secret(\"FEAST-ONLINE-STORE-CONN\")\n", + "\n", + "# connect to feature store\n", + "fs = FeatureStore(\"./feature_repo\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List the feature views\n", + "\n", + "Below lists the registered feature views." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_feature_views()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "gather": { + "logged": 1627130724228 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load features into a pandas dataframe\n", + "\n", + "Below you load the features from the feature store into a pandas data frame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "gather": { + "logged": 1626933777036 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "sql_job = fs.get_historical_features(\n", + " entity_df=\"SELECT * FROM orders\",\n", + " features=[\n", + " \"driver_stats:conv_rate\",\n", + " \"driver_stats:acc_rate\",\n", + " \"driver_stats:avg_daily_trips\",\n", + " \"customer_profile:current_balance\",\n", + " \"customer_profile:avg_passenger_count\",\n", + " \"customer_profile:lifetime_trip_count\",\n", + " ],\n", + ")\n", + "\n", + "training_df = sql_job.to_df()\n", + "training_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Train a model and capture metrics with MLFlow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from azureml.core import Workspace\n", + "\n", + "# connect to your workspace\n", + "ws = Workspace.from_config()\n", + "\n", + "# create experiment and start logging to a new run in the experiment\n", + "experiment_name = \"order_model\"\n", + "\n", + "# set up MLflow to track the metrics\n", + "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())\n", + "mlflow.set_experiment(experiment_name)\n", + "mlflow.sklearn.autolog()\n", + "\n", + "training_df = training_df.dropna()\n", + "X = training_df[['conv_rate', 'acc_rate', 'avg_daily_trips', \n", + " 'current_balance', 'avg_passenger_count','lifetime_trip_count' ]].dropna()\n", + "y = training_df['order_is_success']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n", + "clf = RandomForestClassifier(n_estimators=10)\n", + "\n", + "# train the model\n", + "with mlflow.start_run() as run:\n", + " clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare for deployment\n", + "\n", + "### Register model and the feature registry " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# register the model\n", + "model_uri = \"runs:/{}/model\".format(run.info.run_id)\n", + "model = mlflow.register_model(model_uri, \"order_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `materialize()` data into the online store (redis)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "end_date = datetime.now()\n", + "start_date = end_date - timedelta(days=365)\n", + "fs.materialize(start_date=start_date, end_date=end_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up deployment configuration\n", + "\n", + "__Note: You will need to set up a service principal (SP) and add that SP to your blob storage account as a *Storage Blob Data Contributor* role to authenticate to the storage containing the feast registry file.__\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`az ad sp create-for-rbac -n $sp_name --role \"Storage Blob Data Contributor\" \\\n", + "--scopes /subscriptions/$sub_id/resourceGroups/$rg_name`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have set up the SP, populate the `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET` environment variables below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.environment import Environment\n", + "from azureml.core.webservice import AciWebservice\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "keyvault = ws.get_default_keyvault()\n", + "\n", + "# create deployment config i.e. compute resources\n", + "aciconfig = AciWebservice.deploy_configuration(\n", + " cpu_cores=1,\n", + " memory_gb=1,\n", + " description=\"orders service using feast\",\n", + ")\n", + "\n", + "# get registered environment\n", + "env = Environment(\"feast-env\")\n", + "env.docker.base_image = None\n", + "env.docker.base_dockerfile = \"./inference.dockerfile\"\n", + "env.python.user_managed_dependencies = True\n", + "env.inferencing_stack_version = 'latest'\n", + "env.python.interpreter_path = \"/azureml-envs/feast/bin/python\"\n", + "\n", + "# again ensure that the scoring environment has access to the registry file\n", + "env.environment_variables = {\n", + " \"FEAST_SQL_CONN\": fs.config.offline_store.connection_string,\n", + " \"FEAST_REDIS_CONN\": fs.config.online_store.connection_string,\n", + " \"FEAST_REGISTRY_BLOB\": fs.config.registry.path,\n", + " \"AZURE_CLIENT_ID\": \"PROVIDE YOUR SERVICE PRINCIPLE CLIENT ID HERE\",\n", + " \"AZURE_TENANT_ID\": \"PROVIDE YOUR SERVICE PRINCIPLE TENANT ID HERE\",\n", + " \"AZURE_CLIENT_SECRET\": \"PROVIDE YOUR SERVICE PRINCIPLE CLIENT SECRET HERE\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy model\n", + "\n", + "Next, you deploy the model to Azure Container Instance. Please note that this may take approximately 10 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "from azureml.core.model import InferenceConfig\n", + "from azureml.core.environment import Environment\n", + "from azureml.core.model import Model\n", + "\n", + "# get the registered model\n", + "model = Model(ws, \"order_model\")\n", + "\n", + "# create an inference config i.e. the scoring script and environment\n", + "inference_config = InferenceConfig(\n", + " entry_script=\"./src/score.py\", \n", + " environment=env, \n", + " source_directory=\"src\"\n", + ")\n", + "\n", + "# deploy the service\n", + "service_name = \"orders-service\" + str(uuid.uuid4())[:4]\n", + "service = Model.deploy(\n", + " workspace=ws,\n", + " name=service_name,\n", + " models=[model],\n", + " inference_config=inference_config,\n", + " deployment_config=aciconfig,\n", + ")\n", + "\n", + "service.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test service\n", + "\n", + "Below you test the service. The first score takes a while as the feast registry file is downloaded from blob. Subsequent runs will be faster as feast uses a local cache for the registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "input_payload = json.dumps({\"driver\":50521, \"customer_id\":20265})\n", + "\n", + "service.run(input_data=input_payload)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "service.delete()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "newenv" + }, + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/src/score.py b/docs/tutorials/azure/notebooks/src/score.py new file mode 100644 index 00000000000..93b248240d3 --- /dev/null +++ b/docs/tutorials/azure/notebooks/src/score.py @@ -0,0 +1,76 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import logging +import json +import joblib +from feast import FeatureStore, RepoConfig +from feast.infra.registry.registry import RegistryConfig + +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import MsSqlServerOfflineStoreConfig +from feast.infra.online_stores.redis import RedisOnlineStoreConfig, RedisOnlineStore + + +def init(): + sql_conn_str = os.getenv("FEAST_SQL_CONN") + redis_conn_str = os.getenv("FEAST_REDIS_CONN") + feast_registry_path = os.getenv("FEAST_REGISTRY_BLOB") + + print("connecting to registry...") + reg_config = RegistryConfig( + registry_store_type="azure", + path=feast_registry_path, + ) + + print("connecting to repo config...") + repo_cfg = RepoConfig( + project="production", + provider="azure", + registry=reg_config, + offline_store=MsSqlServerOfflineStoreConfig(connection_string=sql_conn_str), + online_store=RedisOnlineStoreConfig(connection_string=redis_conn_str), + ) + global store + print("connecting to feature store...") + store = FeatureStore(config=repo_cfg) + + global model + # AZUREML_MODEL_DIR is an environment variable created during deployment. + # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) + model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), "model/model.pkl") + # deserialize the model file back into a sklearn model + model = joblib.load(model_path) + print("read model, init complete") + + +def run(raw_data): + data = json.loads(raw_data) + feature_vector = store.get_online_features( + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips", + "driver_stats:acc_rate", + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + ], + entity_rows=[data], + ).to_df() + logging.info(feature_vector) + if len(feature_vector.dropna()) > 0: + data = feature_vector[ + [ + "conv_rate", + "avg_daily_trips", + "acc_rate", + "current_balance", + "avg_passenger_count", + "lifetime_trip_count", + ] + ] + + y_hat = model.predict(data) + return y_hat.tolist() + else: + return 0.0 \ No newline at end of file diff --git a/docs/tutorials/azure/sql/create_cx_profile_table.sql b/docs/tutorials/azure/sql/create_cx_profile_table.sql new file mode 100644 index 00000000000..c1cd09c9f35 --- /dev/null +++ b/docs/tutorials/azure/sql/create_cx_profile_table.sql @@ -0,0 +1,14 @@ +CREATE TABLE dbo.customer_profile +( + [datetime] DATETIME2(0), + [customer_id] bigint, + [current_balance] float, + [lifetime_trip_count] bigint, + [avg_passenger_count] float, + [created] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/create_drivers_table.sql b/docs/tutorials/azure/sql/create_drivers_table.sql new file mode 100644 index 00000000000..39b4b1371dc --- /dev/null +++ b/docs/tutorials/azure/sql/create_drivers_table.sql @@ -0,0 +1,14 @@ +CREATE TABLE dbo.driver_hourly +( + [datetime] DATETIME2(0), + [driver_id] bigint, + [avg_daily_trips] float, + [conv_rate] float, + [acc_rate] float, + [created] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/create_orders_table.sql b/docs/tutorials/azure/sql/create_orders_table.sql new file mode 100644 index 00000000000..e2325e85f62 --- /dev/null +++ b/docs/tutorials/azure/sql/create_orders_table.sql @@ -0,0 +1,13 @@ +CREATE TABLE dbo.orders +( + [order_id] bigint, + [driver_id] bigint, + [customer_id] bigint, + [order_is_success] int, + [event_timestamp] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/load_cx_profile_data.sql b/docs/tutorials/azure/sql/load_cx_profile_data.sql new file mode 100644 index 00000000000..c3f55f4d72e --- /dev/null +++ b/docs/tutorials/azure/sql/load_cx_profile_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.customer_profile +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/customer_profile.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/docs/tutorials/azure/sql/load_drivers_data.sql b/docs/tutorials/azure/sql/load_drivers_data.sql new file mode 100644 index 00000000000..37aa357b9de --- /dev/null +++ b/docs/tutorials/azure/sql/load_drivers_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.driver_hourly +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/driver_hourly.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/docs/tutorials/azure/sql/load_orders_data.sql b/docs/tutorials/azure/sql/load_orders_data.sql new file mode 100644 index 00000000000..eaa062eac26 --- /dev/null +++ b/docs/tutorials/azure/sql/load_orders_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.orders +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/orders.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/sdk/python/docs/source/feast.protos.feast.serving.rst b/sdk/python/docs/source/feast.protos.feast.serving.rst index 792335b189d..bffb7c8a9fa 100644 --- a/sdk/python/docs/source/feast.protos.feast.serving.rst +++ b/sdk/python/docs/source/feast.protos.feast.serving.rst @@ -20,6 +20,22 @@ feast.protos.feast.serving.Connector\_pb2\_grpc module :undoc-members: :show-inheritance: +feast.protos.feast.serving.LoggingService\_pb2 module +----------------------------------------------------- + +.. automodule:: feast.protos.feast.serving.LoggingService_pb2 + :members: + :undoc-members: + :show-inheritance: + +feast.protos.feast.serving.LoggingService\_pb2\_grpc module +----------------------------------------------------------- + +.. automodule:: feast.protos.feast.serving.LoggingService_pb2_grpc + :members: + :undoc-members: + :show-inheritance: + feast.protos.feast.serving.ServingService\_pb2 module ----------------------------------------------------- diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 76b012e5856..19a780b32ca 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -299,7 +299,6 @@ def from_proto(data_source: DataSourceProto) -> Any: if data_source_type == DataSourceProto.SourceType.CUSTOM_SOURCE: cls = get_data_source_class_from_type(data_source.data_source_class_type) return cls.from_proto(data_source) - cls = get_data_source_class_from_type(_DATA_SOURCE_OPTIONS[data_source_type]) return cls.from_proto(data_source) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index a2178d9b28d..23600e7c64f 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -564,9 +564,7 @@ def _validate_all_feature_views( "This API is stable, but the functionality does not scale well for offline retrieval", RuntimeWarning, ) - set_usage_attribute("odfv", bool(odfvs_to_update)) - _validate_feature_views( [ *views_to_update, diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 84e7a1373f4..eefc466bf2f 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -7,6 +7,9 @@ from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_NAME, FeatureView from feast.field import Field, from_value_type from feast.infra.offline_stores.bigquery_source import BigQuerySource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource @@ -40,12 +43,14 @@ def update_data_sources_with_inferred_event_timestamp_col( ts_column_type_regex_pattern = "TIMESTAMP[A-Z]*" elif isinstance(data_source, SnowflakeSource): ts_column_type_regex_pattern = "TIMESTAMP_[A-Z]*" + elif isinstance(data_source, MsSqlServerSource): + ts_column_type_regex_pattern = "TIMESTAMP|DATETIME" else: raise RegistryInferenceFailure( "DataSource", f""" DataSource inferencing of timestamp_field is currently only supported - for FileSource, SparkSource, BigQuerySource, RedshiftSource, and SnowflakeSource. + for FileSource, SparkSource, BigQuerySource, RedshiftSource, SnowflakeSource, MsSqlSource. Attempting to infer from {data_source}. """, ) @@ -55,6 +60,7 @@ def update_data_sources_with_inferred_event_timestamp_col( or isinstance(data_source, BigQuerySource) or isinstance(data_source, RedshiftSource) or isinstance(data_source, SnowflakeSource) + or isinstance(data_source, MsSqlServerSource) or "SparkSource" == data_source.__class__.__name__ ) diff --git a/sdk/python/feast/infra/contrib/azure_provider.py b/sdk/python/feast/infra/contrib/azure_provider.py new file mode 100644 index 00000000000..ac56a2b33e2 --- /dev/null +++ b/sdk/python/feast/infra/contrib/azure_provider.py @@ -0,0 +1,72 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +from datetime import datetime +from typing import Callable + +from tqdm import tqdm + +from feast.feature_view import FeatureView +from feast.infra.passthrough_provider import PassthroughProvider +from feast.infra.registry.base_registry import BaseRegistry +from feast.repo_config import RepoConfig +from feast.utils import ( + _convert_arrow_to_proto, + _get_column_names, + _run_pyarrow_field_mapping, +) + +DEFAULT_BATCH_SIZE = 10_000 + + +class AzureProvider(PassthroughProvider): + def materialize_single_feature_view( + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], + ) -> None: + # TODO(kevjumba): untested + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + event_timestamp_column, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=event_timestamp_column, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + table = offline_job.to_arrow() + + if feature_view.batch_source.field_mapping is not None: + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) + + join_keys = {entity.join_key: entity.value_type for entity in entities} + + with tqdm_builder(table.num_rows) as pbar: + for batch in table.to_batches(DEFAULT_BATCH_SIZE): + rows_to_write = _convert_arrow_to_proto(batch, feature_view, join_keys) + self.online_write_batch( + self.repo_config, + feature_view, + rows_to_write, + lambda x: pbar.update(x), + ) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py new file mode 100644 index 00000000000..8dc5f6c6545 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py @@ -0,0 +1,650 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import warnings +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union + +import numpy as np +import pandas +import pyarrow +import pyarrow as pa +import sqlalchemy +from pydantic.types import StrictStr +from pydantic.typing import Literal +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine +from sqlalchemy.orm import sessionmaker + +from feast import FileSource, errors +from feast.data_source import DataSource +from feast.errors import InvalidEntityType +from feast.feature_logging import LoggingConfig, LoggingSource +from feast.feature_view import FeatureView +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.file_source import SavedDatasetFileStorage +from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalMetadata +from feast.infra.offline_stores.offline_utils import ( + DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, + build_point_in_time_query, + get_feature_view_query_context, +) +from feast.infra.provider import RetrievalJob +from feast.infra.registry.base_registry import BaseRegistry +from feast.on_demand_feature_view import OnDemandFeatureView +from feast.repo_config import FeastBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import pa_to_mssql_type +from feast.usage import log_exceptions_and_usage + +# Make sure warning doesn't raise more than once. +warnings.simplefilter("once", RuntimeWarning) + +EntitySchema = Dict[str, np.dtype] + + +class MsSqlServerOfflineStoreConfig(FeastBaseModel): + """Offline store config for SQL Server""" + + type: Literal["mssql"] = "mssql" + """ Offline store type selector""" + + connection_string: StrictStr = "mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server" + """Connection string containing the host, port, and configuration parameters for SQL Server + format: SQLAlchemy connection string, e.g. mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server""" + + +def make_engine(config: MsSqlServerOfflineStoreConfig) -> Engine: + return create_engine(config.connection_string) + + +class MsSqlServerOfflineStore(OfflineStore): + """ + Microsoft SQL Server based offline store, supporting Azure Synapse or Azure SQL. + + Note: to use this, you'll need to have Microsoft ODBC 17 installed. + See https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15#17 + """ + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + assert type(data_source).__name__ == "MsSqlServerSource" + from_expression = data_source.get_table_query_string().replace("`", "") + + partition_by_join_key_string = ", ".join(join_key_columns) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamps = [timestamp_field] + if created_timestamp_column: + timestamps.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) + + query = f""" + SELECT {field_string} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row + FROM {from_expression} inner_t + WHERE {timestamp_field} BETWEEN CONVERT(DATETIMEOFFSET, '{start_date}', 120) AND CONVERT(DATETIMEOFFSET, '{end_date}', 120) + ) outer_t + WHERE outer_t._feast_row = 1 + """ + engine = make_engine(config.offline_store) + + return MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert type(data_source).__name__ == "MsSqlServerSource" + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + from_expression = data_source.get_table_query_string().replace("`", "") + timestamps = [timestamp_field] + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) + + query = f""" + SELECT {field_string} + FROM ( + SELECT {field_string} + FROM {from_expression} + WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' + ) + """ + engine = make_engine(config.offline_store) + + return MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + + expected_join_keys = _get_join_keys(project, feature_views, registry) + assert isinstance(config.offline_store, MsSqlServerOfflineStoreConfig) + engine = make_engine(config.offline_store) + if isinstance(entity_df, pandas.DataFrame): + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df( + dict(zip(list(entity_df.columns), list(entity_df.dtypes))) + ) + ) + entity_df[entity_df_event_timestamp_col] = pandas.to_datetime( + entity_df[entity_df_event_timestamp_col], utc=True + ).fillna(pandas.Timestamp.now()) + + elif isinstance(entity_df, str): + raise ValueError( + "string entities are currently not supported in the MsSQL offline store." + ) + ( + table_schema, + table_name, + ) = _upload_entity_df_into_sqlserver_and_get_entity_schema( + engine, config, entity_df, full_feature_names=full_feature_names + ) + + _assert_expected_columns_in_sqlserver( + expected_join_keys, + entity_df_event_timestamp_col, + table_schema, + ) + + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, + entity_df_event_timestamp_col, + engine, + ) + + # Build a query context containing all information required to template the SQL query + query_context = get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_timestamp_range=entity_df_event_timestamp_range, + ) + + # Generate the SQL query from the query context + query = build_point_in_time_query( + query_context, + left_table_query_string=table_name, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + entity_df_columns=table_schema.keys(), + full_feature_names=full_feature_names, + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + ) + query = query.replace("`", "") + + job = MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=full_feature_names, + on_demand_feature_views=registry.list_on_demand_feature_views(project), + ) + return job + + @staticmethod + def write_logged_features( + config: RepoConfig, + data: Union[pyarrow.Table, Path], + source: LoggingSource, + logging_config: LoggingConfig, + registry: BaseRegistry, + ): + raise NotImplementedError() + + @staticmethod + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): + raise NotImplementedError() + + +def _assert_expected_columns_in_dataframe( + join_keys: Set[str], entity_df_event_timestamp_col: str, entity_df: pandas.DataFrame +): + entity_df_columns = set(entity_df.columns.values) + expected_columns = join_keys.copy() + expected_columns.add(entity_df_event_timestamp_col) + + missing_keys = expected_columns - entity_df_columns + + if len(missing_keys) != 0: + raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys) + + +def _assert_expected_columns_in_sqlserver( + join_keys: Set[str], entity_df_event_timestamp_col: str, table_schema: EntitySchema +): + entity_columns = set(table_schema.keys()) + expected_columns = join_keys.copy() + expected_columns.add(entity_df_event_timestamp_col) + + missing_keys = expected_columns - entity_columns + + if len(missing_keys) != 0: + raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys) + + +def _get_join_keys( + project: str, feature_views: List[FeatureView], registry: BaseRegistry +) -> Set[str]: + join_keys = set() + for feature_view in feature_views: + entities = feature_view.entities + for entity_name in entities: + entity = registry.get_entity(entity_name, project) + join_keys.add(entity.join_key) + return join_keys + + +def _infer_event_timestamp_from_sqlserver_schema(table_schema) -> str: + if any( + schema_field["COLUMN_NAME"] == DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL + for schema_field in table_schema + ): + return DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL + else: + datetime_columns = list( + filter( + lambda schema_field: schema_field["DATA_TYPE"] == "DATETIMEOFFSET", + table_schema, + ) + ) + if len(datetime_columns) == 1: + print( + f"Using {datetime_columns[0]['COLUMN_NAME']} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." + ) + return datetime_columns[0].name + else: + raise ValueError( + f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." + ) + + +class MsSqlServerRetrievalJob(RetrievalJob): + def __init__( + self, + query: str, + engine: Engine, + config: MsSqlServerOfflineStoreConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], + metadata: Optional[RetrievalMetadata] = None, + drop_columns: Optional[List[str]] = None, + ): + self.query = query + self.engine = engine + self._config = config + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views or [] + self._drop_columns = drop_columns + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: + return self._on_demand_feature_views + + def _to_df_internal(self) -> pandas.DataFrame: + return pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan) + + def _to_arrow_internal(self) -> pyarrow.Table: + result = pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan) + return pyarrow.Table.from_pandas(result) + + ## Implements persist in Feast 0.18 - This persists to filestorage + ## ToDo: Persist to Azure Storage + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): + assert isinstance(storage, SavedDatasetFileStorage) + + filesystem, path = FileSource.create_filesystem_and_path( + storage.file_options.uri, + storage.file_options.s3_endpoint_override, + ) + + if path.endswith(".parquet"): + pyarrow.parquet.write_table( + self.to_arrow(), where=path, filesystem=filesystem + ) + else: + # otherwise assume destination is directory + pyarrow.parquet.write_to_dataset( + self.to_arrow(), root_path=path, filesystem=filesystem + ) + + def supports_remote_storage_export(self) -> bool: + return False + + def to_remote_storage(self) -> List[str]: + raise NotImplementedError() + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + +def _upload_entity_df_into_sqlserver_and_get_entity_schema( + engine: sqlalchemy.engine.Engine, + config: RepoConfig, + entity_df: Union[pandas.DataFrame, str], + full_feature_names: bool, +) -> Tuple[Dict[Any, Any], str]: + """ + Uploads a Pandas entity dataframe into a SQL Server table and constructs the + schema from the original entity_df dataframe. + """ + table_id = offline_utils.get_temp_entity_table_name() + session = sessionmaker(bind=engine)() + + if type(entity_df) is str: + # TODO: This should be a temporary table, right? + session.execute(f"SELECT * INTO {table_id} FROM ({entity_df}) t") # type: ignore + + session.commit() + + limited_entity_df = MsSqlServerRetrievalJob( + f"SELECT TOP 1 * FROM {table_id}", + engine, + config.offline_store, + full_feature_names=full_feature_names, + on_demand_feature_views=None, + ).to_df() + + entity_schema = ( + dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)), + table_id, + ) + + elif isinstance(entity_df, pandas.DataFrame): + # Drop the index so that we don't have unnecessary columns + engine.execute(_df_to_create_table_sql(entity_df, table_id)) + entity_df.to_sql(name=table_id, con=engine, index=False, if_exists="append") + entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)), table_id + + else: + raise ValueError( + f"The entity dataframe you have provided must be a SQL Server SQL query," + f" or a Pandas dataframe. But we found: {type(entity_df)} " + ) + + return entity_schema + + +def _df_to_create_table_sql(df: pandas.DataFrame, table_name: str) -> str: + pa_table = pa.Table.from_pandas(df) + + columns = [f""""{f.name}" {pa_to_mssql_type(f.type)}""" for f in pa_table.schema] + + return f""" + CREATE TABLE "{table_name}" ( + {", ".join(columns)} + ); + """ + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pandas.DataFrame, str], + entity_df_event_timestamp_col: str, + engine: Engine, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pandas.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pandas.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pandas.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + df = pandas.read_sql(entity_df, con=engine).fillna(value=np.nan) + entity_df_event_timestamp = df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pandas.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pandas.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +# TODO: Optimizations +# * Use NEWID() instead of ROW_NUMBER(), or join on entity columns directly +# * Precompute ROW_NUMBER() so that it doesn't have to be recomputed for every query on entity_dataframe +# * Create temporary tables instead of keeping all tables in memory + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ +/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +WITH entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + ,CONCAT( + {% for entity_key in unique_entity_keys %} + {{entity_key}}, + {% endfor %} + {{entity_df_event_timestamp_col}} + ) AS {{featureview.name}}__entity_row_unique_id + {% endfor %} + FROM {{ left_table_query_string }} +), + +{% for featureview in featureviews %} + +{{ featureview.name }}__entity_dataframe AS ( + SELECT + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id + FROM entity_dataframe + GROUP BY + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id +), + +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the timestamp_field` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `timestamp_field` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ + +{{ featureview.name }}__subquery AS ( + SELECT + {{ featureview.timestamp_field }} as event_timestamp, + {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} + {% for feature in featureview.features %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} + WHERE {{ featureview.timestamp_field }} <= '{{ featureview.max_event_timestamp }}' + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.timestamp_field }} >= '{{ featureview.min_event_timestamp }}' + {% endif %} +), + +{{ featureview.name }}__base AS ( + SELECT + subquery.*, + entity_dataframe.{{entity_df_event_timestamp_col}} AS entity_timestamp, + entity_dataframe.{{featureview.name}}__entity_row_unique_id + FROM {{ featureview.name }}__subquery AS subquery + INNER JOIN entity_dataframe + ON 1=1 + AND subquery.event_timestamp <= entity_dataframe.{{entity_df_event_timestamp_col}} + + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.ttl }} > = DATEDIFF(SECOND, subquery.event_timestamp, entity_dataframe.{{entity_df_event_timestamp_col}}) + {% endif %} + + {% for entity in featureview.entities %} + AND subquery.{{ entity }} = entity_dataframe.{{ entity }} + {% endfor %} +), + +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +{{ featureview.name }}__dedup AS ( + SELECT + {{featureview.name}}__entity_row_unique_id, + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM {{ featureview.name }}__base + GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp +), +{% endif %} + +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +{{ featureview.name }}__latest AS ( + SELECT + {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id, + MAX({{ featureview.name }}__base.event_timestamp) AS event_timestamp + {% if featureview.created_timestamp_column %} + ,MAX({{ featureview.name }}__base.created_timestamp) AS created_timestamp + {% endif %} + + FROM {{ featureview.name }}__base + {% if featureview.created_timestamp_column %} + INNER JOIN {{ featureview.name }}__dedup + ON {{ featureview.name }}__dedup.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id + AND {{ featureview.name }}__dedup.event_timestamp = {{ featureview.name }}__base.event_timestamp + AND {{ featureview.name }}__dedup.created_timestamp = {{ featureview.name }}__base.created_timestamp + {% endif %} + + GROUP BY {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id +), + +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +{{ featureview.name }}__cleaned AS ( + SELECT base.* + FROM {{ featureview.name }}__base as base + INNER JOIN {{ featureview.name }}__latest + ON base.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__latest.{{ featureview.name }}__entity_row_unique_id + AND base.event_timestamp = {{ featureview.name }}__latest.event_timestamp + {% if featureview.created_timestamp_column %} + AND base.created_timestamp = {{ featureview.name }}__latest.created_timestamp + {% endif %} +){% if loop.last %}{% else %}, {% endif %} + +{% endfor %} + +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ + +SELECT {{ final_output_feature_names | join(', ')}} +FROM entity_dataframe +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + {{featureview.name}}__entity_row_unique_id + {% for feature in featureview.features %} + ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %} + {% endfor %} + FROM "{{ featureview.name }}__cleaned" +) {{ featureview.name }}__cleaned +ON +{{ featureview.name }}__cleaned.{{ featureview.name }}__entity_row_unique_id = entity_dataframe.{{ featureview.name }}__entity_row_unique_id +{% endfor %} +""" diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py new file mode 100644 index 00000000000..6b126fa40c0 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py @@ -0,0 +1,252 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import json +import warnings +from typing import Callable, Dict, Iterable, Optional, Tuple + +import pandas +from sqlalchemy import create_engine + +from feast import type_map +from feast.data_source import DataSource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import ( + MsSqlServerOfflineStoreConfig, +) +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.repo_config import RepoConfig +from feast.value_type import ValueType + +# Make sure azure warning doesn't raise more than once. +warnings.simplefilter("once", RuntimeWarning) + + +class MsSqlServerOptions: + """ + DataSource MsSQLServer options used to source features from MsSQLServer query + """ + + def __init__( + self, + connection_str: Optional[str], + table_ref: Optional[str], + ): + self._connection_str = connection_str + self._table_ref = table_ref + + @property + def table_ref(self): + """ + Returns the table ref of this SQL Server source + """ + return self._table_ref + + @table_ref.setter + def table_ref(self, table_ref): + """ + Sets the table ref of this SQL Server source + """ + self._table_ref = table_ref + + @property + def connection_str(self): + """ + Returns the SqlServer SQL connection string referenced by this source + """ + return self._connection_str + + @connection_str.setter + def connection_str(self, connection_str): + """ + Sets the SqlServer SQL connection string referenced by this source + """ + self._connection_str = connection_str + + @classmethod + def from_proto( + cls, sqlserver_options_proto: DataSourceProto.CustomSourceOptions + ) -> "MsSqlServerOptions": + """ + Creates an MsSQLServerOptions from a protobuf representation of a SqlServer option + Args: + sqlserver_options_proto: A protobuf representation of a DataSource + Returns: + Returns a SQLServerOptions object based on the sqlserver_options protobuf + """ + options = json.loads(sqlserver_options_proto.configuration) + + sqlserver_options = cls( + table_ref=options["table_ref"], + connection_str=options["connection_str"], + ) + + return sqlserver_options + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """ + Converts a MsSQLServerOptions object to a protobuf representation. + Returns: + CustomSourceOptions protobuf + """ + + sqlserver_options_proto = DataSourceProto.CustomSourceOptions( + configuration=json.dumps( + { + "table_ref": self._table_ref, + "connection_string": self._connection_str, + } + ).encode("utf-8") + ) + + return sqlserver_options_proto + + +class MsSqlServerSource(DataSource): + def __init__( + self, + name: str, + table_ref: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = "", + connection_str: Optional[str] = "", + description: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = None, + ): + warnings.warn( + "The Azure Synapse + Azure SQL data source is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + self._mssqlserver_options = MsSqlServerOptions( + connection_str=connection_str, table_ref=table_ref + ) + self._connection_str = connection_str + + super().__init__( + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + date_partition_column=date_partition_column, + description=description, + tags=tags, + owner=owner, + name=name, + timestamp_field=event_timestamp_column, + ) + + def __eq__(self, other): + if not isinstance(other, MsSqlServerSource): + raise TypeError( + "Comparisons should only involve SqlServerSource class objects." + ) + + return ( + self.name == other.name + and self.mssqlserver_options.connection_str + == other.mssqlserver_options.connection_str + and self.timestamp_field == other.timestamp_field + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + def __hash__(self): + return hash( + ( + self.name, + self.mssqlserver_options.connection_str, + self.timestamp_field, + self.created_timestamp_column, + ) + ) + + @property + def table_ref(self): + return self._mssqlserver_options.table_ref + + @property + def mssqlserver_options(self): + """ + Returns the SQL Server options of this data source + """ + return self._mssqlserver_options + + @mssqlserver_options.setter + def mssqlserver_options(self, sqlserver_options): + """ + Sets the SQL Server options of this data source + """ + self._mssqlserver_options = sqlserver_options + + @staticmethod + def from_proto(data_source: DataSourceProto): + options = json.loads(data_source.custom_options.configuration) + return MsSqlServerSource( + name=data_source.name, + field_mapping=dict(data_source.field_mapping), + table_ref=options["table_ref"], + connection_str=options["connection_string"], + event_timestamp_column=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source.MsSqlServerSource", + field_mapping=self.field_mapping, + custom_options=self.mssqlserver_options.to_proto(), + ) + + data_source_proto.timestamp_field = self.timestamp_field + data_source_proto.created_timestamp_column = self.created_timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + data_source_proto.name = self.name + return data_source_proto + + def get_table_query_string(self) -> str: + """Returns a string that can directly be used to reference this table in SQL""" + return f"`{self.table_ref}`" + + def validate(self, config: RepoConfig): + # As long as the query gets successfully executed, or the table exists, + # the data source is validated. We don't need the results though. + self.get_table_column_names_and_types(config) + return None + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return type_map.mssql_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + assert isinstance(config.offline_store, MsSqlServerOfflineStoreConfig) + conn = create_engine(config.offline_store.connection_string) + self._mssqlserver_options.connection_str = ( + config.offline_store.connection_string + ) + name_type_pairs = [] + if len(self.table_ref.split(".")) == 2: + database, table_name = self.table_ref.split(".") + columns_query = f""" + SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_NAME = '{table_name}' and table_schema = '{database}' + """ + else: + columns_query = f""" + SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_NAME = '{self.table_ref}' + """ + + table_schema = pandas.read_sql(columns_query, conn) + name_type_pairs.extend( + list( + zip( + table_schema["COLUMN_NAME"].to_list(), + table_schema["DATA_TYPE"].to_list(), + ) + ) + ) + return name_type_pairs diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py new file mode 100644 index 00000000000..ae7affc838a --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py @@ -0,0 +1 @@ +from .data_source import mssql_container # noqa diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py new file mode 100644 index 00000000000..9b751d98efe --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py @@ -0,0 +1,108 @@ +from typing import Dict, List + +import pandas as pd +import pytest +from sqlalchemy import create_engine +from testcontainers.core.waiting_utils import wait_for_logs +from testcontainers.mssql import SqlServerContainer + +from feast.data_source import DataSource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import ( + MsSqlServerOfflineStoreConfig, + _df_to_create_table_sql, +) +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) +from feast.saved_dataset import SavedDatasetStorage +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + +MSSQL_USER = "SA" +MSSQL_PASSWORD = "yourStrong(!)Password" + + +@pytest.fixture(scope="session") +def mssql_container(): + container = SqlServerContainer( + user=MSSQL_USER, + password=MSSQL_PASSWORD, + image="mcr.microsoft.com/azure-sql-edge:1.0.6", + ) + container.start() + log_string_to_wait_for = "Service Broker manager has started" + wait_for_logs(container=container, predicate=log_string_to_wait_for, timeout=30) + + yield container + container.stop() + + +class MsSqlDataSourceCreator(DataSourceCreator): + tables: List[str] = [] + + def __init__( + self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs + ): + super().__init__(project_name) + self.tables_created: List[str] = [] + self.container = fixture_request.getfixturevalue("mssql_container") + + if not self.container: + raise RuntimeError( + "In order to use this data source " + "'feast.infra.offline_stores.contrib.mssql_offline_store.tests' " + "must be include into pytest plugins" + ) + + def create_offline_store_config(self) -> MsSqlServerOfflineStoreConfig: + return MsSqlServerOfflineStoreConfig( + connection_string=self.container.get_connection_url(), + ) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + timestamp_field="ts", + created_timestamp_column="created_ts", + field_mapping: Dict[str, str] = None, + **kwargs, + ) -> DataSource: + # Make sure the field mapping is correct and convert the datetime datasources. + if timestamp_field in df: + df[timestamp_field] = pd.to_datetime(df[timestamp_field], utc=True).fillna( + pd.Timestamp.now() + ) + if created_timestamp_column in df: + df[created_timestamp_column] = pd.to_datetime( + df[created_timestamp_column], utc=True + ).fillna(pd.Timestamp.now()) + + connection_string = self.create_offline_store_config().connection_string + engine = create_engine(connection_string) + destination_name = self.get_prefixed_table_name(destination_name) + # Create table + engine.execute(_df_to_create_table_sql(df, destination_name)) + + # Upload dataframe to azure table + df.to_sql(destination_name, engine, index=False, if_exists="append") + + self.tables.append(destination_name) + return MsSqlServerSource( + name="ci_mssql_source", + connection_str=connection_string, + table_ref=destination_name, + event_timestamp_column=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping or {"ts_1": "ts"}, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetStorage: + pass + + def get_prefixed_table_name(self, destination_name: str) -> str: + return f"{self.project_name}_{destination_name}" + + def teardown(self): + pass diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py new file mode 100644 index 00000000000..50d636ba909 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py @@ -0,0 +1,13 @@ +from feast.infra.offline_stores.contrib.mssql_offline_store.tests.data_source import ( + MsSqlDataSourceCreator, +) +from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG +from tests.integration.feature_repos.universal.online_store.redis import ( + RedisOnlineStoreCreator, +) + +AVAILABLE_OFFLINE_STORES = [ + ("local", MsSqlDataSourceCreator), +] + +AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py index 241627ba01e..50f92164ccf 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake.py +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -19,7 +19,6 @@ import numpy as np import pandas as pd import pyarrow -import pyarrow as pa from pydantic import Field, StrictStr from pydantic.typing import Literal from pytz import utc @@ -410,7 +409,7 @@ def _to_df_internal(self) -> pd.DataFrame: return df - def _to_arrow_internal(self) -> pa.Table: + def _to_arrow_internal(self) -> pyarrow.Table: with self._query_generator() as query: pa_table = execute_snowflake_statement( @@ -423,7 +422,7 @@ def _to_arrow_internal(self) -> pa.Table: else: empty_result = execute_snowflake_statement(self.snowflake_conn, query) - return pa.Table.from_pandas( + return pyarrow.Table.from_pandas( pd.DataFrame(columns=[md.name for md in empty_result.description]) ) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index bf2a4ec7bbc..7d3c37e4c2e 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -24,6 +24,7 @@ "gcp": "feast.infra.gcp.GcpProvider", "aws": "feast.infra.aws.AwsProvider", "local": "feast.infra.local.LocalProvider", + "azure": "feast.infra.contrib.azure_provider.AzureProvider", } diff --git a/sdk/python/feast/infra/registry/contrib/azure/__init__.py b/sdk/python/feast/infra/registry/contrib/azure/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py b/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py new file mode 100644 index 00000000000..9c00170b0f6 --- /dev/null +++ b/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py @@ -0,0 +1,98 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import uuid +from datetime import datetime +from pathlib import Path +from tempfile import TemporaryFile +from urllib.parse import urlparse + +from feast.infra.registry.registry import RegistryConfig +from feast.infra.registry.registry_store import RegistryStore +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto + +REGISTRY_SCHEMA_VERSION = "1" + + +class AzBlobRegistryStore(RegistryStore): + def __init__(self, registry_config: RegistryConfig, repo_path: Path): + try: + import logging + + from azure.identity import DefaultAzureCredential + from azure.storage.blob import BlobServiceClient + except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("az", str(e)) + + self._uri = urlparse(registry_config.path) + self._account_url = self._uri.scheme + "://" + self._uri.netloc + container_path = self._uri.path.lstrip("/").split("/") + self._container = container_path.pop(0) + self._path = "/".join(container_path) + + try: + # turn the verbosity of the blob client to warning and above (this reduces verbosity) + logger = logging.getLogger("azure") + logger.setLevel(logging.ERROR) + + # Attempt to use shared account key to login first + if "REGISTRY_BLOB_KEY" in os.environ: + client = BlobServiceClient( + account_url=self._account_url, + credential=os.environ["REGISTRY_BLOB_KEY"], + ) + self.blob = client.get_blob_client( + container=self._container, blob=self._path + ) + return + + default_credential = DefaultAzureCredential( + exclude_shared_token_cache_credential=True + ) + + client = BlobServiceClient( + account_url=self._account_url, credential=default_credential + ) + self.blob = client.get_blob_client( + container=self._container, blob=self._path + ) + except Exception as e: + print( + f"Could not connect to blob. Check the following\nIs the URL specified correctly?\nIs you IAM role set to Storage Blob Data Contributor? \n Errored out with exception {e}" + ) + + return + + def get_registry_proto(self): + file_obj = TemporaryFile() + registry_proto = RegistryProto() + + if self.blob.exists(): + download_stream = self.blob.download_blob() + file_obj.write(download_stream.readall()) + + file_obj.seek(0) + registry_proto.ParseFromString(file_obj.read()) + return registry_proto + raise FileNotFoundError( + f'Registry not found at path "{self._uri.geturl()}". Have you run "feast apply"?' + ) + + def update_registry_proto(self, registry_proto: RegistryProto): + self._write_registry(registry_proto) + + def teardown(self): + self.blob.delete_blob() + + def _write_registry(self, registry_proto: RegistryProto): + registry_proto.version_id = str(uuid.uuid4()) + registry_proto.last_updated.FromDatetime(datetime.utcnow()) + + file_obj = TemporaryFile() + file_obj.write(registry_proto.SerializeToString()) + file_obj.seek(0) + self.blob.upload_blob(file_obj, overwrite=True) # type: ignore + return diff --git a/sdk/python/feast/infra/registry/registry.py b/sdk/python/feast/infra/registry/registry.py index 221b44141a7..09d22ee3766 100644 --- a/sdk/python/feast/infra/registry/registry.py +++ b/sdk/python/feast/infra/registry/registry.py @@ -61,6 +61,7 @@ "S3RegistryStore": "feast.infra.registry.s3.S3RegistryStore", "FileRegistryStore": "feast.infra.registry.file.FileRegistryStore", "PostgreSQLRegistryStore": "feast.infra.registry.contrib.postgres.postgres_registry_store.PostgreSQLRegistryStore", + "AzureRegistryStore": "feast.infra.registry.contrib.azure.azure_registry_store.AzBlobRegistryStore", } REGISTRY_STORE_CLASS_FOR_SCHEME = { diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 118c1ca872f..9ab25563677 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -61,6 +61,7 @@ "trino": "feast.infra.offline_stores.contrib.trino_offline_store.trino.TrinoOfflineStore", "postgres": "feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStore", "athena": "feast.infra.offline_stores.contrib.athena_offline_store.athena.AthenaOfflineStore", + "mssql": "feast.infra.offline_stores.contrib.mssql_offline_store.mssql.MsSqlServerOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { @@ -174,6 +175,8 @@ def __init__(self, **data: Any): self._offline_config = "bigquery" elif data["provider"] == "aws": self._offline_config = "redshift" + elif data["provider"] == "azure": + self._offline_config = "mssql" self._online_store = None if "online_store" in data: @@ -334,6 +337,8 @@ def _validate_offline_store_config(cls, values): values["offline_store"]["type"] = "bigquery" elif values["provider"] == "aws": values["offline_store"]["type"] = "redshift" + if values["provider"] == "azure": + values["offline_store"]["type"] = "mssql" offline_store_type = values["offline_store"]["type"] diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index f8292b9c0da..2cb1c4fefb0 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -49,7 +49,6 @@ if TYPE_CHECKING: import pyarrow - # null timestamps get converted to -9223372036854775808 NULL_TIMESTAMP_INT_VALUE = np.datetime64("NaT").astype(int) @@ -533,6 +532,73 @@ def bq_to_feast_value_type(bq_type_as_str: str) -> ValueType: return value_type +def mssql_to_feast_value_type(mssql_type_as_str: str) -> ValueType: + type_map = { + "bigint": ValueType.FLOAT, + "binary": ValueType.BYTES, + "bit": ValueType.BOOL, + "char": ValueType.STRING, + "date": ValueType.UNIX_TIMESTAMP, + "datetime": ValueType.UNIX_TIMESTAMP, + "float": ValueType.FLOAT, + "nchar": ValueType.STRING, + "nvarchar": ValueType.STRING, + "nvarchar(max)": ValueType.STRING, + "real": ValueType.FLOAT, + "smallint": ValueType.INT32, + "tinyint": ValueType.INT32, + "varbinary": ValueType.BYTES, + "varchar": ValueType.STRING, + "None": ValueType.NULL, + # skip date, geometry, hllsketch, time, timetz + } + if mssql_type_as_str.lower() not in type_map: + raise ValueError(f"Mssql type not supported by feast {mssql_type_as_str}") + return type_map[mssql_type_as_str.lower()] + + +def pa_to_mssql_type(pa_type: "pyarrow.DataType") -> str: + # PyArrow types: https://arrow.apache.org/docs/python/api/datatypes.html + # MS Sql types: https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver16 + pa_type_as_str = str(pa_type).lower() + if pa_type_as_str.startswith("timestamp"): + if "tz=" in pa_type_as_str: + return "datetime2" + else: + return "datetime" + + if pa_type_as_str.startswith("date"): + return "date" + + if pa_type_as_str.startswith("decimal"): + return pa_type_as_str + + # We have to take into account how arrow types map to parquet types as well. + # For example, null type maps to int32 in parquet, so we have to use int4 in Redshift. + # Other mappings have also been adjusted accordingly. + type_map = { + "null": "None", + "bool": "bit", + "int8": "tinyint", + "int16": "smallint", + "int32": "int", + "int64": "bigint", + "uint8": "tinyint", + "uint16": "smallint", + "uint32": "int", + "uint64": "bigint", + "float": "float", + "double": "real", + "binary": "binary", + "string": "varchar", + } + + if pa_type_as_str.lower() not in type_map: + raise ValueError(f"MS SQL Server type not supported by feast {pa_type_as_str}") + + return type_map[pa_type_as_str] + + def redshift_to_feast_value_type(redshift_type_as_str: str) -> ValueType: # Type names from https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html type_map = { diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index a8c24947b09..9d10b2c3132 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -65,9 +65,13 @@ azure-core==1.25.0 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs + # via + # adlfs + # feast (setup.py) azure-storage-blob==12.13.1 - # via adlfs + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -144,6 +148,7 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 @@ -180,7 +185,7 @@ execnet==1.9.0 # via pytest-xdist executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -272,6 +277,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -338,7 +345,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.12.1 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -548,6 +555,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -665,6 +676,7 @@ six==1.16.0 # google-auth-httplib2 # grpcio # happybase + # isodate # kubernetes # mock # msrestazure @@ -751,19 +763,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.8 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==64.0.1 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.22 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via diff --git a/sdk/python/requirements/py3.10-requirements.txt b/sdk/python/requirements/py3.10-requirements.txt index 43e814b6685..ac12befb87c 100644 --- a/sdk/python/requirements/py3.10-requirements.txt +++ b/sdk/python/requirements/py3.10-requirements.txt @@ -38,7 +38,7 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -57,6 +57,8 @@ googleapis-common-protos==1.56.4 # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -73,7 +75,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.12.1 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index 7b0c13f23bd..93011cfdcf4 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -65,9 +65,13 @@ azure-core==1.25.0 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs + # via + # adlfs + # feast (setup.py) azure-storage-blob==12.13.1 - # via adlfs + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -148,6 +152,7 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 @@ -184,7 +189,7 @@ execnet==1.9.0 # via pytest-xdist executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -276,6 +281,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -344,7 +351,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.12.1 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -556,6 +563,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -675,6 +686,7 @@ six==1.16.0 # google-auth-httplib2 # grpcio # happybase + # isodate # kubernetes # mock # msrestazure @@ -761,19 +773,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.8 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==64.0.1 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.22 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via diff --git a/sdk/python/requirements/py3.8-requirements.txt b/sdk/python/requirements/py3.8-requirements.txt index aea3bfb3f43..c2aef636733 100644 --- a/sdk/python/requirements/py3.8-requirements.txt +++ b/sdk/python/requirements/py3.8-requirements.txt @@ -38,7 +38,7 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -57,6 +57,8 @@ googleapis-common-protos==1.56.4 # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -75,7 +77,7 @@ importlib-resources==5.9.0 # via jsonschema jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.12.1 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index efeea5d08fd..e13eee056bc 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -65,9 +65,13 @@ azure-core==1.25.0 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs + # via + # adlfs + # feast (setup.py) azure-storage-blob==12.13.1 - # via adlfs + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -144,6 +148,7 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 @@ -180,7 +185,7 @@ execnet==1.9.0 # via pytest-xdist executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -272,6 +277,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -338,7 +345,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.12.1 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -548,6 +555,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -647,10 +658,10 @@ responses==0.21.0 # via moto rsa==4.9 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml s3fs==2022.1.0 # via feast (setup.py) s3transfer==0.5.2 @@ -667,6 +678,7 @@ six==1.16.0 # google-auth-httplib2 # grpcio # happybase + # isodate # kubernetes # mock # msrestazure @@ -753,19 +765,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.8 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==64.0.1 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.22 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via diff --git a/sdk/python/requirements/py3.9-requirements.txt b/sdk/python/requirements/py3.9-requirements.txt index 738ad25bd1c..0d3cb22bbca 100644 --- a/sdk/python/requirements/py3.9-requirements.txt +++ b/sdk/python/requirements/py3.9-requirements.txt @@ -38,7 +38,7 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) fastavro==1.6.0 # via @@ -57,6 +57,8 @@ googleapis-common-protos==1.56.4 # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -73,7 +75,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.12.1 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd diff --git a/sdk/python/tests/README.md b/sdk/python/tests/README.md index 0f56e0eee28..3212f02482c 100644 --- a/sdk/python/tests/README.md +++ b/sdk/python/tests/README.md @@ -239,7 +239,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, table_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], ) # ... more test code ``` diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index c2cf286fdc4..708d9c0a142 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -349,6 +349,7 @@ class Environment: python_feature_server: bool worker_id: str online_store_creator: Optional[OnlineStoreCreator] = None + fixture_request: Optional[pytest.FixtureRequest] = None def __post_init__(self): self.end_date = datetime.utcnow().replace(microsecond=0, second=0, minute=0) @@ -457,6 +458,7 @@ def construct_test_environment( python_feature_server=test_repo_config.python_feature_server, worker_id=worker_id, online_store_creator=online_creator, + fixture_request=fixture_request, ) return environment diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 73c5152d477..0abb290563a 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -146,7 +146,9 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) assert_feature_service_correctness( @@ -170,7 +172,9 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, table_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -329,7 +333,9 @@ def test_historical_features_with_entities_from_query( validate_dataframes( expected_df_query, actual_df_from_sql_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) table_from_sql_entities = job_from_sql.to_arrow().to_pandas() @@ -341,7 +347,9 @@ def test_historical_features_with_entities_from_query( validate_dataframes( expected_df_query, table_from_sql_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -415,13 +423,17 @@ def test_historical_features_persisting( validate_dataframes( expected_df, saved_dataset.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) validate_dataframes( job.to_df(), saved_dataset.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -493,7 +505,9 @@ def test_historical_features_with_no_ttl( validate_dataframes( expected_df, job.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -590,4 +604,8 @@ def test_historical_features_from_bigquery_sources_containing_backfills(environm print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df.columns) - validate_dataframes(expected_df, actual_df, keys=["driver_id"]) + validate_dataframes( + expected_df, + actual_df, + sort_by=["driver_id"], + ) diff --git a/sdk/python/tests/integration/registration/test_universal_cli.py b/sdk/python/tests/integration/registration/test_universal_cli.py index 1fb82ce59f1..e7f7a7cb633 100644 --- a/sdk/python/tests/integration/registration/test_universal_cli.py +++ b/sdk/python/tests/integration/registration/test_universal_cli.py @@ -26,7 +26,10 @@ def test_universal_cli(environment: Environment): try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path + project, + environment.test_repo_config, + repo_path, + environment.data_source_creator, ) repo_config = repo_path / "feature_store.yaml" @@ -120,7 +123,10 @@ def test_odfv_apply(environment) -> None: try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path + project, + environment.test_repo_config, + repo_path, + environment.data_source_creator, ) repo_config = repo_path / "feature_store.yaml" @@ -151,7 +157,10 @@ def test_nullable_online_store(test_nullable_online_store) -> None: try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, test_nullable_online_store, repo_path + project, + test_nullable_online_store, + repo_path, + test_nullable_online_store.offline_store_creator(project), ) repo_config = repo_path / "feature_store.yaml" diff --git a/sdk/python/tests/utils/e2e_test_validation.py b/sdk/python/tests/utils/e2e_test_validation.py index c87f8fd61fc..e2b8b14eb47 100644 --- a/sdk/python/tests/utils/e2e_test_validation.py +++ b/sdk/python/tests/utils/e2e_test_validation.py @@ -164,8 +164,12 @@ def _check_offline_and_online_features( ) -def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): - offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) +def make_feature_store_yaml( + project, + test_repo_config, + repo_dir_name: Path, + offline_creator: DataSourceCreator, +): offline_store_config = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store diff --git a/sdk/python/tests/utils/feature_records.py b/sdk/python/tests/utils/feature_records.py index 9aadc031683..3f210f9e1c1 100644 --- a/sdk/python/tests/utils/feature_records.py +++ b/sdk/python/tests/utils/feature_records.py @@ -1,6 +1,7 @@ from datetime import datetime, timedelta from typing import Any, Dict, List, Optional +import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal @@ -314,7 +315,9 @@ def assert_feature_service_correctness( validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -359,7 +362,7 @@ def assert_feature_service_entity_mapping_correctness( validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[ + sort_by=[ event_timestamp, "order_id", "driver_id", @@ -367,6 +370,8 @@ def assert_feature_service_entity_mapping_correctness( "origin_id", "destination_id", ], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) else: # using 2 of the same FeatureView without full_feature_names=True will result in collision @@ -378,18 +383,40 @@ def assert_feature_service_entity_mapping_correctness( ) -def validate_dataframes(expected_df, actual_df, keys): - expected_df: pd.DataFrame = ( - expected_df.sort_values(by=keys).drop_duplicates().reset_index(drop=True) +# Specify timestamp_precision to relax timestamp equality constraints +def validate_dataframes( + expected_df: pd.DataFrame, + actual_df: pd.DataFrame, + sort_by: List[str], + event_timestamp_column: Optional[str] = None, + timestamp_precision: timedelta = timedelta(seconds=0), +): + expected_df = ( + expected_df.sort_values(by=sort_by).drop_duplicates().reset_index(drop=True) ) actual_df = ( actual_df[expected_df.columns] - .sort_values(by=keys) + .sort_values(by=sort_by) .drop_duplicates() .reset_index(drop=True) ) - + if event_timestamp_column: + expected_timestamp_col = expected_df[event_timestamp_column].to_frame() + actual_timestamp_col = expected_df[event_timestamp_column].to_frame() + expected_df = expected_df.drop(event_timestamp_column, axis=1) + actual_df = actual_df.drop(event_timestamp_column, axis=1) + if event_timestamp_column in sort_by: + sort_by.remove(event_timestamp_column) + + diffs = expected_timestamp_col.to_numpy() - actual_timestamp_col.to_numpy() + for diff in diffs: + if isinstance(diff, np.ndarray): + diff = diff[0] + if isinstance(diff, np.timedelta64): + assert abs(diff) <= timestamp_precision.seconds + else: + assert abs(diff) <= timestamp_precision pd_assert_frame_equal( expected_df, actual_df, diff --git a/setup.py b/setup.py index 5a770b8a6a2..37ed471cfa6 100644 --- a/setup.py +++ b/setup.py @@ -129,6 +129,16 @@ "cffi==1.15.*,<2", ] +AZURE_REQUIRED = ( + [ + "azure-storage-blob>=0.37.0", + "azure-identity>=1.6.1", + "SQLAlchemy>=1.4.19", + "pyodbc>=4.0.30", + "pymssql", + ] +) + CI_REQUIRED = ( [ "build", @@ -185,6 +195,7 @@ + GE_REQUIRED + HBASE_REQUIRED + CASSANDRA_REQUIRED + + AZURE_REQUIRED ) @@ -515,6 +526,7 @@ def copy_extensions_to_source(self): "spark": SPARK_REQUIRED, "trino": TRINO_REQUIRED, "postgres": POSTGRES_REQUIRED, + "azure": AZURE_REQUIRED, "mysql": MYSQL_REQUIRED, "ge": GE_REQUIRED, "hbase": HBASE_REQUIRED,