diff --git a/.env.example b/.env.example index f6b2fe58..a3662d20 100644 --- a/.env.example +++ b/.env.example @@ -31,6 +31,9 @@ MODEL_PATH = '' EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py' REGISTER_SCRIPT_PATH = 'register/register_model.py' SOURCES_DIR_TRAIN = 'code' +DATASET_NAME = 'diabetes_ds' +DATASTORE_NAME = 'datablobstore' +DATAFILE_NAME = 'diabetes.csv' # Optional. Used by a training pipeline with R on Databricks DB_CLUSTER_ID = '' diff --git a/.pipelines/azdo-variables.yml b/.pipelines/azdo-variables.yml index fcf67c2b..0691e673 100644 --- a/.pipelines/azdo-variables.yml +++ b/.pipelines/azdo-variables.yml @@ -39,4 +39,6 @@ variables: - name: DB_CLUSTER_ID value: '' - name: SCORE_SCRIPT - value: score.py \ No newline at end of file + value: score.py +- name: DATASET_NAME + value: diabetes_ds diff --git a/code/training/train.py b/code/training/train.py index a04972dd..f56daa99 100644 --- a/code/training/train.py +++ b/code/training/train.py @@ -24,6 +24,7 @@ POSSIBILITY OF SUCH DAMAGE. """ from azureml.core.run import Run +from azureml.core import Dataset import os import argparse from sklearn.datasets import load_diabetes @@ -69,19 +70,34 @@ def main(): "must be a positive float.") ) + parser.add_argument( + "--dataset_name", + type=str, + help=("Dataset with the training data") + ) args = parser.parse_args() print("Argument [build_id]: %s" % args.build_id) print("Argument [model_name]: %s" % args.model_name) print("Argument [alpha]: %s" % args.alpha) + print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name build_id = args.build_id alpha = args.alpha + dataset_name = args.dataset_name run = Run.get_context() + ws = run.experiment.workspace + + if (dataset_name): + dataset = Dataset.get_by_name(workspace=ws, name=dataset_name) + df = dataset.to_pandas_dataframe() + X = df.values + y = df.Y + else: + X, y = load_diabetes(return_X_y=True) - X, y = load_diabetes(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, diff --git a/docs/getting_started.md b/docs/getting_started.md index 6d03e5b8..a46d5304 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -122,6 +122,10 @@ Check out the newly created resources in the [Azure Portal](portal.azure.com): (Optional) To remove the resources created for this project you can use the [/environment_setup/iac-remove-environment.yml](../environment_setup/iac-remove-environment.yml) definition or you can just delete the resource group in the [Azure Portal](portal.azure.com). +**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. If you want to use your own dataset, you need to [create and register a datastore](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data#azure-machine-learning-studio) in your ML workspace and upload the datafile (e.g. [diabetes.csv](./data/diabetes.csv)) to the corresponding blob container. You can also define a datastore in the ML Workspace with [az cli](https://docs.microsoft.com/en-us/cli/azure/ext/azure-cli-ml/ml/datastore?view=azure-cli-latest#ext-azure-cli-ml-az-ml-datastore-attach-blob). +You'll also need to configure DATASTORE_NAME and DATAFILE_NAME variables in ***devopsforai-aml-vg*** variable group. + + ## Create an Azure DevOps Azure ML Workspace Service Connection Install the **Azure Machine Learning** extension to your organization from the [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml), diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/build_train_pipeline.py index 2b41f12c..0b86eb50 100644 --- a/ml_service/pipelines/build_train_pipeline.py +++ b/ml_service/pipelines/build_train_pipeline.py @@ -3,6 +3,7 @@ from azureml.pipeline.core import Pipeline from azureml.core import Workspace from azureml.core.runconfig import RunConfiguration, CondaDependencies +from azureml.core import Dataset, Datastore import os import sys sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 @@ -35,10 +36,10 @@ def main(): 'scikit-learn', 'tensorflow', 'keras'], pip_packages=['azure', 'azureml-core', 'azure-storage', - 'azure-storage-blob']) + 'azure-storage-blob', + 'azureml-dataprep']) ) run_config.environment.docker.enabled = True - config_envvar = {} if (e.collection_uri is not None and e.teamproject_name is not None): builduri_base = e.collection_uri + e.teamproject_name @@ -53,6 +54,17 @@ def main(): hyperparameter_alpha_param = PipelineParameter( name="hyperparameter_alpha", default_value=0.5) + dataset_name = "" + if (e.datastore_name is not None and e.datafile_name is not None): + dataset_name = e.dataset_name + datastore = Datastore.get(aml_workspace, e.datastore_name) + data_path = [(datastore, e.datafile_name)] + dataset = Dataset.Tabular.from_delimited_files(path=data_path) + dataset.register(workspace=aml_workspace, + name=e.dataset_name, + description="dataset with training data", + create_new_version=True) + train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, @@ -62,6 +74,7 @@ def main(): "--build_id", build_id_param, "--model_name", model_name_param, "--alpha", hyperparameter_alpha_param, + "--dataset_name", dataset_name, ], runconfig=run_config, allow_reuse=False, diff --git a/ml_service/util/env_variables.py b/ml_service/util/env_variables.py index ec13ac95..ed3be221 100644 --- a/ml_service/util/env_variables.py +++ b/ml_service/util/env_variables.py @@ -41,6 +41,9 @@ def __init__(self): self._score_script = os.environ.get("SCORE_SCRIPT") self._collection_uri = os.environ.get("SYSTEM_COLLECTIONURI") self._teamproject_name = os.environ.get("SYSTEM_TEAMPROJECT") + self._datastore_name = os.environ.get("DATASTORE_NAME") + self._datafile_name = os.environ.get("DATAFILE_NAME") + self._dataset_name = os.environ.get("DATASET_NAME") @property def workspace_name(self): @@ -145,3 +148,15 @@ def collection_uri(self): @property def teamproject_name(self): return self._teamproject_name + + @property + def datastore_name(self): + return self._datastore_name + + @property + def datafile_name(self): + return self._datafile_name + + @property + def dataset_name(self): + return self._dataset_name