Azure Machine Learning SDK Custom Training Tutorial

Custom Training with the Azure ML SDK

Learn how to build, register, and deploy a custom training script using the Azure Machine Learning Python SDK.

Prerequisites

Azure subscription (free trial works)
Azure CLI installed
Python 3.8‑3.10
Azure ML SDK v2 (pip install azure-ai-ml azure-identity)

Setup Workspace

# login & set subscription
az login
az account set --subscription <SUB_ID>

# create resource group & workspace
az group create --name ml-rg --location eastus
az ml workspace create --name ml-ws --resource-group ml-rg

# verify connection in Python
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

credential = DefaultAzureCredential()
ml_client = MLClient(credential, subscription_id="<SUB_ID>", resource_group_name="ml-rg", workspace_name="ml-ws")
print(ml_client.workspaces.get("ml-ws"))

Write a Custom Training Script

Create train.py in your project folder.

import argparse
import os
import joblib
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-path", type=str, required=True, help="Path to training data")
    args = parser.parse_args()

    # Load data
    data = np.loadtxt(args.data_path, delimiter=",")
    X, y = data[:, :-1], data[:, -1]

    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    # Save model
    os.makedirs("outputs", exist_ok=True)
    joblib.dump(model, "outputs/model.joblib")
    print("Model saved.")

if __name__ == "__main__":
    main()

Submit the Training Job

from azure.ai.ml import load_job
from azure.ai.ml.entities import CommandJob

job = CommandJob(
    name="custom-train-job",
    display_name="Custom Training Demo",
    command="python train.py --data-path ${{inputs.training_data}}",
    code="./",
    environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:5",
    compute="cpu-cluster",
    experiment_name="custom-training",
    inputs={"training_data": "./data/train.csv"},
    outputs={"model": {"type": "mlflow_model", "path": "./outputs"}}
)

ml_client.jobs.create_or_update(job)

Register the Model

from azure.ai.ml.entities import Model

model = Model(
    name="rf-regressor",
    path="azureml:custom-train-job:latest",
    description="RandomForest regression model trained on custom script"
)

ml_client.models.create_or_update(model)

Deploy as Real‑Time Endpoint

from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment

endpoint = ManagedOnlineEndpoint(name="rf-endpoint", auth_mode="key")
ml_client.begin_create_or_update(endpoint).wait()

deployment = ManagedOnlineDeployment(
    name="default",
    endpoint_name=endpoint.name,
    model=model.id,
    environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:5",
    instance_type="Standard_DS3_v2",
    instance_count=1
)

ml_client.begin_create_or_update(deployment).wait()

# Test the endpoint
import requests, json
key = ml_client.endpoints.get(endpoint.name).keys.primary_key
scoring_uri = f"https://{endpoint.name}.centralus.inference.ml.azure.com/score"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
sample = {"data": [[5.1, 3.5, 1.4, 0.2]]}
response = requests.post(scoring_uri, json=sample, headers=headers)
print(response.json())

Summary

You have created a custom training script, run it in Azure ML, registered the resulting model, and deployed a real‑time endpoint.

Next steps »