Custom Training with the Azure ML SDK
Learn how to build, register, and deploy a custom training script using the Azure Machine Learning Python SDK.
Prerequisites
- Azure subscription (free trial works)
- Azure CLI installed
- Python 3.8‑3.10
- Azure ML SDK v2 (
pip install azure-ai-ml azure-identity)
Setup Workspace
# login & set subscription
az login
az account set --subscription <SUB_ID>
# create resource group & workspace
az group create --name ml-rg --location eastus
az ml workspace create --name ml-ws --resource-group ml-rg
# verify connection in Python
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
credential = DefaultAzureCredential()
ml_client = MLClient(credential, subscription_id="<SUB_ID>", resource_group_name="ml-rg", workspace_name="ml-ws")
print(ml_client.workspaces.get("ml-ws"))
Write a Custom Training Script
Create train.py in your project folder.
import argparse
import os
import joblib
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, required=True, help="Path to training data")
args = parser.parse_args()
# Load data
data = np.loadtxt(args.data_path, delimiter=",")
X, y = data[:, :-1], data[:, -1]
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
# Save model
os.makedirs("outputs", exist_ok=True)
joblib.dump(model, "outputs/model.joblib")
print("Model saved.")
if __name__ == "__main__":
main()
Submit the Training Job
from azure.ai.ml import load_job
from azure.ai.ml.entities import CommandJob
job = CommandJob(
name="custom-train-job",
display_name="Custom Training Demo",
command="python train.py --data-path ${{inputs.training_data}}",
code="./",
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:5",
compute="cpu-cluster",
experiment_name="custom-training",
inputs={"training_data": "./data/train.csv"},
outputs={"model": {"type": "mlflow_model", "path": "./outputs"}}
)
ml_client.jobs.create_or_update(job)
Register the Model
from azure.ai.ml.entities import Model
model = Model(
name="rf-regressor",
path="azureml:custom-train-job:latest",
description="RandomForest regression model trained on custom script"
)
ml_client.models.create_or_update(model)
Deploy as Real‑Time Endpoint
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
endpoint = ManagedOnlineEndpoint(name="rf-endpoint", auth_mode="key")
ml_client.begin_create_or_update(endpoint).wait()
deployment = ManagedOnlineDeployment(
name="default",
endpoint_name=endpoint.name,
model=model.id,
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:5",
instance_type="Standard_DS3_v2",
instance_count=1
)
ml_client.begin_create_or_update(deployment).wait()
# Test the endpoint
import requests, json
key = ml_client.endpoints.get(endpoint.name).keys.primary_key
scoring_uri = f"https://{endpoint.name}.centralus.inference.ml.azure.com/score"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
sample = {"data": [[5.1, 3.5, 1.4, 0.2]]}
response = requests.post(scoring_uri, json=sample, headers=headers)
print(response.json())
Summary
You have created a custom training script, run it in Azure ML, registered the resulting model, and deployed a real‑time endpoint.
Next steps »