Effective preprocessing is a cornerstone of any Machine Learning pipeline. Scikit‑learn provides a rich set of utilities to transform raw data into a format suitable for modeling.
from sklearn.preprocessing import StandardScaler
import pandas as pd
df = pd.read_csv('data.csv')
scaler = StandardScaler()
scaled = scaler.fit_transform(df[['age','salary']])
df[['age_scaled','salary_scaled']] = scaled
print(df.head())
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
df[['age_norm','salary_norm']] = mm.fit_transform(df[['age','salary']])
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
categorical = df[['city']]
encoded = enc.fit_transform(categorical)
encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(['city']))
df = pd.concat([df, encoded_df], axis=1).drop('city', axis=1)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='median')
df[['age','salary']] = imp.fit_transform(df[['age','salary']])