Welcome to Machine Learning Housing Corp.! The goal is to predict median house values in Californian districts, given a number of features from these districts.
This project aims to predict the median house values in various districts of California using machine learning models. The primary objective is to explore different regression techniques and evaluate their performance on this dataset.
The dataset used for this project is the California Housing dataset. It contains information collected during the 1990 California census, including features such as:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
def load_housing_data():
tarball_path = Path("datasets/housing.tgz")
if not tarball_path.is_file():
Path("datasets").mkdir(parents=True, exist_ok=True)
url = "https://github.com/ageron/data/raw/main/housing.tgz"
urllib.request.urlretrieve(url, tarball_path)
with tarfile.open(tarball_path) as housing_tarball:
housing_tarball.extractall(path="datasets")
return pd.read_csv(Path("datasets/housing/housing.csv"))
housing = load_housing_data()
housing.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
housing["ocean_proximity"].value_counts()
ocean_proximity <1H OCEAN 9136 INLAND 6551 NEAR OCEAN 2658 NEAR BAY 2290 ISLAND 5 Name: count, dtype: int64
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
import matplotlib.pyplot as plt
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
housing.hist(bins=50, figsize=(12, 8))
plt.show()
import numpy as np
def shuffle_and_split_data(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = shuffle_and_split_data(housing, 0.2)
len(train_set)
16512
len(test_set)
4128
To ensure that the outputs remain the same every time we run it, we need to set the random seed:
np.random.seed(42)
from zlib import crc32
def is_id_in_test_set(identifier, test_ratio):
return crc32(np.int64(identifier)) < test_ratio * 2**32
def split_data_with_id_hash(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "index")
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "id")
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set["total_bedrooms"].isnull().sum()
44
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()
housing["income_cat"].value_counts() / len(housing["income_cat"])
income_cat 3 0.350581 2 0.318847 4 0.176308 5 0.114438 1 0.039826 Name: count, dtype: float64
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(housing, housing["income_cat"]):
strat_train_set_n = housing.iloc[train_index]
strat_test_set_n = housing.iloc[test_index]
strat_splits.append([strat_train_set_n, strat_test_set_n])
strat_train_set, strat_test_set = strat_splits[0]
It's much shorter to get a single stratified split:
strat_train_set, strat_test_set = train_test_split(
housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
income_cat 3 0.350533 2 0.318798 4 0.176357 5 0.114341 1 0.039971 Name: count, dtype: float64
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
s=housing["population"] / 100, label="population",
c="median_house_value", cmap="jet", colorbar=True,
legend=True, sharex=False, figsize=(10, 7))
plt.show()
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.688380 total_rooms 0.137455 housing_median_age 0.102175 households 0.071426 total_bedrooms 0.054635 population -0.020153 longitude -0.050859 latitude -0.139584 Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1, grid=True)
plt.show()
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.688380 rooms_per_house 0.143663 total_rooms 0.137455 housing_median_age 0.102175 households 0.071426 total_bedrooms 0.054635 population -0.020153 people_per_house -0.038224 longitude -0.050859 latitude -0.139584 bedrooms_ratio -0.256397 Name: median_house_value, dtype: float64
strat_train_set.drop()
creates a copy of strat_train_set
without the column, it doesn't actually modify strat_train_set
itself, unless you pass inplace=True
:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
null_rows_idx = housing.isnull().any(axis=1)
housing.loc[null_rows_idx].head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|
14452 | -120.67 | 40.50 | 15.0 | 5343.0 | NaN | 2503.0 | 902.0 | 3.5962 | INLAND |
18217 | -117.96 | 34.03 | 35.0 | 2093.0 | NaN | 1755.0 | 403.0 | 3.4115 | <1H OCEAN |
11889 | -118.05 | 34.04 | 33.0 | 1348.0 | NaN | 1098.0 | 257.0 | 4.2917 | <1H OCEAN |
20325 | -118.88 | 34.17 | 15.0 | 4260.0 | NaN | 1701.0 | 669.0 | 5.1033 | <1H OCEAN |
14360 | -117.87 | 33.62 | 8.0 | 1266.0 | NaN | 375.0 | 183.0 | 9.8020 | <1H OCEAN |
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
Separating out the numerical attributes to use the "median"
strategy (as it cannot be calculated on text attributes like ocean_proximity
):
housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer(strategy='median')
imputer.statistics_
array([-118.51 , 34.26 , 29. , 2125. , 434. , 1167. , 408. , 3.5385])
Check that this is the same as manually computing the median of each attribute:
housing_num.median().values
array([-118.51 , 34.26 , 29. , 2125. , 434. , 1167. , 408. , 3.5385])
Transform the training set:
X = imputer.transform(housing_num)
imputer.feature_names_in_
array(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], dtype=object)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)
housing_tr.loc[null_rows_idx].head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | |
---|---|---|---|---|---|---|---|---|
14452 | -120.67 | 40.50 | 15.0 | 5343.0 | 434.0 | 2503.0 | 902.0 | 3.5962 |
18217 | -117.96 | 34.03 | 35.0 | 2093.0 | 434.0 | 1755.0 | 403.0 | 3.4115 |
11889 | -118.05 | 34.04 | 33.0 | 1348.0 | 434.0 | 1098.0 | 257.0 | 4.2917 |
20325 | -118.88 | 34.17 | 15.0 | 4260.0 | 434.0 | 1701.0 | 669.0 | 5.1033 |
14360 | -117.87 | 33.62 | 8.0 | 1266.0 | 434.0 | 375.0 | 183.0 | 9.8020 |
imputer.strategy
'median'
Now let's preprocess the categorical input feature, ocean_proximity
:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(8)
ocean_proximity | |
---|---|
13096 | NEAR BAY |
14973 | <1H OCEAN |
3785 | INLAND |
14689 | INLAND |
20507 | NEAR OCEAN |
1286 | INLAND |
18078 | <1H OCEAN |
4396 | NEAR BAY |
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:8]
array([[3.], [0.], [1.], [1.], [4.], [1.], [0.], [3.]])
ordinal_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>' with 16512 stored elements in Compressed Sparse Row format>
By default, the OneHotEncoder
class returns a sparse array, but we can convert it to a dense array if needed by calling the toarray()
method:
housing_cat_1hot.toarray()
array([[0., 0., 0., 1., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], ..., [0., 0., 0., 0., 1.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.]])
Alternatively, you can set sparse_output=False
when creating the OneHotEncoder
:
cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
array([[0., 0., 0., 1., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], ..., [0., 0., 0., 0., 1.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.]])
cat_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
cat_encoder.feature_names_in_
array(['ocean_proximity'], dtype=object)
cat_encoder.get_feature_names_out()
array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'], dtype=object)
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_num)
fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)
housing["population"].hist(ax=axs[0], bins=50)
housing["population"].apply(np.log).hist(ax=axs[1], bins=50)
axs[0].set_xlabel("Population")
axs[1].set_xlabel("Log of population")
axs[0].set_ylabel("Number of districts")
plt.show()
What if we replace each value with its percentile?
# just shows that we get a uniform distribution
percentiles = [np.percentile(housing["median_income"], p)
for p in range(1, 100)]
flattened_median_income = pd.cut(housing["median_income"],
bins=[-np.inf] + percentiles + [np.inf],
labels=range(1, 100 + 1))
flattened_median_income.hist(bins=50)
plt.xlabel("Median income percentile")
plt.ylabel("Number of districts")
plt.show()
# Note: incomes below the 1st percentile are labeled 1, and incomes above the
# 99th percentile are labeled 100. This is why the distribution below ranges
# from 1 to 100 (not 0 to 100).
from sklearn.metrics.pairwise import rbf_kernel
age_simil_35 = rbf_kernel(housing[["housing_median_age"]], [[35]], gamma=0.1)
# this cell generates Figure 2–18
ages = np.linspace(housing["housing_median_age"].min(),
housing["housing_median_age"].max(),
500).reshape(-1, 1)
gamma1 = 0.1
gamma2 = 0.03
rbf1 = rbf_kernel(ages, [[35]], gamma=gamma1)
rbf2 = rbf_kernel(ages, [[35]], gamma=gamma2)
fig, ax1 = plt.subplots()
ax1.set_xlabel("Housing median age")
ax1.set_ylabel("Number of districts")
ax1.hist(housing["housing_median_age"], bins=50)
ax2 = ax1.twinx() # create a twin axis that shares the same x-axis
color = "blue"
ax2.plot(ages, rbf1, color=color, label="gamma = 0.10")
ax2.plot(ages, rbf2, color=color, label="gamma = 0.03", linestyle="--")
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylabel("Age similarity", color=color)
plt.legend(loc="upper left")
plt.show()
To create simple transformers:
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_pop = log_transformer.transform(housing[["population"]])
rbf_transformer = FunctionTransformer(rbf_kernel,
kw_args=dict(Y=[[35.]], gamma=0.1))
age_simil_35 = rbf_transformer.transform(housing[["housing_median_age"]])
age_simil_35
array([[2.81118530e-13], [8.20849986e-02], [6.70320046e-01], ..., [9.55316054e-22], [6.70320046e-01], [3.03539138e-04]])
sf_coords = 37.7749, -122.41
sf_transformer = FunctionTransformer(rbf_kernel,
kw_args=dict(Y=[sf_coords], gamma=0.1))
sf_simil = sf_transformer.transform(housing[["latitude", "longitude"]])
sf_simil
array([[0.999927 ], [0.05258419], [0.94864161], ..., [0.00388525], [0.05038518], [0.99868067]])
ratio_transformer = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])
ratio_transformer.transform(np.array([[1., 2.], [3., 4.]]))
array([[0.5 ], [0.75]])
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
class StandardScalerClone(BaseEstimator, TransformerMixin):
def __init__(self, with_mean=True): # no *args or **kwargs!
self.with_mean = with_mean
def fit(self, X, y=None): # y is required even though we don't use it
X = check_array(X) # checks that X is an array with finite float values
self.mean_ = X.mean(axis=0)
self.scale_ = X.std(axis=0)
self.n_features_in_ = X.shape[1] # every estimator stores this in fit()
return self # always return self!
def transform(self, X):
check_is_fitted(self) # looks for learned attributes (with trailing _)
X = check_array(X)
assert self.n_features_in_ == X.shape[1]
if self.with_mean:
X = X - self.mean_
return X / self.scale_
from sklearn.cluster import KMeans
class ClusterSimilarity(BaseEstimator, TransformerMixin):
def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
self.n_clusters = n_clusters
self.gamma = gamma
self.random_state = random_state
def fit(self, X, y=None, sample_weight=None):
self.kmeans_ = KMeans(self.n_clusters, n_init=10,
random_state=self.random_state)
self.kmeans_.fit(X, sample_weight=sample_weight)
return self # always return self!
def transform(self, X):
return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
def get_feature_names_out(self, names=None):
return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
similarities = cluster_simil.fit_transform(housing[["latitude", "longitude"]],
sample_weight=housing_labels)
similarities[:3].round(2)
array([[0.08, 0. , 0.6 , 0. , 0. , 0.99, 0. , 0. , 0. , 0.14], [0. , 0.99, 0. , 0.04, 0. , 0. , 0.11, 0. , 0.63, 0. ], [0.44, 0. , 0.3 , 0. , 0. , 0.7 , 0. , 0.01, 0. , 0.29]])
housing_renamed = housing.rename(columns={
"latitude": "Latitude", "longitude": "Longitude",
"population": "Population",
"median_house_value": "Median house value (ᴜsᴅ)"})
housing_renamed["Max cluster similarity"] = similarities.max(axis=1)
housing_renamed.plot(kind="scatter", x="Longitude", y="Latitude", grid=True,
s=housing_renamed["Population"] / 100, label="Population",
c="Max cluster similarity",
cmap="jet", colorbar=True,
legend=True, sharex=False, figsize=(10, 7))
plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],
cluster_simil.kmeans_.cluster_centers_[:, 0],
linestyle="", color="black", marker="X", markersize=20,
label="Cluster centers")
plt.legend(loc="upper right")
plt.show()
Now let's build a pipeline to preprocess the numerical attributes:
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
("impute", SimpleImputer(strategy="median")),
("standardize", StandardScaler()),
])
from sklearn.pipeline import make_pipeline
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
from sklearn import set_config
set_config(display='diagram')
num_pipeline
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])
SimpleImputer(strategy='median')
StandardScaler()
housing_num_prepared = num_pipeline.fit_transform(housing_num)
housing_num_prepared[:2].round(2)
array([[-1.42, 1.01, 1.86, 0.31, 1.37, 0.14, 1.39, -0.94], [ 0.6 , -0.7 , 0.91, -0.31, -0.44, -0.69, -0.37, 1.17]])
def monkey_patch_get_signature_names_out():
"""Monkey patch some classes which did not handle get_feature_names_out()
correctly in Scikit-Learn 1.0.*."""
from inspect import Signature, signature, Parameter
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
default_get_feature_names_out = StandardScaler.get_feature_names_out
if not hasattr(SimpleImputer, "get_feature_names_out"):
print("Monkey-patching SimpleImputer.get_feature_names_out()")
SimpleImputer.get_feature_names_out = default_get_feature_names_out
if not hasattr(FunctionTransformer, "get_feature_names_out"):
print("Monkey-patching FunctionTransformer.get_feature_names_out()")
orig_init = FunctionTransformer.__init__
orig_sig = signature(orig_init)
def __init__(*args, feature_names_out=None, **kwargs):
orig_sig.bind(*args, **kwargs)
orig_init(*args, **kwargs)
args[0].feature_names_out = feature_names_out
__init__.__signature__ = Signature(
list(signature(orig_init).parameters.values()) + [
Parameter("feature_names_out", Parameter.KEYWORD_ONLY)])
def get_feature_names_out(self, names=None):
if callable(self.feature_names_out):
return self.feature_names_out(self, names)
assert self.feature_names_out == "one-to-one"
return default_get_feature_names_out(self, names)
FunctionTransformer.__init__ = __init__
FunctionTransformer.get_feature_names_out = get_feature_names_out
monkey_patch_get_signature_names_out()
df_housing_num_prepared = pd.DataFrame(
housing_num_prepared, columns=num_pipeline.get_feature_names_out(),
index=housing_num.index)
df_housing_num_prepared.head(2)
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | |
---|---|---|---|---|---|---|---|---|
13096 | -1.423037 | 1.013606 | 1.861119 | 0.311912 | 1.368167 | 0.137460 | 1.394812 | -0.936491 |
14973 | 0.596394 | -0.702103 | 0.907630 | -0.308620 | -0.435925 | -0.693771 | -0.373485 | 1.171942 |
num_pipeline.steps
[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]
num_pipeline[1]
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
num_pipeline[:-1]
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median'))])
SimpleImputer(strategy='median')
num_pipeline.named_steps["simpleimputer"]
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer(strategy='median')
num_pipeline.set_params(simpleimputer__strategy="median")
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])
SimpleImputer(strategy='median')
StandardScaler()
from sklearn.compose import ColumnTransformer
num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
"total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore"))
preprocessing = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attribs),
])
from sklearn.compose import make_column_selector, make_column_transformer
preprocessing = make_column_transformer(
(num_pipeline, make_column_selector(dtype_include=np.number)),
(cat_pipeline, make_column_selector(dtype_include=object)),
)
housing_prepared = preprocessing.fit_transform(housing)
# shows that we can get a DataFrame out if we want
housing_prepared_fr = pd.DataFrame(
housing_prepared,
columns=preprocessing.get_feature_names_out(),
index=housing.index)
housing_prepared_fr.head(2)
pipeline-1__longitude | pipeline-1__latitude | pipeline-1__housing_median_age | pipeline-1__total_rooms | pipeline-1__total_bedrooms | pipeline-1__population | pipeline-1__households | pipeline-1__median_income | pipeline-2__ocean_proximity_<1H OCEAN | pipeline-2__ocean_proximity_INLAND | pipeline-2__ocean_proximity_ISLAND | pipeline-2__ocean_proximity_NEAR BAY | pipeline-2__ocean_proximity_NEAR OCEAN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13096 | -1.423037 | 1.013606 | 1.861119 | 0.311912 | 1.368167 | 0.137460 | 1.394812 | -0.936491 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
14973 | 0.596394 | -0.702103 | 0.907630 | -0.308620 | -0.435925 | -0.693771 | -0.373485 | 1.171942 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
def column_ratio(X):
return X[:, [0]] / X[:, [1]]
def ratio_name(function_transformer, feature_names_in):
return ["ratio"] # feature names out
def ratio_pipeline():
return make_pipeline(
SimpleImputer(strategy="median"),
FunctionTransformer(column_ratio, feature_names_out=ratio_name),
StandardScaler())
log_pipeline = make_pipeline(
SimpleImputer(strategy="median"),
FunctionTransformer(np.log, feature_names_out="one-to-one"),
StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
StandardScaler())
preprocessing = ColumnTransformer([
("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
("people_per_house", ratio_pipeline(), ["population", "households"]),
("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
"households", "median_income"]),
("geo", cluster_simil, ["latitude", "longitude"]),
("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
remainder=default_num_pipeline) # one column remaining: housing_median_age
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape
(16512, 24)
preprocessing.get_feature_names_out()
array(['bedrooms__ratio', 'rooms_per_house__ratio', 'people_per_house__ratio', 'log__total_bedrooms', 'log__total_rooms', 'log__population', 'log__households', 'log__median_income', 'geo__Cluster 0 similarity', 'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity', 'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity', 'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity', 'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity', 'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN', 'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND', 'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN', 'remainder__housing_median_age'], dtype=object)
from sklearn.linear_model import LinearRegression
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(housing, housing_labels)
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x000... 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])), ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x000... 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])), ('linearregression', LinearRegression())])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_r... ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
LinearRegression()
Let's try the full preprocessing pipeline on a few training instances:
housing_predictions = lin_reg.predict(housing)
housing_predictions[:5].round(-2) # -2 = rounded to the nearest hundred
array([242800., 375900., 127500., 99400., 324600.])
Compare against the actual values:
housing_labels.iloc[:5].values
array([458300., 483800., 101700., 96100., 361800.])
from sklearn.metrics import mean_squared_error
lin_rmse = mean_squared_error(housing_labels, housing_predictions,
squared=False)
lin_rmse
68647.95686706704
from sklearn.tree import DecisionTreeRegressor
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x000... ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])), ('decisiontreeregressor', DecisionTreeRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x000... ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])), ('decisiontreeregressor', DecisionTreeRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_r... ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x0000021A976CF1F0>, func=<function column_ratio at 0x0000021A976AADC0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x0000021A976A8DC0>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
DecisionTreeRegressor(random_state=42)
housing_predictions = tree_reg.predict(housing)
tree_rmse = mean_squared_error(housing_labels, housing_predictions,
squared=False)
tree_rmse
0.0
from sklearn.model_selection import cross_val_score
tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,
scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rmses).describe()
count 10.000000 mean 67153.318273 std 1963.580924 min 63925.253106 25% 66083.277180 50% 66795.829871 75% 68074.018403 max 70664.635833 dtype: float64
# computes the error stats for the linear model
lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,
scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()
count 10.000000 mean 69847.923224 std 4078.407329 min 65659.761079 25% 68088.799156 50% 68697.591463 75% 69800.966364 max 80685.254832 dtype: float64
from sklearn.ensemble import RandomForestRegressor
forest_reg = make_pipeline(preprocessing,
RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,
scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()
Let's compare this RMSE measured using cross-validation (the "validation error") with the RMSE measured on the training set (the "training error"):
forest_reg.fit(housing, housing_labels)
housing_predictions = forest_reg.predict(housing)
forest_rmse = mean_squared_error(housing_labels, housing_predictions,
squared=False)
forest_rmse
The training error is much lower than the validation error, which usually means that the model has overfit the training set. Another possible explanation may be that there's a mismatch between the training data and the validation data, but it's not the case here, since both came from the same dataset that we shuffled and split in two parts.
from sklearn.model_selection import GridSearchCV
full_pipeline = Pipeline([
("preprocessing", preprocessing),
("random_forest", RandomForestRegressor(random_state=42)),
])
param_grid = [
{'preprocessing__geo__n_clusters': [5, 8, 10],
'random_forest__max_features': [4, 6, 8]},
{'preprocessing__geo__n_clusters': [10, 15],
'random_forest__max_features': [6, 8, 10]},
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
scoring='neg_root_mean_squared_error')
grid_search.fit(housing, housing_labels)
GridSearchCV(cv=3, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<f... <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])), ('random_forest', RandomForestRegressor(random_state=42))]), param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10], 'random_forest__max_features': [4, 6, 8]}, {'preprocessing__geo__n_clusters': [10, 15], 'random_forest__max_features': [6, 8, 10]}], scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<f... <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])), ('random_forest', RandomForestRegressor(random_state=42))]), param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10], 'random_forest__max_features': [4, 6, 8]}, {'preprocessing__geo__n_clusters': [10, 15], 'random_forest__max_features': [6, 8, 10]}], scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1... ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])), ('random_forest', RandomForestRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio... ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(random_state=42)
You can get the full list of hyperparameters available for tuning by looking at full_pipeline.get_params().keys()
:
print(str(full_pipeline.get_params().keys())[:1000] + "...")
dict_keys(['memory', 'steps', 'verbose', 'preprocessing', 'random_forest', 'preprocessing__n_jobs', 'preprocessing__remainder__memory', 'preprocessing__remainder__steps', 'preprocessing__remainder__verbose', 'preprocessing__remainder__simpleimputer', 'preprocessing__remainder__standardscaler', 'preprocessing__remainder__simpleimputer__add_indicator', 'preprocessing__remainder__simpleimputer__copy', 'preprocessing__remainder__simpleimputer__fill_value', 'preprocessing__remainder__simpleimputer__keep_empty_features', 'preprocessing__remainder__simpleimputer__missing_values', 'preprocessing__remainder__simpleimputer__strategy', 'preprocessing__remainder__simpleimputer__verbose', 'preprocessing__remainder__standardscaler__copy', 'preprocessing__remainder__standardscaler__with_mean', 'preprocessing__remainder__standardscaler__with_std', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose'...
The best hyperparameter combination found:
grid_search.best_params_
{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}
grid_search.best_estimator_
Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1... ClusterSimilarity(n_clusters=15, random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befeb1ca3e0>)])), ('random_forest', RandomForestRegressor(max_features=6, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1... ClusterSimilarity(n_clusters=15, random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befeb1ca3e0>)])), ('random_forest', RandomForestRegressor(max_features=6, random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio... ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), ('geo', ClusterSimilarity(n_clusters=15, random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befeb1ca3e0>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(n_clusters=15, random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x7befeb1ca3e0>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(max_features=6, random_state=42)
Let's look at the score of each hyperparameter combination tested during the grid search:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
# these few lines of code just make the DataFrame look nicer
cv_res = cv_res[["param_preprocessing__geo__n_clusters",
"param_random_forest__max_features", "split0_test_score",
"split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res.columns = ["n_clusters", "max_features"] + score_cols
cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)
cv_res.head()
n_clusters | max_features | split0 | split1 | split2 | mean_test_rmse | |
---|---|---|---|---|---|---|
12 | 15 | 6 | 43460 | 43919 | 44748 | 44042 |
13 | 15 | 8 | 44132 | 44075 | 45010 | 44406 |
14 | 15 | 10 | 44374 | 44286 | 45316 | 44659 |
7 | 10 | 6 | 44683 | 44655 | 45657 | 44999 |
9 | 10 | 6 | 44683 | 44655 | 45657 | 44999 |
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
Try 30 (n_iter
× cv
) random combinations of hyperparameters:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),
'random_forest__max_features': randint(low=2, high=20)}
rnd_search = RandomizedSearchCV(
full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
scoring='neg_root_mean_squared_error', random_state=42)
rnd_search.fit(housing, housing_labels)
RandomizedSearchCV(cv=3, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_... ('random_forest', RandomForestRegressor(random_state=42))]), param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7befebe9e2f0>, 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7befebe9d3f0>}, random_state=42, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_... ('random_forest', RandomForestRegressor(random_state=42))]), param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7befebe9e2f0>, 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7befebe9d3f0>}, random_state=42, scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing', ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1... ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])), ('random_forest', RandomForestRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), transformers=[('bedrooms', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('functiontransformer', FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio... ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out=<function ratio_name at 0x7befeb1c5240>, func=<function column_ratio at 0x7befeb1c51b0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x7befebe9f940>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(random_state=42)
final_model = rnd_search.best_estimator_ # includes preprocessing
feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)
array([0.07, 0.05, 0.05, 0.01, 0.01, 0.01, 0.01, 0.19, 0.04, 0.01, 0. , 0.01, 0.01, 0.01, 0.01, 0.01, 0. , 0.01, 0.01, 0.01, 0. , 0.01, 0.01, 0.01, 0.01, 0.01, 0. , 0. , 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0. , 0.02, 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0. , 0.07, 0. , 0. , 0. , 0.01])
sorted(zip(feature_importances,
final_model["preprocessing"].get_feature_names_out()),
reverse=True)
[(0.18694559869103852, 'log__median_income'), (0.0748194905715524, 'cat__ocean_proximity_INLAND'), (0.06926417748515576, 'bedrooms__ratio'), (0.05446998753775219, 'rooms_per_house__ratio'), (0.05262301809680712, 'people_per_house__ratio'), (0.03819415873915732, 'geo__Cluster 0 similarity'), (0.02879263999929514, 'geo__Cluster 28 similarity'), (0.023530192521380392, 'geo__Cluster 24 similarity'), (0.020544786346378206, 'geo__Cluster 27 similarity'), (0.019873052631077512, 'geo__Cluster 43 similarity'), (0.018597511022930273, 'geo__Cluster 34 similarity'), (0.017409085415656868, 'geo__Cluster 37 similarity'), (0.015546519677632162, 'geo__Cluster 20 similarity'), (0.014230331127504292, 'geo__Cluster 17 similarity'), (0.0141032216204026, 'geo__Cluster 39 similarity'), (0.014065768027447325, 'geo__Cluster 9 similarity'), (0.01354220782825315, 'geo__Cluster 4 similarity'), (0.01348963625822907, 'geo__Cluster 3 similarity'), (0.01338319626383868, 'geo__Cluster 38 similarity'), (0.012240533790212824, 'geo__Cluster 31 similarity'), (0.012089046542256785, 'geo__Cluster 7 similarity'), (0.01152326329703204, 'geo__Cluster 23 similarity'), (0.011397459905603558, 'geo__Cluster 40 similarity'), (0.011282340924816439, 'geo__Cluster 36 similarity'), (0.01104139770781063, 'remainder__housing_median_age'), (0.010671123191312802, 'geo__Cluster 44 similarity'), (0.010296376177202627, 'geo__Cluster 5 similarity'), (0.010184798445004483, 'geo__Cluster 42 similarity'), (0.010121853542225083, 'geo__Cluster 11 similarity'), (0.009795219101117579, 'geo__Cluster 35 similarity'), (0.00952581084310724, 'geo__Cluster 10 similarity'), (0.009433209165984823, 'geo__Cluster 13 similarity'), (0.00915075361116215, 'geo__Cluster 1 similarity'), (0.009021485619463173, 'geo__Cluster 30 similarity'), (0.00894936224917583, 'geo__Cluster 41 similarity'), (0.008901832702357514, 'geo__Cluster 25 similarity'), (0.008897504713401587, 'geo__Cluster 29 similarity'), (0.0086846298524955, 'geo__Cluster 21 similarity'), (0.008061104590483955, 'geo__Cluster 15 similarity'), (0.00786048176566994, 'geo__Cluster 16 similarity'), (0.007793633130749198, 'geo__Cluster 22 similarity'), (0.007501766442066527, 'log__total_rooms'), (0.0072024111938241275, 'geo__Cluster 32 similarity'), (0.006947156598995616, 'log__population'), (0.006800076770899128, 'log__households'), (0.006736105364684462, 'log__total_bedrooms'), (0.006315268213499131, 'geo__Cluster 33 similarity'), (0.005796398579893261, 'geo__Cluster 14 similarity'), (0.005234954623294958, 'geo__Cluster 6 similarity'), (0.0045514083468621595, 'geo__Cluster 12 similarity'), (0.004546042080216035, 'geo__Cluster 18 similarity'), (0.004314514641115755, 'geo__Cluster 2 similarity'), (0.003953528110719969, 'geo__Cluster 19 similarity'), (0.003297404747742136, 'geo__Cluster 26 similarity'), (0.00289453474290887, 'cat__ocean_proximity_<1H OCEAN'), (0.0016978863168109126, 'cat__ocean_proximity_NEAR OCEAN'), (0.0016391131530559377, 'geo__Cluster 8 similarity'), (0.00015061247730531558, 'cat__ocean_proximity_NEAR BAY'), (7.301686597099842e-05, 'cat__ocean_proximity_ISLAND')]
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
final_predictions = final_model.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
print(final_rmse)
41424.40026462184
We can compute a 95% confidence interval for the test RMSE:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))
array([39275.40861216, 43467.27680583])