Getting started#

Import packages.

[1]:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes

from generalized_additive_models import GAM, Categorical, Spline

/home/docs/checkouts/readthedocs.org/user_builds/generalized-additive-models/envs/stable/lib/python3.11/site-packages/generalized_additive_models/__init__.py:116: UserWarning:
Thank you for using generalized-additive-models, version 0.4.3.
Until version 1.0.0 is released, the package and API should be considered unstable.
You are welcome to use the package. Report bugs and join the discussion on GitHub:
https://github.com/tommyod/generalized-additive-models
  warnings.warn(message)

Load data.

[2]:

data = load_diabetes(as_frame=True)
df = data.data
y = data.target

# Code the "sex" variable as strings
df = df.assign(sex=lambda df: np.where(df.sex < 0, "Male", "Female"))

df.sample(5)

[2]:

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6
163	0.016281	Female	0.072474	0.076958	-0.008449	0.005575	-0.006584	-0.002592	-0.023647	0.061054
95	-0.070900	Male	-0.057941	-0.081413	-0.045599	-0.028871	-0.043401	-0.002592	0.001148	-0.005220
298	0.023546	Female	-0.037463	-0.046985	-0.091006	-0.075530	-0.032356	-0.039493	-0.030748	-0.013504
192	0.056239	Female	-0.030996	0.008101	0.019070	0.021233	0.033914	-0.039493	-0.029526	-0.059067
275	-0.005515	Female	-0.011595	0.011544	-0.022208	-0.015406	-0.021311	-0.002592	0.011011	0.069338

Creating a model#

Construct and fit a model.

[3]:

# Features can be string and refer to column names if X is a pandas DataFrame
# If X was a 2D numpy array, features must instead be integers referring to the columns

terms = Spline("age") + Categorical("sex", penalty=1) + Spline("bmi") + Spline("bp")
model = GAM(terms, link="identity", verbose=2, fit_intercept=False)

# Fit the model
model.fit(df, y)

Variables set to zero for identifiability:3/62
Initial guess:      Objective: 1.5233e+06   Coef. rmse: 1.1671e+02
Iteration:  1/100   Objective: 1.5233e+06   Coef. rmse: 1.1671e+02   Step size: 1/2^19
 => SUCCESS: Solver converged (met tolerance criterion).

[3]:

GAM(fit_intercept=False,
    terms=TermList(data=[Spline(feature='age'), Categorical(feature='sex'), Spline(feature='bmi'), Spline(feature='bp')]),
    verbose=2)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

GAM

iFitted

Parameters

	terms	TermList([Spl...eature='bp')])
	distribution	'normal'
	link	'identity'
	fit_intercept	False
	solver	'pirls'
	max_iter	100
	tol	0.0001
	verbose	2

terms: TermList

Spline(feature='age') + Categorical(feature='sex') + Spline(feature='bmi') + Spline(feature='bp')

TermList

Parameters

	0__by	None
	0__constraint	None
	0__degree	3
	0__edges	None
	0__extrapolation	'linear'
	0__feature	'age'
	0__knots	'uniform'
	0__l2_penalty	0
	0__num_splines	20
	0__penalty	1
	0	Spline(feature='age')
	1__by	None
	1__feature	'sex'
	1__handle_unknown	'error'
	1__max_categories	None
	1__min_frequency	None
	1__penalty	1
	1	Categorical(feature='sex')
	2__by	None
	2__constraint	None
	2__degree	3
	2__edges	None
	2__extrapolation	'linear'
	2__feature	'bmi'
	2__knots	'uniform'
	2__l2_penalty	0
	2__num_splines	20
	2__penalty	1
	2	Spline(feature='bmi')
	3__by	None
	3__constraint	None
	3__degree	3
	3__edges	None
	3__extrapolation	'linear'
	3__feature	'bp'
	3__knots	'uniform'
	3__l2_penalty	0
	3__num_splines	20
	3__penalty	1
	3	Spline(feature='bp')

[4]:

model.summary()

| Property           | Value      |
|--------------------|------------|
| Model              | GAM        |
| Link               | Identity() |
| Distribution       | Normal     |
| Scale              | 3574.069   |
| GCV                | 3854.516   |
| Explained deviance | 0.441      |

| Term             |   Coefs |   Edof |
|------------------|---------|--------|
| Spline(age)      |      20 | 10.381 |
| Categorical(sex) |       2 |  1.99  |
| Spline(bmi)      |      20 |  9.754 |
| Spline(bp)       |      20 | 10.035 |

| Term                    |    Coef |   Coef std |
|-------------------------|---------|------------|
| Categorical(sex=Female) | 146.735 |     4.3758 |
| Categorical(sex=Male)   | 155.602 |     4.083  |

Make a prediction and score the model.

[5]:

predictions = model.predict(df)
model.score(df, y)  # Pseudo R2 score

[5]:

np.float64(0.4411312994129862)

Since this model has a normal distribution and an identity link, the score equal the R2 score.

[6]:

from sklearn.metrics import r2_score

r2_score(y_true=y, y_pred=predictions)

[6]:

0.44113129941298623

Scikit-learn compatibility#

The models interact nicely with scikit-learn.

[7]:

from sklearn.model_selection import cross_val_score

# Models can be used with sklearn's `cross_val_score`
model = GAM(Spline("bp"))
cross_val_score(model, df, y, scoring="r2")

[7]:

array([0.07107161, 0.16072369, 0.15168224, 0.18885055, 0.25517882])

[8]:

from sklearn.model_selection import GridSearchCV

model = GAM(Spline("bp"))

# A parameter grid can be used to search for e.g. the best spline penalty
param_grid = {"terms__penalty": [1, 10, 100, 1000]}
search = GridSearchCV(model, param_grid, scoring="r2").fit(df, y)
search.best_params_

[8]:

{'terms__penalty': 100}

[9]:

# To search over more than one term, use a `param_grid` like this
model = GAM(Spline("bp") + Spline("age"))

# The integers are used to index the terms, so 0 refers to the first term, etc
param_grid = {
    "terms__0__penalty": [1, 10, 100, 1000],
    "terms__1__penalty": [1, 10, 100, 1000],
}
search = GridSearchCV(model, param_grid, scoring="r2").fit(df, y)
search.best_params_

[9]:

{'terms__0__penalty': 100, 'terms__1__penalty': 1000}

Penalties#

[10]:

plt.figure(figsize=(6, 3))
plt.scatter(df["bp"], y, s=2, alpha=0.5, color="k")

X_smooth = np.linspace(df["bp"].min(), df["bp"].max(), num=2**10).reshape(-1, 1)

for penalty in [0.001, 1, 1000]:
    model = GAM(Spline("bp", penalty=penalty)).fit(df, y)

    plt.plot(X_smooth, model.predict(X_smooth), label=f"Penalty={penalty}")

plt.legend()
plt.tight_layout()
plt.show()

../_images/notebooks_getting_started_16_0.png