Introduction to gradients
Setup¶
import numpy as np
import matplotlib.pyplot as plt
# set random seed for reproducibility
np.random.seed(42)
Representing derivatives¶
A derivative is the instantaneous rate of change of a function with respect to a variable, defined as the limit of the average rate of change over an infinitesimally small interval.
$$f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}$$
Below, several common functions and their derivatives are represented. Notice how the derivative can be interpreted as the "slope" of a function at a given point.
def plot_1D_gradient_arrows(x, y, grad, dx=0.3, N=5, xlabel=None, ylabel=None):
xi = x[::N]
yi = y[::N]
mi = grad[::N]
# arrow length in data units
U = dx * np.ones_like(xi)
V = mi * dx
# plot arrows
plt.quiver(xi, yi, U, V, angles='xy', scale_units='xy', scale=0.5, color='red', zorder=10)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
Quadratic¶
def f(x):
return x**2
def df(x):
return 2*x
temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
Cosine¶
def f(x):
return np.cos(x)
def df(x):
return -np.sin(x)
temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
Polynomial¶
def f(x):
return x**3 - 4*x**2 + 2*x -3
def df(x):
return 3*x**2 - 8*x + 2
temp_x = np.linspace(-2, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
Linear regression¶
The dataset¶
Suppose we want to model the following dataset, consisting of a collection of X,Y tuples.
x = np.arange(-5, 5, 0.25)
m_optimal = 2
b_optimal = -3
y_true = x * m_optimal + b_optimal + np.random.normal(size=x.shape)
plt.scatter(x, y_true)
None
Model parameters¶
We can think of a linear model of the form y = mx + b, with two parameters to optimize, m (slope) and b (bias). The best choice of model parameters will be one that minimizes the error or loss between the predicted and the true y values. An intuitive measure of the difference between these predicted and true values is total sum of the squares of the errors (SSE), computed as follows:
$$\mathrm{L_{SSE}} = \sum_{i=1}^{n} \left(y_i^{\text{true}} - y_i^{\text{pred}}\right)^2$$
Loss landscape¶
How does the error change, depending on the chosen parameters? We can represent this as a partial derivative, where we consider every other variable as constant, and we compute the derivative of an expression with respect to a specific variable.
# here we define the "search space" for parameters m and b
m_range = np.arange(-10, 11, 0.5)
b_range = np.arange(-10, 11, 0.5)
For m (slope), we can compute
$$ \frac{\partial \mathrm{L}}{\partial m} = -2 \sum_{i=1}^{n} x_i \left(y_i^{\text{true}} - y_i^{\text{pred}}\right) $$
# this is the linear function
def y(x, m, b):
return m*x + b
# this is the Loss function
def L(y_true, m, b):
y_pred = y(x, m, b)
return int(np.sum((y_true - y_pred)**2))
# this is the derivative of the loss w.r.t m
def dL_dm(y_true, m, b):
y_pred = m*x + b
result = -2 * np.sum(x * (y_true - y_pred))
return result
# we assume b is constant, and we choose an arbitrary value
fixed_b = 5
# compute the list of loss values for a range of m values
list_losses = [L(y_true, m, b=fixed_b) for m in m_range]
# compute the derivatives of the loss wrt m for a range of m values
gradients = np.array([dL_dm(y_true, m=m, b=fixed_b) for m in m_range])
plt.plot(m_range, list_losses)
plot_1D_gradient_arrows(m_range, list_losses, gradients, 0.5, 5, 'choice of parameter "m"', 'Loss')
For b (bias), we can compute
$$ \frac{\partial \mathrm{L}}{\partial b} = -2 \sum_{i=1}^{n} \left(y_i^{\text{true}} - y_i^{\text{pred}}\right) $$
# this is the derivative of the loss w.r.t b
def dL_db(y_true, m, b):
y_pred = m*x + b
result = -2 * np.sum((y_true - y_pred))
return result
# we assume m is constant, and we choose an arbitrary value
fixed_m = 5
# compute the list of loss values for a range of b values
list_losses = [L(y_true, m=fixed_m, b=b) for b in b_range]
# compute the derivatives of the loss wrt b for a range of b values
gradients = np.array([dL_db(y_true, m=fixed_m, b=b) for b in b_range])
plt.plot(b_range, list_losses)
plot_1D_gradient_arrows(b_range, list_losses, gradients, 0.5, 5, 'choice of parameter "b"', 'Loss')
In an effort to represent the relationship between choice of parameters and error, we have assumed that one of the parameters was constant, in order to only represent one parameter. To properly represent the loss landscape as a function of the two parameters, we have to think of varying them jointly, not independently.
from mpl_toolkits.mplot3d import Axes3D
def compute_loss_landscape(loss_function, m_range, b_range):
# create grid to store loss values
L_grid = np.zeros(shape=(len(b_range), len(m_range)))
# populate the grid
for i, b in enumerate(b_range):
for j, m in enumerate(m_range):
L_grid[i, j] = loss_function(y_true, m, b)
return L_grid
def plot_3D_loss_landscape(m_range, b_range, L_grid):
M, B = np.meshgrid(m_range, b_range)
fig = plt.figure(figsize=(6,8), layout='tight')
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(M, B, L_grid, cmap='viridis')
ax.set_xlabel('m (slope)')
ax.set_ylabel('b (bias)')
ax.set_zlabel('Loss', rotation=90)
ax.set_box_aspect(None, zoom=0.95)
#fig.tight_layout()
return ax
def plot_2D_loss_landscape(m_range, b_range, L_grid, ax=None):
if ax == None:
fig, ax = plt.subplots()
M, B = np.meshgrid(m_range, b_range)
im = ax.contourf(M, B, L_grid, levels=25, cmap='viridis')
ax.set_xlabel('m')
ax.set_ylabel('b')
plt.colorbar(mappable=im, label='Loss')
return ax
# define the "search space" for parameters m and b
m_range = np.arange(-20, 20, 0.5)
b_range = np.arange(-40, 40, 0.5)
# 3D plot in perspective
L_grid = compute_loss_landscape(L, m_range, b_range)
ax = plot_3D_loss_landscape(m_range, b_range, L_grid)
ax.view_init(elev=20, azim=35)
# top view of loss landscape
ax = plot_2D_loss_landscape(m_range, b_range, L_grid)
Optimization¶
How to find the best combination of parameters? By following the derivative at each point! The negative of a derivative will point towards the quickest way to traverse the loss landscape towards the minimum. We can think of the derivatives with respect to m and to b like components in a 2D field, which chart the path of steepest descent (which will not be necessarily the most efficient one!).
def plot_2D_loss_gradient():
##########################
# create figure
fig, ax = plt.subplots()
# visualize landscape
ax = plot_2D_loss_landscape(m_range, b_range, L_grid, ax=ax)
##########################
# compute arrows representing gradient
step = 5
M, B = np.meshgrid(m_range, b_range, indexing='xy')
M_arrow = M[::step, ::step]
B_arrow = B[::step, ::step]
dM = np.zeros_like(M_arrow)
dB = np.zeros_like(B_arrow)
for i in range(M_arrow.shape[0]):
for j in range(B_arrow.shape[1]):
dM[i,j] = dL_dm(y_true, M_arrow[i,j], B_arrow[i,j])
dB[i,j] = dL_db(y_true, M_arrow[i,j], B_arrow[i,j])
##########################
# visualize gradient
ax.streamplot(M_arrow, B_arrow, -dM/4, -dB, color='red', density=1, linewidth=0.25)
plt.xlabel('m')
plt.ylabel('b')
return ax
ax = plot_2D_loss_gradient()
# draw the previous 2D loss landscape
ax = plot_2D_loss_gradient()
# define our "starting position"
curr_m = -15
curr_b = -30
list_ops = [(curr_m, curr_b)]
N_steps = 100 # how many optimization steps to run
lr = 0.001 # the learning rate scales changes in parameter space
for i in range(N_steps):
# compute new m and b parameters
new_m = curr_m - dL_dm(y_true, curr_m, curr_b)/4 * lr
new_b = curr_b - dL_db(y_true, curr_m, curr_b) * lr
# update parameters
curr_m, curr_b = new_m, new_b
list_ops.append((curr_m, curr_b))
# visualize optimization path in the 2D loss landscape
mat_ops = np.array(list_ops)
ax.scatter(mat_ops[:, 0], mat_ops[:, 1], color='orange', marker='*')
ax.plot(mat_ops[:, 0], mat_ops[:, 1], color='orange')
None
# there are the optimized parameters that we obtain
print(f"Optimal value of b: {curr_b}")
print(f"Optimal value of m: {curr_m}")
Optimal value of b: -3.235905316463203 Optimal value of m: 1.9181494319433217
# let us visualize how well do those parameters fit the initial data
plt.scatter(x, y_true)
y_pred = curr_m * x + curr_b
plt.plot(x, y_pred, color='orange')
None