Introduction to gradients

Setup¶

In [1]:

Copied!

import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

In [2]:

Copied!

# set random seed for reproducibility
np.random.seed(42)
# set random seed for reproducibility
np.random.seed(42)

Representing derivatives¶

A derivative is the instantaneous rate of change of a function with respect to a variable, defined as the limit of the average rate of change over an infinitesimally small interval.

$$f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}$$

Below, several common functions and their derivatives are represented. Notice how the derivative can be interpreted as the "slope" of a function at a given point.

In [3]:

Copied!





def plot_1D_gradient_arrows(x, y, grad, dx=0.3, N=5, xlabel=None, ylabel=None):
    xi = x[::N]
    yi = y[::N]
    mi = grad[::N]
    
    # arrow length in data units
    U = dx * np.ones_like(xi)
    V = mi * dx
    
    # plot arrows
    plt.quiver(xi, yi, U, V, angles='xy', scale_units='xy', scale=0.5, color='red', zorder=10)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
def plot_1D_gradient_arrows(x, y, grad, dx=0.3, N=5, xlabel=None, ylabel=None):
    xi = x[::N]
    yi = y[::N]
    mi = grad[::N]
    
    # arrow length in data units
    U = dx * np.ones_like(xi)
    V = mi * dx
    
    # plot arrows
    plt.quiver(xi, yi, U, V, angles='xy', scale_units='xy', scale=0.5, color='red', zorder=10)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

Quadratic¶

In [4]:

Copied!

def f(x):
    return x**2

def df(x):
    return 2*x
def f(x):
    return x**2

def df(x):
    return 2*x

In [5]:

Copied!





temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)

No description has been provided for this image

Cosine¶

In [6]:

Copied!

def f(x):
    return np.cos(x)

def df(x):
    return -np.sin(x)
def f(x):
    return np.cos(x)

def df(x):
    return -np.sin(x)

In [7]:

Copied!





temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
temp_x = np.linspace(-5, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)

Polynomial¶

In [8]:

Copied!

def f(x):
    return x**3 - 4*x**2 + 2*x -3

def df(x):
    return 3*x**2 - 8*x + 2
def f(x):
    return x**3 - 4*x**2 + 2*x -3

def df(x):
    return 3*x**2 - 8*x + 2

In [9]:

Copied!





temp_x = np.linspace(-2, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)
temp_x = np.linspace(-2, 5, num=100)
temp_y = f(temp_x)
temp_grad = df(temp_x)
plt.plot(temp_x, temp_y)
plot_1D_gradient_arrows(temp_x, temp_y, temp_grad, 0.1, 5)

Linear regression¶

The dataset¶

Suppose we want to model the following dataset, consisting of a collection of X,Y tuples.

In [10]:

Copied!





x = np.arange(-5, 5, 0.25)


m_optimal = 2
b_optimal = -3
y_true = x * m_optimal + b_optimal + np.random.normal(size=x.shape)
plt.scatter(x, y_true)
None
x = np.arange(-5, 5, 0.25)


m_optimal = 2
b_optimal = -3
y_true = x * m_optimal + b_optimal + np.random.normal(size=x.shape)
plt.scatter(x, y_true)
None

Model parameters¶

We can think of a linear model of the form y = mx + b, with two parameters to optimize, m (slope) and b (bias). The best choice of model parameters will be one that minimizes the error or loss between the predicted and the true y values. An intuitive measure of the difference between these predicted and true values is total sum of the squares of the errors (SSE), computed as follows:

$$\mathrm{L_{SSE}} = \sum_{i=1}^{n} \left(y_i^{\text{true}} - y_i^{\text{pred}}\right)^2$$

Loss landscape¶

How does the error change, depending on the chosen parameters? We can represent this as a partial derivative, where we consider every other variable as constant, and we compute the derivative of an expression with respect to a specific variable.

In [11]:

Copied!

# here we define the "search space" for parameters m and b
m_range = np.arange(-10, 11, 0.5)
b_range = np.arange(-10, 11, 0.5)
# here we define the "search space" for parameters m and b
m_range = np.arange(-10, 11, 0.5)
b_range = np.arange(-10, 11, 0.5)

For m (slope), we can compute

$$ \frac{\partial \mathrm{L}}{\partial m} = -2 \sum_{i=1}^{n} x_i \left(y_i^{\text{true}} - y_i^{\text{pred}}\right) $$

In [12]:

Copied!





# this is the linear function
def y(x, m, b):
    return m*x + b

# this is the Loss function
def L(y_true, m, b):
    y_pred = y(x, m, b)
    return int(np.sum((y_true - y_pred)**2))

# this is the derivative of the loss w.r.t m
def dL_dm(y_true, m, b):
    y_pred = m*x + b
    result = -2 * np.sum(x * (y_true - y_pred))
    return result
# this is the linear function
def y(x, m, b):
    return m*x + b

# this is the Loss function
def L(y_true, m, b):
    y_pred = y(x, m, b)
    return int(np.sum((y_true - y_pred)**2))

# this is the derivative of the loss w.r.t m
def dL_dm(y_true, m, b):
    y_pred = m*x + b
    result = -2 * np.sum(x * (y_true - y_pred))
    return result

In [13]:

Copied!





# we assume b is constant, and we choose an arbitrary value
fixed_b = 5

# compute the list of loss values for a range of m values
list_losses = [L(y_true, m, b=fixed_b) for m in m_range]

# compute the derivatives of the loss wrt m for a range of m values
gradients = np.array([dL_dm(y_true, m=m, b=fixed_b) for m in m_range])

plt.plot(m_range, list_losses)
plot_1D_gradient_arrows(m_range, list_losses, gradients, 0.5, 5, 'choice of parameter "m"', 'Loss')
# we assume b is constant, and we choose an arbitrary value
fixed_b = 5

# compute the list of loss values for a range of m values
list_losses = [L(y_true, m, b=fixed_b) for m in m_range]

# compute the derivatives of the loss wrt m for a range of m values
gradients = np.array([dL_dm(y_true, m=m, b=fixed_b) for m in m_range])

plt.plot(m_range, list_losses)
plot_1D_gradient_arrows(m_range, list_losses, gradients, 0.5, 5, 'choice of parameter "m"', 'Loss')

For b (bias), we can compute

$$ \frac{\partial \mathrm{L}}{\partial b} = -2 \sum_{i=1}^{n} \left(y_i^{\text{true}} - y_i^{\text{pred}}\right) $$

In [14]:

Copied!





# this is the derivative of the loss w.r.t b
def dL_db(y_true, m, b):
    y_pred = m*x + b
    result = -2 * np.sum((y_true - y_pred))
    return result
# this is the derivative of the loss w.r.t b
def dL_db(y_true, m, b):
    y_pred = m*x + b
    result = -2 * np.sum((y_true - y_pred))
    return result

In [15]:

Copied!





# we assume m is constant, and we choose an arbitrary value
fixed_m = 5

# compute the list of loss values for a range of b values
list_losses = [L(y_true, m=fixed_m, b=b) for b in b_range]

# compute the derivatives of the loss wrt b for a range of b values
gradients = np.array([dL_db(y_true, m=fixed_m, b=b) for b in b_range])

plt.plot(b_range, list_losses)
plot_1D_gradient_arrows(b_range, list_losses, gradients, 0.5, 5, 'choice of parameter "b"', 'Loss')
# we assume m is constant, and we choose an arbitrary value
fixed_m = 5

# compute the list of loss values for a range of b values
list_losses = [L(y_true, m=fixed_m, b=b) for b in b_range]

# compute the derivatives of the loss wrt b for a range of b values
gradients = np.array([dL_db(y_true, m=fixed_m, b=b) for b in b_range])

plt.plot(b_range, list_losses)
plot_1D_gradient_arrows(b_range, list_losses, gradients, 0.5, 5, 'choice of parameter "b"', 'Loss')

In an effort to represent the relationship between choice of parameters and error, we have assumed that one of the parameters was constant, in order to only represent one parameter. To properly represent the loss landscape as a function of the two parameters, we have to think of varying them jointly, not independently.

In [16]:

Copied!





from mpl_toolkits.mplot3d import Axes3D

def compute_loss_landscape(loss_function, m_range, b_range):
    # create grid to store loss values
    L_grid = np.zeros(shape=(len(b_range), len(m_range)))

    # populate the grid
    for i, b in enumerate(b_range):
        for j, m in enumerate(m_range):
            L_grid[i, j] = loss_function(y_true, m, b)

    return L_grid

def plot_3D_loss_landscape(m_range, b_range, L_grid):
    M, B = np.meshgrid(m_range, b_range)
    
    fig = plt.figure(figsize=(6,8), layout='tight')
    ax = fig.add_subplot(111, projection='3d')
    ax.plot_surface(M, B, L_grid, cmap='viridis')
    ax.set_xlabel('m (slope)')
    ax.set_ylabel('b (bias)')
    ax.set_zlabel('Loss', rotation=90)
    ax.set_box_aspect(None, zoom=0.95)
    #fig.tight_layout()
    return ax

def plot_2D_loss_landscape(m_range, b_range, L_grid, ax=None):
    if ax == None:
        fig, ax = plt.subplots()
        
    M, B = np.meshgrid(m_range, b_range)

    im = ax.contourf(M, B, L_grid, levels=25, cmap='viridis')
    ax.set_xlabel('m')
    ax.set_ylabel('b')
    plt.colorbar(mappable=im, label='Loss')
    return ax
from mpl_toolkits.mplot3d import Axes3D

def compute_loss_landscape(loss_function, m_range, b_range):
    # create grid to store loss values
    L_grid = np.zeros(shape=(len(b_range), len(m_range)))

    # populate the grid
    for i, b in enumerate(b_range):
        for j, m in enumerate(m_range):
            L_grid[i, j] = loss_function(y_true, m, b)

    return L_grid

def plot_3D_loss_landscape(m_range, b_range, L_grid):
    M, B = np.meshgrid(m_range, b_range)
    
    fig = plt.figure(figsize=(6,8), layout='tight')
    ax = fig.add_subplot(111, projection='3d')
    ax.plot_surface(M, B, L_grid, cmap='viridis')
    ax.set_xlabel('m (slope)')
    ax.set_ylabel('b (bias)')
    ax.set_zlabel('Loss', rotation=90)
    ax.set_box_aspect(None, zoom=0.95)
    #fig.tight_layout()
    return ax

def plot_2D_loss_landscape(m_range, b_range, L_grid, ax=None):
    if ax == None:
        fig, ax = plt.subplots()
        
    M, B = np.meshgrid(m_range, b_range)

    im = ax.contourf(M, B, L_grid, levels=25, cmap='viridis')
    ax.set_xlabel('m')
    ax.set_ylabel('b')
    plt.colorbar(mappable=im, label='Loss')
    return ax

In [17]:

Copied!





# define the "search space" for parameters m and b
m_range = np.arange(-20, 20, 0.5)
b_range = np.arange(-40, 40, 0.5)

# 3D plot in perspective
L_grid = compute_loss_landscape(L, m_range, b_range)
ax = plot_3D_loss_landscape(m_range, b_range, L_grid)
ax.view_init(elev=20, azim=35)
# define the "search space" for parameters m and b
m_range = np.arange(-20, 20, 0.5)
b_range = np.arange(-40, 40, 0.5)

# 3D plot in perspective
L_grid = compute_loss_landscape(L, m_range, b_range)
ax = plot_3D_loss_landscape(m_range, b_range, L_grid)
ax.view_init(elev=20, azim=35)

In [18]:

Copied!

# top view of loss landscape
ax = plot_2D_loss_landscape(m_range, b_range, L_grid)
# top view of loss landscape
ax = plot_2D_loss_landscape(m_range, b_range, L_grid)

Optimization¶

How to find the best combination of parameters? By following the derivative at each point! The negative of a derivative will point towards the quickest way to traverse the loss landscape towards the minimum. We can think of the derivatives with respect to m and to b like components in a 2D field, which chart the path of steepest descent (which will not be necessarily the most efficient one!).

In [19]:

Copied!





def plot_2D_loss_gradient():
    ##########################
    # create figure
    fig, ax = plt.subplots()

    # visualize landscape
    ax = plot_2D_loss_landscape(m_range, b_range, L_grid, ax=ax)

    ##########################
    # compute arrows representing gradient 
    step = 5

    M, B = np.meshgrid(m_range, b_range, indexing='xy')
    M_arrow = M[::step, ::step]
    B_arrow = B[::step, ::step]
    
    dM = np.zeros_like(M_arrow)
    dB = np.zeros_like(B_arrow)
    
    for i in range(M_arrow.shape[0]):
        for j in range(B_arrow.shape[1]):
            dM[i,j] = dL_dm(y_true, M_arrow[i,j], B_arrow[i,j])
            dB[i,j] = dL_db(y_true, M_arrow[i,j], B_arrow[i,j])
    
    ##########################
    # visualize gradient
    ax.streamplot(M_arrow, B_arrow, -dM/4, -dB, color='red', density=1, linewidth=0.25)
    
    plt.xlabel('m')
    plt.ylabel('b')

    return ax
def plot_2D_loss_gradient():
    ##########################
    # create figure
    fig, ax = plt.subplots()

    # visualize landscape
    ax = plot_2D_loss_landscape(m_range, b_range, L_grid, ax=ax)

    ##########################
    # compute arrows representing gradient 
    step = 5

    M, B = np.meshgrid(m_range, b_range, indexing='xy')
    M_arrow = M[::step, ::step]
    B_arrow = B[::step, ::step]
    
    dM = np.zeros_like(M_arrow)
    dB = np.zeros_like(B_arrow)
    
    for i in range(M_arrow.shape[0]):
        for j in range(B_arrow.shape[1]):
            dM[i,j] = dL_dm(y_true, M_arrow[i,j], B_arrow[i,j])
            dB[i,j] = dL_db(y_true, M_arrow[i,j], B_arrow[i,j])
    
    ##########################
    # visualize gradient
    ax.streamplot(M_arrow, B_arrow, -dM/4, -dB, color='red', density=1, linewidth=0.25)
    
    plt.xlabel('m')
    plt.ylabel('b')

    return ax

In [20]:

Copied!

ax = plot_2D_loss_gradient()
ax = plot_2D_loss_gradient()

In [21]:

Copied!





# draw the previous 2D loss landscape
ax = plot_2D_loss_gradient()

# define our "starting position"
curr_m = -15
curr_b = -30
list_ops = [(curr_m, curr_b)]

N_steps = 100 # how many optimization steps to run
lr = 0.001 # the learning rate scales changes in parameter space

for i in range(N_steps):
    # compute new m and b parameters
    new_m = curr_m - dL_dm(y_true, curr_m, curr_b)/4 * lr
    new_b = curr_b - dL_db(y_true, curr_m, curr_b) * lr

    # update parameters
    curr_m, curr_b = new_m, new_b
    list_ops.append((curr_m, curr_b))

# visualize optimization path in the 2D loss landscape
mat_ops = np.array(list_ops)
ax.scatter(mat_ops[:, 0], mat_ops[:, 1], color='orange', marker='*')
ax.plot(mat_ops[:, 0], mat_ops[:, 1], color='orange')
None
# draw the previous 2D loss landscape
ax = plot_2D_loss_gradient()

# define our "starting position"
curr_m = -15
curr_b = -30
list_ops = [(curr_m, curr_b)]

N_steps = 100 # how many optimization steps to run
lr = 0.001 # the learning rate scales changes in parameter space

for i in range(N_steps):
    # compute new m and b parameters
    new_m = curr_m - dL_dm(y_true, curr_m, curr_b)/4 * lr
    new_b = curr_b - dL_db(y_true, curr_m, curr_b) * lr

    # update parameters
    curr_m, curr_b = new_m, new_b
    list_ops.append((curr_m, curr_b))

# visualize optimization path in the 2D loss landscape
mat_ops = np.array(list_ops)
ax.scatter(mat_ops[:, 0], mat_ops[:, 1], color='orange', marker='*')
ax.plot(mat_ops[:, 0], mat_ops[:, 1], color='orange')
None

In [22]:

Copied!

# there are the optimized parameters that we obtain
print(f"Optimal value of b: {curr_b}")
print(f"Optimal value of m: {curr_m}")
# there are the optimized parameters that we obtain
print(f"Optimal value of b: {curr_b}")
print(f"Optimal value of m: {curr_m}")

Optimal value of b: -3.235905316463203
Optimal value of m: 1.9181494319433217

In [23]:

Copied!





# let us visualize how well do those parameters fit the initial data
plt.scatter(x, y_true)
y_pred = curr_m * x + curr_b
plt.plot(x, y_pred, color='orange')
None
# let us visualize how well do those parameters fit the initial data
plt.scatter(x, y_true)
y_pred = curr_m * x + curr_b
plt.plot(x, y_pred, color='orange')
None