Source code for layered.gradient

import math
import functools
import multiprocessing
import numpy as np
from layered.network import Matrices
from layered.utility import batched


[docs]class Gradient:

    def __init__(self, network, cost):
        self.network = network
        self.cost = cost

[docs]    def __call__(self, weights, example):
        raise NotImplementedError


[docs]class Backprop(Gradient):
    """
    Use the backpropagation algorithm to efficiently determine the gradient of
    the cost function with respect to each individual weight.
    """

[docs]    def __call__(self, weights, example):
        prediction = self.network.feed(weights, example.data)
        delta_output = self._delta_output(prediction, example.target)
        delta_layers = self._delta_layers(weights, delta_output)
        delta_weights = self._delta_weights(delta_layers)
        return delta_weights

    def _delta_output(self, prediction, target):
        assert len(target) == self.network.layers[-1].size
        # The derivative with respect to the output layer is computed as the
        # product of error derivative and local derivative at the layer.
        delta_cost = self.cost.delta(prediction, target)
        delta_output = self.network.layers[-1].delta(delta_cost)
        assert len(delta_cost) == len(delta_output) == len(target)
        return delta_output

    def _delta_layers(self, weights, delta_output):
        # Propagate backwards through the hidden layers but not the input
        # layer. The current weight matrix is the one to the right of the
        # current layer.
        gradient = [delta_output]
        hidden = list(zip(weights[1:], self.network.layers[1:-1]))
        assert all(x.shape[0] - 1 == len(y) for x, y in hidden)
        for weight, layer in reversed(hidden):
            delta = self._delta_layer(layer, weight, gradient[-1])
            gradient.append(delta)
        return reversed(gradient)

    def _delta_layer(self, layer, weight, above):
        # The gradient at a layer is computed as the derivative of both the
        # local activation and the weighted sum of the derivatives in the
        # deeper layer.
        backward = self.network.backward(weight, above)
        delta = layer.delta(backward)
        assert len(layer) == len(backward) == len(delta)
        return delta

    def _delta_weights(self, delta_layers):
        # The gradient with respect to the weights is computed as the gradient
        # at the target neuron multiplied by the activation of the source
        # neuron.
        gradient = Matrices(self.network.shapes)
        prev_and_delta = zip(self.network.layers[:-1], delta_layers)
        for index, (previous, delta) in enumerate(prev_and_delta):
            # We want to tweak the bias weights so we need them in the
            # gradient.
            activations = np.insert(previous.outgoing, 0, 1)
            assert activations[0] == 1
            gradient[index] = np.outer(activations, delta)
        return gradient


[docs]class NumericalGradient(Gradient):
    """
    Approximate the gradient for each weight individually by sampling the error
    function slightly above and below the current value of the weight.
    """

    def __init__(self, network, cost, distance=1e-5):
        super().__init__(network, cost)
        self.distance = distance

[docs]    def __call__(self, weights, example):
        """
        Modify each weight individually in both directions to calculate a
        numeric gradient of the weights.
        """
        # We need a copy of the weights that we can modify to evaluate the cost
        # function on.
        modified = Matrices(weights.shapes, weights.flat.copy())
        gradient = Matrices(weights.shapes)
        for i, connection in enumerate(weights):
            for j, original in np.ndenumerate(connection):
                # Sample above and below and compute costs.
                modified[i][j] = original + self.distance
                above = self._evaluate(modified, example)
                modified[i][j] = original - self.distance
                below = self._evaluate(modified, example)
                # Restore the original value so we can reuse the weight matrix
                # for the next iteration.
                modified[i][j] = original
                # Compute the numeric gradient.
                sample = (above - below) / (2 * self.distance)
                gradient[i][j] = sample
        return gradient

    def _evaluate(self, weights, example):
        prediction = self.network.feed(weights, example.data)
        cost = self.cost(prediction, example.target)
        assert cost.shape == prediction.shape
        return cost.sum()


[docs]class CheckedBackprop(Gradient):
    """
    Computes the gradient both analytically trough backpropagation and
    numerically to validate the backpropagation implementation and derivatives
    of activation functions and cost functions. This is slow by its nature and
    it's recommended to validate derivatives on small networks.
    """

    def __init__(self, network, cost, distance=1e-5, tolerance=1e-8):
        self.tolerance = tolerance
        super().__init__(network, cost)
        self.analytic = Backprop(network, cost)
        self.numeric = NumericalGradient(network, cost, distance)

[docs]    def __call__(self, weights, example):
        analytic = self.analytic(weights, example)
        numeric = self.numeric(weights, example)
        distances = np.absolute(analytic.flat - numeric.flat)
        worst = distances.max()
        if worst > self.tolerance:
            print('Gradient differs by {:.2f}%'.format(100 * worst))
        else:
            print('Gradient looks good')
        return analytic


[docs]class BatchBackprop:
    """
    Calculate the average gradient over a batch of examples.
    """

    def __init__(self, network, cost):
        self.backprop = Backprop(network, cost)

[docs]    def __call__(self, weights, examples):
        gradient = Matrices(weights.shapes)
        for example in examples:
            gradient += self.backprop(weights, example)
        return gradient / len(examples)


[docs]class ParallelBackprop:
    """
    Alternative to BatchBackprop that yields the same results but utilizes
    multiprocessing to make use of more than one processor core.
    """

    def __init__(self, network, cost, workers=4):
        self.backprop = BatchBackprop(network, cost)
        self.workers = workers
        self.pool = multiprocessing.Pool(self.workers)

[docs]    def __call__(self, weights, examples):
        batch_size = int(math.ceil(len(examples) / self.workers))
        batches = list(batched(examples, batch_size))
        sizes = [len(x) / batch_size for x in batches]
        sizes = [x / sum(sizes) for x in sizes]
        assert len(batches) <= self.workers
        assert sum(sizes) == 1
        compute = functools.partial(self.backprop, weights)
        gradients = self.pool.map(compute, batches)
        return sum(x * y for x, y in zip(gradients, sizes))