Source code for layered.dataset

import array
import os
import shutil
import struct
import gzip
from urllib.request import urlopen
import numpy as np
from layered.example import Example
from layered.utility import ensure_folder


[docs]class Dataset: urls = [] cache = True def __init__(self): cache = type(self).cache if cache and self._is_cached(): print('Load cached dataset') self.load() else: filenames = [self.download(x) for x in type(self).urls] self.training, self.testing = self.parse(*filenames) if cache: self.dump() @classmethod
[docs] def folder(cls): name = cls.__name__.lower() home = os.path.expanduser('~') folder = os.path.join(home, '.layered/dataset', name) ensure_folder(folder) return folder
[docs] def parse(self): """ Subclass responsibility. The filenames of downloaded files will be passed as individual parameters to this function. Therefore, it must accept as many parameters as provided class-site urls. Should return a tuple of training examples and testing examples. """ raise NotImplementedError
[docs] def dump(self): np.save(self._training_path(), self.training) np.save(self._testing_path(), self.testing)
[docs] def load(self): self.training = np.load(self._training_path()) self.testing = np.load(self._testing_path())
[docs] def download(self, url): _, filename = os.path.split(url) filename = os.path.join(self.folder(), filename) print('Download', filename) with urlopen(url) as response, open(filename, 'wb') as file_: shutil.copyfileobj(response, file_) return filename
@staticmethod
[docs] def split(examples, ratio=0.8): """ Utility function that can be used within the parse() implementation of sub classes to split a list of example into two lists for training and testing. """ split = int(ratio * len(examples)) return examples[:split], examples[split:]
def _is_cached(self): if not os.path.exists(self._training_path()): return False if not os.path.exists(self._testing_path()): return False return True def _training_path(self): return os.path.join(self.folder(), 'training.npy') def _testing_path(self): return os.path.join(self.folder(), 'testing.npy')
[docs]class Test(Dataset): cache = False def __init__(self, amount=10): self.amount = amount super().__init__()
[docs] def parse(self): examples = [Example([1, 2, 3], [1, 2, 3]) for _ in range(self.amount)] return self.split(examples)
[docs]class Regression(Dataset): """ Synthetically generated dataset for regression. The task is to predict the sum and product of all the input values. All values are normalized between zero and one. """ cache = False def __init__(self, amount=10000, inputs=10): self.amount = amount self.inputs = inputs super().__init__()
[docs] def parse(self): data = np.random.rand(self.amount, self.inputs) products = np.prod(data, axis=1) products = products / np.max(products) sums = np.sum(data, axis=1) sums = sums / np.max(sums) targets = np.column_stack([sums, products]) examples = [Example(x, y) for x, y in zip(data, targets)] return self.split(examples)
[docs]class Modulo(Dataset): """ Sythetically generated classification dataset. The task is to predict the modulo classes of random integers encoded as bit arrays of length 32. """ cache = False def __init__(self, amount=60000, inputs=32, classes=7): self.amount = amount self.inputs = inputs self.classes = classes super().__init__()
[docs] def parse(self): data = np.random.randint(0, self.inputs ** 2 - 1, self.amount) mods = np.mod(data, self.classes) targets = np.zeros((self.amount, self.classes)) for index, mod in enumerate(mods): targets[index][mod] = 1 data = (((data[:, None] & (1 << np.arange(self.inputs)))) > 0) examples = [Example(x, y) for x, y in zip(data, targets)] return self.split(examples)
[docs]class Mnist(Dataset): """ The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. (from http://yann.lecun.com/exdb/mnist/) """ urls = [ 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', ]
[docs] def parse(self, train_x, train_y, test_x, test_y): # pylint: disable=arguments-differ training = list(self.read(train_x, train_y)) testing = list(self.read(test_x, test_y)) return training, testing
@staticmethod
[docs] def read(data, labels): images = gzip.open(data, 'rb') _, size, rows, cols = struct.unpack('>IIII', images.read(16)) image_bin = array.array('B', images.read()) images.close() labels = gzip.open(labels, 'rb') _, size2 = struct.unpack('>II', labels.read(8)) assert size == size2 label_bin = array.array('B', labels.read()) labels.close() for i in range(size): data = image_bin[i * rows * cols:(i + 1) * rows * cols] data = np.array(data).reshape(rows * cols) / 255 target = np.zeros(10) target[label_bin[i]] = 1 yield Example(data, target)