import array
import os
import shutil
import struct
import gzip
from urllib.request import urlopen
import numpy as np
from layered.example import Example
from layered.utility import ensure_folder
[docs]class Dataset:
urls = []
cache = True
def __init__(self):
cache = type(self).cache
if cache and self._is_cached():
print('Load cached dataset')
self.load()
else:
filenames = [self.download(x) for x in type(self).urls]
self.training, self.testing = self.parse(*filenames)
if cache:
self.dump()
@classmethod
[docs] def folder(cls):
name = cls.__name__.lower()
home = os.path.expanduser('~')
folder = os.path.join(home, '.layered/dataset', name)
ensure_folder(folder)
return folder
[docs] def parse(self):
"""
Subclass responsibility. The filenames of downloaded files will be
passed as individual parameters to this function. Therefore, it must
accept as many parameters as provided class-site urls. Should return a
tuple of training examples and testing examples.
"""
raise NotImplementedError
[docs] def dump(self):
np.save(self._training_path(), self.training)
np.save(self._testing_path(), self.testing)
[docs] def load(self):
self.training = np.load(self._training_path())
self.testing = np.load(self._testing_path())
[docs] def download(self, url):
_, filename = os.path.split(url)
filename = os.path.join(self.folder(), filename)
print('Download', filename)
with urlopen(url) as response, open(filename, 'wb') as file_:
shutil.copyfileobj(response, file_)
return filename
@staticmethod
[docs] def split(examples, ratio=0.8):
"""
Utility function that can be used within the parse() implementation of
sub classes to split a list of example into two lists for training and
testing.
"""
split = int(ratio * len(examples))
return examples[:split], examples[split:]
def _is_cached(self):
if not os.path.exists(self._training_path()):
return False
if not os.path.exists(self._testing_path()):
return False
return True
def _training_path(self):
return os.path.join(self.folder(), 'training.npy')
def _testing_path(self):
return os.path.join(self.folder(), 'testing.npy')
[docs]class Test(Dataset):
cache = False
def __init__(self, amount=10):
self.amount = amount
super().__init__()
[docs] def parse(self):
examples = [Example([1, 2, 3], [1, 2, 3]) for _ in range(self.amount)]
return self.split(examples)
[docs]class Regression(Dataset):
"""
Synthetically generated dataset for regression. The task is to predict the
sum and product of all the input values. All values are normalized between
zero and one.
"""
cache = False
def __init__(self, amount=10000, inputs=10):
self.amount = amount
self.inputs = inputs
super().__init__()
[docs] def parse(self):
data = np.random.rand(self.amount, self.inputs)
products = np.prod(data, axis=1)
products = products / np.max(products)
sums = np.sum(data, axis=1)
sums = sums / np.max(sums)
targets = np.column_stack([sums, products])
examples = [Example(x, y) for x, y in zip(data, targets)]
return self.split(examples)
[docs]class Modulo(Dataset):
"""
Sythetically generated classification dataset. The task is to predict the
modulo classes of random integers encoded as bit arrays of length 32.
"""
cache = False
def __init__(self, amount=60000, inputs=32, classes=7):
self.amount = amount
self.inputs = inputs
self.classes = classes
super().__init__()
[docs] def parse(self):
data = np.random.randint(0, self.inputs ** 2 - 1, self.amount)
mods = np.mod(data, self.classes)
targets = np.zeros((self.amount, self.classes))
for index, mod in enumerate(mods):
targets[index][mod] = 1
data = (((data[:, None] & (1 << np.arange(self.inputs)))) > 0)
examples = [Example(x, y) for x, y in zip(data, targets)]
return self.split(examples)
[docs]class Mnist(Dataset):
"""
The MNIST database of handwritten digits, available from this page, has a
training set of 60,000 examples, and a test set of 10,000 examples. It is a
subset of a larger set available from NIST. The digits have been
size-normalized and centered in a fixed-size image. It is a good database
for people who want to try learning techniques and pattern recognition
methods on real-world data while spending minimal efforts on preprocessing
and formatting. (from http://yann.lecun.com/exdb/mnist/)
"""
urls = [
'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
]
[docs] def parse(self, train_x, train_y, test_x, test_y):
# pylint: disable=arguments-differ
training = list(self.read(train_x, train_y))
testing = list(self.read(test_x, test_y))
return training, testing
@staticmethod
[docs] def read(data, labels):
images = gzip.open(data, 'rb')
_, size, rows, cols = struct.unpack('>IIII', images.read(16))
image_bin = array.array('B', images.read())
images.close()
labels = gzip.open(labels, 'rb')
_, size2 = struct.unpack('>II', labels.read(8))
assert size == size2
label_bin = array.array('B', labels.read())
labels.close()
for i in range(size):
data = image_bin[i * rows * cols:(i + 1) * rows * cols]
data = np.array(data).reshape(rows * cols) / 255
target = np.zeros(10)
target[label_bin[i]] = 1
yield Example(data, target)