Module vipy.data.emnist
Expand source code Browse git
import os
import numpy as np
from vipy.util import remkdir, tocache, filetail
import gzip
import struct
from array import array
import vipy.image
import string
TRAIN_IMG_URL = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
TRAIN_IMG_SHA1 = '6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d'
TRAIN_LBL_URL = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
TRAIN_LBL_SHA1 = '2a80914081dc54586dbdf242f9805a6b8d2a15fc'
TEST_IMG_URL = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
TEST_IMG_SHA1 = 'c3a25af1f52dad7f726cce8cacb138654b760d48'
TEST_LBL_URL = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
TEST_LBL_SHA1 = '763e7fa3757d93b0cdec073cef058b2004252c17'
EMNIST_URL = 'https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip'
class MNIST():
def __init__(self, datadir=None, redownload=False):
raise ValueError('moved to huggingface')
outdir = tocache('mnist') if datadir is None else datadir
self._datadir = remkdir(os.path.expanduser(outdir))
if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')):
vipy.downloader.download(TRAIN_IMG_URL, os.path.join(self._datadir, filetail(TRAIN_IMG_URL)), sha1=TRAIN_IMG_SHA1)
vipy.downloader.download(TRAIN_LBL_URL, os.path.join(self._datadir, filetail(TRAIN_LBL_URL)), sha1=TRAIN_LBL_SHA1)
vipy.downloader.download(TEST_IMG_URL, os.path.join(self._datadir, filetail(TEST_IMG_URL)), sha1=TEST_IMG_SHA1)
vipy.downloader.download(TEST_LBL_URL, os.path.join(self._datadir, filetail(TEST_LBL_URL)), sha1=TEST_LBL_SHA1)
open(os.path.join(self._datadir, '.complete'), 'a').close()
@staticmethod
def _labels(gzfile):
with gzip.open(gzfile, 'rb') as file:
magic, size = struct.unpack(">II", file.read(8))
if magic != 2049:
raise ValueError('Magic number mismatch, expected 2049,'
'got %d' % magic)
labels = array("B", file.read())
return labels
@staticmethod
def _imread(dataset, index):
"""Read MNIST encoded images, adapted from: https://github.com/sorki/python-mnist/blob/master/mnist/loader.py"""
gzfile = None
with gzip.open(gzfile, 'rb') as file:
magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
if magic != 2051:
raise ValueError('Magic number mismatch, expected 2051, got %d' % magic)
file.seek(index * rows * cols + 16)
image = np.asarray(array("B", file.read(rows * cols)).tolist())
return np.reshape(image, (rows,cols))
@staticmethod
def _dataset(img_gzfile, label_gzfile, N):
y = MNIST._labels(label_gzfile).tolist()
x = []
train_img_file = img_gzfile
with gzip.open(train_img_file, 'rb') as gzfile:
magic, size, rows, cols = struct.unpack(">IIII", gzfile.read(16))
if magic != 2051:
raise ValueError('Magic number mismatch, expected 2051, got %d' % magic)
x = [np.asarray(array("B", gzfile.read(rows * cols)).tolist(), dtype=np.uint8).reshape((rows, cols)) for k in range(N)]
return tuple((xi,yi) for (xi,yi) in zip(x,y))
def trainset(self):
(labelfile, imgfile, N) = (os.path.join(self._datadir, 'train-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 'train-images-idx3-ubyte.gz'), 60000)
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist')
def testset(self):
(labelfile, imgfile, N) = (os.path.join(self._datadir, 't10k-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 't10k-images-idx3-ubyte.gz'), 10000)
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist_test')
class EMNIST(MNIST):
def __init__(self, datadir=None, redownload=False):
datadir = tocache('emnist') if datadir is None else datadir
self._datadir = vipy.util.remkdir(datadir)
if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')):
vipy.downloader.download_and_unpack(EMNIST_URL, self._datadir)
super().__init__(datadir)
open(os.path.join(self._datadir, '.complete'), 'a').close()
def letters_train(self):
(imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-train-labels-idx1-ubyte.gz'))
d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)}
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_train')
def letters_test(self):
(imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-test-labels-idx1-ubyte.gz'))
d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)}
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=145600-124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_test')
def letters(self):
return (self.letters_train(), self.letters_test())
def digits_train(self):
(imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-train-labels-idx1-ubyte.gz'))
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_train')
def digits_test(self):
(imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-test-labels-idx1-ubyte.gz'))
return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=280000-240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_test')
def digits(self):
return (self.digits_train(), self.digits_test())
def trainset(self):
return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist')
def testset(self):
return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist_test')
Classes
class EMNIST (datadir=None, redownload=False)
-
Expand source code Browse git
class EMNIST(MNIST): def __init__(self, datadir=None, redownload=False): datadir = tocache('emnist') if datadir is None else datadir self._datadir = vipy.util.remkdir(datadir) if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')): vipy.downloader.download_and_unpack(EMNIST_URL, self._datadir) super().__init__(datadir) open(os.path.join(self._datadir, '.complete'), 'a').close() def letters_train(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-train-labels-idx1-ubyte.gz')) d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)} return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_train') def letters_test(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-test-labels-idx1-ubyte.gz')) d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)} return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=145600-124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_test') def letters(self): return (self.letters_train(), self.letters_test()) def digits_train(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-train-labels-idx1-ubyte.gz')) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_train') def digits_test(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-test-labels-idx1-ubyte.gz')) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=280000-240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_test') def digits(self): return (self.digits_train(), self.digits_test()) def trainset(self): return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist') def testset(self): return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist_test')
Ancestors
Methods
def digits(self)
-
Expand source code Browse git
def digits(self): return (self.digits_train(), self.digits_test())
def digits_test(self)
-
Expand source code Browse git
def digits_test(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-test-labels-idx1-ubyte.gz')) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=280000-240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_test')
def digits_train(self)
-
Expand source code Browse git
def digits_train(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-digits-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-digits-train-labels-idx1-ubyte.gz')) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=240000), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='emnist_digits_train')
def letters(self)
-
Expand source code Browse git
def letters(self): return (self.letters_train(), self.letters_test())
def letters_test(self)
-
Expand source code Browse git
def letters_test(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-test-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-test-labels-idx1-ubyte.gz')) d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)} return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=145600-124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_test')
def letters_train(self)
-
Expand source code Browse git
def letters_train(self): (imgfile, labelfile) = (os.path.join(self._datadir, 'gzip/emnist-letters-train-images-idx3-ubyte.gz'), os.path.join(self._datadir, 'gzip/emnist-letters-train-labels-idx1-ubyte.gz')) d_categoryidx_to_category = {str(k):x for (k,x) in enumerate(string.ascii_lowercase, start=1)} return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=124800), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=d_categoryidx_to_category[str(z[1])], colorspace='lum'), id='emnist_letters_train')
def testset(self)
-
Expand source code Browse git
def testset(self): return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist_test')
def trainset(self)
-
Expand source code Browse git
def trainset(self): return vipy.dataset.Union(self.letters()[0], self.digits()[0], id='emnist')
class MNIST (datadir=None, redownload=False)
-
Expand source code Browse git
class MNIST(): def __init__(self, datadir=None, redownload=False): raise ValueError('moved to huggingface') outdir = tocache('mnist') if datadir is None else datadir self._datadir = remkdir(os.path.expanduser(outdir)) if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')): vipy.downloader.download(TRAIN_IMG_URL, os.path.join(self._datadir, filetail(TRAIN_IMG_URL)), sha1=TRAIN_IMG_SHA1) vipy.downloader.download(TRAIN_LBL_URL, os.path.join(self._datadir, filetail(TRAIN_LBL_URL)), sha1=TRAIN_LBL_SHA1) vipy.downloader.download(TEST_IMG_URL, os.path.join(self._datadir, filetail(TEST_IMG_URL)), sha1=TEST_IMG_SHA1) vipy.downloader.download(TEST_LBL_URL, os.path.join(self._datadir, filetail(TEST_LBL_URL)), sha1=TEST_LBL_SHA1) open(os.path.join(self._datadir, '.complete'), 'a').close() @staticmethod def _labels(gzfile): with gzip.open(gzfile, 'rb') as file: magic, size = struct.unpack(">II", file.read(8)) if magic != 2049: raise ValueError('Magic number mismatch, expected 2049,' 'got %d' % magic) labels = array("B", file.read()) return labels @staticmethod def _imread(dataset, index): """Read MNIST encoded images, adapted from: https://github.com/sorki/python-mnist/blob/master/mnist/loader.py""" gzfile = None with gzip.open(gzfile, 'rb') as file: magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) if magic != 2051: raise ValueError('Magic number mismatch, expected 2051, got %d' % magic) file.seek(index * rows * cols + 16) image = np.asarray(array("B", file.read(rows * cols)).tolist()) return np.reshape(image, (rows,cols)) @staticmethod def _dataset(img_gzfile, label_gzfile, N): y = MNIST._labels(label_gzfile).tolist() x = [] train_img_file = img_gzfile with gzip.open(train_img_file, 'rb') as gzfile: magic, size, rows, cols = struct.unpack(">IIII", gzfile.read(16)) if magic != 2051: raise ValueError('Magic number mismatch, expected 2051, got %d' % magic) x = [np.asarray(array("B", gzfile.read(rows * cols)).tolist(), dtype=np.uint8).reshape((rows, cols)) for k in range(N)] return tuple((xi,yi) for (xi,yi) in zip(x,y)) def trainset(self): (labelfile, imgfile, N) = (os.path.join(self._datadir, 'train-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 'train-images-idx3-ubyte.gz'), 60000) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist') def testset(self): (labelfile, imgfile, N) = (os.path.join(self._datadir, 't10k-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 't10k-images-idx3-ubyte.gz'), 10000) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist_test')
Subclasses
Methods
def testset(self)
-
Expand source code Browse git
def testset(self): (labelfile, imgfile, N) = (os.path.join(self._datadir, 't10k-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 't10k-images-idx3-ubyte.gz'), 10000) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist_test')
def trainset(self)
-
Expand source code Browse git
def trainset(self): (labelfile, imgfile, N) = (os.path.join(self._datadir, 'train-labels-idx1-ubyte.gz'), os.path.join(self._datadir, 'train-images-idx3-ubyte.gz'), 60000) return vipy.dataset.Dataset(self._dataset(imgfile, labelfile, N=N), loader=lambda z: vipy.image.ImageCategory(array=z[0], category=str(z[1]), colorspace='lum'), id='mnist')