Module vipy.data.lfw
Expand source code Browse git
import os
import numpy as np
from vipy.util import remkdir, filetail, dirlist, imlist, readcsv, tocache
from vipy.image import ImageCategory
import vipy.downloader
URL = 'http://vis-www.cs.umass.edu/lfw/lfw.tgz'
URL_NAMES = 'http://vis-www.cs.umass.edu/lfw/lfw-names.txt'
URL_PAIRS_DEV_TRAIN = 'http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt'
URL_PAIRS_DEV_TEST = 'http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt'
URL_PAIRS_VIEW2 = 'http://vis-www.cs.umass.edu/lfw/pairs.txt'
class LFW(vipy.dataset.Dataset):
def __init__(self, datadir=None, redownload=False):
"""Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw"""
datadir = tocache('lfw') if datadir is None else datadir
self._datadir = remkdir(os.path.expanduser(datadir))
if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')):
self._download()
loader = lambda x: ImageCategory(category=x[0], filename=x[1])
super().__init__([(s, f) for s in self.subjects() for f in imlist(os.path.join(self._datadir, 'lfw', s))], id='lfw', loader=loader)
open(os.path.join(self._datadir, '.complete'), 'a').close()
def _download(self, verbose=True):
# Link may be dead ...
vipy.downloader.download_and_unpack(URL, self._datadir, verbose=verbose)
return self
def subjects(self):
"""List of all subject names"""
return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))]
def subject_images(self, subject):
"""List of Images of a subject"""
fnames = imlist(os.path.join(self._datadir, 'lfw', subject))
return [ImageCategory(category=subject, filename=f) for f in fnames]
def _parse_pairs(self, txtfile):
pairs = []
for x in readcsv(os.path.join(self._datadir, 'lfw', txtfile), separator='\t'):
if len(x) == 3:
pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[2]))))))
elif len(x) == 4:
pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
ImageCategory(category=x[2], filename=os.path.join(self._datadir, 'lfw', x[2], '%s_%04d.jpg' % (x[2], int(x[3]))))))
else:
pass
return pairs
def _pairsDevTest(self):
if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTest.txt')):
raise ValueError("Download and save text file to $datadir/pairsDevTest.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw' 'pairsDevTest.txt')))
return self._parse_pairs('pairsDevTest.txt')
def _pairsDevTrain(self):
if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')):
raise ValueError("Download and save text file to $datadir/pairsDevTrain.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')))
return self._parse_pairs('pairsDevTrain.txt')
def _pairs(self):
if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairs.txt')):
raise ValueError("Download and save text file to $datadir/pairs.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairs.txt')))
return self._parse_pairs('pairs.txt')
Classes
class LFW (datadir=None, redownload=False)
-
vipy.dataset.Dataset() class
Common class to manipulate large sets of objects in parallel
Args
- dataset [list, tuple, set, obj]: a python built-in type that supports indexing or a generic object that supports indexing and has a length
- id [str]: an optional id of this dataset, which provides a descriptive name of the dataset
- loader [callable]: a callable loader that will construct the object from a raw data element in dataset. This is useful for custom deerialization or on demand transformations Datasets can be indexed, shuffled, iterated, minibatched, sorted, sampled, partitioned. Datasets constructed of vipy objects are lazy loaded, delaying loading pixels until they are needed
(trainset, valset, testset) = vipy.dataset.registry('mnist') (trainset, valset) = trainset.partition(0.9, 0.1) categories = trainset.set(lambda im: im.category()) smaller = testset.take(1024) preprocessed = smaller.map(lambda im: im.resize(32, 32).gain(1/256)) for b in preprocessed.minibatch(128): print(b) # visualize the dataset (trainset, valset, testset) = vipy.dataset.registry('pascal_voc_2007') for im in trainset: im.mindim(1024).show().print(sleep=1).close()
Datasets can be constructed from directories of json files or image files (
Dataset.from_directory()
) Datasets can be constructed from a single json file containing a list of objects (Dataset.from_json()
)Note: that if a lambda function is provided as loader then this dataset is not serializable. Use self.load() then serialize
Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw
Expand source code Browse git
class LFW(vipy.dataset.Dataset): def __init__(self, datadir=None, redownload=False): """Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw""" datadir = tocache('lfw') if datadir is None else datadir self._datadir = remkdir(os.path.expanduser(datadir)) if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')): self._download() loader = lambda x: ImageCategory(category=x[0], filename=x[1]) super().__init__([(s, f) for s in self.subjects() for f in imlist(os.path.join(self._datadir, 'lfw', s))], id='lfw', loader=loader) open(os.path.join(self._datadir, '.complete'), 'a').close() def _download(self, verbose=True): # Link may be dead ... vipy.downloader.download_and_unpack(URL, self._datadir, verbose=verbose) return self def subjects(self): """List of all subject names""" return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))] def subject_images(self, subject): """List of Images of a subject""" fnames = imlist(os.path.join(self._datadir, 'lfw', subject)) return [ImageCategory(category=subject, filename=f) for f in fnames] def _parse_pairs(self, txtfile): pairs = [] for x in readcsv(os.path.join(self._datadir, 'lfw', txtfile), separator='\t'): if len(x) == 3: pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))), ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[2])))))) elif len(x) == 4: pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))), ImageCategory(category=x[2], filename=os.path.join(self._datadir, 'lfw', x[2], '%s_%04d.jpg' % (x[2], int(x[3])))))) else: pass return pairs def _pairsDevTest(self): if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTest.txt')): raise ValueError("Download and save text file to $datadir/pairsDevTest.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw' 'pairsDevTest.txt'))) return self._parse_pairs('pairsDevTest.txt') def _pairsDevTrain(self): if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')): raise ValueError("Download and save text file to $datadir/pairsDevTrain.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt'))) return self._parse_pairs('pairsDevTrain.txt') def _pairs(self): if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairs.txt')): raise ValueError("Download and save text file to $datadir/pairs.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairs.txt'))) return self._parse_pairs('pairs.txt')
Ancestors
Methods
def subject_images(self, subject)
-
List of Images of a subject
Expand source code Browse git
def subject_images(self, subject): """List of Images of a subject""" fnames = imlist(os.path.join(self._datadir, 'lfw', subject)) return [ImageCategory(category=subject, filename=f) for f in fnames]
def subjects(self)
-
List of all subject names
Expand source code Browse git
def subjects(self): """List of all subject names""" return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))]
Inherited members
Dataset
:balanced
batch
chunk
chunks
clone
count
even_split
filter
frequency
from_directory
from_image_urls
groupby
id
identity_shuffler
index
inverse_frequency
list
load
localmap
map
minibatch
partition
pipeline
raw
repeat
sample
set
shift
shuffle
slice
sort
split
streaming_map
streaming_shuffler
take
take_fraction
takeby
takelist
takeone
truncate
tuple
uniform_shuffler
zip