Module `vipy.data.lfw`

Expand source code Browse git

import os
import numpy as np
from vipy.util import remkdir, filetail, dirlist, imlist, readcsv, tocache
from vipy.image import ImageCategory
import vipy.downloader


URL = 'http://vis-www.cs.umass.edu/lfw/lfw.tgz'
URL_NAMES = 'http://vis-www.cs.umass.edu/lfw/lfw-names.txt'
URL_PAIRS_DEV_TRAIN = 'http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt'
URL_PAIRS_DEV_TEST = 'http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt'
URL_PAIRS_VIEW2 = 'http://vis-www.cs.umass.edu/lfw/pairs.txt'


class LFW(vipy.dataset.Dataset):
    def __init__(self, datadir=None, redownload=False):
        """Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw"""

        datadir = tocache('lfw') if datadir is None else datadir
         
        self._datadir = remkdir(os.path.expanduser(datadir))

        if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')):
            self._download()

        loader = lambda x: ImageCategory(category=x[0], filename=x[1])
        
        super().__init__([(s, f) for s in self.subjects() for f in imlist(os.path.join(self._datadir, 'lfw', s))], id='lfw', loader=loader)

        open(os.path.join(self._datadir, '.complete'), 'a').close()
        
    def _download(self, verbose=True):
        # Link may be dead ... 
        vipy.downloader.download_and_unpack(URL, self._datadir, verbose=verbose)
        return self

    def subjects(self):
        """List of all subject names"""
        return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))]

    def subject_images(self, subject):
        """List of Images of a subject"""
        fnames = imlist(os.path.join(self._datadir, 'lfw', subject))
        return [ImageCategory(category=subject, filename=f) for f in fnames]

    def _parse_pairs(self, txtfile):
        pairs = []
        for x in readcsv(os.path.join(self._datadir, 'lfw', txtfile), separator='\t'):
            if len(x) == 3:
                pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
                              ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[2]))))))
            elif len(x) == 4:
                pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
                              ImageCategory(category=x[2], filename=os.path.join(self._datadir, 'lfw', x[2], '%s_%04d.jpg' % (x[2], int(x[3]))))))
            else:
                pass
        return pairs

    def _pairsDevTest(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTest.txt')):
            raise ValueError("Download and save text file to $datadir/pairsDevTest.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw' 'pairsDevTest.txt')))
        return self._parse_pairs('pairsDevTest.txt')

    def _pairsDevTrain(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')):
            raise ValueError("Download and save text file to $datadir/pairsDevTrain.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')))
        return self._parse_pairs('pairsDevTrain.txt')

    def _pairs(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairs.txt')):
            raise ValueError("Download and save text file to $datadir/pairs.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairs.txt')))
        return self._parse_pairs('pairs.txt')

Classes

class LFW (datadir=None, redownload=False)

vipy.dataset.Dataset() class

Common class to manipulate large sets of objects in parallel

Args

dataset [list, tuple, set, obj]: a python built-in type that supports indexing or a generic object that supports indexing and has a length
id [str]: an optional id of this dataset, which provides a descriptive name of the dataset
loader [callable]: a callable loader that will construct the object from a raw data element in dataset. This is useful for custom deerialization or on demand transformations Datasets can be indexed, shuffled, iterated, minibatched, sorted, sampled, partitioned. Datasets constructed of vipy objects are lazy loaded, delaying loading pixels until they are needed

(trainset, valset, testset) = vipy.dataset.registry('mnist')

(trainset, valset) = trainset.partition(0.9, 0.1)
categories = trainset.set(lambda im: im.category())
smaller = testset.take(1024)
preprocessed = smaller.map(lambda im: im.resize(32, 32).gain(1/256))

for b in preprocessed.minibatch(128):
    print(b)

# visualize the dataset 
(trainset, valset, testset) = vipy.dataset.registry('pascal_voc_2007')
for im in trainset:
    im.mindim(1024).show().print(sleep=1).close()

Datasets can be constructed from directories of json files or image files (Dataset.from_directory()) Datasets can be constructed from a single json file containing a list of objects (Dataset.from_json())

Note: that if a lambda function is provided as loader then this dataset is not serializable. Use self.load() then serialize

Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw

Expand source code Browse git

class LFW(vipy.dataset.Dataset):
    def __init__(self, datadir=None, redownload=False):
        """Datadir contains the unpacked contents of LFW from $URL -> /path/to/lfw"""

        datadir = tocache('lfw') if datadir is None else datadir
         
        self._datadir = remkdir(os.path.expanduser(datadir))

        if redownload or not os.path.exists(os.path.join(self._datadir, '.complete')):
            self._download()

        loader = lambda x: ImageCategory(category=x[0], filename=x[1])
        
        super().__init__([(s, f) for s in self.subjects() for f in imlist(os.path.join(self._datadir, 'lfw', s))], id='lfw', loader=loader)

        open(os.path.join(self._datadir, '.complete'), 'a').close()
        
    def _download(self, verbose=True):
        # Link may be dead ... 
        vipy.downloader.download_and_unpack(URL, self._datadir, verbose=verbose)
        return self

    def subjects(self):
        """List of all subject names"""
        return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))]

    def subject_images(self, subject):
        """List of Images of a subject"""
        fnames = imlist(os.path.join(self._datadir, 'lfw', subject))
        return [ImageCategory(category=subject, filename=f) for f in fnames]

    def _parse_pairs(self, txtfile):
        pairs = []
        for x in readcsv(os.path.join(self._datadir, 'lfw', txtfile), separator='\t'):
            if len(x) == 3:
                pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
                              ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[2]))))))
            elif len(x) == 4:
                pairs.append((ImageCategory(category=x[0], filename=os.path.join(self._datadir, 'lfw', x[0], '%s_%04d.jpg' % (x[0], int(x[1])))),
                              ImageCategory(category=x[2], filename=os.path.join(self._datadir, 'lfw', x[2], '%s_%04d.jpg' % (x[2], int(x[3]))))))
            else:
                pass
        return pairs

    def _pairsDevTest(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTest.txt')):
            raise ValueError("Download and save text file to $datadir/pairsDevTest.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw' 'pairsDevTest.txt')))
        return self._parse_pairs('pairsDevTest.txt')

    def _pairsDevTrain(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')):
            raise ValueError("Download and save text file to $datadir/pairsDevTrain.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairsDevTrain.txt')))
        return self._parse_pairs('pairsDevTrain.txt')

    def _pairs(self):
        if not os.path.isfile(os.path.join(self._datadir, 'lfw', 'pairs.txt')):
            raise ValueError("Download and save text file to $datadir/pairs.txt with 'wget %s -O %s'" % (URL_PAIRS_DEV_TRAIN, os.path.join(self._datadir, 'lfw', 'pairs.txt')))
        return self._parse_pairs('pairs.txt')

Ancestors

Dataset

Methods

def subject_images(self, subject)

List of Images of a subject

Expand source code Browse git

def subject_images(self, subject):
    """List of Images of a subject"""
    fnames = imlist(os.path.join(self._datadir, 'lfw', subject))
    return [ImageCategory(category=subject, filename=f) for f in fnames]

def subjects(self)

List of all subject names

Expand source code Browse git

def subjects(self):
    """List of all subject names"""
    return [filetail(d) for d in dirlist(os.path.join(self._datadir, 'lfw'))]

Inherited members

Dataset:
- balanced
- batch
- chunk
- chunks
- clone
- count
- even_split
- filter
- frequency
- from_directory
- from_image_urls
- groupby
- id
- identity_shuffler
- index
- inverse_frequency
- list
- load
- localmap
- map
- minibatch
- partition
- pipeline
- raw
- repeat
- sample
- set
- shift
- shuffle
- slice
- sort
- split
- streaming_map
- streaming_shuffler
- take
- take_fraction
- takeby
- takelist
- takeone
- truncate
- tuple
- uniform_shuffler
- zip