Module vipy.data.facescrub

Expand source code Browse git
import os
from vipy.util import readcsv, remkdir
from vipy.image import ImageDetection
import numpy as np


class FaceScrub(object):
    def __init__(self, datadir):
        self._datadir = datadir
        self._dataset = []  # parsed and validated ImageDetections

    def __repr__(self):
        return str('<vipy.data.facescrub: %s>' % self._datadir)

    def __len__(self):
        return len(self._dataset)

    def parse(self):
        """ Return a list of ImageDetections for all URLs in facescrub """
        imset = []
        imdir = remkdir(os.path.join(self._datadir, 'images'))
        csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t')
        for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]:
            categoryname = subjectname.replace(' ', '_')
            (xmin,ymin,xmax,ymax) = bbox.split(',')
            imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'}))

        csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t')
        for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]:
            categoryname = subjectname.replace(' ', '_')
            (xmin,ymin,xmax,ymax) = bbox.split(',')
            imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'}))

        return imset

    def download(self):
        """ Download every URL in dataset and store in provided filename """
        return [d.download(ignoreErrors=True) for d in self.parse()]

    def validate(self):
        """Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()"""
        P = self.parse()
        D = []
        for (k,p) in enumerate(P):
            if k % 1000 == 0:
                print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P)))
            if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None:
                D.append(p.flush())
        self._dataset = D  # cache
        return self

    def dataset(self):
        return self._dataset if len(self) > 0 else self.validate().dataset()

    def stats(self):
        print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))

    def subjects(self):
        return list(set([im.category() for im in self.dataset()]))

    def split(self, valsize=128):
        D = self.dataset()
        subjects = list(set([im.category() for im in D]))
        (trainset, valset) = ([], [])
        for s in subjects:
            S = [d for d in D if d.category() == s]
            if len(S) > 3:
                valset = valset + S[0:2]   # two examples per subject
                trainset = trainset + S[2:]  # rest
            else:
                trainset = trainset + S
        valset = np.random.choice(valset, valsize)
        return (trainset, valset)

Classes

class FaceScrub (datadir)
Expand source code Browse git
class FaceScrub(object):
    def __init__(self, datadir):
        self._datadir = datadir
        self._dataset = []  # parsed and validated ImageDetections

    def __repr__(self):
        return str('<vipy.data.facescrub: %s>' % self._datadir)

    def __len__(self):
        return len(self._dataset)

    def parse(self):
        """ Return a list of ImageDetections for all URLs in facescrub """
        imset = []
        imdir = remkdir(os.path.join(self._datadir, 'images'))
        csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t')
        for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]:
            categoryname = subjectname.replace(' ', '_')
            (xmin,ymin,xmax,ymax) = bbox.split(',')
            imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'}))

        csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t')
        for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]:
            categoryname = subjectname.replace(' ', '_')
            (xmin,ymin,xmax,ymax) = bbox.split(',')
            imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'}))

        return imset

    def download(self):
        """ Download every URL in dataset and store in provided filename """
        return [d.download(ignoreErrors=True) for d in self.parse()]

    def validate(self):
        """Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()"""
        P = self.parse()
        D = []
        for (k,p) in enumerate(P):
            if k % 1000 == 0:
                print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P)))
            if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None:
                D.append(p.flush())
        self._dataset = D  # cache
        return self

    def dataset(self):
        return self._dataset if len(self) > 0 else self.validate().dataset()

    def stats(self):
        print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))

    def subjects(self):
        return list(set([im.category() for im in self.dataset()]))

    def split(self, valsize=128):
        D = self.dataset()
        subjects = list(set([im.category() for im in D]))
        (trainset, valset) = ([], [])
        for s in subjects:
            S = [d for d in D if d.category() == s]
            if len(S) > 3:
                valset = valset + S[0:2]   # two examples per subject
                trainset = trainset + S[2:]  # rest
            else:
                trainset = trainset + S
        valset = np.random.choice(valset, valsize)
        return (trainset, valset)

Methods

def dataset(self)
Expand source code Browse git
def dataset(self):
    return self._dataset if len(self) > 0 else self.validate().dataset()
def download(self)

Download every URL in dataset and store in provided filename

Expand source code Browse git
def download(self):
    """ Download every URL in dataset and store in provided filename """
    return [d.download(ignoreErrors=True) for d in self.parse()]
def parse(self)

Return a list of ImageDetections for all URLs in facescrub

Expand source code Browse git
def parse(self):
    """ Return a list of ImageDetections for all URLs in facescrub """
    imset = []
    imdir = remkdir(os.path.join(self._datadir, 'images'))
    csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t')
    for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]:
        categoryname = subjectname.replace(' ', '_')
        (xmin,ymin,xmax,ymax) = bbox.split(',')
        imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'}))

    csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t')
    for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]:
        categoryname = subjectname.replace(' ', '_')
        (xmin,ymin,xmax,ymax) = bbox.split(',')
        imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'}))

    return imset
def split(self, valsize=128)
Expand source code Browse git
def split(self, valsize=128):
    D = self.dataset()
    subjects = list(set([im.category() for im in D]))
    (trainset, valset) = ([], [])
    for s in subjects:
        S = [d for d in D if d.category() == s]
        if len(S) > 3:
            valset = valset + S[0:2]   # two examples per subject
            trainset = trainset + S[2:]  # rest
        else:
            trainset = trainset + S
    valset = np.random.choice(valset, valsize)
    return (trainset, valset)
def stats(self)
Expand source code Browse git
def stats(self):
    print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))
def subjects(self)
Expand source code Browse git
def subjects(self):
    return list(set([im.category() for im in self.dataset()]))
def validate(self)

Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()

Expand source code Browse git
def validate(self):
    """Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()"""
    P = self.parse()
    D = []
    for (k,p) in enumerate(P):
        if k % 1000 == 0:
            print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P)))
        if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None:
            D.append(p.flush())
    self._dataset = D  # cache
    return self