Module vipy.data.facescrub
Expand source code Browse git
import os
from vipy.util import readcsv, remkdir
from vipy.image import ImageDetection
import numpy as np
class FaceScrub(object):
def __init__(self, datadir):
self._datadir = datadir
self._dataset = [] # parsed and validated ImageDetections
def __repr__(self):
return str('<vipy.data.facescrub: %s>' % self._datadir)
def __len__(self):
return len(self._dataset)
def parse(self):
""" Return a list of ImageDetections for all URLs in facescrub """
imset = []
imdir = remkdir(os.path.join(self._datadir, 'images'))
csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t')
for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]:
categoryname = subjectname.replace(' ', '_')
(xmin,ymin,xmax,ymax) = bbox.split(',')
imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'}))
csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t')
for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]:
categoryname = subjectname.replace(' ', '_')
(xmin,ymin,xmax,ymax) = bbox.split(',')
imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'}))
return imset
def download(self):
""" Download every URL in dataset and store in provided filename """
return [d.download(ignoreErrors=True) for d in self.parse()]
def validate(self):
"""Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()"""
P = self.parse()
D = []
for (k,p) in enumerate(P):
if k % 1000 == 0:
print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P)))
if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None:
D.append(p.flush())
self._dataset = D # cache
return self
def dataset(self):
return self._dataset if len(self) > 0 else self.validate().dataset()
def stats(self):
print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))
def subjects(self):
return list(set([im.category() for im in self.dataset()]))
def split(self, valsize=128):
D = self.dataset()
subjects = list(set([im.category() for im in D]))
(trainset, valset) = ([], [])
for s in subjects:
S = [d for d in D if d.category() == s]
if len(S) > 3:
valset = valset + S[0:2] # two examples per subject
trainset = trainset + S[2:] # rest
else:
trainset = trainset + S
valset = np.random.choice(valset, valsize)
return (trainset, valset)
Classes
class FaceScrub (datadir)
-
Expand source code Browse git
class FaceScrub(object): def __init__(self, datadir): self._datadir = datadir self._dataset = [] # parsed and validated ImageDetections def __repr__(self): return str('<vipy.data.facescrub: %s>' % self._datadir) def __len__(self): return len(self._dataset) def parse(self): """ Return a list of ImageDetections for all URLs in facescrub """ imset = [] imdir = remkdir(os.path.join(self._datadir, 'images')) csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t') for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]: categoryname = subjectname.replace(' ', '_') (xmin,ymin,xmax,ymax) = bbox.split(',') imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'})) csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t') for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]: categoryname = subjectname.replace(' ', '_') (xmin,ymin,xmax,ymax) = bbox.split(',') imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'})) return imset def download(self): """ Download every URL in dataset and store in provided filename """ return [d.download(ignoreErrors=True) for d in self.parse()] def validate(self): """Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()""" P = self.parse() D = [] for (k,p) in enumerate(P): if k % 1000 == 0: print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P))) if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None: D.append(p.flush()) self._dataset = D # cache return self def dataset(self): return self._dataset if len(self) > 0 else self.validate().dataset() def stats(self): print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset())))) def subjects(self): return list(set([im.category() for im in self.dataset()])) def split(self, valsize=128): D = self.dataset() subjects = list(set([im.category() for im in D])) (trainset, valset) = ([], []) for s in subjects: S = [d for d in D if d.category() == s] if len(S) > 3: valset = valset + S[0:2] # two examples per subject trainset = trainset + S[2:] # rest else: trainset = trainset + S valset = np.random.choice(valset, valsize) return (trainset, valset)
Methods
def dataset(self)
-
Expand source code Browse git
def dataset(self): return self._dataset if len(self) > 0 else self.validate().dataset()
def download(self)
-
Download every URL in dataset and store in provided filename
Expand source code Browse git
def download(self): """ Download every URL in dataset and store in provided filename """ return [d.download(ignoreErrors=True) for d in self.parse()]
def parse(self)
-
Return a list of ImageDetections for all URLs in facescrub
Expand source code Browse git
def parse(self): """ Return a list of ImageDetections for all URLs in facescrub """ imset = [] imdir = remkdir(os.path.join(self._datadir, 'images')) csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t') for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]: categoryname = subjectname.replace(' ', '_') (xmin,ymin,xmax,ymax) = bbox.split(',') imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'})) csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t') for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]: categoryname = subjectname.replace(' ', '_') (xmin,ymin,xmax,ymax) = bbox.split(',') imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'})) return imset
def split(self, valsize=128)
-
Expand source code Browse git
def split(self, valsize=128): D = self.dataset() subjects = list(set([im.category() for im in D])) (trainset, valset) = ([], []) for s in subjects: S = [d for d in D if d.category() == s] if len(S) > 3: valset = valset + S[0:2] # two examples per subject trainset = trainset + S[2:] # rest else: trainset = trainset + S valset = np.random.choice(valset, valsize) return (trainset, valset)
def stats(self)
-
Expand source code Browse git
def stats(self): print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))
def subjects(self)
-
Expand source code Browse git
def subjects(self): return list(set([im.category() for im in self.dataset()]))
def validate(self)
-
Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()
Expand source code Browse git
def validate(self): """Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()""" P = self.parse() D = [] for (k,p) in enumerate(P): if k % 1000 == 0: print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P))) if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None: D.append(p.flush()) self._dataset = D # cache return self