Module `vipy.dataset`

Expand source code Browse git

import os
import numpy as np
from vipy.util import findpkl, toextension, filepath, filebase, jsonlist, ishtml, ispkl, filetail, temphtml, listpkl, listext, templike, tempdir, remkdir, tolist, fileext, writelist, tempcsv, newpathroot, listjson, extlist, filefull, tempdir, groupbyasdict
import random
import vipy
import vipy.util
import shutil
import uuid
import warnings
import copy 
from vipy.util import is_email_address
import hashlib
import pickle
import time
import json
import dill
from vipy.show import colorlist
import matplotlib.pyplot as plt
import gc 
import vipy.metrics


class Dataset():
    """vipy.dataset.Dataset() class
    
    Common class to manipulate large sets of vipy objects in parallel

    ```python
    D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()], id='random_scene')
    with vipy.globals.parallel(2):
        D = D.map(lambda v: v.frame(0))
    list(D)
    ```

    Create dataset and export as a directory of json files 

    ```python
    D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()])
    D.tojsondir('/tmp/myjsondir')
    ```
    
    Create dataset from all json or pkl files recursively discovered in a directory and lazy loaded

    ```python
    D = vipy.dataset.Dataset('/tmp/myjsondir')  # lazy loading
    ```

    Create dataset from a list of json or pkl files and lazy loaded

    ```python
    D = vipy.dataset.Dataset(['/path/to/file1.json', '/path/to/file2.json'])  # lazy loading
    ```
    
    Args:
    
        - abspath [bool]: If true, load all lazy elements with absolute path
        - loader [lambda]: a callable loader that will process the object .  This is useful for custom deerialization
        - lazy [bool]: If true, load all pkl or json files using the custom loader when accessed

    .. notes:: Be warned that using the jsondir constructor will load elements on demand, but there are some methods that require loading the entire dataset into memory, and will happily try to do so
    """

    def __init__(self, objlist, id=None, abspath=True, loader=None, lazy=False):

        assert loader is None or callable(loader)

        self._saveas_ext = ['pkl', 'json']
        self._id = id if id is not None else vipy.util.shortuuid(8)
        self._loader = self._default_loader if loader is None else loader  # may not be serializable if lambda is provided
        self._istype_strict = True
        self._lazy_loader = lazy
        self._abspath = abspath
        self._shuffler = 'uniform'
        
        if isinstance(objlist, str) and (vipy.util.isjsonfile(objlist) or vipy.util.ispklfile(objlist) or vipy.util.ispklbz2(objlist)):
            self._objlist = vipy.util.load(objlist, abspath=abspath)
        elif isinstance(objlist, str) and os.path.isdir(objlist):
            self._objlist = vipy.util.findloadable(objlist) # recursive
            self._loader = lambda x,b=abspath:  vipy.util.load(x, abspath=b) if (vipy.util.ispkl(x) or vipy.util.isjsonfile(x) or vipy.util.ispklbz2(objlist)) else x
            self._istype_strict = False
            self._lazy_loader = True
        elif lazy and (isinstance(objlist, list) and all([(vipy.util.ispkl(x) or vipy.util.isjsonfile(x)) for x in objlist])):
            self._objlist = objlist 
            self._loader = lambda x,b=abspath:  vipy.util.load(x, abspath=b) if (vipy.util.ispkl(x) or vipy.util.isjsonfile(x)) else x            
            self._istype_strict = False
            self._lazy_loader = True
        else:
            self._objlist = objlist

        self._objlist = tolist(self._objlist)        
        assert len(self._objlist) > 0, "Empty dataset"

        if self._lazy_loader:
            try:
                self[0]
            except Exception as e:
                raise ValueError('Invalid dataset - Lazy load failed with error "%s"' % str(e))

    @staticmethod
    def _default_loader(x):
        return x
    
    def __repr__(self):
        return str('<vipy.dataset: id="%s", len=%d, type=%s>' % (self.id(), len(self), str(type(self[0])) if len(self)>0 else 'None'))

    def __iter__(self):
        for k in range(len(self)):
            yield self[k]

    def __getitem__(self, k):
        if isinstance(k, int) or isinstance(k, np.uint64):
            assert abs(k) < len(self._objlist), "invalid index"
            x = self._objlist[int(k)]
            return self._loader(x) if self._loader is not None else x
        elif isinstance(k, slice):
            return [self._loader(x) if self._loader is not None else x for x in self._objlist[k.start:k.stop:k.step]]
        else:
            raise ValueError()
            
    def __len__(self):
        return len(self._objlist)
        
    def json(self, encode=True):
        r = vipy.util.class_registry()
        d = {k:v for (k,v) in self.__dict__.items() if not k == '_loader'}
        d['_objlist'] = [(str(type(v)), v.json(encode=False)) if str(type(v)) in r else v for v in self._objlist]
        return json.dumps(d) if encode else d
        
    @classmethod
    def from_json(cls, s):
        r = vipy.util.class_registry()
        d = json.loads(s) if not isinstance(s, dict) else s  
        return cls(objlist=[r[x[0]](x[1]) if (isinstance(x, tuple) and x[0] in r) else x for x in d['_objlist']],
                   id=d['_id'],
                   abspath=d['_abspath'])                            

    def id(self, n=None):
        """Set or return the dataset id"""
        if n is None:
            return self._id
        else:
            self._id = n
            return self

    def list(self):
        """Return the dataset as a list"""
        return list(self)

    def set(self):
        """Return the dataset as a set"""
        return set(self.list())
    
    def tolist(self):
        """Alias for self.list()"""
        return list(self)

    def flatten(self):
        """Convert dataset stored as a list of lists into a flat list"""
        self._objlist = [o for objlist in self._objlist for o in vipy.util.tolist(objlist)]
        return self

    def istype(self, validtype):
        """Return True if the all elements (or just the first element if strict=False) in the dataset are of type 'validtype'"""
        return all([any([isinstance(v,t) for t in tolist(validtype)]) for v in self]) if self._istype_strict else any([isinstance(self[0],t) for t in tolist(validtype)])

    def _isvipy(self):
        """Return True if all elements in the dataset are of type `vipy.video.Video` or `vipy.image.Image`"""        
        return self.istype([vipy.image.Image, vipy.video.Video])

    def _is_vipy_video(self):
        """Return True if all elements in the dataset are of type `vipy.video.Video`"""                
        return self.istype([vipy.video.Video])

    def _is_vipy_video_scene(self):
        """Return True if all elements in the dataset are of type `vipy.video.Scene`"""                        
        return self.istype([vipy.video.Scene])

    def _is_vipy_image_scene(self):
        """Return True if all elements in the dataset are of type `vipy.video.Scene`"""                        
        return self.istype([vipy.image.Scene])

    def clone(self, shallow=False):
        """Return a deep copy of the dataset"""
        if shallow:
            objlist = self._objlist
            self._objlist = []  
            D = copy.deepcopy(self)
            self._objlist = objlist  # restore
            return D
        else:
            return copy.deepcopy(self)

    def archive(self, tarfile, delprefix, mediadir='videos', format='json', castas=vipy.video.Scene, verbose=False, extrafiles=None, novideos=False, md5=True, tmpdir=None, inplace=False, bycategory=False, annotationdir='annotations'):
        """Create a archive file for this dataset.  This will be archived as:

           /path/to/tarfile.{tar.gz|.tgz|.bz2}
              tarfilename
                 tarfilename.{json|pkl}
                 mediadir/
                     video.mp4
                 extras1.ext
                 extras2.ext
        
            Args:
                tarfile: /path/to/tarfilename.tar.gz
                delprefix:  the absolute file path contained in the media filenames to be removed.  If a video has a delprefix='/a/b' then videos with path /a/b/c/d.mp4' -> 'c/d.mp4', and {JSON|PKL} will be saved with relative paths to mediadir.  This may be a list of delprefixes.
                mediadir:  the subdirectory name of the media to be contained in the archive.  Usually "videos".             
                extrafiles: list of tuples or singletons [(abspath, filename_in_archive_relative_to_root), 'file_in_root_and_in_pwd', ...], 
                novideos [bool]:  generate a tarball without linking videos, just annotations
                md5 [bool]:  If True, generate the MD5 hash of the tarball using the system "md5sum", or if md5='vipy' use a slower python only md5 hash 
                castas [class]:  This should be a vipy class that the vipy objects should be cast to prior to archive.  This is useful for converting priveledged superclasses to a base class prior to export.
                tmpdir:  The path to the temporary directory for construting this dataset.  Defaults to system temp.  This directory will be emptied prior to archive.
                inplace [bool]:  If true, modify the dataset in place to prepare it for archive, else make a copy
                bycategory [bool]: If true, save the annotations in an annotations/ directory by category
                annotationdir [str]: The subdirectory name of annotations to be contained in the archive if bycategory=True.  Usually "annotations" or "json".

            Example:  

              - Input files contain /path/to/oldvideos/category/video.mp4
              - Output will contain relative paths videos/category/video.mp4

        ```python
        d.archive('out.tar.gz', delprefix='/path/to/oldvideos', mediadir='videos')
        ```
        
            Returns:

                The absolute path to the tarball 
        """
        assert self._isvipy(), "Source dataset must contain vipy objects for staging"
        assert all([os.path.isabs(v.filename()) for v in self]), "Input dataset must have only absolute media paths"
        assert len([v for v in self if any([d in v.filename() for d in tolist(delprefix)])]) == len(self), "all media objects must have a provided delprefix for relative path construction"
        assert vipy.util.istgz(tarfile) or vipy.util.istarbz2(tarfile) or vipy.util.istar(tarfile), "Allowable extensions are .tar.gz, .tgz, .bz2 or .tar"
        assert shutil.which('tar') is not None, "tar not found on path"        
        
        D = self.clone() if not inplace else self   # large memory footprint if inplace=False
        tmpdir = tempdir() if tmpdir is None else remkdir(tmpdir, flush=True)
        stagedir = remkdir(os.path.join(tmpdir, filefull(filetail(tarfile))))
        print('[vipy.dataset]: creating staging directory "%s"' % stagedir)
        delprefix = [[d for d in tolist(delprefix) if d in v.filename()][0] for v in self]  # select the delprefix per video
        D._objlist = [v.filename(v.filename().replace(os.path.normpath(p), os.path.normpath(os.path.join(stagedir, mediadir))), symlink=not novideos) for (p,v) in zip(delprefix, D.list())]

        # Save annotations:  Split large datasets into annotations grouped by category to help speed up loading         
        if bycategory:
            for (c,V) in vipy.util.groupbyasdict(list(D), lambda v: v.category()).items():
                Dataset(V, id=c).save(os.path.join(stagedir, annotationdir, '%s.%s' % (c, format)), relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)
        else:
            pklfile = os.path.join(stagedir, '%s.%s' % (filetail(filefull(tarfile)), format))
            D.save(pklfile, relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)
    
        # Copy extras (symlinked) to staging directory
        if extrafiles is not None:
            # extrafiles = [("/abs/path/in/filesystem.ext", "rel/path/in/archive.ext"), ... ]
            assert all([((isinstance(e, tuple) or isinstance(e, list)) and len(e) == 2) or isinstance(e, str) for e in extrafiles])
            extrafiles = [e if (isinstance(e, tuple) or isinstance(e, list)) else (e,e) for e in extrafiles]  # tuple-ify files in pwd() and should be put in the tarball root
            for (e, a) in tolist(extrafiles):
                assert os.path.exists(os.path.abspath(e)), "Invalid extras file '%s' - file not found" % e
                remkdir(filepath(os.path.join(stagedir, filetail(e) if a is None else a)))    # make directory in stagedir for symlink
                os.symlink(os.path.abspath(e), os.path.join(stagedir, filetail(e) if a is None else a))

        # System command to run tar
        cmd = ('tar %scvf %s -C %s --dereference %s %s' % ('j' if vipy.util.istarbz2(tarfile) else ('z' if vipy.util.istgz(tarfile) else ''), 
                                                           tarfile,
                                                           filepath(stagedir),
                                                           filetail(stagedir),
                                                           ' > /dev/null' if not verbose else ''))

        print('[vipy.dataset]: executing "%s"' % cmd)        
        os.system(cmd)  # too slow to use python "tarfile" package
        print('[vipy.dataset]: deleting staging directory "%s"' % stagedir)        
        shutil.rmtree(stagedir)

        if md5:
            if shutil.which('md5sum') is not None:
                cmd = 'md5sum %s' % tarfile
                print('[vipy.dataset]: executing "%s"' % cmd)        
                os.system(cmd)  # too slow to use python "vipy.downloader.generate_md5(tarball)" for huge datasets
            else:
                print('[vipy.dataset]: %s, MD5=%s' % (tarfile, vipy.downloader.generate_md5(tarfile)))  # too slow for large datasets, but does not require md5sum on path
        return tarfile
        
    def save(self, outfile, nourl=False, castas=None, relpath=False, sanitize=True, strict=True, significant_digits=2, noemail=True, flush=True, bycategory=False):
        """Save the dataset to the provided output filename stored as pkl or json
        
        Args:
            outfile: [str]: The /path/to/out.pkl or /path/to/out.json
            nourl: [bool]: If true, remove all URLs from the media (if present)
            castas: [type]:  Cast all media to the provided type.  This is useful for downcasting to `vipy.video.Scene` from superclasses
            relpath: [bool]: If true, define all file paths in objects relative to the /path/to in /path/to/out.json
            sanitize: [bool]:  If trye, call sanitize() on all objects to remove all private attributes with prepended '__' 
            strict: [bool]: Unused
            significant_digits: [int]: Assign the requested number of significant digits to all bounding boxes in all tracks.  This requires dataset of `vipy.video.Scene`
            noemail: [bool]: If true, scrub the attributes for emails and replace with a hash
            flush: [bool]:  If true, flush the object buffers prior to save
            bycategory [bool[: If trye, then save the dataset to the provided output filename pattern outfile='/path/to/annotations/*.json' where the wildcard is replaced with the category name

        Returns:        
            This dataset that is quivalent to vipy.dataset.Dataset('/path/to/outfile.json')
        """
        n = len([v for v in self if v is None])
        if n > 0:
            print('[vipy.dataset]: removing %d invalid elements' % n)
        objlist = [v for v in self if v is not None]  
        if relpath or nourl or sanitize or flush or noemail or (significant_digits is not None):
            assert self._isvipy(), "Invalid input"
        if relpath:
            print('[vipy.dataset]: setting relative paths')
            objlist = [v.relpath(start=filepath(outfile)) if os.path.isabs(v.filename()) else v for v in objlist]
        if nourl: 
            print('[vipy.dataset]: removing URLs')
            objlist = [v.nourl() for v in objlist]           
        if sanitize:
            print('[vipy.dataset]: sanitizing attributes')                        
            objlist = [v.sanitize() for v in objlist]  # removes all attributes with '__' keys
        if castas is not None:
            assert hasattr(castas, 'cast'), "Invalid cast"
            print('[vipy.dataset]: casting as "%s"' % (str(castas)))
            objlist = [castas.cast(v) for v in objlist]                     
        if significant_digits is not None:
            assert self._is_vipy_video_scene()
            assert isinstance(significant_digits, int) and significant_digits >= 1, "Invalid input"
            objlist = [o.trackmap(lambda t: t.significant_digits(significant_digits)) if o is not None else o for o in objlist]
        if noemail:
            print('[vipy.dataset]: removing emails')            
            for o in objlist:
                for (k,v) in o.attributes.items():
                    if isinstance(v, str) and is_email_address(v):
                        o.attributes[k] = hashlib.sha1(v.encode("UTF-8")).hexdigest()[0:10]
        if flush:
            objlist = [o.flush() for o in objlist]  

        if bycategory:
            for (c,V) in vipy.util.groupbyasdict(list(self), lambda v: v.category()).items():
                jsonfile = outfile.replace('*', c)  # outfile="/path/to/annotations/*.json"
                d = Dataset(V, id=c).save(jsonfile, relpath=relpath, nourl=nourl, sanitize=sanitize, castas=castas, significant_digits=significant_digits, noemail=noemail, flush=flush, bycategory=False)
                print('[vipy.dataset]: Saving %s by category to "%s"' % (str(d), jsonfile))                
        else:
            print('[vipy.dataset]: Saving %s to "%s"' % (str(self), outfile))
            vipy.util.save(objlist, outfile)
        return self

    def classlist(self):
        """Return a sorted list of categories in the dataset"""
        assert self._isvipy(), "Invalid input"
        return sorted(list(set([v.category() for v in self])))

    def classes(self):
        """Alias for classlist"""
        return self.classlist()
    def categories(self):
        """Alias for classlist"""
        return self.classlist()
    def num_classes(self):
        """Return the number of unique categories in this dataset"""
        return len(self.classlist())
    def num_labels(self):
        """Alias for num_classes"""
        return self.num_classes()
    def num_categories(self):
        """Alias for num_classes"""
        return self.num_classes()
    
    
    def class_to_index(self):
        """Return a dictionary mapping the unique classes to an integer index.  This is useful for defining a softmax index ordering for categorization"""
        return {v:k for (k,v) in enumerate(self.classlist())}

    def index_to_class(self):
        """Return a dictionary mapping an integer index to the unique class names.  This is the inverse of class_to_index, swapping keys and values"""
        return {v:k for (k,v) in self.class_to_index().items()}

    def label_to_index(self):
        """Alias for class_to_index"""
        return self.class_to_index()

    def powerset(self):
        return list(sorted(set([tuple(sorted(list(a))) for v in self for a in v.activitylabel() if len(a) > 0])))        

    def powerset_to_index(self):        
        assert self._isvipy(), "Invalid input"
        return {c:k for (k,c) in enumerate(self.powerset())}

    def dedupe(self, key):
        self._objlist = list({key(v):v for v in self}.values())
        return self
        
    def countby(self, f):
        return len([v for v in self if f(v)])

    def union(self, other, key=None):
        assert isinstance(other, Dataset), "invalid input"
        if len(other) > 0:
            try:
                if other._loader is not None:
                    other._loader(self._objlist[0])
                if self._loader is not None:
                    self._loader(other._objlist[0])
                self._objlist = self._objlist + other._objlist  # compatible loaders
            except:
                self._objlist = self.list() + other.list()  # incompatible loaders
                self._loader = None
        return self.dedupe(key) if key is not None else self
    
    def difference(self, other, key):
        assert isinstance(other, Dataset), "invalid input"
        idset = set([key(v) for v in self]).difference([key(v) for v in other])   # in A but not in B
        self._objlist = [v for v in self if key(v) in idset]
        return self
        
    def has(self, val, key):
        return any([key(obj) == val for obj in self])

    def replace(self, other, key):
        """Replace elements in self with other with equality detemrined by the key lambda function"""
        assert isinstance(other, Dataset), "invalid input"
        d = {key(v):v for v in other}
        self._objlist = [v if key(v) not in d else d[key(v)] for v in self]
        return self

    def merge(self, outdir):
        """Merge a dataset union into a single subdirectory with symlinked media ready to be archived.

        ```python
        D1 = vipy.dataset.Dataset('/path1/dataset.json')
        D2 = vipy.dataset.Dataset('/path2/dataset.json')
        D3 = D1.union(D2).merge(outdir='/path3')
        ```

        Media in D1 are in /path1, media in D2 are in /path2, media in D3 are all symlinked to /path3.
        We can now create a tarball for D3 with all of the media files in the same relative path.
        """
        
        outdir = vipy.util.remkdir(os.path.abspath(os.path.normpath(outdir)))
        return self.clone().localmap(lambda v: v.filename(os.path.join(outdir, filetail(v.filename())), copy=False, symlink=True))

    def augment(self, f, n_augmentations):
        assert n_augmentations >= 1
        self._objlist = [f(v.clone()) for v in self for k in range(n_augmentations)]  # This will remove the originals
        return self

    def filter(self, f):
        """In place filter with lambda function f"""
        self._objlist = [v for v in self if f(v)]
        return self

    def valid(self):
        return self.filter(lambda v: v is not None)

    def takefilter(self, f, n=1):
        """Apply the lambda function f and return n elements in a list where the filter returns true
        
        Args:
            f: [lambda] If f(x) returns true, then keep
            n: [int >= 0] The number of elements to take
        
        Returns:
            [n=0] Returns empty list
            [n=1] Returns singleton element
            [n>1] Returns list of elements of at most n such that each element f(x) is True            
        """
        objlist = [obj for obj in self if f(obj)]
        return [] if (len(objlist) == 0 or n == 0) else (objlist[0] if n==1 else objlist[0:n])

    def jsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
        """Export all objects to a directory of JSON files.
    
           Usage:

        ```python
        D = vipy.dataset.Dataset(...).jsondir('/path/to/jsondir')
        D = vipy.util.load('/path/to/jsondir')   # recursively discover and lazy load all json files 
        ```

           Args:
               outdir [str]:  The root directory to store the JSON files
               verbose [bool]: If True, print the save progress
               rekey [bool] If False, use the instance ID of the vipy object as the filename for the JSON file, otherwise assign a new UUID_dataset-index
               bycategory [bool]: If True, use the JSON structure '$OUTDIR/$CATEGORY/$INSTANCEID.json'
               byfilename [bool]: If True, use the JSON structure '$FILENAME.json' where $FILENAME is the underlying media filename of the vipy object
               abspath [bool]: If true, store absolute paths to media in JSON.  If false, store relative paths to media from JSON directory

           Returns:
               outdir: The directory containing the JSON files.
        """
        assert self._isvipy()
        assert outdir is not None or byfilename 
        assert not byfilename and bycategory

        if outdir is not None:
            vipy.util.remkdir(outdir) 
        if bycategory:
            tojsonfile = lambda v,k: os.path.join(outdir, v.category(), ('%s.json' % v.instanceid()) if not rekey else ('%s_%d.json' % (uuid.uuid4().hex, k)))
        elif byfilename:
            tojsonfile = lambda v,k: vipy.util.toextension(v.filename(), '.json')
        else:
            tojsonfile = lambda v,k: os.path.join(outdir, ('%s.json' % v.instanceid()) if not rekey else '%s_%d.json' % (uuid.uuid4().hex, k))
        
        for (k,v) in enumerate(self):            
            f = vipy.util.save(v.clone().relpath(start=filepath(tojsonfile(v,k))) if not abspath else v.clone().abspath(), tojsonfile(v,k))
            if verbose:
                print('[vipy.dataset.Dataset][%d/%d]: %s' % (k, len(self), f))
        return outdir

    def tojsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
        """Alias for `vipy.dataset.Dataset.jsondir`"""
        return self.jsondir(outdir, verbose=verbose, rekey=rekey, bycategory=bycategory, byfilename=byfilename, abspath=abspath)
    
    def takelist(self, n, category=None, seed=None):
        """Take n elements of selected category and return list.  The elements are not cloned."""
        assert n >= 0, "Invalid length"
        K = list(range(len(self))) if category is None else [k for (k,v) in enumerate(self) if v.category() == category]
        if seed is not None:
            assert isinstance(seed, int), "integer required"
            np.random.seed(seed)            
        outlist = [self[int(k)] for k in np.random.permutation(K)[0:n]]  # native python int
        if seed is not None:
            np.random.seed()
        return outlist

    def load(self):
        """Load the entire dataset into memory.  This is useful for creating in-memory datasets from lazy load datasets"""
        self._objlist = self.list()
        self._loader = None
        return self

    def take(self, n, category=None, canload=False, seed=None):
        """Randomlly Take n elements from the dataset, and return a dataset.  If seed=int, take will return the same results each time."""
        assert isinstance(n, int) and n>0
        D = self.clone(shallow=True)
        D._objlist = self.takelist(n, category=category, seed=seed)
        return D

    def takeone(self, category=None, canload=False, seed=None):
        """Randomly take one element from the dataset and return a singleton"""
        D = self.take(n=1, category=category, canload=canload, seed=seed)
        return D[0] if len(D)>0 else None

    def take_per_category(self, n, seed=None):
        """Random;y take n elements per category and return a shallow cloned dataset"""
        D = self.clone(shallow=True)
        d_category_to_objlist = vipy.util.groupbyasdict(self._objlist, lambda x: x.category())
        D._objlist = [v for c in self.categories() for v in Dataset(d_category_to_objlist[c]).take(n, seed=seed)]
        return D

    def shuffler(self, method=None, uniform=None, pairwise=None):
        """Specify a shuffler protocol.  
        
           >>> D.shuffler('uniform')
           >>> D.shuffer(uniform=True)
           >>> D.shuffle()

           Args:
             uniform [bool]: shuffle element uniformly at random
             pairwise [bool]:  elements are assumed to be pairwise similarities, such that the category() method returns an id for each positive pair.  Shuffle keeping positive pairs as minibatch neighbors.        

           Returns: self if a new shuffler is requested, otherwise return a lambda function which shuffles a list. This lambda function is not meant to be used directly, rather exercised by shuffle
        """
        if method:
            assert method in ['uniform', 'pairwise'], "unknown shuffler '%s'" % method
            self._shuffler = method
        elif pairwise:
            self._shuffler = 'pairwise'
        elif uniform:
            self._shuffler = 'uniform'
        elif self._shuffler == 'uniform':
            return lambda y: sorted(y, key=lambda x: random.random())
        elif self._shuffler == 'pairwise':
            return lambda y: vipy.util.flatlist(sorted(vipy.util.chunklistbysize(sorted(y, key=lambda x: x.category()), 2), key=lambda x: random.random()))
        return self
    
    def shuffle(self):
        """Randomly permute elements in this dataset according to a shuffler protocol set with shuffler()"""
        self._objlist = self.shuffler()(self._objlist)  # in-place
        return self

    def chunk(self, n):
        """Yield n chunks as dataset.  Last chunk will be ragged"""
        for (k,V) in enumerate(vipy.util.chunklist(self._objlist, n)):
            yield Dataset(V, id='%s_%d' % (self.id(), k), loader=self._loader)

    def minibatch(self, n, ragged=True):
        """Yield list chunks of size n of this dataset.  Last chunk will be ragged if ragged=True, else skipped"""
        for (k,V) in enumerate(vipy.util.chunklistbysize(self._objlist, n)):
            if ragged or len(V) == n:
                yield V 
        
    def split_by_videoid(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None):
        """Split the dataset by category by fraction so that video IDs are never in the same set"""
        assert self._isvipy(), "Invalid input"
        assert trainfraction >=0 and trainfraction <= 1
        assert valfraction >=0 and valfraction <= 1
        assert testfraction >=0 and testfraction <= 1
        assert trainfraction + valfraction + testfraction == 1.0
        np.random.seed(seed)  # deterministic
        
        # Video ID assignment
        A = self.list()
        videoid = list(set([a.videoid() for a in A]))
        np.random.shuffle(videoid)
        (testid, valid, trainid) = vipy.util.dividelist(videoid, (testfraction, valfraction, trainfraction))        
        (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
        d = groupbyasdict(A, lambda a: 'testset' if a.videoid() in testid else 'valset' if a.videoid() in valid else 'trainset')
        (trainset, testset, valset) = (d['trainset'] if 'trainset' in d else [], 
                                       d['testset'] if 'testset' in d else [], 
                                       d['valset'] if 'valset' in d else [])

        #print('[vipy.dataset]: trainset=%d (%1.2f)' % (len(trainset), trainfraction))
        #print('[vipy.dataset]: valset=%d (%1.2f)' % (len(valset), valfraction))
        #print('[vipy.dataset]: testset=%d (%1.2f)' % (len(testset), testfraction))
        np.random.seed()  # re-initialize seed

        return (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)

    def split(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None, withtest=True):
        """Split the dataset into the requested fractions.  

        Args:
            trainfraction [float]: fraction of dataset for training set
            valfraction [float]: fraction of dataset for validation set
            testfraction [float]: fraction of dataset for test set
            seed [int]: random seed for determinism.  Set to None for random.
            withtest: If true, return (trainset, valset, testset) even if testset is None

        Returns:        
            (trainset, valset, testset) if withtest=True else (trainset, valest) if testfraction=0
        """
        assert trainfraction >=0 and trainfraction <= 1
        assert valfraction >=0 and valfraction <= 1
        assert testfraction >=0 and testfraction <= 1
        assert trainfraction + valfraction + testfraction == 1.0
        
        # Assignment
        if seed is not None:
            np.random.seed(seed)  # deterministic        
        A = self.list()
        idx = list(range(len(A)))
        np.random.shuffle(idx)
        (testid, valid, trainid) = vipy.util.dividelist(idx, (testfraction, valfraction, trainfraction))
        (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
        trainset = [a for (k,a) in enumerate(A) if k in trainid]
        testset = [a for (k,a) in enumerate(A) if k in testid]
        valset = [a for (k,a) in enumerate(A) if k in valid]
        if seed is not None:
            np.random.seed()  # re-initialize seed

        (train,val,test) = (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)
        return (train,val,test) if withtest or test is not None else (train,val)
    
    def tocsv(self, csvfile=None):
        csv = [v.csv() for v in self.list]        
        return vipy.util.writecsv(csv, csvfile) if csvfile is not None else (csv[0], csv[1:])

    def map(self, f_map, model=None, dst=None, id=None, strict=False, ascompleted=True, ordered=False):        
        """Distributed map.

        To perform this in parallel across four processes:

        ```python
        D = vipy.dataset.Dataset(...)
        with vipy.globals.parallel(4):
            D.map(lambda v: ...)
        ```

        Args:
            f_map: [lambda] The lambda function to apply in parallel to all elements in the dataset.  This must return a JSON serializable object
            model: [torch.nn.Module] The model to scatter to all workers
            dst: [str] The ID to give to the resulting dataset
            id: [str] The ID to give to the resulting dataset (parameter alias for dst)
            strict: [bool] If true, raise exception on map failures, otherwise the map will return None for failed elements
            ascompleted: [bool] If true, return elements as they complete
            ordered: [bool] If true, preserve the order of objects in dataset as returned from distributed processing
        
        Returns:
            A `vipy.dataset.Dataset` containing the elements f_map(v).  This operation is order preserving if ordered=True.

        .. note:: 
            - This dataset must contain vipy objects of types defined in `vipy.util.class_registry` or JSON serializable objects
            - Serialization of large datasets can take a while, kick it off to a distributed dask scheduler and go get lunch
            - This method uses dask distributed and `vipy.batch.Batch` operations
            - All vipy objects are JSON serialized prior to parallel map to avoid reference cycle garbage collection which can introduce instabilities
            - Due to chunking, all error handling is caught by this method.  Use `vipy.batch.Batch` to leverage dask distributed futures error handling.
            - Operations must be chunked and serialized because each dask task comes with overhead, and lots of small tasks violates best practices
            - Serialized results are deserialized by the client and returned a a new dataset
        """
        assert callable(f_map)
        from vipy.batch import Batch   # requires pip install vipy[all]

        # Distributed map using vipy.batch
        f_serialize = lambda v,d=vipy.util.class_registry(): (str(type(v)), v.json()) if str(type(v)) in d else (None, pickle.dumps(v))  # fallback on PKL dumps/loads
        f_deserialize = lambda x,d=vipy.util.class_registry(): d[x[0]](x[1])  # with closure capture
        f_catcher = lambda f, *args, **kwargs: vipy.util.loudcatcher(f, '[vipy.dataset.Dataset.map]: ', *args, **kwargs)  # catch exceptions when executing lambda, print errors and return (True, result) or (False, exception)
        f_loader = self._loader if self._loader is not None else lambda x: x
        S = [f_serialize(v) for v in self._objlist]  # local serialization

        B = Batch(vipy.util.chunklist(S, 128), strict=strict, as_completed=ascompleted, warnme=False, minscatter=128, ordered=ordered)
        if model is None:
            f = lambda x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, f_loader(f_deserializer(x))))  # with closure capture
            S = B.map(lambda X,f=f: [f(x) for x in X]).result()  # chunked, with caught exceptions, may return empty list
        else:
            f = lambda net, x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, net, f_loader(f_deserializer(x))))  # with closure capture
            S = B.scattermap((lambda net, X, f=f: [f(net, x) for x in X]), model).result()  # chunked, scattered, caught exceptions
        if not isinstance(S, list) or any([not isinstance(s, list) for s in S]):
            raise ValueError('Distributed processing error - Batch returned: %s' % (str(S)))
        V = [f_deserialize(x) for s in S for x in s]  # Local deserialization and chunk flattening
        (good, bad) = ([r for (b,r) in V if b], [r for (b,r) in V if not b])  # catcher returns (True, result) or (False, exception string)
        if len(bad) > 0:
            print('[vipy.dataset.Dataset.map]: Exceptions in map distributed processing:\n%s' % str(bad))
            print('[vipy.dataset.Dataset.map]: %d/%d items failed' % (len(bad), len(self)))
        return Dataset(good, id=dst if dst is not None else id)

    def localmap(self, f):
        for (k,v) in enumerate(self):
            self._objlist[k] = f(v)  # in-place update
        return self

    def flatmap(self, f):
        self._objlist = [x for v in self for x in f(v)]
        return self
    
    def count(self, f=None):
        """Counts for each label.  
        
        Args:
            f: [lambda] if provided, count the number of elements that return true.  This is the same as len(self.filter(f)) without modifying the dataset.

        Returns:
            A dictionary of counts per category [if f is None]
            A length of elements that satisfy f(v) = True [if f is not None]
        """
        assert self._isvipy()
        assert f is None or callable(f)
        return len([v for v in self if f is None or f(v)])

    def countby(self, f=lambda v: v.category()):
        """Count the number of elements that return the same value from the lambda function"""
        assert self._isvipy()
        assert f is None or callable(f)
        return vipy.util.countby(self, f)
        
    def frequency(self):
        return self.count()

    def synonym(self, synonymdict):
        """Convert all categories in the dataset using the provided synonym dictionary mapping"""
        assert self._isvipy()
        assert isinstance(synonymdict, dict)
        
        if self._is_vipy_video_scene():
            return self.localmap(lambda v: v.trackmap(lambda t: t.categoryif(synonymdict)).activitymap(lambda a: a.categoryif(synonymdict)))
        elif self._is_vipy_image_scene():
            return self.localmap(lambda v: v.objectmap(lambda o: o.categoryif(synonymdict)))
        return self

    def histogram(self, outfile=None, fontsize=6, category_to_barcolor=None, category_to_xlabel=None):
        assert self._isvipy()
        assert category_to_barcolor is None or all([c in category_to_barcolor for c in self.categories()])
        assert category_to_xlabel is None or callable(category_to_xlabel) or all([c in category_to_xlabel for c in self.categories()])
        f_category_to_xlabel = category_to_xlabel if callable(category_to_xlabel) else ((lambda c: category_to_xlabel[c]) if category_to_xlabel is not None else (lambda c: c))
        
        d = self.countby(lambda v: v.category())
        if outfile is not None:
            (categories, freq) = zip(*reversed(sorted(list(d.items()), key=lambda x: x[1])))  # decreasing frequency
            barcolors = ['blue' if category_to_barcolor is None else category_to_barcolor[c] for c in categories]
            xlabels = [f_category_to_xlabel(c) for c in categories]
            print('[vipy.dataset]: histogram="%s"' % vipy.metrics.histogram(freq, xlabels, barcolors=barcolors, outfile=outfile, ylabel='Instances', fontsize=fontsize))
        return d
    
    def percentage(self):
        """Fraction of dataset for each label"""
        d = self.count()
        n = sum(d.values())
        return {k:v/float(n) for (k,v) in d.items()}

    def multilabel_inverse_frequency_weight(self):
        """Return an inverse frequency weight for multilabel activities, where label counts are the fractional label likelihood within a clip"""
        assert self._is_vipy_video()

        def _multilabel_inverse_frequency_weight(v):
            lbl_likelihood = {}
            if len(v.activities()) > 0:
                (ef, sf) = (max([a.endframe() for a in v.activitylist()]), min([a.startframe() for a in v.activitylist()]))  # clip length 
                lbl_list = [a for A in v.activitylabel(sf, ef) for a in set(A)]  # list of all labels within clip (labels are unique in each frame)
                lbl_frequency = vipy.util.countby(lbl_list, lambda x: x)  # frequency of each label within clip
                lbl_weight = {k:v/float(len(lbl_list)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, normalized frequency sums to one 
                for (k,w) in lbl_weight.items():
                    if k not in lbl_likelihood:
                        lbl_likelihood[k] = 0
                    lbl_likelihood[k] += w
            return lbl_likelihood
                    
        lbl_likelihood  = {}
        for d in self.map(lambda v: _multilabel_inverse_frequency_weight(v)):  # parallelizable
            for (k,v) in d.items():
                if k not in lbl_likelihood:
                    lbl_likelihood[k] = 0
                lbl_likelihood[k] += v

        # Inverse frequency weight on label likelihood per clip
        d = {k:1.0/max(v,1) for (k,v) in lbl_likelihood.items()}
        n = sum(d.values())  
        return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

    def inverse_frequency_weight(self):
        """Return inverse frequency weight for categories in dataset.  Useful for unbalanced class weighting during training"""
        d = {k:1.0/max(v,1) for (k,v) in self.count().items()}
        n = sum(d.values())
        return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

    def duration_in_frames(self, outfile=None):
        assert self._isvipy()
        d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
        if outfile is not None:
            vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Duration (frames)', fontsize=6)            
        return d

    def duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
        """Duration of activities"""
        assert self._isvipy()
        d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)/v.framerate()) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
        if outfile is not None:
            max_duration = max(d.values()) if max_duration is None else max_duration
            vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
        return d

    def video_duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
        """Duration of activities"""
        assert self._isvipy()
        d = {k:np.mean([d for (c,d) in D]) for (k,D) in groupbyasdict([(v.category(), v.duration()) for v in self.list()], lambda x: x[0]).items()}
        if outfile is not None:
            max_duration = max(d.values()) if max_duration is None else max_duration
            vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
        return d
    
    def framerate(self, outfile=None):
        assert self._isvipy()
        d = vipy.util.countby([int(round(v.framerate())) for v in self.list()], lambda x: x)
        if outfile is not None:
            vipy.metrics.pie(d.values(), ['%d fps' % k for k in d.keys()], explode=None, outfile=outfile,  shadow=False)
        return d
        
        
    def density(self, outfile=None, max=None):
        """Compute the frequency that each video ID is represented.  This counts how many activities are in a video, truncated at max"""
        assert self._isvipy()
        d = [len(v) if (max is None or len(v)<= max) else max for (k,v) in groupbyasdict(self.list(), lambda v: v.videoid()).items()]
        d = {k:v for (k,v) in sorted(vipy.util.countby(d, lambda x: x).items(), key=lambda x: x[1], reverse=True)}
        if outfile is not None:
            vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Frequency', xlabel='Activities per video', fontsize=6, xrot=None)            
        return d

    def boxsize(self, outfile=None, category_to_color=None, categories=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks]) if categories is None else categories

        d = {}        
        for c in object_categories:
            xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category().lower() == c.lower()) and (t.meanshape() is not None))]
            d[c] = xcyc
        
        if outfile is not None:            
            plt.clf()
            plt.figure()
            plt.grid(True)
            for c in object_categories:
                xcyc = d[c]
                if len(xcyc) > 0:
                    (xc, yc) = zip(*xcyc)
                    plt.scatter(xc, yc, c=category_to_color[c] if category_to_color is not None else 'blue', label=c)
            plt.xlabel('bounding box (width)')
            plt.ylabel('bounding box (height)')
            plt.axis([0, 1000, 0, 1000])                
            plt.legend()
            plt.gca().set_axisbelow(True)        
            plt.savefig(outfile)
        return d

    def boxsize_by_category(self, outfile=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks])
        
        # Mean track size per video category
        d_category_to_xy = {k:np.mean([t.meanshape() for v in vlist for t in v.tracklist()], axis=0) for (k,vlist) in groupbyasdict(self.list(), lambda v: v.category()).items()}

        if outfile is not None:
            plt.clf()
            plt.figure()
            plt.grid(True)
            colors = colorlist()            
            d_category_to_color = {c:colors[k % len(colors)] for (k,c) in enumerate(d_category_to_xy.keys())}
            for c in d_category_to_xy.keys():
                (xc, yc) = d_category_to_xy[c]
                plt.scatter(xc, yc, c=d_category_to_color[c], label=c)
            plt.xlabel('bounding box (width)')
            plt.ylabel('bounding box (height)')
            plt.axis([0, 600, 0, 600])                
            plt.gca().set_axisbelow(True)        
            lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
            plt.savefig(outfile, bbox_extra_artists=(lgd,), bbox_inches='tight')
        return d_category_to_xy

    def boxsize_histogram(self, outfile=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks])

        
        # 2D histogram of object box sizes
        for c in object_categories:
            xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category() == c) and (t.meanshape() is not None))]
            d[c] = xcyc

        if outfile is not None:
            for c in object_categories:            
                xcyc = d[c]
                if len(xcyc) > 0:
                    (xc, yc) = zip(*xcyc)
                    plt.clf()
                    plt.figure()
                    plt.hist2d(xc, yc, bins=10)
                    plt.xlabel('Bounding box (width)')
                    plt.ylabel('Bounding box (height)')                    
                    plt.savefig(outfile % c)
        return d

    
    def to_torch(self, f_video_to_tensor):
        """Return a torch dataset that will apply the lambda function f_video_to_tensor to each element in the dataset on demand"""
        import vipy.torch
        return vipy.torch.TorchDataset(f_video_to_tensor, self)

    def to_torch_tensordir(self, f_video_to_tensor, outdir, n_augmentations=20, sleep=None):
        """Return a TorchTensordir dataset that will load a pkl.bz2 file that contains one of n_augmentations (tensor, label) pairs.
        
        This is useful for fast loading of datasets that contain many videos.

        """
        import vipy.torch    # lazy import, requires vipy[all] 
        from vipy.batch import Batch   # requires pip install vipy[all]

        assert self._is_vipy_video_scene()
        outdir = vipy.util.remkdir(outdir)
        self.map(lambda v, f=f_video_to_tensor, outdir=outdir, n_augmentations=n_augmentations: vipy.util.bz2pkl(os.path.join(outdir, '%s.pkl.bz2' % v.instanceid()), [f(v.print(sleep=sleep).clone()) for k in range(0, n_augmentations)]))
        return vipy.torch.Tensordir(outdir)

    def annotate(self, outdir, mindim=512):
        assert self._isvipy()
        f = lambda v, outdir=outdir, mindim=mindim: v.mindim(mindim).annotate(outfile=os.path.join(outdir, '%s.mp4' % v.videoid())).print()
        return self.map(f, dst='annotate')

    def tohtml(self, outfile, mindim=512, title='Visualization', fraction=1.0, display=False, clip=True, activities=True, category=True):
        """Generate a standalone HTML file containing quicklooks for each annotated activity in dataset, along with some helpful provenance information for where the annotation came from"""
    
        assert ishtml(outfile), "Output file must be .html"
        assert fraction > 0 and fraction <= 1.0, "Fraction must be between [0,1]"
        
        import vipy.util  # This should not be necessary, but we get "UnboundLocalError" without it, not sure why..
        import vipy.batch  # requires pip install vipy[all]

        dataset = self.list()
        assert all([isinstance(v, vipy.video.Video) for v in dataset])
        dataset = [dataset[int(k)] for k in np.random.permutation(range(len(dataset)))[0:int(len(dataset)*fraction)]]
        #dataset = [v for v in dataset if all([len(a) < 15*v.framerate() for a in v.activitylist()])]  # remove extremely long videos

        quicklist = vipy.batch.Batch(dataset, strict=False, as_completed=True, minscatter=1).map(lambda v: (v.load().quicklook(), v.flush().print())).result()
        quicklist = [x for x in quicklist if x is not None]  # remove errors
        quicklooks = [imq for (imq, v) in quicklist]  # keep original video for HTML display purposes
        provenance = [{'clip':str(v), 'activities':str(';'.join([str(a) for a in v.activitylist()])), 'category':v.category()} for (imq, v) in quicklist]
        (quicklooks, provenance) = zip(*sorted([(q,p) for (q,p) in zip(quicklooks, provenance)], key=lambda x: x[1]['category']))  # sorted in category order
        return vipy.visualize.tohtml(quicklooks, provenance, title='%s' % title, outfile=outfile, mindim=mindim, display=display)


    def video_montage(self, outfile, gridrows, gridcols, mindim=64, bycategory=False, category=None, annotate=True, trackcrop=False, transpose=False, max_duration=None, framerate=30, fontsize=8):
        """30x50 activity montage, each 64x64 elements.

        Args:
            outfile: [str] The name of the outfile for the video.  Must have a valid video extension. 
            gridrows: [int, None]  The number of rows to include in the montage.  If None, infer from other args
            gridcols: [int] The number of columns in the montage
            mindim: [int] The square size of each video in the montage
            bycategory: [bool]  Make the video such that each row is a category 
            category: [str, list] Make the video so that every element is of category.  May be a list of more than one categories
            annotate: [bool] If true, include boxes and captions for objects and activities
            trackcrop: [bool] If true, center the video elements on the tracks with dilation factor 1.5
            transpose: [bool] If true, organize categories columnwise, but still return a montage of size (gridrows, gridcols)
            max_duration: [float] If not None, then set a maximum duration in seconds for elements in the video.  If None, then the max duration is the duration of the longest element.

        Returns:
            A clone of the dataset containing the selected videos for the montage, ordered rowwise in the montage

        .. notes::  
            - If a category does not contain the required number of elements for bycategory, it is removed prior to visualization
            - Elements are looped if they exit prior to the end of the longest video (or max_duration)
        """
        assert self._is_vipy_video()
        assert vipy.util.isvideo(outfile)
        assert gridrows is None or (isinstance(gridrows, int) and gridrows >= 1)
        assert gridcols is None or (isinstance(gridcols, int) and gridcols >= 1)
        assert isinstance(mindim, int) and mindim >= 1
        assert category is None or isinstance(category, str)

        D = self.clone()
        if bycategory:
            (num_categories, num_elements) = (gridrows, gridcols) if not transpose else (gridcols, gridrows)
            assert num_elements is not None
            requested_categories = sorted(D.classlist()) if (num_categories is None) else sorted(D.classlist())[0:num_categories]             
            categories = [c for c in requested_categories if D.count()[c] >= num_elements]  # filter those categories that do not have enough
            if set(categories) != set(requested_categories):
                warnings.warn('[vipy.dataset.video_montage]: removing "%s" without at least %d examples' % (str(set(requested_categories).difference(set(categories))), num_elements))
            vidlist = sorted(D.filter(lambda v: v.category() in categories).take_per_category(num_elements).tolist(), key=lambda v: v.category())
            vidlist = vidlist if not transpose else [vidlist[k] for k in np.array(range(0, len(vidlist))).reshape( (len(categories), num_elements) ).transpose().flatten().tolist()] 
            (gridrows, gridcols) = (len(categories), num_elements) if not transpose else (num_elements, len(categories))
            assert len(vidlist) == gridrows*gridcols

        elif category is not None:
            vidlist = D.filter(lambda v: v.category() in vipy.util.tolist(category)).take(gridrows*gridcols, canload=True).tolist()            
        elif len(D) != gridrows*gridcols:
            vidlist = D.take(gridrows*gridcols, canload=True).tolist()
        else:
            vidlist = D.tolist()

        vidlist = [v.framerate(framerate) for v in vidlist]  # resample to common framerate (this may result in jittery tracks
        montage = Dataset(vidlist, id='video_montage').clone()  # for output
        vidlist = [v.trackcrop(dilate=1.5, maxsquare=True) if (v.trackbox() is not None) else v for v in vidlist] if trackcrop else vidlist  # may be None, if so return the video
        vidlist = [v.mindim(mindim) for v in vidlist]  # before annotate for common font size
        vidlist = [vipy.video.Video.cast(v) for v in vidlist] if not annotate else [v.annotate(verbose=False, fontsize=fontsize) for v in vidlist]  # pre-annotate
            
        vipy.visualize.videomontage(vidlist, mindim, mindim, gridrows=gridrows, gridcols=gridcols, framerate=framerate, max_duration=max_duration).saveas(outfile)
        return montage        
        
    def zip(self, other, sortkey=None):
        """Zip two datasets.  Equivalent to zip(self, other).

        ```python
        for (d1,d2) in D1.zip(D2, sortkey=lambda v: v.instanceid()):
            pass
        
        for (d1, d2) in zip(D1, D2):
            pass
        ```

        Args:
            other: [`vipy.dataset.Dataset`] 
            sortkey: [lambda] sort both datasets using the provided sortkey lambda.
        
        Returns:
            Generator for the tuple sequence ( (self[0], other[0]), (self[1], other[1]), ... )
        """ 
        assert isinstance(other, Dataset)
        assert len(self) == len(other)

        for (vi, vj) in zip(self.sort(sortkey), other.sort(sortkey)):
            yield (vi, vj)

    def sort(self, key):
        """Sort the dataset in-place using the sortkey lambda function"""
        if key is not None:
            self._objlist.sort(key=lambda x: key(self._loader(x)))
        return self

Classes

class Dataset (objlist, id=None, abspath=True, loader=None, lazy=False)

vipy.dataset.Dataset() class

Common class to manipulate large sets of vipy objects in parallel

D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()], id='random_scene')
with vipy.globals.parallel(2):
    D = D.map(lambda v: v.frame(0))
list(D)

Create dataset and export as a directory of json files

D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()])
D.tojsondir('/tmp/myjsondir')

Create dataset from all json or pkl files recursively discovered in a directory and lazy loaded

D = vipy.dataset.Dataset('/tmp/myjsondir')  # lazy loading

Create dataset from a list of json or pkl files and lazy loaded

D = vipy.dataset.Dataset(['/path/to/file1.json', '/path/to/file2.json'])  # lazy loading

Args

abspath [bool]: If true, load all lazy elements with absolute path
loader [lambda]: a callable loader that will process the object . This is useful for custom deerialization
lazy [bool]: If true, load all pkl or json files using the custom loader when accessed

Notes: Be warned that using the jsondir constructor will load elements on demand, but there are some methods that require loading the entire dataset into memory, and will happily try to do so

Expand source code Browse git

class Dataset():
    """vipy.dataset.Dataset() class
    
    Common class to manipulate large sets of vipy objects in parallel

    ```python
    D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()], id='random_scene')
    with vipy.globals.parallel(2):
        D = D.map(lambda v: v.frame(0))
    list(D)
    ```

    Create dataset and export as a directory of json files 

    ```python
    D = vipy.dataset.Dataset([vipy.video.RandomScene(), vipy.video.RandomScene()])
    D.tojsondir('/tmp/myjsondir')
    ```
    
    Create dataset from all json or pkl files recursively discovered in a directory and lazy loaded

    ```python
    D = vipy.dataset.Dataset('/tmp/myjsondir')  # lazy loading
    ```

    Create dataset from a list of json or pkl files and lazy loaded

    ```python
    D = vipy.dataset.Dataset(['/path/to/file1.json', '/path/to/file2.json'])  # lazy loading
    ```
    
    Args:
    
        - abspath [bool]: If true, load all lazy elements with absolute path
        - loader [lambda]: a callable loader that will process the object .  This is useful for custom deerialization
        - lazy [bool]: If true, load all pkl or json files using the custom loader when accessed

    .. notes:: Be warned that using the jsondir constructor will load elements on demand, but there are some methods that require loading the entire dataset into memory, and will happily try to do so
    """

    def __init__(self, objlist, id=None, abspath=True, loader=None, lazy=False):

        assert loader is None or callable(loader)

        self._saveas_ext = ['pkl', 'json']
        self._id = id if id is not None else vipy.util.shortuuid(8)
        self._loader = self._default_loader if loader is None else loader  # may not be serializable if lambda is provided
        self._istype_strict = True
        self._lazy_loader = lazy
        self._abspath = abspath
        self._shuffler = 'uniform'
        
        if isinstance(objlist, str) and (vipy.util.isjsonfile(objlist) or vipy.util.ispklfile(objlist) or vipy.util.ispklbz2(objlist)):
            self._objlist = vipy.util.load(objlist, abspath=abspath)
        elif isinstance(objlist, str) and os.path.isdir(objlist):
            self._objlist = vipy.util.findloadable(objlist) # recursive
            self._loader = lambda x,b=abspath:  vipy.util.load(x, abspath=b) if (vipy.util.ispkl(x) or vipy.util.isjsonfile(x) or vipy.util.ispklbz2(objlist)) else x
            self._istype_strict = False
            self._lazy_loader = True
        elif lazy and (isinstance(objlist, list) and all([(vipy.util.ispkl(x) or vipy.util.isjsonfile(x)) for x in objlist])):
            self._objlist = objlist 
            self._loader = lambda x,b=abspath:  vipy.util.load(x, abspath=b) if (vipy.util.ispkl(x) or vipy.util.isjsonfile(x)) else x            
            self._istype_strict = False
            self._lazy_loader = True
        else:
            self._objlist = objlist

        self._objlist = tolist(self._objlist)        
        assert len(self._objlist) > 0, "Empty dataset"

        if self._lazy_loader:
            try:
                self[0]
            except Exception as e:
                raise ValueError('Invalid dataset - Lazy load failed with error "%s"' % str(e))

    @staticmethod
    def _default_loader(x):
        return x
    
    def __repr__(self):
        return str('<vipy.dataset: id="%s", len=%d, type=%s>' % (self.id(), len(self), str(type(self[0])) if len(self)>0 else 'None'))

    def __iter__(self):
        for k in range(len(self)):
            yield self[k]

    def __getitem__(self, k):
        if isinstance(k, int) or isinstance(k, np.uint64):
            assert abs(k) < len(self._objlist), "invalid index"
            x = self._objlist[int(k)]
            return self._loader(x) if self._loader is not None else x
        elif isinstance(k, slice):
            return [self._loader(x) if self._loader is not None else x for x in self._objlist[k.start:k.stop:k.step]]
        else:
            raise ValueError()
            
    def __len__(self):
        return len(self._objlist)
        
    def json(self, encode=True):
        r = vipy.util.class_registry()
        d = {k:v for (k,v) in self.__dict__.items() if not k == '_loader'}
        d['_objlist'] = [(str(type(v)), v.json(encode=False)) if str(type(v)) in r else v for v in self._objlist]
        return json.dumps(d) if encode else d
        
    @classmethod
    def from_json(cls, s):
        r = vipy.util.class_registry()
        d = json.loads(s) if not isinstance(s, dict) else s  
        return cls(objlist=[r[x[0]](x[1]) if (isinstance(x, tuple) and x[0] in r) else x for x in d['_objlist']],
                   id=d['_id'],
                   abspath=d['_abspath'])                            

    def id(self, n=None):
        """Set or return the dataset id"""
        if n is None:
            return self._id
        else:
            self._id = n
            return self

    def list(self):
        """Return the dataset as a list"""
        return list(self)

    def set(self):
        """Return the dataset as a set"""
        return set(self.list())
    
    def tolist(self):
        """Alias for self.list()"""
        return list(self)

    def flatten(self):
        """Convert dataset stored as a list of lists into a flat list"""
        self._objlist = [o for objlist in self._objlist for o in vipy.util.tolist(objlist)]
        return self

    def istype(self, validtype):
        """Return True if the all elements (or just the first element if strict=False) in the dataset are of type 'validtype'"""
        return all([any([isinstance(v,t) for t in tolist(validtype)]) for v in self]) if self._istype_strict else any([isinstance(self[0],t) for t in tolist(validtype)])

    def _isvipy(self):
        """Return True if all elements in the dataset are of type `vipy.video.Video` or `vipy.image.Image`"""        
        return self.istype([vipy.image.Image, vipy.video.Video])

    def _is_vipy_video(self):
        """Return True if all elements in the dataset are of type `vipy.video.Video`"""                
        return self.istype([vipy.video.Video])

    def _is_vipy_video_scene(self):
        """Return True if all elements in the dataset are of type `vipy.video.Scene`"""                        
        return self.istype([vipy.video.Scene])

    def _is_vipy_image_scene(self):
        """Return True if all elements in the dataset are of type `vipy.video.Scene`"""                        
        return self.istype([vipy.image.Scene])

    def clone(self, shallow=False):
        """Return a deep copy of the dataset"""
        if shallow:
            objlist = self._objlist
            self._objlist = []  
            D = copy.deepcopy(self)
            self._objlist = objlist  # restore
            return D
        else:
            return copy.deepcopy(self)

    def archive(self, tarfile, delprefix, mediadir='videos', format='json', castas=vipy.video.Scene, verbose=False, extrafiles=None, novideos=False, md5=True, tmpdir=None, inplace=False, bycategory=False, annotationdir='annotations'):
        """Create a archive file for this dataset.  This will be archived as:

           /path/to/tarfile.{tar.gz|.tgz|.bz2}
              tarfilename
                 tarfilename.{json|pkl}
                 mediadir/
                     video.mp4
                 extras1.ext
                 extras2.ext
        
            Args:
                tarfile: /path/to/tarfilename.tar.gz
                delprefix:  the absolute file path contained in the media filenames to be removed.  If a video has a delprefix='/a/b' then videos with path /a/b/c/d.mp4' -> 'c/d.mp4', and {JSON|PKL} will be saved with relative paths to mediadir.  This may be a list of delprefixes.
                mediadir:  the subdirectory name of the media to be contained in the archive.  Usually "videos".             
                extrafiles: list of tuples or singletons [(abspath, filename_in_archive_relative_to_root), 'file_in_root_and_in_pwd', ...], 
                novideos [bool]:  generate a tarball without linking videos, just annotations
                md5 [bool]:  If True, generate the MD5 hash of the tarball using the system "md5sum", or if md5='vipy' use a slower python only md5 hash 
                castas [class]:  This should be a vipy class that the vipy objects should be cast to prior to archive.  This is useful for converting priveledged superclasses to a base class prior to export.
                tmpdir:  The path to the temporary directory for construting this dataset.  Defaults to system temp.  This directory will be emptied prior to archive.
                inplace [bool]:  If true, modify the dataset in place to prepare it for archive, else make a copy
                bycategory [bool]: If true, save the annotations in an annotations/ directory by category
                annotationdir [str]: The subdirectory name of annotations to be contained in the archive if bycategory=True.  Usually "annotations" or "json".

            Example:  

              - Input files contain /path/to/oldvideos/category/video.mp4
              - Output will contain relative paths videos/category/video.mp4

        ```python
        d.archive('out.tar.gz', delprefix='/path/to/oldvideos', mediadir='videos')
        ```
        
            Returns:

                The absolute path to the tarball 
        """
        assert self._isvipy(), "Source dataset must contain vipy objects for staging"
        assert all([os.path.isabs(v.filename()) for v in self]), "Input dataset must have only absolute media paths"
        assert len([v for v in self if any([d in v.filename() for d in tolist(delprefix)])]) == len(self), "all media objects must have a provided delprefix for relative path construction"
        assert vipy.util.istgz(tarfile) or vipy.util.istarbz2(tarfile) or vipy.util.istar(tarfile), "Allowable extensions are .tar.gz, .tgz, .bz2 or .tar"
        assert shutil.which('tar') is not None, "tar not found on path"        
        
        D = self.clone() if not inplace else self   # large memory footprint if inplace=False
        tmpdir = tempdir() if tmpdir is None else remkdir(tmpdir, flush=True)
        stagedir = remkdir(os.path.join(tmpdir, filefull(filetail(tarfile))))
        print('[vipy.dataset]: creating staging directory "%s"' % stagedir)
        delprefix = [[d for d in tolist(delprefix) if d in v.filename()][0] for v in self]  # select the delprefix per video
        D._objlist = [v.filename(v.filename().replace(os.path.normpath(p), os.path.normpath(os.path.join(stagedir, mediadir))), symlink=not novideos) for (p,v) in zip(delprefix, D.list())]

        # Save annotations:  Split large datasets into annotations grouped by category to help speed up loading         
        if bycategory:
            for (c,V) in vipy.util.groupbyasdict(list(D), lambda v: v.category()).items():
                Dataset(V, id=c).save(os.path.join(stagedir, annotationdir, '%s.%s' % (c, format)), relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)
        else:
            pklfile = os.path.join(stagedir, '%s.%s' % (filetail(filefull(tarfile)), format))
            D.save(pklfile, relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)
    
        # Copy extras (symlinked) to staging directory
        if extrafiles is not None:
            # extrafiles = [("/abs/path/in/filesystem.ext", "rel/path/in/archive.ext"), ... ]
            assert all([((isinstance(e, tuple) or isinstance(e, list)) and len(e) == 2) or isinstance(e, str) for e in extrafiles])
            extrafiles = [e if (isinstance(e, tuple) or isinstance(e, list)) else (e,e) for e in extrafiles]  # tuple-ify files in pwd() and should be put in the tarball root
            for (e, a) in tolist(extrafiles):
                assert os.path.exists(os.path.abspath(e)), "Invalid extras file '%s' - file not found" % e
                remkdir(filepath(os.path.join(stagedir, filetail(e) if a is None else a)))    # make directory in stagedir for symlink
                os.symlink(os.path.abspath(e), os.path.join(stagedir, filetail(e) if a is None else a))

        # System command to run tar
        cmd = ('tar %scvf %s -C %s --dereference %s %s' % ('j' if vipy.util.istarbz2(tarfile) else ('z' if vipy.util.istgz(tarfile) else ''), 
                                                           tarfile,
                                                           filepath(stagedir),
                                                           filetail(stagedir),
                                                           ' > /dev/null' if not verbose else ''))

        print('[vipy.dataset]: executing "%s"' % cmd)        
        os.system(cmd)  # too slow to use python "tarfile" package
        print('[vipy.dataset]: deleting staging directory "%s"' % stagedir)        
        shutil.rmtree(stagedir)

        if md5:
            if shutil.which('md5sum') is not None:
                cmd = 'md5sum %s' % tarfile
                print('[vipy.dataset]: executing "%s"' % cmd)        
                os.system(cmd)  # too slow to use python "vipy.downloader.generate_md5(tarball)" for huge datasets
            else:
                print('[vipy.dataset]: %s, MD5=%s' % (tarfile, vipy.downloader.generate_md5(tarfile)))  # too slow for large datasets, but does not require md5sum on path
        return tarfile
        
    def save(self, outfile, nourl=False, castas=None, relpath=False, sanitize=True, strict=True, significant_digits=2, noemail=True, flush=True, bycategory=False):
        """Save the dataset to the provided output filename stored as pkl or json
        
        Args:
            outfile: [str]: The /path/to/out.pkl or /path/to/out.json
            nourl: [bool]: If true, remove all URLs from the media (if present)
            castas: [type]:  Cast all media to the provided type.  This is useful for downcasting to `vipy.video.Scene` from superclasses
            relpath: [bool]: If true, define all file paths in objects relative to the /path/to in /path/to/out.json
            sanitize: [bool]:  If trye, call sanitize() on all objects to remove all private attributes with prepended '__' 
            strict: [bool]: Unused
            significant_digits: [int]: Assign the requested number of significant digits to all bounding boxes in all tracks.  This requires dataset of `vipy.video.Scene`
            noemail: [bool]: If true, scrub the attributes for emails and replace with a hash
            flush: [bool]:  If true, flush the object buffers prior to save
            bycategory [bool[: If trye, then save the dataset to the provided output filename pattern outfile='/path/to/annotations/*.json' where the wildcard is replaced with the category name

        Returns:        
            This dataset that is quivalent to vipy.dataset.Dataset('/path/to/outfile.json')
        """
        n = len([v for v in self if v is None])
        if n > 0:
            print('[vipy.dataset]: removing %d invalid elements' % n)
        objlist = [v for v in self if v is not None]  
        if relpath or nourl or sanitize or flush or noemail or (significant_digits is not None):
            assert self._isvipy(), "Invalid input"
        if relpath:
            print('[vipy.dataset]: setting relative paths')
            objlist = [v.relpath(start=filepath(outfile)) if os.path.isabs(v.filename()) else v for v in objlist]
        if nourl: 
            print('[vipy.dataset]: removing URLs')
            objlist = [v.nourl() for v in objlist]           
        if sanitize:
            print('[vipy.dataset]: sanitizing attributes')                        
            objlist = [v.sanitize() for v in objlist]  # removes all attributes with '__' keys
        if castas is not None:
            assert hasattr(castas, 'cast'), "Invalid cast"
            print('[vipy.dataset]: casting as "%s"' % (str(castas)))
            objlist = [castas.cast(v) for v in objlist]                     
        if significant_digits is not None:
            assert self._is_vipy_video_scene()
            assert isinstance(significant_digits, int) and significant_digits >= 1, "Invalid input"
            objlist = [o.trackmap(lambda t: t.significant_digits(significant_digits)) if o is not None else o for o in objlist]
        if noemail:
            print('[vipy.dataset]: removing emails')            
            for o in objlist:
                for (k,v) in o.attributes.items():
                    if isinstance(v, str) and is_email_address(v):
                        o.attributes[k] = hashlib.sha1(v.encode("UTF-8")).hexdigest()[0:10]
        if flush:
            objlist = [o.flush() for o in objlist]  

        if bycategory:
            for (c,V) in vipy.util.groupbyasdict(list(self), lambda v: v.category()).items():
                jsonfile = outfile.replace('*', c)  # outfile="/path/to/annotations/*.json"
                d = Dataset(V, id=c).save(jsonfile, relpath=relpath, nourl=nourl, sanitize=sanitize, castas=castas, significant_digits=significant_digits, noemail=noemail, flush=flush, bycategory=False)
                print('[vipy.dataset]: Saving %s by category to "%s"' % (str(d), jsonfile))                
        else:
            print('[vipy.dataset]: Saving %s to "%s"' % (str(self), outfile))
            vipy.util.save(objlist, outfile)
        return self

    def classlist(self):
        """Return a sorted list of categories in the dataset"""
        assert self._isvipy(), "Invalid input"
        return sorted(list(set([v.category() for v in self])))

    def classes(self):
        """Alias for classlist"""
        return self.classlist()
    def categories(self):
        """Alias for classlist"""
        return self.classlist()
    def num_classes(self):
        """Return the number of unique categories in this dataset"""
        return len(self.classlist())
    def num_labels(self):
        """Alias for num_classes"""
        return self.num_classes()
    def num_categories(self):
        """Alias for num_classes"""
        return self.num_classes()
    
    
    def class_to_index(self):
        """Return a dictionary mapping the unique classes to an integer index.  This is useful for defining a softmax index ordering for categorization"""
        return {v:k for (k,v) in enumerate(self.classlist())}

    def index_to_class(self):
        """Return a dictionary mapping an integer index to the unique class names.  This is the inverse of class_to_index, swapping keys and values"""
        return {v:k for (k,v) in self.class_to_index().items()}

    def label_to_index(self):
        """Alias for class_to_index"""
        return self.class_to_index()

    def powerset(self):
        return list(sorted(set([tuple(sorted(list(a))) for v in self for a in v.activitylabel() if len(a) > 0])))        

    def powerset_to_index(self):        
        assert self._isvipy(), "Invalid input"
        return {c:k for (k,c) in enumerate(self.powerset())}

    def dedupe(self, key):
        self._objlist = list({key(v):v for v in self}.values())
        return self
        
    def countby(self, f):
        return len([v for v in self if f(v)])

    def union(self, other, key=None):
        assert isinstance(other, Dataset), "invalid input"
        if len(other) > 0:
            try:
                if other._loader is not None:
                    other._loader(self._objlist[0])
                if self._loader is not None:
                    self._loader(other._objlist[0])
                self._objlist = self._objlist + other._objlist  # compatible loaders
            except:
                self._objlist = self.list() + other.list()  # incompatible loaders
                self._loader = None
        return self.dedupe(key) if key is not None else self
    
    def difference(self, other, key):
        assert isinstance(other, Dataset), "invalid input"
        idset = set([key(v) for v in self]).difference([key(v) for v in other])   # in A but not in B
        self._objlist = [v for v in self if key(v) in idset]
        return self
        
    def has(self, val, key):
        return any([key(obj) == val for obj in self])

    def replace(self, other, key):
        """Replace elements in self with other with equality detemrined by the key lambda function"""
        assert isinstance(other, Dataset), "invalid input"
        d = {key(v):v for v in other}
        self._objlist = [v if key(v) not in d else d[key(v)] for v in self]
        return self

    def merge(self, outdir):
        """Merge a dataset union into a single subdirectory with symlinked media ready to be archived.

        ```python
        D1 = vipy.dataset.Dataset('/path1/dataset.json')
        D2 = vipy.dataset.Dataset('/path2/dataset.json')
        D3 = D1.union(D2).merge(outdir='/path3')
        ```

        Media in D1 are in /path1, media in D2 are in /path2, media in D3 are all symlinked to /path3.
        We can now create a tarball for D3 with all of the media files in the same relative path.
        """
        
        outdir = vipy.util.remkdir(os.path.abspath(os.path.normpath(outdir)))
        return self.clone().localmap(lambda v: v.filename(os.path.join(outdir, filetail(v.filename())), copy=False, symlink=True))

    def augment(self, f, n_augmentations):
        assert n_augmentations >= 1
        self._objlist = [f(v.clone()) for v in self for k in range(n_augmentations)]  # This will remove the originals
        return self

    def filter(self, f):
        """In place filter with lambda function f"""
        self._objlist = [v for v in self if f(v)]
        return self

    def valid(self):
        return self.filter(lambda v: v is not None)

    def takefilter(self, f, n=1):
        """Apply the lambda function f and return n elements in a list where the filter returns true
        
        Args:
            f: [lambda] If f(x) returns true, then keep
            n: [int >= 0] The number of elements to take
        
        Returns:
            [n=0] Returns empty list
            [n=1] Returns singleton element
            [n>1] Returns list of elements of at most n such that each element f(x) is True            
        """
        objlist = [obj for obj in self if f(obj)]
        return [] if (len(objlist) == 0 or n == 0) else (objlist[0] if n==1 else objlist[0:n])

    def jsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
        """Export all objects to a directory of JSON files.
    
           Usage:

        ```python
        D = vipy.dataset.Dataset(...).jsondir('/path/to/jsondir')
        D = vipy.util.load('/path/to/jsondir')   # recursively discover and lazy load all json files 
        ```

           Args:
               outdir [str]:  The root directory to store the JSON files
               verbose [bool]: If True, print the save progress
               rekey [bool] If False, use the instance ID of the vipy object as the filename for the JSON file, otherwise assign a new UUID_dataset-index
               bycategory [bool]: If True, use the JSON structure '$OUTDIR/$CATEGORY/$INSTANCEID.json'
               byfilename [bool]: If True, use the JSON structure '$FILENAME.json' where $FILENAME is the underlying media filename of the vipy object
               abspath [bool]: If true, store absolute paths to media in JSON.  If false, store relative paths to media from JSON directory

           Returns:
               outdir: The directory containing the JSON files.
        """
        assert self._isvipy()
        assert outdir is not None or byfilename 
        assert not byfilename and bycategory

        if outdir is not None:
            vipy.util.remkdir(outdir) 
        if bycategory:
            tojsonfile = lambda v,k: os.path.join(outdir, v.category(), ('%s.json' % v.instanceid()) if not rekey else ('%s_%d.json' % (uuid.uuid4().hex, k)))
        elif byfilename:
            tojsonfile = lambda v,k: vipy.util.toextension(v.filename(), '.json')
        else:
            tojsonfile = lambda v,k: os.path.join(outdir, ('%s.json' % v.instanceid()) if not rekey else '%s_%d.json' % (uuid.uuid4().hex, k))
        
        for (k,v) in enumerate(self):            
            f = vipy.util.save(v.clone().relpath(start=filepath(tojsonfile(v,k))) if not abspath else v.clone().abspath(), tojsonfile(v,k))
            if verbose:
                print('[vipy.dataset.Dataset][%d/%d]: %s' % (k, len(self), f))
        return outdir

    def tojsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
        """Alias for `vipy.dataset.Dataset.jsondir`"""
        return self.jsondir(outdir, verbose=verbose, rekey=rekey, bycategory=bycategory, byfilename=byfilename, abspath=abspath)
    
    def takelist(self, n, category=None, seed=None):
        """Take n elements of selected category and return list.  The elements are not cloned."""
        assert n >= 0, "Invalid length"
        K = list(range(len(self))) if category is None else [k for (k,v) in enumerate(self) if v.category() == category]
        if seed is not None:
            assert isinstance(seed, int), "integer required"
            np.random.seed(seed)            
        outlist = [self[int(k)] for k in np.random.permutation(K)[0:n]]  # native python int
        if seed is not None:
            np.random.seed()
        return outlist

    def load(self):
        """Load the entire dataset into memory.  This is useful for creating in-memory datasets from lazy load datasets"""
        self._objlist = self.list()
        self._loader = None
        return self

    def take(self, n, category=None, canload=False, seed=None):
        """Randomlly Take n elements from the dataset, and return a dataset.  If seed=int, take will return the same results each time."""
        assert isinstance(n, int) and n>0
        D = self.clone(shallow=True)
        D._objlist = self.takelist(n, category=category, seed=seed)
        return D

    def takeone(self, category=None, canload=False, seed=None):
        """Randomly take one element from the dataset and return a singleton"""
        D = self.take(n=1, category=category, canload=canload, seed=seed)
        return D[0] if len(D)>0 else None

    def take_per_category(self, n, seed=None):
        """Random;y take n elements per category and return a shallow cloned dataset"""
        D = self.clone(shallow=True)
        d_category_to_objlist = vipy.util.groupbyasdict(self._objlist, lambda x: x.category())
        D._objlist = [v for c in self.categories() for v in Dataset(d_category_to_objlist[c]).take(n, seed=seed)]
        return D

    def shuffler(self, method=None, uniform=None, pairwise=None):
        """Specify a shuffler protocol.  
        
           >>> D.shuffler('uniform')
           >>> D.shuffer(uniform=True)
           >>> D.shuffle()

           Args:
             uniform [bool]: shuffle element uniformly at random
             pairwise [bool]:  elements are assumed to be pairwise similarities, such that the category() method returns an id for each positive pair.  Shuffle keeping positive pairs as minibatch neighbors.        

           Returns: self if a new shuffler is requested, otherwise return a lambda function which shuffles a list. This lambda function is not meant to be used directly, rather exercised by shuffle
        """
        if method:
            assert method in ['uniform', 'pairwise'], "unknown shuffler '%s'" % method
            self._shuffler = method
        elif pairwise:
            self._shuffler = 'pairwise'
        elif uniform:
            self._shuffler = 'uniform'
        elif self._shuffler == 'uniform':
            return lambda y: sorted(y, key=lambda x: random.random())
        elif self._shuffler == 'pairwise':
            return lambda y: vipy.util.flatlist(sorted(vipy.util.chunklistbysize(sorted(y, key=lambda x: x.category()), 2), key=lambda x: random.random()))
        return self
    
    def shuffle(self):
        """Randomly permute elements in this dataset according to a shuffler protocol set with shuffler()"""
        self._objlist = self.shuffler()(self._objlist)  # in-place
        return self

    def chunk(self, n):
        """Yield n chunks as dataset.  Last chunk will be ragged"""
        for (k,V) in enumerate(vipy.util.chunklist(self._objlist, n)):
            yield Dataset(V, id='%s_%d' % (self.id(), k), loader=self._loader)

    def minibatch(self, n, ragged=True):
        """Yield list chunks of size n of this dataset.  Last chunk will be ragged if ragged=True, else skipped"""
        for (k,V) in enumerate(vipy.util.chunklistbysize(self._objlist, n)):
            if ragged or len(V) == n:
                yield V 
        
    def split_by_videoid(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None):
        """Split the dataset by category by fraction so that video IDs are never in the same set"""
        assert self._isvipy(), "Invalid input"
        assert trainfraction >=0 and trainfraction <= 1
        assert valfraction >=0 and valfraction <= 1
        assert testfraction >=0 and testfraction <= 1
        assert trainfraction + valfraction + testfraction == 1.0
        np.random.seed(seed)  # deterministic
        
        # Video ID assignment
        A = self.list()
        videoid = list(set([a.videoid() for a in A]))
        np.random.shuffle(videoid)
        (testid, valid, trainid) = vipy.util.dividelist(videoid, (testfraction, valfraction, trainfraction))        
        (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
        d = groupbyasdict(A, lambda a: 'testset' if a.videoid() in testid else 'valset' if a.videoid() in valid else 'trainset')
        (trainset, testset, valset) = (d['trainset'] if 'trainset' in d else [], 
                                       d['testset'] if 'testset' in d else [], 
                                       d['valset'] if 'valset' in d else [])

        #print('[vipy.dataset]: trainset=%d (%1.2f)' % (len(trainset), trainfraction))
        #print('[vipy.dataset]: valset=%d (%1.2f)' % (len(valset), valfraction))
        #print('[vipy.dataset]: testset=%d (%1.2f)' % (len(testset), testfraction))
        np.random.seed()  # re-initialize seed

        return (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)

    def split(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None, withtest=True):
        """Split the dataset into the requested fractions.  

        Args:
            trainfraction [float]: fraction of dataset for training set
            valfraction [float]: fraction of dataset for validation set
            testfraction [float]: fraction of dataset for test set
            seed [int]: random seed for determinism.  Set to None for random.
            withtest: If true, return (trainset, valset, testset) even if testset is None

        Returns:        
            (trainset, valset, testset) if withtest=True else (trainset, valest) if testfraction=0
        """
        assert trainfraction >=0 and trainfraction <= 1
        assert valfraction >=0 and valfraction <= 1
        assert testfraction >=0 and testfraction <= 1
        assert trainfraction + valfraction + testfraction == 1.0
        
        # Assignment
        if seed is not None:
            np.random.seed(seed)  # deterministic        
        A = self.list()
        idx = list(range(len(A)))
        np.random.shuffle(idx)
        (testid, valid, trainid) = vipy.util.dividelist(idx, (testfraction, valfraction, trainfraction))
        (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
        trainset = [a for (k,a) in enumerate(A) if k in trainid]
        testset = [a for (k,a) in enumerate(A) if k in testid]
        valset = [a for (k,a) in enumerate(A) if k in valid]
        if seed is not None:
            np.random.seed()  # re-initialize seed

        (train,val,test) = (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)
        return (train,val,test) if withtest or test is not None else (train,val)
    
    def tocsv(self, csvfile=None):
        csv = [v.csv() for v in self.list]        
        return vipy.util.writecsv(csv, csvfile) if csvfile is not None else (csv[0], csv[1:])

    def map(self, f_map, model=None, dst=None, id=None, strict=False, ascompleted=True, ordered=False):        
        """Distributed map.

        To perform this in parallel across four processes:

        ```python
        D = vipy.dataset.Dataset(...)
        with vipy.globals.parallel(4):
            D.map(lambda v: ...)
        ```

        Args:
            f_map: [lambda] The lambda function to apply in parallel to all elements in the dataset.  This must return a JSON serializable object
            model: [torch.nn.Module] The model to scatter to all workers
            dst: [str] The ID to give to the resulting dataset
            id: [str] The ID to give to the resulting dataset (parameter alias for dst)
            strict: [bool] If true, raise exception on map failures, otherwise the map will return None for failed elements
            ascompleted: [bool] If true, return elements as they complete
            ordered: [bool] If true, preserve the order of objects in dataset as returned from distributed processing
        
        Returns:
            A `vipy.dataset.Dataset` containing the elements f_map(v).  This operation is order preserving if ordered=True.

        .. note:: 
            - This dataset must contain vipy objects of types defined in `vipy.util.class_registry` or JSON serializable objects
            - Serialization of large datasets can take a while, kick it off to a distributed dask scheduler and go get lunch
            - This method uses dask distributed and `vipy.batch.Batch` operations
            - All vipy objects are JSON serialized prior to parallel map to avoid reference cycle garbage collection which can introduce instabilities
            - Due to chunking, all error handling is caught by this method.  Use `vipy.batch.Batch` to leverage dask distributed futures error handling.
            - Operations must be chunked and serialized because each dask task comes with overhead, and lots of small tasks violates best practices
            - Serialized results are deserialized by the client and returned a a new dataset
        """
        assert callable(f_map)
        from vipy.batch import Batch   # requires pip install vipy[all]

        # Distributed map using vipy.batch
        f_serialize = lambda v,d=vipy.util.class_registry(): (str(type(v)), v.json()) if str(type(v)) in d else (None, pickle.dumps(v))  # fallback on PKL dumps/loads
        f_deserialize = lambda x,d=vipy.util.class_registry(): d[x[0]](x[1])  # with closure capture
        f_catcher = lambda f, *args, **kwargs: vipy.util.loudcatcher(f, '[vipy.dataset.Dataset.map]: ', *args, **kwargs)  # catch exceptions when executing lambda, print errors and return (True, result) or (False, exception)
        f_loader = self._loader if self._loader is not None else lambda x: x
        S = [f_serialize(v) for v in self._objlist]  # local serialization

        B = Batch(vipy.util.chunklist(S, 128), strict=strict, as_completed=ascompleted, warnme=False, minscatter=128, ordered=ordered)
        if model is None:
            f = lambda x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, f_loader(f_deserializer(x))))  # with closure capture
            S = B.map(lambda X,f=f: [f(x) for x in X]).result()  # chunked, with caught exceptions, may return empty list
        else:
            f = lambda net, x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, net, f_loader(f_deserializer(x))))  # with closure capture
            S = B.scattermap((lambda net, X, f=f: [f(net, x) for x in X]), model).result()  # chunked, scattered, caught exceptions
        if not isinstance(S, list) or any([not isinstance(s, list) for s in S]):
            raise ValueError('Distributed processing error - Batch returned: %s' % (str(S)))
        V = [f_deserialize(x) for s in S for x in s]  # Local deserialization and chunk flattening
        (good, bad) = ([r for (b,r) in V if b], [r for (b,r) in V if not b])  # catcher returns (True, result) or (False, exception string)
        if len(bad) > 0:
            print('[vipy.dataset.Dataset.map]: Exceptions in map distributed processing:\n%s' % str(bad))
            print('[vipy.dataset.Dataset.map]: %d/%d items failed' % (len(bad), len(self)))
        return Dataset(good, id=dst if dst is not None else id)

    def localmap(self, f):
        for (k,v) in enumerate(self):
            self._objlist[k] = f(v)  # in-place update
        return self

    def flatmap(self, f):
        self._objlist = [x for v in self for x in f(v)]
        return self
    
    def count(self, f=None):
        """Counts for each label.  
        
        Args:
            f: [lambda] if provided, count the number of elements that return true.  This is the same as len(self.filter(f)) without modifying the dataset.

        Returns:
            A dictionary of counts per category [if f is None]
            A length of elements that satisfy f(v) = True [if f is not None]
        """
        assert self._isvipy()
        assert f is None or callable(f)
        return len([v for v in self if f is None or f(v)])

    def countby(self, f=lambda v: v.category()):
        """Count the number of elements that return the same value from the lambda function"""
        assert self._isvipy()
        assert f is None or callable(f)
        return vipy.util.countby(self, f)
        
    def frequency(self):
        return self.count()

    def synonym(self, synonymdict):
        """Convert all categories in the dataset using the provided synonym dictionary mapping"""
        assert self._isvipy()
        assert isinstance(synonymdict, dict)
        
        if self._is_vipy_video_scene():
            return self.localmap(lambda v: v.trackmap(lambda t: t.categoryif(synonymdict)).activitymap(lambda a: a.categoryif(synonymdict)))
        elif self._is_vipy_image_scene():
            return self.localmap(lambda v: v.objectmap(lambda o: o.categoryif(synonymdict)))
        return self

    def histogram(self, outfile=None, fontsize=6, category_to_barcolor=None, category_to_xlabel=None):
        assert self._isvipy()
        assert category_to_barcolor is None or all([c in category_to_barcolor for c in self.categories()])
        assert category_to_xlabel is None or callable(category_to_xlabel) or all([c in category_to_xlabel for c in self.categories()])
        f_category_to_xlabel = category_to_xlabel if callable(category_to_xlabel) else ((lambda c: category_to_xlabel[c]) if category_to_xlabel is not None else (lambda c: c))
        
        d = self.countby(lambda v: v.category())
        if outfile is not None:
            (categories, freq) = zip(*reversed(sorted(list(d.items()), key=lambda x: x[1])))  # decreasing frequency
            barcolors = ['blue' if category_to_barcolor is None else category_to_barcolor[c] for c in categories]
            xlabels = [f_category_to_xlabel(c) for c in categories]
            print('[vipy.dataset]: histogram="%s"' % vipy.metrics.histogram(freq, xlabels, barcolors=barcolors, outfile=outfile, ylabel='Instances', fontsize=fontsize))
        return d
    
    def percentage(self):
        """Fraction of dataset for each label"""
        d = self.count()
        n = sum(d.values())
        return {k:v/float(n) for (k,v) in d.items()}

    def multilabel_inverse_frequency_weight(self):
        """Return an inverse frequency weight for multilabel activities, where label counts are the fractional label likelihood within a clip"""
        assert self._is_vipy_video()

        def _multilabel_inverse_frequency_weight(v):
            lbl_likelihood = {}
            if len(v.activities()) > 0:
                (ef, sf) = (max([a.endframe() for a in v.activitylist()]), min([a.startframe() for a in v.activitylist()]))  # clip length 
                lbl_list = [a for A in v.activitylabel(sf, ef) for a in set(A)]  # list of all labels within clip (labels are unique in each frame)
                lbl_frequency = vipy.util.countby(lbl_list, lambda x: x)  # frequency of each label within clip
                lbl_weight = {k:v/float(len(lbl_list)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, normalized frequency sums to one 
                for (k,w) in lbl_weight.items():
                    if k not in lbl_likelihood:
                        lbl_likelihood[k] = 0
                    lbl_likelihood[k] += w
            return lbl_likelihood
                    
        lbl_likelihood  = {}
        for d in self.map(lambda v: _multilabel_inverse_frequency_weight(v)):  # parallelizable
            for (k,v) in d.items():
                if k not in lbl_likelihood:
                    lbl_likelihood[k] = 0
                lbl_likelihood[k] += v

        # Inverse frequency weight on label likelihood per clip
        d = {k:1.0/max(v,1) for (k,v) in lbl_likelihood.items()}
        n = sum(d.values())  
        return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

    def inverse_frequency_weight(self):
        """Return inverse frequency weight for categories in dataset.  Useful for unbalanced class weighting during training"""
        d = {k:1.0/max(v,1) for (k,v) in self.count().items()}
        n = sum(d.values())
        return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

    def duration_in_frames(self, outfile=None):
        assert self._isvipy()
        d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
        if outfile is not None:
            vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Duration (frames)', fontsize=6)            
        return d

    def duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
        """Duration of activities"""
        assert self._isvipy()
        d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)/v.framerate()) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
        if outfile is not None:
            max_duration = max(d.values()) if max_duration is None else max_duration
            vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
        return d

    def video_duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
        """Duration of activities"""
        assert self._isvipy()
        d = {k:np.mean([d for (c,d) in D]) for (k,D) in groupbyasdict([(v.category(), v.duration()) for v in self.list()], lambda x: x[0]).items()}
        if outfile is not None:
            max_duration = max(d.values()) if max_duration is None else max_duration
            vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
        return d
    
    def framerate(self, outfile=None):
        assert self._isvipy()
        d = vipy.util.countby([int(round(v.framerate())) for v in self.list()], lambda x: x)
        if outfile is not None:
            vipy.metrics.pie(d.values(), ['%d fps' % k for k in d.keys()], explode=None, outfile=outfile,  shadow=False)
        return d
        
        
    def density(self, outfile=None, max=None):
        """Compute the frequency that each video ID is represented.  This counts how many activities are in a video, truncated at max"""
        assert self._isvipy()
        d = [len(v) if (max is None or len(v)<= max) else max for (k,v) in groupbyasdict(self.list(), lambda v: v.videoid()).items()]
        d = {k:v for (k,v) in sorted(vipy.util.countby(d, lambda x: x).items(), key=lambda x: x[1], reverse=True)}
        if outfile is not None:
            vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Frequency', xlabel='Activities per video', fontsize=6, xrot=None)            
        return d

    def boxsize(self, outfile=None, category_to_color=None, categories=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks]) if categories is None else categories

        d = {}        
        for c in object_categories:
            xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category().lower() == c.lower()) and (t.meanshape() is not None))]
            d[c] = xcyc
        
        if outfile is not None:            
            plt.clf()
            plt.figure()
            plt.grid(True)
            for c in object_categories:
                xcyc = d[c]
                if len(xcyc) > 0:
                    (xc, yc) = zip(*xcyc)
                    plt.scatter(xc, yc, c=category_to_color[c] if category_to_color is not None else 'blue', label=c)
            plt.xlabel('bounding box (width)')
            plt.ylabel('bounding box (height)')
            plt.axis([0, 1000, 0, 1000])                
            plt.legend()
            plt.gca().set_axisbelow(True)        
            plt.savefig(outfile)
        return d

    def boxsize_by_category(self, outfile=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks])
        
        # Mean track size per video category
        d_category_to_xy = {k:np.mean([t.meanshape() for v in vlist for t in v.tracklist()], axis=0) for (k,vlist) in groupbyasdict(self.list(), lambda v: v.category()).items()}

        if outfile is not None:
            plt.clf()
            plt.figure()
            plt.grid(True)
            colors = colorlist()            
            d_category_to_color = {c:colors[k % len(colors)] for (k,c) in enumerate(d_category_to_xy.keys())}
            for c in d_category_to_xy.keys():
                (xc, yc) = d_category_to_xy[c]
                plt.scatter(xc, yc, c=d_category_to_color[c], label=c)
            plt.xlabel('bounding box (width)')
            plt.ylabel('bounding box (height)')
            plt.axis([0, 600, 0, 600])                
            plt.gca().set_axisbelow(True)        
            lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
            plt.savefig(outfile, bbox_extra_artists=(lgd,), bbox_inches='tight')
        return d_category_to_xy

    def boxsize_histogram(self, outfile=None):
        # Scatterplot of object box sizes
        tracks = [t for s in self.list() for t in s.tracks().values()]        
        (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
        object_categories = set([t.category() for t in tracks])

        
        # 2D histogram of object box sizes
        for c in object_categories:
            xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category() == c) and (t.meanshape() is not None))]
            d[c] = xcyc

        if outfile is not None:
            for c in object_categories:            
                xcyc = d[c]
                if len(xcyc) > 0:
                    (xc, yc) = zip(*xcyc)
                    plt.clf()
                    plt.figure()
                    plt.hist2d(xc, yc, bins=10)
                    plt.xlabel('Bounding box (width)')
                    plt.ylabel('Bounding box (height)')                    
                    plt.savefig(outfile % c)
        return d

    
    def to_torch(self, f_video_to_tensor):
        """Return a torch dataset that will apply the lambda function f_video_to_tensor to each element in the dataset on demand"""
        import vipy.torch
        return vipy.torch.TorchDataset(f_video_to_tensor, self)

    def to_torch_tensordir(self, f_video_to_tensor, outdir, n_augmentations=20, sleep=None):
        """Return a TorchTensordir dataset that will load a pkl.bz2 file that contains one of n_augmentations (tensor, label) pairs.
        
        This is useful for fast loading of datasets that contain many videos.

        """
        import vipy.torch    # lazy import, requires vipy[all] 
        from vipy.batch import Batch   # requires pip install vipy[all]

        assert self._is_vipy_video_scene()
        outdir = vipy.util.remkdir(outdir)
        self.map(lambda v, f=f_video_to_tensor, outdir=outdir, n_augmentations=n_augmentations: vipy.util.bz2pkl(os.path.join(outdir, '%s.pkl.bz2' % v.instanceid()), [f(v.print(sleep=sleep).clone()) for k in range(0, n_augmentations)]))
        return vipy.torch.Tensordir(outdir)

    def annotate(self, outdir, mindim=512):
        assert self._isvipy()
        f = lambda v, outdir=outdir, mindim=mindim: v.mindim(mindim).annotate(outfile=os.path.join(outdir, '%s.mp4' % v.videoid())).print()
        return self.map(f, dst='annotate')

    def tohtml(self, outfile, mindim=512, title='Visualization', fraction=1.0, display=False, clip=True, activities=True, category=True):
        """Generate a standalone HTML file containing quicklooks for each annotated activity in dataset, along with some helpful provenance information for where the annotation came from"""
    
        assert ishtml(outfile), "Output file must be .html"
        assert fraction > 0 and fraction <= 1.0, "Fraction must be between [0,1]"
        
        import vipy.util  # This should not be necessary, but we get "UnboundLocalError" without it, not sure why..
        import vipy.batch  # requires pip install vipy[all]

        dataset = self.list()
        assert all([isinstance(v, vipy.video.Video) for v in dataset])
        dataset = [dataset[int(k)] for k in np.random.permutation(range(len(dataset)))[0:int(len(dataset)*fraction)]]
        #dataset = [v for v in dataset if all([len(a) < 15*v.framerate() for a in v.activitylist()])]  # remove extremely long videos

        quicklist = vipy.batch.Batch(dataset, strict=False, as_completed=True, minscatter=1).map(lambda v: (v.load().quicklook(), v.flush().print())).result()
        quicklist = [x for x in quicklist if x is not None]  # remove errors
        quicklooks = [imq for (imq, v) in quicklist]  # keep original video for HTML display purposes
        provenance = [{'clip':str(v), 'activities':str(';'.join([str(a) for a in v.activitylist()])), 'category':v.category()} for (imq, v) in quicklist]
        (quicklooks, provenance) = zip(*sorted([(q,p) for (q,p) in zip(quicklooks, provenance)], key=lambda x: x[1]['category']))  # sorted in category order
        return vipy.visualize.tohtml(quicklooks, provenance, title='%s' % title, outfile=outfile, mindim=mindim, display=display)


    def video_montage(self, outfile, gridrows, gridcols, mindim=64, bycategory=False, category=None, annotate=True, trackcrop=False, transpose=False, max_duration=None, framerate=30, fontsize=8):
        """30x50 activity montage, each 64x64 elements.

        Args:
            outfile: [str] The name of the outfile for the video.  Must have a valid video extension. 
            gridrows: [int, None]  The number of rows to include in the montage.  If None, infer from other args
            gridcols: [int] The number of columns in the montage
            mindim: [int] The square size of each video in the montage
            bycategory: [bool]  Make the video such that each row is a category 
            category: [str, list] Make the video so that every element is of category.  May be a list of more than one categories
            annotate: [bool] If true, include boxes and captions for objects and activities
            trackcrop: [bool] If true, center the video elements on the tracks with dilation factor 1.5
            transpose: [bool] If true, organize categories columnwise, but still return a montage of size (gridrows, gridcols)
            max_duration: [float] If not None, then set a maximum duration in seconds for elements in the video.  If None, then the max duration is the duration of the longest element.

        Returns:
            A clone of the dataset containing the selected videos for the montage, ordered rowwise in the montage

        .. notes::  
            - If a category does not contain the required number of elements for bycategory, it is removed prior to visualization
            - Elements are looped if they exit prior to the end of the longest video (or max_duration)
        """
        assert self._is_vipy_video()
        assert vipy.util.isvideo(outfile)
        assert gridrows is None or (isinstance(gridrows, int) and gridrows >= 1)
        assert gridcols is None or (isinstance(gridcols, int) and gridcols >= 1)
        assert isinstance(mindim, int) and mindim >= 1
        assert category is None or isinstance(category, str)

        D = self.clone()
        if bycategory:
            (num_categories, num_elements) = (gridrows, gridcols) if not transpose else (gridcols, gridrows)
            assert num_elements is not None
            requested_categories = sorted(D.classlist()) if (num_categories is None) else sorted(D.classlist())[0:num_categories]             
            categories = [c for c in requested_categories if D.count()[c] >= num_elements]  # filter those categories that do not have enough
            if set(categories) != set(requested_categories):
                warnings.warn('[vipy.dataset.video_montage]: removing "%s" without at least %d examples' % (str(set(requested_categories).difference(set(categories))), num_elements))
            vidlist = sorted(D.filter(lambda v: v.category() in categories).take_per_category(num_elements).tolist(), key=lambda v: v.category())
            vidlist = vidlist if not transpose else [vidlist[k] for k in np.array(range(0, len(vidlist))).reshape( (len(categories), num_elements) ).transpose().flatten().tolist()] 
            (gridrows, gridcols) = (len(categories), num_elements) if not transpose else (num_elements, len(categories))
            assert len(vidlist) == gridrows*gridcols

        elif category is not None:
            vidlist = D.filter(lambda v: v.category() in vipy.util.tolist(category)).take(gridrows*gridcols, canload=True).tolist()            
        elif len(D) != gridrows*gridcols:
            vidlist = D.take(gridrows*gridcols, canload=True).tolist()
        else:
            vidlist = D.tolist()

        vidlist = [v.framerate(framerate) for v in vidlist]  # resample to common framerate (this may result in jittery tracks
        montage = Dataset(vidlist, id='video_montage').clone()  # for output
        vidlist = [v.trackcrop(dilate=1.5, maxsquare=True) if (v.trackbox() is not None) else v for v in vidlist] if trackcrop else vidlist  # may be None, if so return the video
        vidlist = [v.mindim(mindim) for v in vidlist]  # before annotate for common font size
        vidlist = [vipy.video.Video.cast(v) for v in vidlist] if not annotate else [v.annotate(verbose=False, fontsize=fontsize) for v in vidlist]  # pre-annotate
            
        vipy.visualize.videomontage(vidlist, mindim, mindim, gridrows=gridrows, gridcols=gridcols, framerate=framerate, max_duration=max_duration).saveas(outfile)
        return montage        
        
    def zip(self, other, sortkey=None):
        """Zip two datasets.  Equivalent to zip(self, other).

        ```python
        for (d1,d2) in D1.zip(D2, sortkey=lambda v: v.instanceid()):
            pass
        
        for (d1, d2) in zip(D1, D2):
            pass
        ```

        Args:
            other: [`vipy.dataset.Dataset`] 
            sortkey: [lambda] sort both datasets using the provided sortkey lambda.
        
        Returns:
            Generator for the tuple sequence ( (self[0], other[0]), (self[1], other[1]), ... )
        """ 
        assert isinstance(other, Dataset)
        assert len(self) == len(other)

        for (vi, vj) in zip(self.sort(sortkey), other.sort(sortkey)):
            yield (vi, vj)

    def sort(self, key):
        """Sort the dataset in-place using the sortkey lambda function"""
        if key is not None:
            self._objlist.sort(key=lambda x: key(self._loader(x)))
        return self

Subclasses

Static methods

def from_json(s)

Expand source code Browse git

@classmethod
def from_json(cls, s):
    r = vipy.util.class_registry()
    d = json.loads(s) if not isinstance(s, dict) else s  
    return cls(objlist=[r[x[0]](x[1]) if (isinstance(x, tuple) and x[0] in r) else x for x in d['_objlist']],
               id=d['_id'],
               abspath=d['_abspath'])

Methods

def annotate(self, outdir, mindim=512)

Expand source code Browse git

def annotate(self, outdir, mindim=512):
    assert self._isvipy()
    f = lambda v, outdir=outdir, mindim=mindim: v.mindim(mindim).annotate(outfile=os.path.join(outdir, '%s.mp4' % v.videoid())).print()
    return self.map(f, dst='annotate')

def archive(self, tarfile, delprefix, mediadir='videos', format='json', castas=vipy.video.Scene, verbose=False, extrafiles=None, novideos=False, md5=True, tmpdir=None, inplace=False, bycategory=False, annotationdir='annotations')

Create a archive file for this dataset. This will be archived as:

/path/to/tarfile.{tar.gz|.tgz|.bz2} tarfilename tarfilename.{json|pkl} mediadir/ video.mp4 extras1.ext extras2.ext

Args:
    tarfile: /path/to/tarfilename.tar.gz
    delprefix:  the absolute file path contained in the media filenames to be removed.  If a video has a delprefix='/a/b' then videos with path /a/b/c/d.mp4' -> 'c/d.mp4', and {JSON|PKL} will be saved with relative paths to mediadir.  This may be a list of delprefixes.
    mediadir:  the subdirectory name of the media to be contained in the archive.  Usually "videos".             
    extrafiles: list of tuples or singletons [(abspath, filename_in_archive_relative_to_root), 'file_in_root_and_in_pwd', ...], 
    novideos [bool]:  generate a tarball without linking videos, just annotations
    md5 [bool]:  If True, generate the MD5 hash of the tarball using the system "md5sum", or if md5='vipy' use a slower python only md5 hash 
    castas [class]:  This should be a vipy class that the vipy objects should be cast to prior to archive.  This is useful for converting priveledged superclasses to a base class prior to export.
    tmpdir:  The path to the temporary directory for construting this dataset.  Defaults to system temp.  This directory will be emptied prior to archive.
    inplace [bool]:  If true, modify the dataset in place to prepare it for archive, else make a copy
    bycategory [bool]: If true, save the annotations in an annotations/ directory by category
    annotationdir [str]: The subdirectory name of annotations to be contained in the archive if bycategory=True.  Usually "annotations" or "json".

Example:

  - Input files contain /path/to/oldvideos/category/video.mp4
  - Output will contain relative paths videos/category/video.mp4

d.archive('out.tar.gz', delprefix='/path/to/oldvideos', mediadir='videos')

Returns:

    The absolute path to the tarball

Expand source code Browse git

def archive(self, tarfile, delprefix, mediadir='videos', format='json', castas=vipy.video.Scene, verbose=False, extrafiles=None, novideos=False, md5=True, tmpdir=None, inplace=False, bycategory=False, annotationdir='annotations'):
    """Create a archive file for this dataset.  This will be archived as:

       /path/to/tarfile.{tar.gz|.tgz|.bz2}
          tarfilename
             tarfilename.{json|pkl}
             mediadir/
                 video.mp4
             extras1.ext
             extras2.ext
    
        Args:
            tarfile: /path/to/tarfilename.tar.gz
            delprefix:  the absolute file path contained in the media filenames to be removed.  If a video has a delprefix='/a/b' then videos with path /a/b/c/d.mp4' -> 'c/d.mp4', and {JSON|PKL} will be saved with relative paths to mediadir.  This may be a list of delprefixes.
            mediadir:  the subdirectory name of the media to be contained in the archive.  Usually "videos".             
            extrafiles: list of tuples or singletons [(abspath, filename_in_archive_relative_to_root), 'file_in_root_and_in_pwd', ...], 
            novideos [bool]:  generate a tarball without linking videos, just annotations
            md5 [bool]:  If True, generate the MD5 hash of the tarball using the system "md5sum", or if md5='vipy' use a slower python only md5 hash 
            castas [class]:  This should be a vipy class that the vipy objects should be cast to prior to archive.  This is useful for converting priveledged superclasses to a base class prior to export.
            tmpdir:  The path to the temporary directory for construting this dataset.  Defaults to system temp.  This directory will be emptied prior to archive.
            inplace [bool]:  If true, modify the dataset in place to prepare it for archive, else make a copy
            bycategory [bool]: If true, save the annotations in an annotations/ directory by category
            annotationdir [str]: The subdirectory name of annotations to be contained in the archive if bycategory=True.  Usually "annotations" or "json".

        Example:  

          - Input files contain /path/to/oldvideos/category/video.mp4
          - Output will contain relative paths videos/category/video.mp4

    ```python
    d.archive('out.tar.gz', delprefix='/path/to/oldvideos', mediadir='videos')
    ```
    
        Returns:

            The absolute path to the tarball 
    """
    assert self._isvipy(), "Source dataset must contain vipy objects for staging"
    assert all([os.path.isabs(v.filename()) for v in self]), "Input dataset must have only absolute media paths"
    assert len([v for v in self if any([d in v.filename() for d in tolist(delprefix)])]) == len(self), "all media objects must have a provided delprefix for relative path construction"
    assert vipy.util.istgz(tarfile) or vipy.util.istarbz2(tarfile) or vipy.util.istar(tarfile), "Allowable extensions are .tar.gz, .tgz, .bz2 or .tar"
    assert shutil.which('tar') is not None, "tar not found on path"        
    
    D = self.clone() if not inplace else self   # large memory footprint if inplace=False
    tmpdir = tempdir() if tmpdir is None else remkdir(tmpdir, flush=True)
    stagedir = remkdir(os.path.join(tmpdir, filefull(filetail(tarfile))))
    print('[vipy.dataset]: creating staging directory "%s"' % stagedir)
    delprefix = [[d for d in tolist(delprefix) if d in v.filename()][0] for v in self]  # select the delprefix per video
    D._objlist = [v.filename(v.filename().replace(os.path.normpath(p), os.path.normpath(os.path.join(stagedir, mediadir))), symlink=not novideos) for (p,v) in zip(delprefix, D.list())]

    # Save annotations:  Split large datasets into annotations grouped by category to help speed up loading         
    if bycategory:
        for (c,V) in vipy.util.groupbyasdict(list(D), lambda v: v.category()).items():
            Dataset(V, id=c).save(os.path.join(stagedir, annotationdir, '%s.%s' % (c, format)), relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)
    else:
        pklfile = os.path.join(stagedir, '%s.%s' % (filetail(filefull(tarfile)), format))
        D.save(pklfile, relpath=True, nourl=True, sanitize=True, castas=castas, significant_digits=2, noemail=True, flush=True)

    # Copy extras (symlinked) to staging directory
    if extrafiles is not None:
        # extrafiles = [("/abs/path/in/filesystem.ext", "rel/path/in/archive.ext"), ... ]
        assert all([((isinstance(e, tuple) or isinstance(e, list)) and len(e) == 2) or isinstance(e, str) for e in extrafiles])
        extrafiles = [e if (isinstance(e, tuple) or isinstance(e, list)) else (e,e) for e in extrafiles]  # tuple-ify files in pwd() and should be put in the tarball root
        for (e, a) in tolist(extrafiles):
            assert os.path.exists(os.path.abspath(e)), "Invalid extras file '%s' - file not found" % e
            remkdir(filepath(os.path.join(stagedir, filetail(e) if a is None else a)))    # make directory in stagedir for symlink
            os.symlink(os.path.abspath(e), os.path.join(stagedir, filetail(e) if a is None else a))

    # System command to run tar
    cmd = ('tar %scvf %s -C %s --dereference %s %s' % ('j' if vipy.util.istarbz2(tarfile) else ('z' if vipy.util.istgz(tarfile) else ''), 
                                                       tarfile,
                                                       filepath(stagedir),
                                                       filetail(stagedir),
                                                       ' > /dev/null' if not verbose else ''))

    print('[vipy.dataset]: executing "%s"' % cmd)        
    os.system(cmd)  # too slow to use python "tarfile" package
    print('[vipy.dataset]: deleting staging directory "%s"' % stagedir)        
    shutil.rmtree(stagedir)

    if md5:
        if shutil.which('md5sum') is not None:
            cmd = 'md5sum %s' % tarfile
            print('[vipy.dataset]: executing "%s"' % cmd)        
            os.system(cmd)  # too slow to use python "vipy.downloader.generate_md5(tarball)" for huge datasets
        else:
            print('[vipy.dataset]: %s, MD5=%s' % (tarfile, vipy.downloader.generate_md5(tarfile)))  # too slow for large datasets, but does not require md5sum on path
    return tarfile

def augment(self, f, n_augmentations)

Expand source code Browse git

def augment(self, f, n_augmentations):
    assert n_augmentations >= 1
    self._objlist = [f(v.clone()) for v in self for k in range(n_augmentations)]  # This will remove the originals
    return self

def boxsize(self, outfile=None, category_to_color=None, categories=None)

Expand source code Browse git

def boxsize(self, outfile=None, category_to_color=None, categories=None):
    # Scatterplot of object box sizes
    tracks = [t for s in self.list() for t in s.tracks().values()]        
    (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
    object_categories = set([t.category() for t in tracks]) if categories is None else categories

    d = {}        
    for c in object_categories:
        xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category().lower() == c.lower()) and (t.meanshape() is not None))]
        d[c] = xcyc
    
    if outfile is not None:            
        plt.clf()
        plt.figure()
        plt.grid(True)
        for c in object_categories:
            xcyc = d[c]
            if len(xcyc) > 0:
                (xc, yc) = zip(*xcyc)
                plt.scatter(xc, yc, c=category_to_color[c] if category_to_color is not None else 'blue', label=c)
        plt.xlabel('bounding box (width)')
        plt.ylabel('bounding box (height)')
        plt.axis([0, 1000, 0, 1000])                
        plt.legend()
        plt.gca().set_axisbelow(True)        
        plt.savefig(outfile)
    return d

def boxsize_by_category(self, outfile=None)

Expand source code Browse git

def boxsize_by_category(self, outfile=None):
    # Scatterplot of object box sizes
    tracks = [t for s in self.list() for t in s.tracks().values()]        
    (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
    object_categories = set([t.category() for t in tracks])
    
    # Mean track size per video category
    d_category_to_xy = {k:np.mean([t.meanshape() for v in vlist for t in v.tracklist()], axis=0) for (k,vlist) in groupbyasdict(self.list(), lambda v: v.category()).items()}

    if outfile is not None:
        plt.clf()
        plt.figure()
        plt.grid(True)
        colors = colorlist()            
        d_category_to_color = {c:colors[k % len(colors)] for (k,c) in enumerate(d_category_to_xy.keys())}
        for c in d_category_to_xy.keys():
            (xc, yc) = d_category_to_xy[c]
            plt.scatter(xc, yc, c=d_category_to_color[c], label=c)
        plt.xlabel('bounding box (width)')
        plt.ylabel('bounding box (height)')
        plt.axis([0, 600, 0, 600])                
        plt.gca().set_axisbelow(True)        
        lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        plt.savefig(outfile, bbox_extra_artists=(lgd,), bbox_inches='tight')
    return d_category_to_xy

def boxsize_histogram(self, outfile=None)

Expand source code Browse git

def boxsize_histogram(self, outfile=None):
    # Scatterplot of object box sizes
    tracks = [t for s in self.list() for t in s.tracks().values()]        
    (x, y) = zip(*[(t.meanshape()[1], t.meanshape()[0]) for t in tracks])
    object_categories = set([t.category() for t in tracks])

    
    # 2D histogram of object box sizes
    for c in object_categories:
        xcyc = [(t.meanshape()[1], t.meanshape()[0]) for t in tracks if ((t.category() == c) and (t.meanshape() is not None))]
        d[c] = xcyc

    if outfile is not None:
        for c in object_categories:            
            xcyc = d[c]
            if len(xcyc) > 0:
                (xc, yc) = zip(*xcyc)
                plt.clf()
                plt.figure()
                plt.hist2d(xc, yc, bins=10)
                plt.xlabel('Bounding box (width)')
                plt.ylabel('Bounding box (height)')                    
                plt.savefig(outfile % c)
    return d

def categories(self)

Alias for classlist

Expand source code Browse git

def categories(self):
    """Alias for classlist"""
    return self.classlist()

def chunk(self, n)

Yield n chunks as dataset. Last chunk will be ragged

Expand source code Browse git

def chunk(self, n):
    """Yield n chunks as dataset.  Last chunk will be ragged"""
    for (k,V) in enumerate(vipy.util.chunklist(self._objlist, n)):
        yield Dataset(V, id='%s_%d' % (self.id(), k), loader=self._loader)

def class_to_index(self)

Return a dictionary mapping the unique classes to an integer index. This is useful for defining a softmax index ordering for categorization

Expand source code Browse git

def class_to_index(self):
    """Return a dictionary mapping the unique classes to an integer index.  This is useful for defining a softmax index ordering for categorization"""
    return {v:k for (k,v) in enumerate(self.classlist())}

def classes(self)

Alias for classlist

Expand source code Browse git

def classes(self):
    """Alias for classlist"""
    return self.classlist()

def classlist(self)

Return a sorted list of categories in the dataset

Expand source code Browse git

def classlist(self):
    """Return a sorted list of categories in the dataset"""
    assert self._isvipy(), "Invalid input"
    return sorted(list(set([v.category() for v in self])))

def clone(self, shallow=False)

Return a deep copy of the dataset

Expand source code Browse git

def clone(self, shallow=False):
    """Return a deep copy of the dataset"""
    if shallow:
        objlist = self._objlist
        self._objlist = []  
        D = copy.deepcopy(self)
        self._objlist = objlist  # restore
        return D
    else:
        return copy.deepcopy(self)

def count(self, f=None)

Counts for each label.

Args

f: [lambda] if provided, count the number of elements that return true. This is the same as len(self.filter(f)) without modifying the dataset.

Returns

A dictionary of counts per category [if f is None] A length of elements that satisfy f(v) = True [if f is not None]

Expand source code Browse git

def count(self, f=None):
    """Counts for each label.  
    
    Args:
        f: [lambda] if provided, count the number of elements that return true.  This is the same as len(self.filter(f)) without modifying the dataset.

    Returns:
        A dictionary of counts per category [if f is None]
        A length of elements that satisfy f(v) = True [if f is not None]
    """
    assert self._isvipy()
    assert f is None or callable(f)
    return len([v for v in self if f is None or f(v)])

def countby(self, f=<function Dataset.<lambda>>)

Count the number of elements that return the same value from the lambda function

Expand source code Browse git

def countby(self, f=lambda v: v.category()):
    """Count the number of elements that return the same value from the lambda function"""
    assert self._isvipy()
    assert f is None or callable(f)
    return vipy.util.countby(self, f)

def dedupe(self, key)

Expand source code Browse git

def dedupe(self, key):
    self._objlist = list({key(v):v for v in self}.values())
    return self

def density(self, outfile=None, max=None)

Compute the frequency that each video ID is represented. This counts how many activities are in a video, truncated at max

Expand source code Browse git

def density(self, outfile=None, max=None):
    """Compute the frequency that each video ID is represented.  This counts how many activities are in a video, truncated at max"""
    assert self._isvipy()
    d = [len(v) if (max is None or len(v)<= max) else max for (k,v) in groupbyasdict(self.list(), lambda v: v.videoid()).items()]
    d = {k:v for (k,v) in sorted(vipy.util.countby(d, lambda x: x).items(), key=lambda x: x[1], reverse=True)}
    if outfile is not None:
        vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Frequency', xlabel='Activities per video', fontsize=6, xrot=None)            
    return d

def difference(self, other, key)

Expand source code Browse git

def difference(self, other, key):
    assert isinstance(other, Dataset), "invalid input"
    idset = set([key(v) for v in self]).difference([key(v) for v in other])   # in A but not in B
    self._objlist = [v for v in self if key(v) in idset]
    return self

def duration_in_frames(self, outfile=None)

Expand source code Browse git

def duration_in_frames(self, outfile=None):
    assert self._isvipy()
    d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
    if outfile is not None:
        vipy.metrics.histogram(d.values(), d.keys(), outfile=outfile, ylabel='Duration (frames)', fontsize=6)            
    return d

def duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None)

Duration of activities

Expand source code Browse git

def duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
    """Duration of activities"""
    assert self._isvipy()
    d = {k:np.mean([v[1] for v in v]) for (k,v) in groupbyasdict([(a.category(), len(a)/v.framerate()) for v in self.list() for a in v.activitylist()], lambda x: x[0]).items()}
    if outfile is not None:
        max_duration = max(d.values()) if max_duration is None else max_duration
        vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
    return d

def filter(self, f)

In place filter with lambda function f

Expand source code Browse git

def filter(self, f):
    """In place filter with lambda function f"""
    self._objlist = [v for v in self if f(v)]
    return self

def flatmap(self, f)

Expand source code Browse git

def flatmap(self, f):
    self._objlist = [x for v in self for x in f(v)]
    return self

def flatten(self)

Convert dataset stored as a list of lists into a flat list

Expand source code Browse git

def flatten(self):
    """Convert dataset stored as a list of lists into a flat list"""
    self._objlist = [o for objlist in self._objlist for o in vipy.util.tolist(objlist)]
    return self

def framerate(self, outfile=None)

Expand source code Browse git

def framerate(self, outfile=None):
    assert self._isvipy()
    d = vipy.util.countby([int(round(v.framerate())) for v in self.list()], lambda x: x)
    if outfile is not None:
        vipy.metrics.pie(d.values(), ['%d fps' % k for k in d.keys()], explode=None, outfile=outfile,  shadow=False)
    return d

def frequency(self)

Expand source code Browse git

def frequency(self):
    return self.count()

def has(self, val, key)

Expand source code Browse git

def has(self, val, key):
    return any([key(obj) == val for obj in self])

def histogram(self, outfile=None, fontsize=6, category_to_barcolor=None, category_to_xlabel=None)

Expand source code Browse git

def histogram(self, outfile=None, fontsize=6, category_to_barcolor=None, category_to_xlabel=None):
    assert self._isvipy()
    assert category_to_barcolor is None or all([c in category_to_barcolor for c in self.categories()])
    assert category_to_xlabel is None or callable(category_to_xlabel) or all([c in category_to_xlabel for c in self.categories()])
    f_category_to_xlabel = category_to_xlabel if callable(category_to_xlabel) else ((lambda c: category_to_xlabel[c]) if category_to_xlabel is not None else (lambda c: c))
    
    d = self.countby(lambda v: v.category())
    if outfile is not None:
        (categories, freq) = zip(*reversed(sorted(list(d.items()), key=lambda x: x[1])))  # decreasing frequency
        barcolors = ['blue' if category_to_barcolor is None else category_to_barcolor[c] for c in categories]
        xlabels = [f_category_to_xlabel(c) for c in categories]
        print('[vipy.dataset]: histogram="%s"' % vipy.metrics.histogram(freq, xlabels, barcolors=barcolors, outfile=outfile, ylabel='Instances', fontsize=fontsize))
    return d

def id(self, n=None)

Set or return the dataset id

Expand source code Browse git

def id(self, n=None):
    """Set or return the dataset id"""
    if n is None:
        return self._id
    else:
        self._id = n
        return self

def index_to_class(self)

Return a dictionary mapping an integer index to the unique class names. This is the inverse of class_to_index, swapping keys and values

Expand source code Browse git

def index_to_class(self):
    """Return a dictionary mapping an integer index to the unique class names.  This is the inverse of class_to_index, swapping keys and values"""
    return {v:k for (k,v) in self.class_to_index().items()}

def inverse_frequency_weight(self)

Return inverse frequency weight for categories in dataset. Useful for unbalanced class weighting during training

Expand source code Browse git

def inverse_frequency_weight(self):
    """Return inverse frequency weight for categories in dataset.  Useful for unbalanced class weighting during training"""
    d = {k:1.0/max(v,1) for (k,v) in self.count().items()}
    n = sum(d.values())
    return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

def istype(self, validtype)

Return True if the all elements (or just the first element if strict=False) in the dataset are of type 'validtype'

Expand source code Browse git

def istype(self, validtype):
    """Return True if the all elements (or just the first element if strict=False) in the dataset are of type 'validtype'"""
    return all([any([isinstance(v,t) for t in tolist(validtype)]) for v in self]) if self._istype_strict else any([isinstance(self[0],t) for t in tolist(validtype)])

def json(self, encode=True)

Expand source code Browse git

def json(self, encode=True):
    r = vipy.util.class_registry()
    d = {k:v for (k,v) in self.__dict__.items() if not k == '_loader'}
    d['_objlist'] = [(str(type(v)), v.json(encode=False)) if str(type(v)) in r else v for v in self._objlist]
    return json.dumps(d) if encode else d

def jsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True)

Export all objects to a directory of JSON files.

Usage:

D = vipy.dataset.Dataset(...).jsondir('/path/to/jsondir')
D = vipy.util.load('/path/to/jsondir')   # recursively discover and lazy load all json files

Args: outdir [str]: The root directory to store the JSON files verbose [bool]: If True, print the save progress rekey [bool] If False, use the instance ID of the vipy object as the filename for the JSON file, otherwise assign a new UUID_dataset-index bycategory [bool]: If True, use the JSON structure '$OUTDIR/$CATEGORY/$INSTANCEID.json' byfilename [bool]: If True, use the JSON structure '$FILENAME.json' where $FILENAME is the underlying media filename of the vipy object abspath [bool]: If true, store absolute paths to media in JSON. If false, store relative paths to media from JSON directory

Returns: outdir: The directory containing the JSON files.

Expand source code Browse git

def jsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
    """Export all objects to a directory of JSON files.

       Usage:

    ```python
    D = vipy.dataset.Dataset(...).jsondir('/path/to/jsondir')
    D = vipy.util.load('/path/to/jsondir')   # recursively discover and lazy load all json files 
    ```

       Args:
           outdir [str]:  The root directory to store the JSON files
           verbose [bool]: If True, print the save progress
           rekey [bool] If False, use the instance ID of the vipy object as the filename for the JSON file, otherwise assign a new UUID_dataset-index
           bycategory [bool]: If True, use the JSON structure '$OUTDIR/$CATEGORY/$INSTANCEID.json'
           byfilename [bool]: If True, use the JSON structure '$FILENAME.json' where $FILENAME is the underlying media filename of the vipy object
           abspath [bool]: If true, store absolute paths to media in JSON.  If false, store relative paths to media from JSON directory

       Returns:
           outdir: The directory containing the JSON files.
    """
    assert self._isvipy()
    assert outdir is not None or byfilename 
    assert not byfilename and bycategory

    if outdir is not None:
        vipy.util.remkdir(outdir) 
    if bycategory:
        tojsonfile = lambda v,k: os.path.join(outdir, v.category(), ('%s.json' % v.instanceid()) if not rekey else ('%s_%d.json' % (uuid.uuid4().hex, k)))
    elif byfilename:
        tojsonfile = lambda v,k: vipy.util.toextension(v.filename(), '.json')
    else:
        tojsonfile = lambda v,k: os.path.join(outdir, ('%s.json' % v.instanceid()) if not rekey else '%s_%d.json' % (uuid.uuid4().hex, k))
    
    for (k,v) in enumerate(self):            
        f = vipy.util.save(v.clone().relpath(start=filepath(tojsonfile(v,k))) if not abspath else v.clone().abspath(), tojsonfile(v,k))
        if verbose:
            print('[vipy.dataset.Dataset][%d/%d]: %s' % (k, len(self), f))
    return outdir

def label_to_index(self)

Alias for class_to_index

Expand source code Browse git

def label_to_index(self):
    """Alias for class_to_index"""
    return self.class_to_index()

def list(self)

Return the dataset as a list

Expand source code Browse git

def list(self):
    """Return the dataset as a list"""
    return list(self)

def load(self)

Load the entire dataset into memory. This is useful for creating in-memory datasets from lazy load datasets

Expand source code Browse git

def load(self):
    """Load the entire dataset into memory.  This is useful for creating in-memory datasets from lazy load datasets"""
    self._objlist = self.list()
    self._loader = None
    return self

def localmap(self, f)

Expand source code Browse git

def localmap(self, f):
    for (k,v) in enumerate(self):
        self._objlist[k] = f(v)  # in-place update
    return self

def map(self, f_map, model=None, dst=None, id=None, strict=False, ascompleted=True, ordered=False)

Distributed map.

To perform this in parallel across four processes:

D = vipy.dataset.Dataset(...)
with vipy.globals.parallel(4):
    D.map(lambda v: ...)

Args

f_map: [lambda] The lambda function to apply in parallel to all elements in the dataset. This must return a JSON serializable object
model: [torch.nn.Module] The model to scatter to all workers
dst: [str] The ID to give to the resulting dataset
id: [str] The ID to give to the resulting dataset (parameter alias for dst)
strict: [bool] If true, raise exception on map failures, otherwise the map will return None for failed elements
ascompleted: [bool] If true, return elements as they complete
ordered: [bool] If true, preserve the order of objects in dataset as returned from distributed processing

Returns

A Dataset containing the elements f_map(v). This operation is order preserving if ordered=True.

Note

This dataset must contain vipy objects of types defined in class_registry() or JSON serializable objects
Serialization of large datasets can take a while, kick it off to a distributed dask scheduler and go get lunch
This method uses dask distributed and Batch operations
All vipy objects are JSON serialized prior to parallel map to avoid reference cycle garbage collection which can introduce instabilities
Due to chunking, all error handling is caught by this method. Use Batch to leverage dask distributed futures error handling.
Operations must be chunked and serialized because each dask task comes with overhead, and lots of small tasks violates best practices
Serialized results are deserialized by the client and returned a a new dataset

Expand source code Browse git

def map(self, f_map, model=None, dst=None, id=None, strict=False, ascompleted=True, ordered=False):        
    """Distributed map.

    To perform this in parallel across four processes:

    ```python
    D = vipy.dataset.Dataset(...)
    with vipy.globals.parallel(4):
        D.map(lambda v: ...)
    ```

    Args:
        f_map: [lambda] The lambda function to apply in parallel to all elements in the dataset.  This must return a JSON serializable object
        model: [torch.nn.Module] The model to scatter to all workers
        dst: [str] The ID to give to the resulting dataset
        id: [str] The ID to give to the resulting dataset (parameter alias for dst)
        strict: [bool] If true, raise exception on map failures, otherwise the map will return None for failed elements
        ascompleted: [bool] If true, return elements as they complete
        ordered: [bool] If true, preserve the order of objects in dataset as returned from distributed processing
    
    Returns:
        A `vipy.dataset.Dataset` containing the elements f_map(v).  This operation is order preserving if ordered=True.

    .. note:: 
        - This dataset must contain vipy objects of types defined in `vipy.util.class_registry` or JSON serializable objects
        - Serialization of large datasets can take a while, kick it off to a distributed dask scheduler and go get lunch
        - This method uses dask distributed and `vipy.batch.Batch` operations
        - All vipy objects are JSON serialized prior to parallel map to avoid reference cycle garbage collection which can introduce instabilities
        - Due to chunking, all error handling is caught by this method.  Use `vipy.batch.Batch` to leverage dask distributed futures error handling.
        - Operations must be chunked and serialized because each dask task comes with overhead, and lots of small tasks violates best practices
        - Serialized results are deserialized by the client and returned a a new dataset
    """
    assert callable(f_map)
    from vipy.batch import Batch   # requires pip install vipy[all]

    # Distributed map using vipy.batch
    f_serialize = lambda v,d=vipy.util.class_registry(): (str(type(v)), v.json()) if str(type(v)) in d else (None, pickle.dumps(v))  # fallback on PKL dumps/loads
    f_deserialize = lambda x,d=vipy.util.class_registry(): d[x[0]](x[1])  # with closure capture
    f_catcher = lambda f, *args, **kwargs: vipy.util.loudcatcher(f, '[vipy.dataset.Dataset.map]: ', *args, **kwargs)  # catch exceptions when executing lambda, print errors and return (True, result) or (False, exception)
    f_loader = self._loader if self._loader is not None else lambda x: x
    S = [f_serialize(v) for v in self._objlist]  # local serialization

    B = Batch(vipy.util.chunklist(S, 128), strict=strict, as_completed=ascompleted, warnme=False, minscatter=128, ordered=ordered)
    if model is None:
        f = lambda x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, f_loader(f_deserializer(x))))  # with closure capture
        S = B.map(lambda X,f=f: [f(x) for x in X]).result()  # chunked, with caught exceptions, may return empty list
    else:
        f = lambda net, x, f_loader=f_loader, f_serializer=f_serialize, f_deserializer=f_deserialize, f_map=f_map, f_catcher=f_catcher: f_serializer(f_catcher(f_map, net, f_loader(f_deserializer(x))))  # with closure capture
        S = B.scattermap((lambda net, X, f=f: [f(net, x) for x in X]), model).result()  # chunked, scattered, caught exceptions
    if not isinstance(S, list) or any([not isinstance(s, list) for s in S]):
        raise ValueError('Distributed processing error - Batch returned: %s' % (str(S)))
    V = [f_deserialize(x) for s in S for x in s]  # Local deserialization and chunk flattening
    (good, bad) = ([r for (b,r) in V if b], [r for (b,r) in V if not b])  # catcher returns (True, result) or (False, exception string)
    if len(bad) > 0:
        print('[vipy.dataset.Dataset.map]: Exceptions in map distributed processing:\n%s' % str(bad))
        print('[vipy.dataset.Dataset.map]: %d/%d items failed' % (len(bad), len(self)))
    return Dataset(good, id=dst if dst is not None else id)

def merge(self, outdir)

Merge a dataset union into a single subdirectory with symlinked media ready to be archived.

D1 = vipy.dataset.Dataset('/path1/dataset.json')
D2 = vipy.dataset.Dataset('/path2/dataset.json')
D3 = D1.union(D2).merge(outdir='/path3')

Media in D1 are in /path1, media in D2 are in /path2, media in D3 are all symlinked to /path3. We can now create a tarball for D3 with all of the media files in the same relative path.

Expand source code Browse git

def merge(self, outdir):
    """Merge a dataset union into a single subdirectory with symlinked media ready to be archived.

    ```python
    D1 = vipy.dataset.Dataset('/path1/dataset.json')
    D2 = vipy.dataset.Dataset('/path2/dataset.json')
    D3 = D1.union(D2).merge(outdir='/path3')
    ```

    Media in D1 are in /path1, media in D2 are in /path2, media in D3 are all symlinked to /path3.
    We can now create a tarball for D3 with all of the media files in the same relative path.
    """
    
    outdir = vipy.util.remkdir(os.path.abspath(os.path.normpath(outdir)))
    return self.clone().localmap(lambda v: v.filename(os.path.join(outdir, filetail(v.filename())), copy=False, symlink=True))

def minibatch(self, n, ragged=True)

Yield list chunks of size n of this dataset. Last chunk will be ragged if ragged=True, else skipped

Expand source code Browse git

def minibatch(self, n, ragged=True):
    """Yield list chunks of size n of this dataset.  Last chunk will be ragged if ragged=True, else skipped"""
    for (k,V) in enumerate(vipy.util.chunklistbysize(self._objlist, n)):
        if ragged or len(V) == n:
            yield V

def multilabel_inverse_frequency_weight(self)

Return an inverse frequency weight for multilabel activities, where label counts are the fractional label likelihood within a clip

Expand source code Browse git

def multilabel_inverse_frequency_weight(self):
    """Return an inverse frequency weight for multilabel activities, where label counts are the fractional label likelihood within a clip"""
    assert self._is_vipy_video()

    def _multilabel_inverse_frequency_weight(v):
        lbl_likelihood = {}
        if len(v.activities()) > 0:
            (ef, sf) = (max([a.endframe() for a in v.activitylist()]), min([a.startframe() for a in v.activitylist()]))  # clip length 
            lbl_list = [a for A in v.activitylabel(sf, ef) for a in set(A)]  # list of all labels within clip (labels are unique in each frame)
            lbl_frequency = vipy.util.countby(lbl_list, lambda x: x)  # frequency of each label within clip
            lbl_weight = {k:v/float(len(lbl_list)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, normalized frequency sums to one 
            for (k,w) in lbl_weight.items():
                if k not in lbl_likelihood:
                    lbl_likelihood[k] = 0
                lbl_likelihood[k] += w
        return lbl_likelihood
                
    lbl_likelihood  = {}
    for d in self.map(lambda v: _multilabel_inverse_frequency_weight(v)):  # parallelizable
        for (k,v) in d.items():
            if k not in lbl_likelihood:
                lbl_likelihood[k] = 0
            lbl_likelihood[k] += v

    # Inverse frequency weight on label likelihood per clip
    d = {k:1.0/max(v,1) for (k,v) in lbl_likelihood.items()}
    n = sum(d.values())  
    return {k:len(d)*(v/float(n)) for (k,v) in d.items()}

def num_categories(self)

Alias for num_classes

Expand source code Browse git

def num_categories(self):
    """Alias for num_classes"""
    return self.num_classes()

def num_classes(self)

Return the number of unique categories in this dataset

Expand source code Browse git

def num_classes(self):
    """Return the number of unique categories in this dataset"""
    return len(self.classlist())

def num_labels(self)

Alias for num_classes

Expand source code Browse git

def num_labels(self):
    """Alias for num_classes"""
    return self.num_classes()

def percentage(self)

Fraction of dataset for each label

Expand source code Browse git

def percentage(self):
    """Fraction of dataset for each label"""
    d = self.count()
    n = sum(d.values())
    return {k:v/float(n) for (k,v) in d.items()}

def powerset(self)

Expand source code Browse git

def powerset(self):
    return list(sorted(set([tuple(sorted(list(a))) for v in self for a in v.activitylabel() if len(a) > 0])))

def powerset_to_index(self)

Expand source code Browse git

def powerset_to_index(self):        
    assert self._isvipy(), "Invalid input"
    return {c:k for (k,c) in enumerate(self.powerset())}

def replace(self, other, key)

Replace elements in self with other with equality detemrined by the key lambda function

Expand source code Browse git

def replace(self, other, key):
    """Replace elements in self with other with equality detemrined by the key lambda function"""
    assert isinstance(other, Dataset), "invalid input"
    d = {key(v):v for v in other}
    self._objlist = [v if key(v) not in d else d[key(v)] for v in self]
    return self

def save(self, outfile, nourl=False, castas=None, relpath=False, sanitize=True, strict=True, significant_digits=2, noemail=True, flush=True, bycategory=False)

Save the dataset to the provided output filename stored as pkl or json

Args

outfile: [str]: The /path/to/out.pkl or /path/to/out.json
nourl: bool: If true, remove all URLs from the media (if present)
castas: [type]: Cast all media to the provided type. This is useful for downcasting to Scene from superclasses
relpath: bool: If true, define all file paths in objects relative to the /path/to in /path/to/out.json
sanitize: bool: If trye, call sanitize() on all objects to remove all private attributes with prepended '__'
strict
significant_digits: [int]: Assign the requested number of significant digits to all bounding boxes in all tracks. This requires dataset of Scene
noemail: bool: If true, scrub the attributes for emails and replace with a hash
flush: bool: If true, flush the object buffers prior to save

bycategory [bool[: If trye, then save the dataset to the provided output filename pattern outfile='/path/to/annotations/*.json' where the wildcard is replaced with the category name Returns:
This dataset that is quivalent to vipy.dataset.Dataset('/path/to/outfile.json')

Expand source code Browse git

def save(self, outfile, nourl=False, castas=None, relpath=False, sanitize=True, strict=True, significant_digits=2, noemail=True, flush=True, bycategory=False):
    """Save the dataset to the provided output filename stored as pkl or json
    
    Args:
        outfile: [str]: The /path/to/out.pkl or /path/to/out.json
        nourl: [bool]: If true, remove all URLs from the media (if present)
        castas: [type]:  Cast all media to the provided type.  This is useful for downcasting to `vipy.video.Scene` from superclasses
        relpath: [bool]: If true, define all file paths in objects relative to the /path/to in /path/to/out.json
        sanitize: [bool]:  If trye, call sanitize() on all objects to remove all private attributes with prepended '__' 
        strict: [bool]: Unused
        significant_digits: [int]: Assign the requested number of significant digits to all bounding boxes in all tracks.  This requires dataset of `vipy.video.Scene`
        noemail: [bool]: If true, scrub the attributes for emails and replace with a hash
        flush: [bool]:  If true, flush the object buffers prior to save
        bycategory [bool[: If trye, then save the dataset to the provided output filename pattern outfile='/path/to/annotations/*.json' where the wildcard is replaced with the category name

    Returns:        
        This dataset that is quivalent to vipy.dataset.Dataset('/path/to/outfile.json')
    """
    n = len([v for v in self if v is None])
    if n > 0:
        print('[vipy.dataset]: removing %d invalid elements' % n)
    objlist = [v for v in self if v is not None]  
    if relpath or nourl or sanitize or flush or noemail or (significant_digits is not None):
        assert self._isvipy(), "Invalid input"
    if relpath:
        print('[vipy.dataset]: setting relative paths')
        objlist = [v.relpath(start=filepath(outfile)) if os.path.isabs(v.filename()) else v for v in objlist]
    if nourl: 
        print('[vipy.dataset]: removing URLs')
        objlist = [v.nourl() for v in objlist]           
    if sanitize:
        print('[vipy.dataset]: sanitizing attributes')                        
        objlist = [v.sanitize() for v in objlist]  # removes all attributes with '__' keys
    if castas is not None:
        assert hasattr(castas, 'cast'), "Invalid cast"
        print('[vipy.dataset]: casting as "%s"' % (str(castas)))
        objlist = [castas.cast(v) for v in objlist]                     
    if significant_digits is not None:
        assert self._is_vipy_video_scene()
        assert isinstance(significant_digits, int) and significant_digits >= 1, "Invalid input"
        objlist = [o.trackmap(lambda t: t.significant_digits(significant_digits)) if o is not None else o for o in objlist]
    if noemail:
        print('[vipy.dataset]: removing emails')            
        for o in objlist:
            for (k,v) in o.attributes.items():
                if isinstance(v, str) and is_email_address(v):
                    o.attributes[k] = hashlib.sha1(v.encode("UTF-8")).hexdigest()[0:10]
    if flush:
        objlist = [o.flush() for o in objlist]  

    if bycategory:
        for (c,V) in vipy.util.groupbyasdict(list(self), lambda v: v.category()).items():
            jsonfile = outfile.replace('*', c)  # outfile="/path/to/annotations/*.json"
            d = Dataset(V, id=c).save(jsonfile, relpath=relpath, nourl=nourl, sanitize=sanitize, castas=castas, significant_digits=significant_digits, noemail=noemail, flush=flush, bycategory=False)
            print('[vipy.dataset]: Saving %s by category to "%s"' % (str(d), jsonfile))                
    else:
        print('[vipy.dataset]: Saving %s to "%s"' % (str(self), outfile))
        vipy.util.save(objlist, outfile)
    return self

def set(self)

Return the dataset as a set

Expand source code Browse git

def set(self):
    """Return the dataset as a set"""
    return set(self.list())

def shuffle(self)

Randomly permute elements in this dataset according to a shuffler protocol set with shuffler()

Expand source code Browse git

def shuffle(self):
    """Randomly permute elements in this dataset according to a shuffler protocol set with shuffler()"""
    self._objlist = self.shuffler()(self._objlist)  # in-place
    return self

def shuffler(self, method=None, uniform=None, pairwise=None)

Specify a shuffler protocol.

>>> D.shuffler('uniform')
>>> D.shuffer(uniform=True)
>>> D.shuffle()

Args

uniform [bool]: shuffle element uniformly at random pairwise [bool]: elements are assumed to be pairwise similarities, such that the category() method returns an id for each positive pair. Shuffle keeping positive pairs as minibatch neighbors.
Returns: self if a new shuffler is requested, otherwise return a lambda function which shuffles a list. This lambda function is not meant to be used directly, rather exercised by shuffle

Expand source code Browse git

def shuffler(self, method=None, uniform=None, pairwise=None):
    """Specify a shuffler protocol.  
    
       >>> D.shuffler('uniform')
       >>> D.shuffer(uniform=True)
       >>> D.shuffle()

       Args:
         uniform [bool]: shuffle element uniformly at random
         pairwise [bool]:  elements are assumed to be pairwise similarities, such that the category() method returns an id for each positive pair.  Shuffle keeping positive pairs as minibatch neighbors.        

       Returns: self if a new shuffler is requested, otherwise return a lambda function which shuffles a list. This lambda function is not meant to be used directly, rather exercised by shuffle
    """
    if method:
        assert method in ['uniform', 'pairwise'], "unknown shuffler '%s'" % method
        self._shuffler = method
    elif pairwise:
        self._shuffler = 'pairwise'
    elif uniform:
        self._shuffler = 'uniform'
    elif self._shuffler == 'uniform':
        return lambda y: sorted(y, key=lambda x: random.random())
    elif self._shuffler == 'pairwise':
        return lambda y: vipy.util.flatlist(sorted(vipy.util.chunklistbysize(sorted(y, key=lambda x: x.category()), 2), key=lambda x: random.random()))
    return self

def sort(self, key)

Sort the dataset in-place using the sortkey lambda function

Expand source code Browse git

def sort(self, key):
    """Sort the dataset in-place using the sortkey lambda function"""
    if key is not None:
        self._objlist.sort(key=lambda x: key(self._loader(x)))
    return self

def split(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None, withtest=True)

Split the dataset into the requested fractions.

Args

trainfraction [float]: fraction of dataset for training set
valfraction [float]: fraction of dataset for validation set
testfraction [float]: fraction of dataset for test set
seed [int]: random seed for determinism. Set to None for random.
withtest: If true, return (trainset, valset, testset) even if testset is None

Returns:
(trainset, valset, testset) if withtest=True else (trainset, valest) if testfraction=0

Expand source code Browse git

def split(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None, withtest=True):
    """Split the dataset into the requested fractions.  

    Args:
        trainfraction [float]: fraction of dataset for training set
        valfraction [float]: fraction of dataset for validation set
        testfraction [float]: fraction of dataset for test set
        seed [int]: random seed for determinism.  Set to None for random.
        withtest: If true, return (trainset, valset, testset) even if testset is None

    Returns:        
        (trainset, valset, testset) if withtest=True else (trainset, valest) if testfraction=0
    """
    assert trainfraction >=0 and trainfraction <= 1
    assert valfraction >=0 and valfraction <= 1
    assert testfraction >=0 and testfraction <= 1
    assert trainfraction + valfraction + testfraction == 1.0
    
    # Assignment
    if seed is not None:
        np.random.seed(seed)  # deterministic        
    A = self.list()
    idx = list(range(len(A)))
    np.random.shuffle(idx)
    (testid, valid, trainid) = vipy.util.dividelist(idx, (testfraction, valfraction, trainfraction))
    (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
    trainset = [a for (k,a) in enumerate(A) if k in trainid]
    testset = [a for (k,a) in enumerate(A) if k in testid]
    valset = [a for (k,a) in enumerate(A) if k in valid]
    if seed is not None:
        np.random.seed()  # re-initialize seed

    (train,val,test) = (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)
    return (train,val,test) if withtest or test is not None else (train,val)

def split_by_videoid(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None)

Split the dataset by category by fraction so that video IDs are never in the same set

Expand source code Browse git

def split_by_videoid(self, trainfraction=0.9, valfraction=0.1, testfraction=0, seed=None):
    """Split the dataset by category by fraction so that video IDs are never in the same set"""
    assert self._isvipy(), "Invalid input"
    assert trainfraction >=0 and trainfraction <= 1
    assert valfraction >=0 and valfraction <= 1
    assert testfraction >=0 and testfraction <= 1
    assert trainfraction + valfraction + testfraction == 1.0
    np.random.seed(seed)  # deterministic
    
    # Video ID assignment
    A = self.list()
    videoid = list(set([a.videoid() for a in A]))
    np.random.shuffle(videoid)
    (testid, valid, trainid) = vipy.util.dividelist(videoid, (testfraction, valfraction, trainfraction))        
    (testid, valid, trainid) = (set(testid), set(valid), set(trainid))
    d = groupbyasdict(A, lambda a: 'testset' if a.videoid() in testid else 'valset' if a.videoid() in valid else 'trainset')
    (trainset, testset, valset) = (d['trainset'] if 'trainset' in d else [], 
                                   d['testset'] if 'testset' in d else [], 
                                   d['valset'] if 'valset' in d else [])

    #print('[vipy.dataset]: trainset=%d (%1.2f)' % (len(trainset), trainfraction))
    #print('[vipy.dataset]: valset=%d (%1.2f)' % (len(valset), valfraction))
    #print('[vipy.dataset]: testset=%d (%1.2f)' % (len(testset), testfraction))
    np.random.seed()  # re-initialize seed

    return (Dataset(trainset, id='trainset'), Dataset(valset, id='valset'), Dataset(testset, id='testset') if len(testset)>0 else None)

def synonym(self, synonymdict)

Convert all categories in the dataset using the provided synonym dictionary mapping

Expand source code Browse git

def synonym(self, synonymdict):
    """Convert all categories in the dataset using the provided synonym dictionary mapping"""
    assert self._isvipy()
    assert isinstance(synonymdict, dict)
    
    if self._is_vipy_video_scene():
        return self.localmap(lambda v: v.trackmap(lambda t: t.categoryif(synonymdict)).activitymap(lambda a: a.categoryif(synonymdict)))
    elif self._is_vipy_image_scene():
        return self.localmap(lambda v: v.objectmap(lambda o: o.categoryif(synonymdict)))
    return self

def take(self, n, category=None, canload=False, seed=None)

Randomlly Take n elements from the dataset, and return a dataset. If seed=int, take will return the same results each time.

Expand source code Browse git

def take(self, n, category=None, canload=False, seed=None):
    """Randomlly Take n elements from the dataset, and return a dataset.  If seed=int, take will return the same results each time."""
    assert isinstance(n, int) and n>0
    D = self.clone(shallow=True)
    D._objlist = self.takelist(n, category=category, seed=seed)
    return D

def take_per_category(self, n, seed=None)

Random;y take n elements per category and return a shallow cloned dataset

Expand source code Browse git

def take_per_category(self, n, seed=None):
    """Random;y take n elements per category and return a shallow cloned dataset"""
    D = self.clone(shallow=True)
    d_category_to_objlist = vipy.util.groupbyasdict(self._objlist, lambda x: x.category())
    D._objlist = [v for c in self.categories() for v in Dataset(d_category_to_objlist[c]).take(n, seed=seed)]
    return D

def takefilter(self, f, n=1)

Apply the lambda function f and return n elements in a list where the filter returns true

Args

f: [lambda] If f(x) returns true, then keep
n: [int >= 0] The number of elements to take

Returns

[n=0] Returns empty list [n=1] Returns singleton element [n>1] Returns list of elements of at most n such that each element f(x) is True

Expand source code Browse git

def takefilter(self, f, n=1):
    """Apply the lambda function f and return n elements in a list where the filter returns true
    
    Args:
        f: [lambda] If f(x) returns true, then keep
        n: [int >= 0] The number of elements to take
    
    Returns:
        [n=0] Returns empty list
        [n=1] Returns singleton element
        [n>1] Returns list of elements of at most n such that each element f(x) is True            
    """
    objlist = [obj for obj in self if f(obj)]
    return [] if (len(objlist) == 0 or n == 0) else (objlist[0] if n==1 else objlist[0:n])

def takelist(self, n, category=None, seed=None)

Take n elements of selected category and return list. The elements are not cloned.

Expand source code Browse git

def takelist(self, n, category=None, seed=None):
    """Take n elements of selected category and return list.  The elements are not cloned."""
    assert n >= 0, "Invalid length"
    K = list(range(len(self))) if category is None else [k for (k,v) in enumerate(self) if v.category() == category]
    if seed is not None:
        assert isinstance(seed, int), "integer required"
        np.random.seed(seed)            
    outlist = [self[int(k)] for k in np.random.permutation(K)[0:n]]  # native python int
    if seed is not None:
        np.random.seed()
    return outlist

def takeone(self, category=None, canload=False, seed=None)

Randomly take one element from the dataset and return a singleton

Expand source code Browse git

def takeone(self, category=None, canload=False, seed=None):
    """Randomly take one element from the dataset and return a singleton"""
    D = self.take(n=1, category=category, canload=canload, seed=seed)
    return D[0] if len(D)>0 else None

def to_torch(self, f_video_to_tensor)

Return a torch dataset that will apply the lambda function f_video_to_tensor to each element in the dataset on demand

Expand source code Browse git

def to_torch(self, f_video_to_tensor):
    """Return a torch dataset that will apply the lambda function f_video_to_tensor to each element in the dataset on demand"""
    import vipy.torch
    return vipy.torch.TorchDataset(f_video_to_tensor, self)

def to_torch_tensordir(self, f_video_to_tensor, outdir, n_augmentations=20, sleep=None)

Return a TorchTensordir dataset that will load a pkl.bz2 file that contains one of n_augmentations (tensor, label) pairs.

This is useful for fast loading of datasets that contain many videos.

Expand source code Browse git

def to_torch_tensordir(self, f_video_to_tensor, outdir, n_augmentations=20, sleep=None):
    """Return a TorchTensordir dataset that will load a pkl.bz2 file that contains one of n_augmentations (tensor, label) pairs.
    
    This is useful for fast loading of datasets that contain many videos.

    """
    import vipy.torch    # lazy import, requires vipy[all] 
    from vipy.batch import Batch   # requires pip install vipy[all]

    assert self._is_vipy_video_scene()
    outdir = vipy.util.remkdir(outdir)
    self.map(lambda v, f=f_video_to_tensor, outdir=outdir, n_augmentations=n_augmentations: vipy.util.bz2pkl(os.path.join(outdir, '%s.pkl.bz2' % v.instanceid()), [f(v.print(sleep=sleep).clone()) for k in range(0, n_augmentations)]))
    return vipy.torch.Tensordir(outdir)

def tocsv(self, csvfile=None)

Expand source code Browse git

def tocsv(self, csvfile=None):
    csv = [v.csv() for v in self.list]        
    return vipy.util.writecsv(csv, csvfile) if csvfile is not None else (csv[0], csv[1:])

def tohtml(self, outfile, mindim=512, title='Visualization', fraction=1.0, display=False, clip=True, activities=True, category=True)

Generate a standalone HTML file containing quicklooks for each annotated activity in dataset, along with some helpful provenance information for where the annotation came from

Expand source code Browse git

def tohtml(self, outfile, mindim=512, title='Visualization', fraction=1.0, display=False, clip=True, activities=True, category=True):
    """Generate a standalone HTML file containing quicklooks for each annotated activity in dataset, along with some helpful provenance information for where the annotation came from"""

    assert ishtml(outfile), "Output file must be .html"
    assert fraction > 0 and fraction <= 1.0, "Fraction must be between [0,1]"
    
    import vipy.util  # This should not be necessary, but we get "UnboundLocalError" without it, not sure why..
    import vipy.batch  # requires pip install vipy[all]

    dataset = self.list()
    assert all([isinstance(v, vipy.video.Video) for v in dataset])
    dataset = [dataset[int(k)] for k in np.random.permutation(range(len(dataset)))[0:int(len(dataset)*fraction)]]
    #dataset = [v for v in dataset if all([len(a) < 15*v.framerate() for a in v.activitylist()])]  # remove extremely long videos

    quicklist = vipy.batch.Batch(dataset, strict=False, as_completed=True, minscatter=1).map(lambda v: (v.load().quicklook(), v.flush().print())).result()
    quicklist = [x for x in quicklist if x is not None]  # remove errors
    quicklooks = [imq for (imq, v) in quicklist]  # keep original video for HTML display purposes
    provenance = [{'clip':str(v), 'activities':str(';'.join([str(a) for a in v.activitylist()])), 'category':v.category()} for (imq, v) in quicklist]
    (quicklooks, provenance) = zip(*sorted([(q,p) for (q,p) in zip(quicklooks, provenance)], key=lambda x: x[1]['category']))  # sorted in category order
    return vipy.visualize.tohtml(quicklooks, provenance, title='%s' % title, outfile=outfile, mindim=mindim, display=display)

def tojsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True)

Alias for Dataset.jsondir()

Expand source code Browse git

def tojsondir(self, outdir=None, verbose=True, rekey=False, bycategory=False, byfilename=False, abspath=True):
    """Alias for `vipy.dataset.Dataset.jsondir`"""
    return self.jsondir(outdir, verbose=verbose, rekey=rekey, bycategory=bycategory, byfilename=byfilename, abspath=abspath)

def tolist(self)

Alias for self.list()

Expand source code Browse git

def tolist(self):
    """Alias for self.list()"""
    return list(self)

def union(self, other, key=None)

Expand source code Browse git

def union(self, other, key=None):
    assert isinstance(other, Dataset), "invalid input"
    if len(other) > 0:
        try:
            if other._loader is not None:
                other._loader(self._objlist[0])
            if self._loader is not None:
                self._loader(other._objlist[0])
            self._objlist = self._objlist + other._objlist  # compatible loaders
        except:
            self._objlist = self.list() + other.list()  # incompatible loaders
            self._loader = None
    return self.dedupe(key) if key is not None else self

def valid(self)

Expand source code Browse git

def valid(self):
    return self.filter(lambda v: v is not None)

def video_duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None)

Duration of activities

Expand source code Browse git

def video_duration_in_seconds(self, outfile=None, fontsize=6, max_duration=None):
    """Duration of activities"""
    assert self._isvipy()
    d = {k:np.mean([d for (c,d) in D]) for (k,D) in groupbyasdict([(v.category(), v.duration()) for v in self.list()], lambda x: x[0]).items()}
    if outfile is not None:
        max_duration = max(d.values()) if max_duration is None else max_duration
        vipy.metrics.histogram([min(x, max_duration) for x in d.values()], d.keys(), outfile=outfile, ylabel='Duration (seconds)', fontsize=fontsize)            
    return d

def video_montage(self, outfile, gridrows, gridcols, mindim=64, bycategory=False, category=None, annotate=True, trackcrop=False, transpose=False, max_duration=None, framerate=30, fontsize=8)

30x50 activity montage, each 64x64 elements.

Args

outfile: [str] The name of the outfile for the video. Must have a valid video extension.
gridrows: [int, None] The number of rows to include in the montage. If None, infer from other args
gridcols: [int] The number of columns in the montage
mindim: [int] The square size of each video in the montage
bycategory: [bool] Make the video such that each row is a category
category: [str, list] Make the video so that every element is of category. May be a list of more than one categories
annotate: [bool] If true, include boxes and captions for objects and activities
trackcrop: [bool] If true, center the video elements on the tracks with dilation factor 1.5
transpose: [bool] If true, organize categories columnwise, but still return a montage of size (gridrows, gridcols)
max_duration: [float] If not None, then set a maximum duration in seconds for elements in the video. If None, then the max duration is the duration of the longest element.

Returns

A clone of the dataset containing the selected videos for the montage, ordered rowwise in the montage

Notes

If a category does not contain the required number of elements for bycategory, it is removed prior to visualization
Elements are looped if they exit prior to the end of the longest video (or max_duration)

Expand source code Browse git

def video_montage(self, outfile, gridrows, gridcols, mindim=64, bycategory=False, category=None, annotate=True, trackcrop=False, transpose=False, max_duration=None, framerate=30, fontsize=8):
    """30x50 activity montage, each 64x64 elements.

    Args:
        outfile: [str] The name of the outfile for the video.  Must have a valid video extension. 
        gridrows: [int, None]  The number of rows to include in the montage.  If None, infer from other args
        gridcols: [int] The number of columns in the montage
        mindim: [int] The square size of each video in the montage
        bycategory: [bool]  Make the video such that each row is a category 
        category: [str, list] Make the video so that every element is of category.  May be a list of more than one categories
        annotate: [bool] If true, include boxes and captions for objects and activities
        trackcrop: [bool] If true, center the video elements on the tracks with dilation factor 1.5
        transpose: [bool] If true, organize categories columnwise, but still return a montage of size (gridrows, gridcols)
        max_duration: [float] If not None, then set a maximum duration in seconds for elements in the video.  If None, then the max duration is the duration of the longest element.

    Returns:
        A clone of the dataset containing the selected videos for the montage, ordered rowwise in the montage

    .. notes::  
        - If a category does not contain the required number of elements for bycategory, it is removed prior to visualization
        - Elements are looped if they exit prior to the end of the longest video (or max_duration)
    """
    assert self._is_vipy_video()
    assert vipy.util.isvideo(outfile)
    assert gridrows is None or (isinstance(gridrows, int) and gridrows >= 1)
    assert gridcols is None or (isinstance(gridcols, int) and gridcols >= 1)
    assert isinstance(mindim, int) and mindim >= 1
    assert category is None or isinstance(category, str)

    D = self.clone()
    if bycategory:
        (num_categories, num_elements) = (gridrows, gridcols) if not transpose else (gridcols, gridrows)
        assert num_elements is not None
        requested_categories = sorted(D.classlist()) if (num_categories is None) else sorted(D.classlist())[0:num_categories]             
        categories = [c for c in requested_categories if D.count()[c] >= num_elements]  # filter those categories that do not have enough
        if set(categories) != set(requested_categories):
            warnings.warn('[vipy.dataset.video_montage]: removing "%s" without at least %d examples' % (str(set(requested_categories).difference(set(categories))), num_elements))
        vidlist = sorted(D.filter(lambda v: v.category() in categories).take_per_category(num_elements).tolist(), key=lambda v: v.category())
        vidlist = vidlist if not transpose else [vidlist[k] for k in np.array(range(0, len(vidlist))).reshape( (len(categories), num_elements) ).transpose().flatten().tolist()] 
        (gridrows, gridcols) = (len(categories), num_elements) if not transpose else (num_elements, len(categories))
        assert len(vidlist) == gridrows*gridcols

    elif category is not None:
        vidlist = D.filter(lambda v: v.category() in vipy.util.tolist(category)).take(gridrows*gridcols, canload=True).tolist()            
    elif len(D) != gridrows*gridcols:
        vidlist = D.take(gridrows*gridcols, canload=True).tolist()
    else:
        vidlist = D.tolist()

    vidlist = [v.framerate(framerate) for v in vidlist]  # resample to common framerate (this may result in jittery tracks
    montage = Dataset(vidlist, id='video_montage').clone()  # for output
    vidlist = [v.trackcrop(dilate=1.5, maxsquare=True) if (v.trackbox() is not None) else v for v in vidlist] if trackcrop else vidlist  # may be None, if so return the video
    vidlist = [v.mindim(mindim) for v in vidlist]  # before annotate for common font size
    vidlist = [vipy.video.Video.cast(v) for v in vidlist] if not annotate else [v.annotate(verbose=False, fontsize=fontsize) for v in vidlist]  # pre-annotate
        
    vipy.visualize.videomontage(vidlist, mindim, mindim, gridrows=gridrows, gridcols=gridcols, framerate=framerate, max_duration=max_duration).saveas(outfile)
    return montage

def zip(self, other, sortkey=None)

Zip two datasets. Equivalent to zip(self, other).

for (d1,d2) in D1.zip(D2, sortkey=lambda v: v.instanceid()):
    pass

for (d1, d2) in zip(D1, D2):
    pass

Args

other: [vipy.dataset.Dataset]
sortkey: [lambda] sort both datasets using the provided sortkey lambda.

Returns

Generator for the tuple sequence ( (self[0], other[0]), (self[1], other[1]), … )

Expand source code Browse git

def zip(self, other, sortkey=None):
    """Zip two datasets.  Equivalent to zip(self, other).

    ```python
    for (d1,d2) in D1.zip(D2, sortkey=lambda v: v.instanceid()):
        pass
    
    for (d1, d2) in zip(D1, D2):
        pass
    ```

    Args:
        other: [`vipy.dataset.Dataset`] 
        sortkey: [lambda] sort both datasets using the provided sortkey lambda.
    
    Returns:
        Generator for the tuple sequence ( (self[0], other[0]), (self[1], other[1]), ... )
    """ 
    assert isinstance(other, Dataset)
    assert len(self) == len(other)

    for (vi, vj) in zip(self.sort(sortkey), other.sort(sortkey)):
        yield (vi, vj)