Module vipy.data.activitynet

Expand source code Browse git
import os
from vipy.util import filetail, remkdir, readjson, groupbyasdict
import vipy.downloader
from vipy.video import VideoCategory, Video
import numpy as np


# http://activity-net.org/download.html
URL = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/files/activity_net.v1-3.min.json'


class ActivityNet(object):
    def __init__(self, datadir):
        """Activitynet, provide a datadir='/path/to/store/activitynet' """
        self._url = URL
        self.datadir = remkdir(datadir)
        if not self._isdownloaded():
            self.download()
        
    def __repr__(self):
        return str('<vipy.data.activitynet: "%s">' % self.datadir)

    def download(self):
        vipy.downloader.download(URL, os.path.join(self.datadir, filetail(URL)))
        return self

    def _isdownloaded(self):
        return os.path.exists(os.path.join(self.datadir, 'activity_net.v1-3.min.json'))
    
    def _dataset(self, subset):
        assert self._isdownloaded(), "Dataset not downloaded.  download() first or manually download '%s' into '%s'" % (self._url, self.datadir)        
        jsonfile = os.path.join(self.datadir, filetail(URL))
        json = readjson(jsonfile)
        return [VideoCategory(url=v['url'],
                              filename=os.path.join(self.datadir, youtubeid),
                              category=a['label'],
                              startsec=float(a['segment'][0]),
                              endsec=float(a['segment'][1]))
                for (youtubeid, v) in json['database'].items()
                for a in v['annotations']
                if v['subset'] == subset]

    def trainset(self):
        return self._dataset('training')

    def testset(self):
        """ActivityNet test set does not include any annotations"""
        assert self._isdownloaded(), "Dataset not downloaded.  download() first or manually download '%s' into '%s'" % (self._url, self.datadir)        
        json = readjson(os.path.join(self.datadir, filetail(URL)))
        return [Video(url=v['url'], filename=os.path.join(self.datadir, youtubeid)) for (youtubeid, v) in json['database'].items() if v['subset'] == 'testing']

    def valset(self):
        return self._dataset('validation')
    
    def categories(self):
        return set([v.category() for v in self.trainset()])

    def analysis(self):
        C = self.categories()
        d_category_to_trainsize = {k:len(v) for (k,v) in groupbyasdict(self.trainset(), lambda x: x.category()).items()}

        top10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[-10:]
        print('top10 categories by number of instances in training set:')
        print(top10)

        bottom10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[0:10]
        print('bottom-10 categories by number of instances in training set:')
        print(bottom10)
        
        return d_category_to_trainsize

Classes

class ActivityNet (datadir)

Activitynet, provide a datadir='/path/to/store/activitynet'

Expand source code Browse git
class ActivityNet(object):
    def __init__(self, datadir):
        """Activitynet, provide a datadir='/path/to/store/activitynet' """
        self._url = URL
        self.datadir = remkdir(datadir)
        if not self._isdownloaded():
            self.download()
        
    def __repr__(self):
        return str('<vipy.data.activitynet: "%s">' % self.datadir)

    def download(self):
        vipy.downloader.download(URL, os.path.join(self.datadir, filetail(URL)))
        return self

    def _isdownloaded(self):
        return os.path.exists(os.path.join(self.datadir, 'activity_net.v1-3.min.json'))
    
    def _dataset(self, subset):
        assert self._isdownloaded(), "Dataset not downloaded.  download() first or manually download '%s' into '%s'" % (self._url, self.datadir)        
        jsonfile = os.path.join(self.datadir, filetail(URL))
        json = readjson(jsonfile)
        return [VideoCategory(url=v['url'],
                              filename=os.path.join(self.datadir, youtubeid),
                              category=a['label'],
                              startsec=float(a['segment'][0]),
                              endsec=float(a['segment'][1]))
                for (youtubeid, v) in json['database'].items()
                for a in v['annotations']
                if v['subset'] == subset]

    def trainset(self):
        return self._dataset('training')

    def testset(self):
        """ActivityNet test set does not include any annotations"""
        assert self._isdownloaded(), "Dataset not downloaded.  download() first or manually download '%s' into '%s'" % (self._url, self.datadir)        
        json = readjson(os.path.join(self.datadir, filetail(URL)))
        return [Video(url=v['url'], filename=os.path.join(self.datadir, youtubeid)) for (youtubeid, v) in json['database'].items() if v['subset'] == 'testing']

    def valset(self):
        return self._dataset('validation')
    
    def categories(self):
        return set([v.category() for v in self.trainset()])

    def analysis(self):
        C = self.categories()
        d_category_to_trainsize = {k:len(v) for (k,v) in groupbyasdict(self.trainset(), lambda x: x.category()).items()}

        top10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[-10:]
        print('top10 categories by number of instances in training set:')
        print(top10)

        bottom10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[0:10]
        print('bottom-10 categories by number of instances in training set:')
        print(bottom10)
        
        return d_category_to_trainsize

Methods

def analysis(self)
Expand source code Browse git
def analysis(self):
    C = self.categories()
    d_category_to_trainsize = {k:len(v) for (k,v) in groupbyasdict(self.trainset(), lambda x: x.category()).items()}

    top10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[-10:]
    print('top10 categories by number of instances in training set:')
    print(top10)

    bottom10 = sorted([(k,v) for (k,v) in d_category_to_trainsize.items()], key=lambda x: x[1])[0:10]
    print('bottom-10 categories by number of instances in training set:')
    print(bottom10)
    
    return d_category_to_trainsize
def categories(self)
Expand source code Browse git
def categories(self):
    return set([v.category() for v in self.trainset()])
def download(self)
Expand source code Browse git
def download(self):
    vipy.downloader.download(URL, os.path.join(self.datadir, filetail(URL)))
    return self
def testset(self)

ActivityNet test set does not include any annotations

Expand source code Browse git
def testset(self):
    """ActivityNet test set does not include any annotations"""
    assert self._isdownloaded(), "Dataset not downloaded.  download() first or manually download '%s' into '%s'" % (self._url, self.datadir)        
    json = readjson(os.path.join(self.datadir, filetail(URL)))
    return [Video(url=v['url'], filename=os.path.join(self.datadir, youtubeid)) for (youtubeid, v) in json['database'].items() if v['subset'] == 'testing']
def trainset(self)
Expand source code Browse git
def trainset(self):
    return self._dataset('training')
def valset(self)
Expand source code Browse git
def valset(self):
    return self._dataset('validation')