ANALYSIS.py 2.24 KB
__author__ = 'chunk'

import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(sum(map(ord, "whoami")))
plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4))

package_dir = os.path.dirname(os.path.abspath(__file__))


def anal_ILSVRC():
    df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
    print df_ILS[df_ILS.size < 2000000]
    print df_ILS.describe()
    # df_ILS.boxplot(column='size')
    # plt.show()

    length = df_ILS.shape[0]

    # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    7082     108514
    3826     150389
    8761    4814541
    """

    print df_ILS.size[df_ILS.size <= 102400].count()
    print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
    print df_ILS.size[df_ILS.size > 153600].count()

    """
    (-,100K,150K,+):
        4519
        6163
        4831
    (-,100K,500K,+):
        4519
        10932
        62
    """

    ## Quality
    print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    13507     96
    831       96
    6529     100
    """
    df_new = df_ILS.sort(['size', 'quality'], ascending=True)
    print df_new

    rand_class = stats.bernoulli.rvs(0.3, size=length)
    df_new['class'] = pd.Series(rand_class, index=df_new.index)

    print rand_class
    print df_new

    df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')


def anal_ILSVRC_Test():
    df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','class'], sep='\t')
    print df_ILS_T
    print df_ILS_T.size.describe()

    print df_ILS_T.size[df_ILS_T.size <= 102400].count()
    print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
    print df_ILS_T.size[df_ILS_T.size > 153600].count()









if __name__ == '__main__':
    anal_ILSVRC_Test()
    # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
    pass