__author__ = 'chunk' import os import numpy as np from numpy.random import randn import pandas as pd from scipy import stats import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns np.random.seed(sum(map(ord, "whoami"))) plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4)) package_dir = os.path.dirname(os.path.abspath(__file__)) def anal_ILSVRC(): df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t') print df_ILS[df_ILS.size < 2000000] print df_ILS.describe() # df_ILS.boxplot(column='size') # plt.show() length = df_ILS.shape[0] # print type(df_ILS.size.order()) # print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] """ 7082 108514 3826 150389 8761 4814541 """ print df_ILS.size[df_ILS.size <= 102400].count() print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count() print df_ILS.size[df_ILS.size > 153600].count() """ (-,100K,150K,+): 4519 6163 4831 (-,100K,500K,+): 4519 10932 62 """ ## Quality print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] """ 13507 96 831 96 6529 100 """ df_new = df_ILS.sort(['size', 'quality'], ascending=True) print df_new rand_class = stats.bernoulli.rvs(0.3, size=length) # df_new['class'] = pd.Series(rand_class, index=df_new.index) df_new['class'] = rand_class print rand_class[:100] print df_new df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t') def anal_ILSVRC_Test(): df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','class'], sep='\t') print df_ILS_T print df_ILS_T.size.describe() print df_ILS_T.size[df_ILS_T.size <= 102400].count() print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count() print df_ILS_T.size[df_ILS_T.size > 153600].count() length = df_ILS_T.shape[0] df_ILS_T['class2'] = np.zeros(length, np.int32) df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t') def anal_0000(): df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t') length = df_ILS.shape[0] print df_ILS.size.describe() print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] print df_ILS.size[df_ILS.size <= 166500].count() print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count() print df_ILS.size[df_ILS.size > 187500].count() df_ILS.hist(column='size',bins=100) plt.show() def pre_crop(): df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t') print df_ILS.shape print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape # 300x300 4213 0.917 * # 200x200 4534 0.987 # 400x400 932 0.202 if __name__ == '__main__': # anal_ILSVRC() # anal_ILSVRC_Test() # anal_0000() # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1) pre_crop() pass