ANALYSIS.py
1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
__author__ = 'chunk'
import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(sum(map(ord, "whoami")))
plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4))
package_dir = os.path.dirname(os.path.abspath(__file__))
def anal_ILSVRC():
df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
print df_ILS[df_ILS.size < 2000000]
print df_ILS.describe()
# df_ILS.boxplot(column='size')
# plt.show()
length = df_ILS.shape[0]
# print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
7082 108514
3826 150389
8761 4814541
"""
print df_ILS.size[df_ILS.size <= 102400].count()
print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
print df_ILS.size[df_ILS.size > 153600].count()
"""
(-,100K,150K,+):
4519
6163
4831
(-,100K,500K,+):
4519
10932
62
"""
## Quality
print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
13507 96
831 96
6529 100
"""
df_new = df_ILS.sort(['size', 'quality'], ascending=True)
print df_new
rand_class = stats.bernoulli.rvs(0.3, size=length)
df_new['class'] = pd.Series(rand_class, index=df_new.index)
print rand_class
print df_new
df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
if __name__ == '__main__':
anal_ILSVRC()
# print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
pass