ANALYSIS.py
3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
__author__ = 'chunk'
import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(sum(map(ord, "whoami")))
plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4))
package_dir = os.path.dirname(os.path.abspath(__file__))
def anal_ILSVRC():
df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
print df_ILS[df_ILS.size < 2000000]
print df_ILS.describe()
# df_ILS.boxplot(column='size')
# plt.show()
length = df_ILS.shape[0]
# print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
7082 108514
3826 150389
8761 4814541
"""
print df_ILS.size[df_ILS.size <= 102400].count()
print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
print df_ILS.size[df_ILS.size > 153600].count()
"""
(-,100K,150K,+):
4519
6163
4831
(-,100K,500K,+):
4519
10932
62
"""
## Quality
print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
13507 96
831 96
6529 100
"""
df_new = df_ILS.sort(['size', 'quality'], ascending=True)
print df_new
rand_class = stats.bernoulli.rvs(0.3, size=length)
# df_new['class'] = pd.Series(rand_class, index=df_new.index)
df_new['class'] = rand_class
print rand_class[:100]
print df_new
df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
def anal_ILSVRC_Test():
df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','class'], sep='\t')
print df_ILS_T
print df_ILS_T.size.describe()
print df_ILS_T.size[df_ILS_T.size <= 102400].count()
print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
print df_ILS_T.size[df_ILS_T.size > 153600].count()
length = df_ILS_T.shape[0]
df_ILS_T['class2'] = np.zeros(length, np.int32)
df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')
def anal_0000():
df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t')
length = df_ILS.shape[0]
print df_ILS.size.describe()
print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
print df_ILS.size[df_ILS.size <= 166500].count()
print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
print df_ILS.size[df_ILS.size > 187500].count()
df_ILS.hist(column='size',bins=100)
plt.show()
def pre_crop():
df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t')
print df_ILS.shape
print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape
# 300x300 4213 0.917 *
# 200x200 4534 0.987
# 400x400 932 0.202
if __name__ == '__main__':
# anal_ILSVRC()
# anal_ILSVRC_Test()
# anal_0000()
# print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
pre_crop()
pass