92d488d8
Chunk
.
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
__author__ = 'chunk'
import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(sum(map(ord, "whoami")))
plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4))
package_dir = os.path.dirname(os.path.abspath(__file__))
def anal_ILSVRC():
df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
print df_ILS[df_ILS.size < 2000000]
print df_ILS.describe()
# df_ILS.boxplot(column='size')
# plt.show()
length = df_ILS.shape[0]
# print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
7082 108514
3826 150389
8761 4814541
"""
print df_ILS.size[df_ILS.size <= 102400].count()
print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
print df_ILS.size[df_ILS.size > 153600].count()
"""
(-,100K,150K,+):
4519
6163
4831
(-,100K,500K,+):
4519
10932
62
"""
## Quality
print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
"""
13507 96
831 96
6529 100
"""
df_new = df_ILS.sort(['size', 'quality'], ascending=True)
print df_new
rand_class = stats.bernoulli.rvs(0.3, size=length)
df_new['class'] = pd.Series(rand_class, index=df_new.index)
print rand_class
print df_new
df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
def anal_ILSVRC_Test():
df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','class'], sep='\t')
print df_ILS_T
print df_ILS_T.size.describe()
print df_ILS_T.size[df_ILS_T.size <= 102400].count()
print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
print df_ILS_T.size[df_ILS_T.size > 153600].count()
if __name__ == '__main__':
anal_ILSVRC_Test()
# print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
pass
|