ANALYSIS.py 3.36 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127


__author__ = 'chunk'

import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(sum(map(ord, "whoami")))
plt.ticklabel_format(style='sci', axis='both', scilimits=(1, 4))

package_dir = os.path.dirname(os.path.abspath(__file__))


def anal_ILSVRC():
    df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
    print df_ILS[df_ILS.size < 2000000]
    print df_ILS.describe()
    # df_ILS.boxplot(column='size')
    # plt.show()

    length = df_ILS.shape[0]

    # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    7082     108514
    3826     150389
    8761    4814541
    """

    print df_ILS.size[df_ILS.size <= 102400].count()
    print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
    print df_ILS.size[df_ILS.size > 153600].count()

    """
    (-,100K,150K,+):
        4519
        6163
        4831
    (-,100K,500K,+):
        4519
        10932
        62
    """

    ## Quality
    print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    13507     96
    831       96
    6529     100
    """
    df_new = df_ILS.sort(['size', 'quality'], ascending=True)
    print df_new

    rand_class = stats.bernoulli.rvs(0.3, size=length)
    # df_new['class'] = pd.Series(rand_class, index=df_new.index)
    df_new['class'] = rand_class

    print rand_class[:100]
    print df_new

    df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')


def anal_ILSVRC_Test():
    df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','class'], sep='\t')
    print df_ILS_T
    print df_ILS_T.size.describe()

    print df_ILS_T.size[df_ILS_T.size <= 102400].count()
    print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
    print df_ILS_T.size[df_ILS_T.size > 153600].count()

    length = df_ILS_T.shape[0]
    df_ILS_T['class2'] = np.zeros(length, np.int32)
    df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')

def anal_0000():
    df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t')
    length = df_ILS.shape[0]
    print df_ILS.size.describe()
    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]

    print df_ILS.size[df_ILS.size <= 166500].count()
    print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
    print df_ILS.size[df_ILS.size > 187500].count()

    df_ILS.hist(column='size',bins=100)
    plt.show()


def pre_crop():
    df_ILS = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality','chosen','class'], sep='\t')
    print df_ILS.shape
    print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape

    # 300x300 4213 0.917 *
    # 200x200 4534 0.987
    # 400x400 932 0.202


if __name__ == '__main__':
    # anal_ILSVRC()
    # anal_ILSVRC_Test()
    # anal_0000()
    # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)


    pre_crop()
    pass