test_whole.py 5.62 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175


__author__ = 'chunk'

from ..mspark import SC
from ..common import *
from ..mdata import ILSVRC, ILSVRC_S

from pyspark.mllib.regression import LabeledPoint
import happybase


def test_whole():
    cols0 = [
        'cf_pic:data',
        'cf_info:width',
        'cf_info:height',
        'cf_info:size',
        'cf_info:capacity',
        'cf_info:quality',
        'cf_info:rate',
        'cf_tag:chosen',
        'cf_tag:class'
    ]
    cols1 = [
        'cf_pic:data',
        'cf_info:width',
        'cf_info:height',
        'cf_info:size',
        'cf_info:capacity',
        'cf_info:quality',
        'cf_info:rate',
        'cf_tag:chosen',
        'cf_tag:class',
        'cf_feat:bid',
    ]

    sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077')

    # rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \
    # .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \
    # .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \
    # .mapValues(lambda items: SC.rddfeat_ILS(items))

    rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False).mapValues(
        lambda data: [data] + SC.rddinfo_ILS(data))
    rdd_data_ext = rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=0.2)).filter(lambda x: x != None)

    rdd_data = rdd_data.union(rdd_data_ext).mapValues(lambda items: SC.rddfeat_ILS(items))

    print len(rdd_data.collect())

    # sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1,
    # withdata=True)


def test_whole_ext(category='Train_100'):
    timer = Timer()

    print '[time]category:', category

    print '[time]formating table...'
    timer.mark()
    dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
    dil.delete_table()
    # dil.format()
    dil.store_img()
    timer.report()

    print '[time]reading table...'
    timer.mark()
    table_name = dil.table_name
    connection = happybase.Connection('HPC-server')
    tables = connection.tables()
    if table_name not in tables:
        families_compressed = {'cf_pic': dict(max_versions=2, compression='LZO'),
                               'cf_info': dict(max_versions=2, compression='LZO'),
                               'cf_tag': dict(max_versions=2, compression='LZO'),
                               'cf_feat': dict(max_versions=2, compression='LZO'),
                               }
        families = {'cf_pic': dict(max_versions=1),
                    'cf_info': dict(max_versions=1),
                    'cf_tag': dict(max_versions=1),
                    'cf_feat': dict(max_versions=1),
                    }
        connection.create_table(name=table_name, families=families)
    table = connection.table(name=table_name)

    cols = ['cf_pic:data']
    list_data = []
    for key, data in table.scan(columns=cols):
        data = data['cf_pic:data']
        list_data.append((key, data))
    timer.report()

    print '[time]processing...'
    timer.mark()
    sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077')
    rdd_data = sparker.sc.parallelize(list_data, 30) \
        .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \
        .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.05)) \
        .mapValues(lambda items: SC.rddfeat_ILS(items))
    items = rdd_data.collect()
    timer.report()

    print '[time]writing table...'
    timer.mark()
    try:
        with table.batch(batch_size=5000) as b:
            for item in items:
                imgname, imginfo = item[0], item[1]
                b.put(imgname,
                      {
                          'cf_pic:data': imginfo[0],
                          'cf_info:width': str(imginfo[1]),
                          'cf_info:height': str(imginfo[2]),
                          'cf_info:size': str(imginfo[3]),
                          'cf_info:capacity': str(imginfo[4]),
                          'cf_info:quality': str(imginfo[5]),
                          'cf_info:rate': str(imginfo[6]),
                          'cf_tag:chosen': str(imginfo[7]),
                          'cf_tag:class': str(imginfo[8]),
                          'cf_feat:ibd': imginfo[9],
                      })
    except ValueError:
        raise
    timer.report()


def test_whole_sp(category='Train_100'):
    timer = Timer()

    print '[time]category:', category

    print '[time]formating table...'
    timer.mark()
    dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
    dil.delete_table()
    # dil.format()
    dil.store_img()
    timer.report()

    table_name = dil.table_name
    feattype = 'ibd'
    cols = [
        'cf_pic:data',
        'cf_info:width',
        'cf_info:height',
        'cf_info:size',
        'cf_info:capacity',
        'cf_info:quality',
        'cf_info:rate',
        'cf_tag:chosen',
        'cf_tag:class',
        'cf_feat:' + feattype,
    ]

    print '[time]processing...'
    timer.mark()
    sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077')
    rdd_data = sparker.read_hbase(table_name, func=SC.rddparse_data_ILS,
                                  collect=False, parallelism=30) \
        .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \
        .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \
        .mapValues(lambda items: SC.rddfeat_ILS(items, feattype))

    print '[time]writing table...'
    sparker.write_hbase(table_name, rdd_data, fromrdd=True, columns=cols,
                        withdata=True)
    timer.report()