Commit 8bddd8b35aabd5333192647471c7e198890081cf
1 parent
1c2a3fa0
Exists in
master
and in
1 other branch
You guess what? Through all the 'debugs' and 'f**ks' finally we have finished the pyspark&hbase!
Showing
5 changed files
with
22 additions
and
12 deletions
Show diff stats
mdata/ILSVRC_S.py
... | ... | @@ -361,7 +361,7 @@ class DataILSVRC_S(DataDumperBase): |
361 | 361 | if readforward: |
362 | 362 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
363 | 363 | |
364 | - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)) | |
364 | + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) | |
365 | 365 | self.rdd_data = self.rdd_data.union(rdd_data_ext) |
366 | 366 | |
367 | 367 | if not writeback: |
... | ... | @@ -454,7 +454,10 @@ class DataILSVRC_S(DataDumperBase): |
454 | 454 | if readforward: |
455 | 455 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
456 | 456 | |
457 | - self.rdd_data = self.rdd_data.map(lambda x: SC.rddfeat_ILS(x)) | |
457 | + self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddfeat_ILS(items)) | |
458 | + | |
459 | + # print self.rdd_data.collect()[0] | |
460 | + # return | |
458 | 461 | |
459 | 462 | if not writeback: |
460 | 463 | return self.rdd_data | ... | ... |
mspark/SC.py
... | ... | @@ -56,9 +56,16 @@ def rddparse_data_ILS(raw_row): |
56 | 56 | |
57 | 57 | |
58 | 58 | def rddparse_all_ILS(raw_row): |
59 | + """ | |
60 | + Deprecated | |
61 | + """ | |
59 | 62 | key = raw_row[0] |
60 | 63 | items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') |
64 | + | |
65 | + # @TODO | |
66 | + # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings. | |
61 | 67 | data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in items[1:]] |
68 | + | |
62 | 69 | return (key, data) |
63 | 70 | |
64 | 71 | |
... | ... | @@ -145,17 +152,15 @@ def _get_feat(image, feattype='ibd', **kwargs): |
145 | 152 | return desc |
146 | 153 | |
147 | 154 | |
148 | -def rddfeat_ILS(row, feattype='ibd', **kwargs): | |
149 | - items = row[1] | |
150 | - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) | |
155 | +def rddfeat_ILS(items, feattype='ibd', **kwargs): | |
151 | 156 | try: |
152 | 157 | tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') |
153 | 158 | tmpf_src.write(items[0]) |
154 | 159 | tmpf_src.seek(0) |
155 | 160 | |
156 | 161 | desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) |
157 | - | |
158 | - return (row[0], row[1].append(desc)) | |
162 | + # print 'desccccccccccccccccccc',desc | |
163 | + return items + [desc] | |
159 | 164 | |
160 | 165 | except Exception as e: |
161 | 166 | print e |
... | ... | @@ -174,6 +179,8 @@ def format_out(row, cols): |
174 | 179 | """ |
175 | 180 | puts = [] |
176 | 181 | key = row[0] |
182 | + if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
183 | + print row | |
177 | 184 | for data, col in zip(row[1], cols): |
178 | 185 | puts.append((key, [key] + col + [str(data)])) |
179 | 186 | return puts | ... | ... |
res/toembed
test/test_data.py
... | ... | @@ -72,9 +72,9 @@ def test_ILSVRC_S(): |
72 | 72 | |
73 | 73 | # dils.format() |
74 | 74 | |
75 | - # dils._extract_data(mode='spark', writeback=True) | |
76 | - dils._embed_data(mode='spark', rate=0.1, readforward=True, writeback=True) | |
77 | - # dils._extract_feat( mode='spark', feattype='ibd', readforward=False, writeback=False) | |
75 | + dils._extract_data(mode='spark', writeback=False) | |
76 | + dils._embed_data(mode='spark', rate=0.1, readforward=False, writeback=False) | |
77 | + dils._extract_feat( mode='spark', feattype='ibd', readforward=False, writeback=True) | |
78 | 78 | |
79 | 79 | |
80 | 80 | if __name__ == '__main__': | ... | ... |