Commit d47ae6ce8a8a4197938768893f939b9e10e1b244
1 parent
8bddd8b3
Exists in
master
and in
1 other branch
staged.
Showing
4 changed files
with
58 additions
and
16 deletions
Show diff stats
mdata/ILSVRC.py
... | ... | @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): |
291 | 291 | |
292 | 292 | return table |
293 | 293 | |
294 | + def delete_table(self, table_name=None, disable=True): | |
295 | + if table_name == None: | |
296 | + table_name = self.table_name | |
297 | + | |
298 | + if self.connection is None: | |
299 | + c = happybase.Connection('HPC-server') | |
300 | + self.connection = c | |
301 | + | |
302 | + tables = self.connection.tables() | |
303 | + if table_name not in tables: | |
304 | + return False | |
305 | + else: | |
306 | + try: | |
307 | + self.connection.delete_table(table_name, disable) | |
308 | + except: | |
309 | + print 'Exception when deleting table.' | |
310 | + raise | |
311 | + return True | |
294 | 312 | |
295 | 313 | def store_img(self): |
296 | 314 | if self.table == None: | ... | ... |
mdata/ILSVRC_S.py
... | ... | @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): |
81 | 81 | |
82 | 82 | return table |
83 | 83 | |
84 | + def delete_table(self, table_name=None, disable=True): | |
85 | + if table_name == None: | |
86 | + table_name = self.table_name | |
87 | + | |
88 | + if self.connection is None: | |
89 | + c = happybase.Connection('HPC-server') | |
90 | + self.connection = c | |
91 | + | |
92 | + tables = self.connection.tables() | |
93 | + if table_name not in tables: | |
94 | + return False | |
95 | + else: | |
96 | + try: | |
97 | + self.connection.delete_table(table_name, disable) | |
98 | + except: | |
99 | + print 'Exception when deleting table.' | |
100 | + raise | |
101 | + return True | |
102 | + | |
84 | 103 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): |
85 | 104 | """ |
86 | 105 | Tempfile is our friend. (?) |
... | ... | @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): |
256 | 275 | if not writeback: |
257 | 276 | return self.rdd_data |
258 | 277 | else: |
259 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
278 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) | |
260 | 279 | |
261 | 280 | else: |
262 | 281 | raise Exception("Unknown mode!") |
... | ... | @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): |
361 | 380 | if readforward: |
362 | 381 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
363 | 382 | |
364 | - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) | |
383 | + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) | |
365 | 384 | self.rdd_data = self.rdd_data.union(rdd_data_ext) |
366 | 385 | |
367 | 386 | if not writeback: |
368 | 387 | return self.rdd_data |
369 | 388 | else: |
370 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
389 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) | |
371 | 390 | |
372 | 391 | else: |
373 | 392 | raise Exception("Unknown mode!") |
... | ... | @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): |
462 | 481 | if not writeback: |
463 | 482 | return self.rdd_data |
464 | 483 | else: |
465 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
484 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False) | |
466 | 485 | |
467 | 486 | |
468 | 487 | else: | ... | ... |
mspark/SC.py
... | ... | @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): |
99 | 99 | tmpf.close() |
100 | 100 | |
101 | 101 | |
102 | -def rddembed_ILS(row): | |
102 | +def rddembed_ILS(row, rate=None): | |
103 | 103 | """ |
104 | 104 | input: |
105 | 105 | e.g. row =('row1',[1,3400,'hello']) |
... | ... | @@ -107,7 +107,7 @@ def rddembed_ILS(row): |
107 | 107 | newrow = ('row2',[34,5400,'embeded']) |
108 | 108 | """ |
109 | 109 | items = row[1] |
110 | - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) | |
110 | + capacity, chosen = int(items[4]), int(items[7]) | |
111 | 111 | if chosen == 0: |
112 | 112 | return None |
113 | 113 | try: |
... | ... | @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): |
169 | 169 | tmpf_src.close() |
170 | 170 | |
171 | 171 | |
172 | -def format_out(row, cols): | |
172 | +def format_out(row, cols, withdata=False): | |
173 | 173 | """ |
174 | 174 | input: |
175 | 175 | e.g. row =('row1',[1,3400,'hello']) |
... | ... | @@ -179,10 +179,14 @@ def format_out(row, cols): |
179 | 179 | """ |
180 | 180 | puts = [] |
181 | 181 | key = row[0] |
182 | - if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
183 | - print row | |
184 | - for data, col in zip(row[1], cols): | |
185 | - puts.append((key, [key] + col + [str(data)])) | |
182 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
183 | + # print row | |
184 | + if not withdata: | |
185 | + for data, col in zip(row[1][1:], cols[1:]): | |
186 | + puts.append((key, [key] + col + [str(data)])) | |
187 | + else: | |
188 | + for data, col in zip(row[1], cols): | |
189 | + puts.append((key, [key] + col + [str(data)])) | |
186 | 190 | return puts |
187 | 191 | |
188 | 192 | |
... | ... | @@ -239,7 +243,7 @@ class Sparker(object): |
239 | 243 | else: |
240 | 244 | return hbase_rdd |
241 | 245 | |
242 | - def write_hbase(self, table_name, data, fromrdd=False, columns=None): | |
246 | + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): | |
243 | 247 | """ |
244 | 248 | Data Format: (Deprecated) |
245 | 249 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] |
... | ... | @@ -264,7 +268,7 @@ class Sparker(object): |
264 | 268 | else: |
265 | 269 | rdd_data = data |
266 | 270 | |
267 | - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( | |
271 | + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
268 | 272 | conf=hconf, |
269 | 273 | keyConverter=hparams["writeKeyConverter"], |
270 | 274 | valueConverter=hparams["writeValueConverter"]) | ... | ... |
test/test_data.py
... | ... | @@ -59,9 +59,10 @@ def test_ILSVRC(): |
59 | 59 | def test_ILSVRC_S(): |
60 | 60 | timer = Timer() |
61 | 61 | |
62 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
63 | - # dil.format() | |
64 | - # dil.store_img() | |
62 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
63 | + dil.delete_table() | |
64 | + dil.format() | |
65 | + dil.store_img() | |
65 | 66 | |
66 | 67 | |
67 | 68 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ... | ... |