Commit d47ae6ce8a8a4197938768893f939b9e10e1b244
1 parent
8bddd8b3
Exists in
master
and in
1 other branch
staged.
Showing
4 changed files
with
58 additions
and
16 deletions
Show diff stats
mdata/ILSVRC.py
@@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): | @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): | ||
291 | 291 | ||
292 | return table | 292 | return table |
293 | 293 | ||
294 | + def delete_table(self, table_name=None, disable=True): | ||
295 | + if table_name == None: | ||
296 | + table_name = self.table_name | ||
297 | + | ||
298 | + if self.connection is None: | ||
299 | + c = happybase.Connection('HPC-server') | ||
300 | + self.connection = c | ||
301 | + | ||
302 | + tables = self.connection.tables() | ||
303 | + if table_name not in tables: | ||
304 | + return False | ||
305 | + else: | ||
306 | + try: | ||
307 | + self.connection.delete_table(table_name, disable) | ||
308 | + except: | ||
309 | + print 'Exception when deleting table.' | ||
310 | + raise | ||
311 | + return True | ||
294 | 312 | ||
295 | def store_img(self): | 313 | def store_img(self): |
296 | if self.table == None: | 314 | if self.table == None: |
mdata/ILSVRC_S.py
@@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): | @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): | ||
81 | 81 | ||
82 | return table | 82 | return table |
83 | 83 | ||
84 | + def delete_table(self, table_name=None, disable=True): | ||
85 | + if table_name == None: | ||
86 | + table_name = self.table_name | ||
87 | + | ||
88 | + if self.connection is None: | ||
89 | + c = happybase.Connection('HPC-server') | ||
90 | + self.connection = c | ||
91 | + | ||
92 | + tables = self.connection.tables() | ||
93 | + if table_name not in tables: | ||
94 | + return False | ||
95 | + else: | ||
96 | + try: | ||
97 | + self.connection.delete_table(table_name, disable) | ||
98 | + except: | ||
99 | + print 'Exception when deleting table.' | ||
100 | + raise | ||
101 | + return True | ||
102 | + | ||
84 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | 103 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): |
85 | """ | 104 | """ |
86 | Tempfile is our friend. (?) | 105 | Tempfile is our friend. (?) |
@@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): | ||
256 | if not writeback: | 275 | if not writeback: |
257 | return self.rdd_data | 276 | return self.rdd_data |
258 | else: | 277 | else: |
259 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 278 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) |
260 | 279 | ||
261 | else: | 280 | else: |
262 | raise Exception("Unknown mode!") | 281 | raise Exception("Unknown mode!") |
@@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): | @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): | ||
361 | if readforward: | 380 | if readforward: |
362 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | 381 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
363 | 382 | ||
364 | - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) | 383 | + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) |
365 | self.rdd_data = self.rdd_data.union(rdd_data_ext) | 384 | self.rdd_data = self.rdd_data.union(rdd_data_ext) |
366 | 385 | ||
367 | if not writeback: | 386 | if not writeback: |
368 | return self.rdd_data | 387 | return self.rdd_data |
369 | else: | 388 | else: |
370 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 389 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) |
371 | 390 | ||
372 | else: | 391 | else: |
373 | raise Exception("Unknown mode!") | 392 | raise Exception("Unknown mode!") |
@@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): | ||
462 | if not writeback: | 481 | if not writeback: |
463 | return self.rdd_data | 482 | return self.rdd_data |
464 | else: | 483 | else: |
465 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 484 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False) |
466 | 485 | ||
467 | 486 | ||
468 | else: | 487 | else: |
mspark/SC.py
@@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | ||
99 | tmpf.close() | 99 | tmpf.close() |
100 | 100 | ||
101 | 101 | ||
102 | -def rddembed_ILS(row): | 102 | +def rddembed_ILS(row, rate=None): |
103 | """ | 103 | """ |
104 | input: | 104 | input: |
105 | e.g. row =('row1',[1,3400,'hello']) | 105 | e.g. row =('row1',[1,3400,'hello']) |
@@ -107,7 +107,7 @@ def rddembed_ILS(row): | @@ -107,7 +107,7 @@ def rddembed_ILS(row): | ||
107 | newrow = ('row2',[34,5400,'embeded']) | 107 | newrow = ('row2',[34,5400,'embeded']) |
108 | """ | 108 | """ |
109 | items = row[1] | 109 | items = row[1] |
110 | - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) | 110 | + capacity, chosen = int(items[4]), int(items[7]) |
111 | if chosen == 0: | 111 | if chosen == 0: |
112 | return None | 112 | return None |
113 | try: | 113 | try: |
@@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): | @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): | ||
169 | tmpf_src.close() | 169 | tmpf_src.close() |
170 | 170 | ||
171 | 171 | ||
172 | -def format_out(row, cols): | 172 | +def format_out(row, cols, withdata=False): |
173 | """ | 173 | """ |
174 | input: | 174 | input: |
175 | e.g. row =('row1',[1,3400,'hello']) | 175 | e.g. row =('row1',[1,3400,'hello']) |
@@ -179,10 +179,14 @@ def format_out(row, cols): | @@ -179,10 +179,14 @@ def format_out(row, cols): | ||
179 | """ | 179 | """ |
180 | puts = [] | 180 | puts = [] |
181 | key = row[0] | 181 | key = row[0] |
182 | - if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
183 | - print row | ||
184 | - for data, col in zip(row[1], cols): | ||
185 | - puts.append((key, [key] + col + [str(data)])) | 182 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': |
183 | + # print row | ||
184 | + if not withdata: | ||
185 | + for data, col in zip(row[1][1:], cols[1:]): | ||
186 | + puts.append((key, [key] + col + [str(data)])) | ||
187 | + else: | ||
188 | + for data, col in zip(row[1], cols): | ||
189 | + puts.append((key, [key] + col + [str(data)])) | ||
186 | return puts | 190 | return puts |
187 | 191 | ||
188 | 192 | ||
@@ -239,7 +243,7 @@ class Sparker(object): | @@ -239,7 +243,7 @@ class Sparker(object): | ||
239 | else: | 243 | else: |
240 | return hbase_rdd | 244 | return hbase_rdd |
241 | 245 | ||
242 | - def write_hbase(self, table_name, data, fromrdd=False, columns=None): | 246 | + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): |
243 | """ | 247 | """ |
244 | Data Format: (Deprecated) | 248 | Data Format: (Deprecated) |
245 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | 249 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] |
@@ -264,7 +268,7 @@ class Sparker(object): | @@ -264,7 +268,7 @@ class Sparker(object): | ||
264 | else: | 268 | else: |
265 | rdd_data = data | 269 | rdd_data = data |
266 | 270 | ||
267 | - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( | 271 | + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( |
268 | conf=hconf, | 272 | conf=hconf, |
269 | keyConverter=hparams["writeKeyConverter"], | 273 | keyConverter=hparams["writeKeyConverter"], |
270 | valueConverter=hparams["writeValueConverter"]) | 274 | valueConverter=hparams["writeValueConverter"]) |
test/test_data.py
@@ -59,9 +59,10 @@ def test_ILSVRC(): | @@ -59,9 +59,10 @@ def test_ILSVRC(): | ||
59 | def test_ILSVRC_S(): | 59 | def test_ILSVRC_S(): |
60 | timer = Timer() | 60 | timer = Timer() |
61 | 61 | ||
62 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ||
63 | - # dil.format() | ||
64 | - # dil.store_img() | 62 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |
63 | + dil.delete_table() | ||
64 | + dil.format() | ||
65 | + dil.store_img() | ||
65 | 66 | ||
66 | 67 | ||
67 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | 68 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |