Commit d47ae6ce8a8a4197938768893f939b9e10e1b244
1 parent
8bddd8b3
Exists in
master
and in
1 other branch
staged.
Showing
4 changed files
with
58 additions
and
16 deletions
Show diff stats
mdata/ILSVRC.py
| @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): | @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): | ||
| 291 | 291 | ||
| 292 | return table | 292 | return table |
| 293 | 293 | ||
| 294 | + def delete_table(self, table_name=None, disable=True): | ||
| 295 | + if table_name == None: | ||
| 296 | + table_name = self.table_name | ||
| 297 | + | ||
| 298 | + if self.connection is None: | ||
| 299 | + c = happybase.Connection('HPC-server') | ||
| 300 | + self.connection = c | ||
| 301 | + | ||
| 302 | + tables = self.connection.tables() | ||
| 303 | + if table_name not in tables: | ||
| 304 | + return False | ||
| 305 | + else: | ||
| 306 | + try: | ||
| 307 | + self.connection.delete_table(table_name, disable) | ||
| 308 | + except: | ||
| 309 | + print 'Exception when deleting table.' | ||
| 310 | + raise | ||
| 311 | + return True | ||
| 294 | 312 | ||
| 295 | def store_img(self): | 313 | def store_img(self): |
| 296 | if self.table == None: | 314 | if self.table == None: |
mdata/ILSVRC_S.py
| @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): | @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): | ||
| 81 | 81 | ||
| 82 | return table | 82 | return table |
| 83 | 83 | ||
| 84 | + def delete_table(self, table_name=None, disable=True): | ||
| 85 | + if table_name == None: | ||
| 86 | + table_name = self.table_name | ||
| 87 | + | ||
| 88 | + if self.connection is None: | ||
| 89 | + c = happybase.Connection('HPC-server') | ||
| 90 | + self.connection = c | ||
| 91 | + | ||
| 92 | + tables = self.connection.tables() | ||
| 93 | + if table_name not in tables: | ||
| 94 | + return False | ||
| 95 | + else: | ||
| 96 | + try: | ||
| 97 | + self.connection.delete_table(table_name, disable) | ||
| 98 | + except: | ||
| 99 | + print 'Exception when deleting table.' | ||
| 100 | + raise | ||
| 101 | + return True | ||
| 102 | + | ||
| 84 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | 103 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): |
| 85 | """ | 104 | """ |
| 86 | Tempfile is our friend. (?) | 105 | Tempfile is our friend. (?) |
| @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 256 | if not writeback: | 275 | if not writeback: |
| 257 | return self.rdd_data | 276 | return self.rdd_data |
| 258 | else: | 277 | else: |
| 259 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 278 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) |
| 260 | 279 | ||
| 261 | else: | 280 | else: |
| 262 | raise Exception("Unknown mode!") | 281 | raise Exception("Unknown mode!") |
| @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): | @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): | ||
| 361 | if readforward: | 380 | if readforward: |
| 362 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | 381 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
| 363 | 382 | ||
| 364 | - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) | 383 | + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) |
| 365 | self.rdd_data = self.rdd_data.union(rdd_data_ext) | 384 | self.rdd_data = self.rdd_data.union(rdd_data_ext) |
| 366 | 385 | ||
| 367 | if not writeback: | 386 | if not writeback: |
| 368 | return self.rdd_data | 387 | return self.rdd_data |
| 369 | else: | 388 | else: |
| 370 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 389 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) |
| 371 | 390 | ||
| 372 | else: | 391 | else: |
| 373 | raise Exception("Unknown mode!") | 392 | raise Exception("Unknown mode!") |
| @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 462 | if not writeback: | 481 | if not writeback: |
| 463 | return self.rdd_data | 482 | return self.rdd_data |
| 464 | else: | 483 | else: |
| 465 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | 484 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False) |
| 466 | 485 | ||
| 467 | 486 | ||
| 468 | else: | 487 | else: |
mspark/SC.py
| @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | ||
| 99 | tmpf.close() | 99 | tmpf.close() |
| 100 | 100 | ||
| 101 | 101 | ||
| 102 | -def rddembed_ILS(row): | 102 | +def rddembed_ILS(row, rate=None): |
| 103 | """ | 103 | """ |
| 104 | input: | 104 | input: |
| 105 | e.g. row =('row1',[1,3400,'hello']) | 105 | e.g. row =('row1',[1,3400,'hello']) |
| @@ -107,7 +107,7 @@ def rddembed_ILS(row): | @@ -107,7 +107,7 @@ def rddembed_ILS(row): | ||
| 107 | newrow = ('row2',[34,5400,'embeded']) | 107 | newrow = ('row2',[34,5400,'embeded']) |
| 108 | """ | 108 | """ |
| 109 | items = row[1] | 109 | items = row[1] |
| 110 | - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) | 110 | + capacity, chosen = int(items[4]), int(items[7]) |
| 111 | if chosen == 0: | 111 | if chosen == 0: |
| 112 | return None | 112 | return None |
| 113 | try: | 113 | try: |
| @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): | @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): | ||
| 169 | tmpf_src.close() | 169 | tmpf_src.close() |
| 170 | 170 | ||
| 171 | 171 | ||
| 172 | -def format_out(row, cols): | 172 | +def format_out(row, cols, withdata=False): |
| 173 | """ | 173 | """ |
| 174 | input: | 174 | input: |
| 175 | e.g. row =('row1',[1,3400,'hello']) | 175 | e.g. row =('row1',[1,3400,'hello']) |
| @@ -179,10 +179,14 @@ def format_out(row, cols): | @@ -179,10 +179,14 @@ def format_out(row, cols): | ||
| 179 | """ | 179 | """ |
| 180 | puts = [] | 180 | puts = [] |
| 181 | key = row[0] | 181 | key = row[0] |
| 182 | - if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
| 183 | - print row | ||
| 184 | - for data, col in zip(row[1], cols): | ||
| 185 | - puts.append((key, [key] + col + [str(data)])) | 182 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': |
| 183 | + # print row | ||
| 184 | + if not withdata: | ||
| 185 | + for data, col in zip(row[1][1:], cols[1:]): | ||
| 186 | + puts.append((key, [key] + col + [str(data)])) | ||
| 187 | + else: | ||
| 188 | + for data, col in zip(row[1], cols): | ||
| 189 | + puts.append((key, [key] + col + [str(data)])) | ||
| 186 | return puts | 190 | return puts |
| 187 | 191 | ||
| 188 | 192 | ||
| @@ -239,7 +243,7 @@ class Sparker(object): | @@ -239,7 +243,7 @@ class Sparker(object): | ||
| 239 | else: | 243 | else: |
| 240 | return hbase_rdd | 244 | return hbase_rdd |
| 241 | 245 | ||
| 242 | - def write_hbase(self, table_name, data, fromrdd=False, columns=None): | 246 | + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): |
| 243 | """ | 247 | """ |
| 244 | Data Format: (Deprecated) | 248 | Data Format: (Deprecated) |
| 245 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | 249 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] |
| @@ -264,7 +268,7 @@ class Sparker(object): | @@ -264,7 +268,7 @@ class Sparker(object): | ||
| 264 | else: | 268 | else: |
| 265 | rdd_data = data | 269 | rdd_data = data |
| 266 | 270 | ||
| 267 | - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( | 271 | + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( |
| 268 | conf=hconf, | 272 | conf=hconf, |
| 269 | keyConverter=hparams["writeKeyConverter"], | 273 | keyConverter=hparams["writeKeyConverter"], |
| 270 | valueConverter=hparams["writeValueConverter"]) | 274 | valueConverter=hparams["writeValueConverter"]) |
test/test_data.py
| @@ -59,9 +59,10 @@ def test_ILSVRC(): | @@ -59,9 +59,10 @@ def test_ILSVRC(): | ||
| 59 | def test_ILSVRC_S(): | 59 | def test_ILSVRC_S(): |
| 60 | timer = Timer() | 60 | timer = Timer() |
| 61 | 61 | ||
| 62 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ||
| 63 | - # dil.format() | ||
| 64 | - # dil.store_img() | 62 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |
| 63 | + dil.delete_table() | ||
| 64 | + dil.format() | ||
| 65 | + dil.store_img() | ||
| 65 | 66 | ||
| 66 | 67 | ||
| 67 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | 68 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |