Commit d47ae6ce8a8a4197938768893f939b9e10e1b244
1 parent
8bddd8b3
Exists in
master
and in
1 other branch
staged.
Showing
4 changed files
with
58 additions
and
16 deletions
Show diff stats
mdata/ILSVRC.py
| ... | ... | @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): |
| 291 | 291 | |
| 292 | 292 | return table |
| 293 | 293 | |
| 294 | + def delete_table(self, table_name=None, disable=True): | |
| 295 | + if table_name == None: | |
| 296 | + table_name = self.table_name | |
| 297 | + | |
| 298 | + if self.connection is None: | |
| 299 | + c = happybase.Connection('HPC-server') | |
| 300 | + self.connection = c | |
| 301 | + | |
| 302 | + tables = self.connection.tables() | |
| 303 | + if table_name not in tables: | |
| 304 | + return False | |
| 305 | + else: | |
| 306 | + try: | |
| 307 | + self.connection.delete_table(table_name, disable) | |
| 308 | + except: | |
| 309 | + print 'Exception when deleting table.' | |
| 310 | + raise | |
| 311 | + return True | |
| 294 | 312 | |
| 295 | 313 | def store_img(self): |
| 296 | 314 | if self.table == None: | ... | ... |
mdata/ILSVRC_S.py
| ... | ... | @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): |
| 81 | 81 | |
| 82 | 82 | return table |
| 83 | 83 | |
| 84 | + def delete_table(self, table_name=None, disable=True): | |
| 85 | + if table_name == None: | |
| 86 | + table_name = self.table_name | |
| 87 | + | |
| 88 | + if self.connection is None: | |
| 89 | + c = happybase.Connection('HPC-server') | |
| 90 | + self.connection = c | |
| 91 | + | |
| 92 | + tables = self.connection.tables() | |
| 93 | + if table_name not in tables: | |
| 94 | + return False | |
| 95 | + else: | |
| 96 | + try: | |
| 97 | + self.connection.delete_table(table_name, disable) | |
| 98 | + except: | |
| 99 | + print 'Exception when deleting table.' | |
| 100 | + raise | |
| 101 | + return True | |
| 102 | + | |
| 84 | 103 | def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): |
| 85 | 104 | """ |
| 86 | 105 | Tempfile is our friend. (?) |
| ... | ... | @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): |
| 256 | 275 | if not writeback: |
| 257 | 276 | return self.rdd_data |
| 258 | 277 | else: |
| 259 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
| 278 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) | |
| 260 | 279 | |
| 261 | 280 | else: |
| 262 | 281 | raise Exception("Unknown mode!") |
| ... | ... | @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): |
| 361 | 380 | if readforward: |
| 362 | 381 | self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) |
| 363 | 382 | |
| 364 | - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) | |
| 383 | + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) | |
| 365 | 384 | self.rdd_data = self.rdd_data.union(rdd_data_ext) |
| 366 | 385 | |
| 367 | 386 | if not writeback: |
| 368 | 387 | return self.rdd_data |
| 369 | 388 | else: |
| 370 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
| 389 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True) | |
| 371 | 390 | |
| 372 | 391 | else: |
| 373 | 392 | raise Exception("Unknown mode!") |
| ... | ... | @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): |
| 462 | 481 | if not writeback: |
| 463 | 482 | return self.rdd_data |
| 464 | 483 | else: |
| 465 | - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) | |
| 484 | + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False) | |
| 466 | 485 | |
| 467 | 486 | |
| 468 | 487 | else: | ... | ... |
mspark/SC.py
| ... | ... | @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): |
| 99 | 99 | tmpf.close() |
| 100 | 100 | |
| 101 | 101 | |
| 102 | -def rddembed_ILS(row): | |
| 102 | +def rddembed_ILS(row, rate=None): | |
| 103 | 103 | """ |
| 104 | 104 | input: |
| 105 | 105 | e.g. row =('row1',[1,3400,'hello']) |
| ... | ... | @@ -107,7 +107,7 @@ def rddembed_ILS(row): |
| 107 | 107 | newrow = ('row2',[34,5400,'embeded']) |
| 108 | 108 | """ |
| 109 | 109 | items = row[1] |
| 110 | - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) | |
| 110 | + capacity, chosen = int(items[4]), int(items[7]) | |
| 111 | 111 | if chosen == 0: |
| 112 | 112 | return None |
| 113 | 113 | try: |
| ... | ... | @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): |
| 169 | 169 | tmpf_src.close() |
| 170 | 170 | |
| 171 | 171 | |
| 172 | -def format_out(row, cols): | |
| 172 | +def format_out(row, cols, withdata=False): | |
| 173 | 173 | """ |
| 174 | 174 | input: |
| 175 | 175 | e.g. row =('row1',[1,3400,'hello']) |
| ... | ... | @@ -179,10 +179,14 @@ def format_out(row, cols): |
| 179 | 179 | """ |
| 180 | 180 | puts = [] |
| 181 | 181 | key = row[0] |
| 182 | - if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 183 | - print row | |
| 184 | - for data, col in zip(row[1], cols): | |
| 185 | - puts.append((key, [key] + col + [str(data)])) | |
| 182 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 183 | + # print row | |
| 184 | + if not withdata: | |
| 185 | + for data, col in zip(row[1][1:], cols[1:]): | |
| 186 | + puts.append((key, [key] + col + [str(data)])) | |
| 187 | + else: | |
| 188 | + for data, col in zip(row[1], cols): | |
| 189 | + puts.append((key, [key] + col + [str(data)])) | |
| 186 | 190 | return puts |
| 187 | 191 | |
| 188 | 192 | |
| ... | ... | @@ -239,7 +243,7 @@ class Sparker(object): |
| 239 | 243 | else: |
| 240 | 244 | return hbase_rdd |
| 241 | 245 | |
| 242 | - def write_hbase(self, table_name, data, fromrdd=False, columns=None): | |
| 246 | + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): | |
| 243 | 247 | """ |
| 244 | 248 | Data Format: (Deprecated) |
| 245 | 249 | e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] |
| ... | ... | @@ -264,7 +268,7 @@ class Sparker(object): |
| 264 | 268 | else: |
| 265 | 269 | rdd_data = data |
| 266 | 270 | |
| 267 | - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( | |
| 271 | + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
| 268 | 272 | conf=hconf, |
| 269 | 273 | keyConverter=hparams["writeKeyConverter"], |
| 270 | 274 | valueConverter=hparams["writeValueConverter"]) | ... | ... |
test/test_data.py
| ... | ... | @@ -59,9 +59,10 @@ def test_ILSVRC(): |
| 59 | 59 | def test_ILSVRC_S(): |
| 60 | 60 | timer = Timer() |
| 61 | 61 | |
| 62 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 63 | - # dil.format() | |
| 64 | - # dil.store_img() | |
| 62 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 63 | + dil.delete_table() | |
| 64 | + dil.format() | |
| 65 | + dil.store_img() | |
| 65 | 66 | |
| 66 | 67 | |
| 67 | 68 | dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ... | ... |