Commit d47ae6ce8a8a4197938768893f939b9e10e1b244

Authored by Chunk
1 parent 8bddd8b3
Exists in master and in 1 other branch refactor

staged.

mdata/ILSVRC.py
@@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase): @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase):
291 291
292 return table 292 return table
293 293
  294 + def delete_table(self, table_name=None, disable=True):
  295 + if table_name == None:
  296 + table_name = self.table_name
  297 +
  298 + if self.connection is None:
  299 + c = happybase.Connection('HPC-server')
  300 + self.connection = c
  301 +
  302 + tables = self.connection.tables()
  303 + if table_name not in tables:
  304 + return False
  305 + else:
  306 + try:
  307 + self.connection.delete_table(table_name, disable)
  308 + except:
  309 + print 'Exception when deleting table.'
  310 + raise
  311 + return True
294 312
295 def store_img(self): 313 def store_img(self):
296 if self.table == None: 314 if self.table == None:
mdata/ILSVRC_S.py
@@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase): @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase):
81 81
82 return table 82 return table
83 83
  84 + def delete_table(self, table_name=None, disable=True):
  85 + if table_name == None:
  86 + table_name = self.table_name
  87 +
  88 + if self.connection is None:
  89 + c = happybase.Connection('HPC-server')
  90 + self.connection = c
  91 +
  92 + tables = self.connection.tables()
  93 + if table_name not in tables:
  94 + return False
  95 + else:
  96 + try:
  97 + self.connection.delete_table(table_name, disable)
  98 + except:
  99 + print 'Exception when deleting table.'
  100 + raise
  101 + return True
  102 +
84 def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): 103 def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):
85 """ 104 """
86 Tempfile is our friend. (?) 105 Tempfile is our friend. (?)
@@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase): @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase):
256 if not writeback: 275 if not writeback:
257 return self.rdd_data 276 return self.rdd_data
258 else: 277 else:
259 - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) 278 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True)
260 279
261 else: 280 else:
262 raise Exception("Unknown mode!") 281 raise Exception("Unknown mode!")
@@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase): @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase):
361 if readforward: 380 if readforward:
362 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) 381 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False)
363 382
364 - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None) 383 + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None)
365 self.rdd_data = self.rdd_data.union(rdd_data_ext) 384 self.rdd_data = self.rdd_data.union(rdd_data_ext)
366 385
367 if not writeback: 386 if not writeback:
368 return self.rdd_data 387 return self.rdd_data
369 else: 388 else:
370 - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) 389 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True)
371 390
372 else: 391 else:
373 raise Exception("Unknown mode!") 392 raise Exception("Unknown mode!")
@@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase): @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase):
462 if not writeback: 481 if not writeback:
463 return self.rdd_data 482 return self.rdd_data
464 else: 483 else:
465 - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols) 484 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False)
466 485
467 486
468 else: 487 else:
@@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None):
99 tmpf.close() 99 tmpf.close()
100 100
101 101
102 -def rddembed_ILS(row): 102 +def rddembed_ILS(row, rate=None):
103 """ 103 """
104 input: 104 input:
105 e.g. row =('row1',[1,3400,'hello']) 105 e.g. row =('row1',[1,3400,'hello'])
@@ -107,7 +107,7 @@ def rddembed_ILS(row): @@ -107,7 +107,7 @@ def rddembed_ILS(row):
107 newrow = ('row2',[34,5400,'embeded']) 107 newrow = ('row2',[34,5400,'embeded'])
108 """ 108 """
109 items = row[1] 109 items = row[1]
110 - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) 110 + capacity, chosen = int(items[4]), int(items[7])
111 if chosen == 0: 111 if chosen == 0:
112 return None 112 return None
113 try: 113 try:
@@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs): @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs):
169 tmpf_src.close() 169 tmpf_src.close()
170 170
171 171
172 -def format_out(row, cols): 172 +def format_out(row, cols, withdata=False):
173 """ 173 """
174 input: 174 input:
175 e.g. row =('row1',[1,3400,'hello']) 175 e.g. row =('row1',[1,3400,'hello'])
@@ -179,10 +179,14 @@ def format_out(row, cols): @@ -179,10 +179,14 @@ def format_out(row, cols):
179 """ 179 """
180 puts = [] 180 puts = []
181 key = row[0] 181 key = row[0]
182 - if key == '04650c488a2b163ca8a1f52da6022f03.jpg':  
183 - print row  
184 - for data, col in zip(row[1], cols):  
185 - puts.append((key, [key] + col + [str(data)])) 182 + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg':
  183 + # print row
  184 + if not withdata:
  185 + for data, col in zip(row[1][1:], cols[1:]):
  186 + puts.append((key, [key] + col + [str(data)]))
  187 + else:
  188 + for data, col in zip(row[1], cols):
  189 + puts.append((key, [key] + col + [str(data)]))
186 return puts 190 return puts
187 191
188 192
@@ -239,7 +243,7 @@ class Sparker(object): @@ -239,7 +243,7 @@ class Sparker(object):
239 else: 243 else:
240 return hbase_rdd 244 return hbase_rdd
241 245
242 - def write_hbase(self, table_name, data, fromrdd=False, columns=None): 246 + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False):
243 """ 247 """
244 Data Format: (Deprecated) 248 Data Format: (Deprecated)
245 e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] 249 e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]]
@@ -264,7 +268,7 @@ class Sparker(object): @@ -264,7 +268,7 @@ class Sparker(object):
264 else: 268 else:
265 rdd_data = data 269 rdd_data = data
266 270
267 - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( 271 + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset(
268 conf=hconf, 272 conf=hconf,
269 keyConverter=hparams["writeKeyConverter"], 273 keyConverter=hparams["writeKeyConverter"],
270 valueConverter=hparams["writeValueConverter"]) 274 valueConverter=hparams["writeValueConverter"])
test/test_data.py
@@ -59,9 +59,10 @@ def test_ILSVRC(): @@ -59,9 +59,10 @@ def test_ILSVRC():
59 def test_ILSVRC_S(): 59 def test_ILSVRC_S():
60 timer = Timer() 60 timer = Timer()
61 61
62 - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')  
63 - # dil.format()  
64 - # dil.store_img() 62 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  63 + dil.delete_table()
  64 + dil.format()
  65 + dil.store_img()
65 66
66 67
67 dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') 68 dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')