Commit d47ae6ce8a8a4197938768893f939b9e10e1b244

Authored by Chunk
1 parent 8bddd8b3
Exists in master and in 1 other branch refactor

staged.

mdata/ILSVRC.py
... ... @@ -291,6 +291,24 @@ class DataILSVRC(DataDumperBase):
291 291  
292 292 return table
293 293  
  294 + def delete_table(self, table_name=None, disable=True):
  295 + if table_name == None:
  296 + table_name = self.table_name
  297 +
  298 + if self.connection is None:
  299 + c = happybase.Connection('HPC-server')
  300 + self.connection = c
  301 +
  302 + tables = self.connection.tables()
  303 + if table_name not in tables:
  304 + return False
  305 + else:
  306 + try:
  307 + self.connection.delete_table(table_name, disable)
  308 + except:
  309 + print 'Exception when deleting table.'
  310 + raise
  311 + return True
294 312  
295 313 def store_img(self):
296 314 if self.table == None:
... ...
mdata/ILSVRC_S.py
... ... @@ -81,6 +81,25 @@ class DataILSVRC_S(DataDumperBase):
81 81  
82 82 return table
83 83  
  84 + def delete_table(self, table_name=None, disable=True):
  85 + if table_name == None:
  86 + table_name = self.table_name
  87 +
  88 + if self.connection is None:
  89 + c = happybase.Connection('HPC-server')
  90 + self.connection = c
  91 +
  92 + tables = self.connection.tables()
  93 + if table_name not in tables:
  94 + return False
  95 + else:
  96 + try:
  97 + self.connection.delete_table(table_name, disable)
  98 + except:
  99 + print 'Exception when deleting table.'
  100 + raise
  101 + return True
  102 +
84 103 def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):
85 104 """
86 105 Tempfile is our friend. (?)
... ... @@ -256,7 +275,7 @@ class DataILSVRC_S(DataDumperBase):
256 275 if not writeback:
257 276 return self.rdd_data
258 277 else:
259   - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols)
  278 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True)
260 279  
261 280 else:
262 281 raise Exception("Unknown mode!")
... ... @@ -361,13 +380,13 @@ class DataILSVRC_S(DataDumperBase):
361 380 if readforward:
362 381 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False)
363 382  
364   - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None)
  383 + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None)
365 384 self.rdd_data = self.rdd_data.union(rdd_data_ext)
366 385  
367 386 if not writeback:
368 387 return self.rdd_data
369 388 else:
370   - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols)
  389 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=True)
371 390  
372 391 else:
373 392 raise Exception("Unknown mode!")
... ... @@ -462,7 +481,7 @@ class DataILSVRC_S(DataDumperBase):
462 481 if not writeback:
463 482 return self.rdd_data
464 483 else:
465   - self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols)
  484 + self.sparkcontex.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, withdata=False)
466 485  
467 486  
468 487 else:
... ...
mspark/SC.py
... ... @@ -99,7 +99,7 @@ def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None):
99 99 tmpf.close()
100 100  
101 101  
102   -def rddembed_ILS(row):
  102 +def rddembed_ILS(row, rate=None):
103 103 """
104 104 input:
105 105 e.g. row =('row1',[1,3400,'hello'])
... ... @@ -107,7 +107,7 @@ def rddembed_ILS(row):
107 107 newrow = ('row2',[34,5400,'embeded'])
108 108 """
109 109 items = row[1]
110   - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7])
  110 + capacity, chosen = int(items[4]), int(items[7])
111 111 if chosen == 0:
112 112 return None
113 113 try:
... ... @@ -169,7 +169,7 @@ def rddfeat_ILS(items, feattype='ibd', **kwargs):
169 169 tmpf_src.close()
170 170  
171 171  
172   -def format_out(row, cols):
  172 +def format_out(row, cols, withdata=False):
173 173 """
174 174 input:
175 175 e.g. row =('row1',[1,3400,'hello'])
... ... @@ -179,10 +179,14 @@ def format_out(row, cols):
179 179 """
180 180 puts = []
181 181 key = row[0]
182   - if key == '04650c488a2b163ca8a1f52da6022f03.jpg':
183   - print row
184   - for data, col in zip(row[1], cols):
185   - puts.append((key, [key] + col + [str(data)]))
  182 + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg':
  183 + # print row
  184 + if not withdata:
  185 + for data, col in zip(row[1][1:], cols[1:]):
  186 + puts.append((key, [key] + col + [str(data)]))
  187 + else:
  188 + for data, col in zip(row[1], cols):
  189 + puts.append((key, [key] + col + [str(data)]))
186 190 return puts
187 191  
188 192  
... ... @@ -239,7 +243,7 @@ class Sparker(object):
239 243 else:
240 244 return hbase_rdd
241 245  
242   - def write_hbase(self, table_name, data, fromrdd=False, columns=None):
  246 + def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False):
243 247 """
244 248 Data Format: (Deprecated)
245 249 e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]]
... ... @@ -264,7 +268,7 @@ class Sparker(object):
264 268 else:
265 269 rdd_data = data
266 270  
267   - rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset(
  271 + rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset(
268 272 conf=hconf,
269 273 keyConverter=hparams["writeKeyConverter"],
270 274 valueConverter=hparams["writeValueConverter"])
... ...
test/test_data.py
... ... @@ -59,9 +59,10 @@ def test_ILSVRC():
59 59 def test_ILSVRC_S():
60 60 timer = Timer()
61 61  
62   - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
63   - # dil.format()
64   - # dil.store_img()
  62 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  63 + dil.delete_table()
  64 + dil.format()
  65 + dil.store_img()
65 66  
66 67  
67 68 dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
... ...