Commit 0bd44a28a8510a0496ca6271fe2a362dd152c584

Authored by Chunk
1 parent 66c71f34
Exists in master and in 1 other branch refactor

staged.

Showing 2 changed files with 46 additions and 12 deletions   Show diff stats
mdata/ILSVRC_S.py
@@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase): @@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase):
244 elif mode == 'spark': 244 elif mode == 'spark':
245 if self.sparker == None: 245 if self.sparker == None:
246 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', 246 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
247 - master='spark://HPC-server:7077') 247 + master='spark://HPC-server:7077')
248 248
249 cols = [ 249 cols = [
250 'cf_pic:data', 250 'cf_pic:data',
@@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase): @@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase):
267 267
268 268
269 self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, 269 self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS,
270 - collect=False).mapValues( 270 + collect=False).mapValues(
271 lambda data: [data] + SC.rddinfo_ILS(data)) 271 lambda data: [data] + SC.rddinfo_ILS(data))
272 272
273 if not writeback: 273 if not writeback:
274 return self.rdd_data 274 return self.rdd_data
275 else: 275 else:
276 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, 276 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
277 - withdata=withdata) 277 + withdata=withdata)
278 278
279 else: 279 else:
280 raise Exception("Unknown mode!") 280 raise Exception("Unknown mode!")
@@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase): @@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase):
367 elif mode == 'spark': 367 elif mode == 'spark':
368 if self.sparker == None: 368 if self.sparker == None:
369 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', 369 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
370 - master='spark://HPC-server:7077') 370 + master='spark://HPC-server:7077')
371 371
372 cols = [ 372 cols = [
373 'cf_pic:data', 373 'cf_pic:data',
@@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase): @@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase):
392 return self.rdd_data 392 return self.rdd_data
393 else: 393 else:
394 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, 394 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
395 - withdata=withdata) 395 + withdata=withdata)
396 396
397 else: 397 else:
398 raise Exception("Unknown mode!") 398 raise Exception("Unknown mode!")
@@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase): @@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase):
461 elif mode == 'spark': 461 elif mode == 'spark':
462 if self.sparker == None: 462 if self.sparker == None:
463 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', 463 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
464 - master='spark://HPC-server:7077') 464 + master='spark://HPC-server:7077')
465 465
466 cols = [ 466 cols = [
467 'cf_pic:data', 467 'cf_pic:data',
@@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase): @@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase):
488 return self.rdd_data 488 return self.rdd_data
489 else: 489 else:
490 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, 490 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
491 - withdata=withdata) 491 + withdata=withdata)
492 492
493 493
494 else: 494 else:
@@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase): @@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase):
505 INDEX = [] 505 INDEX = []
506 X = [] 506 X = []
507 Y = [] 507 Y = []
  508 + if mode == "local":
  509 + dict_dataset = {}
  510 +
  511 + if feattype == 'coef': # raw
  512 + with open(self.list_file, 'rb') as tsvfile:
  513 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  514 + for line in tsvfile:
  515 + hash = line[0]
  516 + tag = line[-1]
  517 + image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
  518 + if image:
  519 + im = Jpeg(image, key=sample_key)
  520 + dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y'))
  521 +
  522 + for tag, feat in dict_dataset.values():
  523 + X.append(feat.tolist())
  524 + Y.append(int(tag))
508 525
509 - if mode == "hbase": 526 + else:
  527 + with open(self.list_file, 'rb') as tsvfile:
  528 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  529 + for line in tsvfile:
  530 + hash = line[0]
  531 + tag = line[-1]
  532 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  533 + if path_feat:
  534 + with open(path_feat, 'rb') as featfile:
  535 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
  536 +
  537 + for tag, feat in dict_dataset.values():
  538 + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  539 + X.append(np.array(feat).ravel().tolist())
  540 + Y.append(int(tag))
  541 +
  542 + elif mode == "hbase":
510 if self.table == None: 543 if self.table == None:
511 self.table = self.get_table() 544 self.table = self.get_table()
512 545
513 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype 546 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
514 for key, data in self.table.scan(columns=[col_feat, col_tag]): 547 for key, data in self.table.scan(columns=[col_feat, col_tag]):
515 - X.append(  
516 - [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) 548 + # X.append(
  549 + # [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
  550 + X.append(np.array(json.loads(data[col_feat])).ravel().tolist())
517 Y.append(int(data[col_tag])) 551 Y.append(int(data[col_tag]))
518 552
519 elif mode == "spark" or mode == "cluster": 553 elif mode == "spark" or mode == "cluster":
520 if self.sparker == None: 554 if self.sparker == None:
521 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', 555 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
522 - master='spark://HPC-server:7077') 556 + master='spark://HPC-server:7077')
523 557
524 rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) 558 rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False)
525 if not collect: 559 if not collect:
test/test_model.py
@@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S(): @@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S():
148 def test_THEANO_crop(): 148 def test_THEANO_crop():
149 149
150 timer.mark() 150 timer.mark()
151 - dilc = ILSVRC.DataILSVRC(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil') 151 + dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil')
152 X, Y = dilc.load_data(mode='local', feattype='coef') 152 X, Y = dilc.load_data(mode='local', feattype='coef')
153 timer.report() 153 timer.report()
154 154