Commit 0bd44a28a8510a0496ca6271fe2a362dd152c584

Authored by Chunk
1 parent 66c71f34
Exists in master and in 1 other branch refactor

staged.

Showing 2 changed files with 46 additions and 12 deletions   Show diff stats
mdata/ILSVRC_S.py
... ... @@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase):
244 244 elif mode == 'spark':
245 245 if self.sparker == None:
246 246 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
247   - master='spark://HPC-server:7077')
  247 + master='spark://HPC-server:7077')
248 248  
249 249 cols = [
250 250 'cf_pic:data',
... ... @@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase):
267 267  
268 268  
269 269 self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS,
270   - collect=False).mapValues(
  270 + collect=False).mapValues(
271 271 lambda data: [data] + SC.rddinfo_ILS(data))
272 272  
273 273 if not writeback:
274 274 return self.rdd_data
275 275 else:
276 276 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
277   - withdata=withdata)
  277 + withdata=withdata)
278 278  
279 279 else:
280 280 raise Exception("Unknown mode!")
... ... @@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase):
367 367 elif mode == 'spark':
368 368 if self.sparker == None:
369 369 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
370   - master='spark://HPC-server:7077')
  370 + master='spark://HPC-server:7077')
371 371  
372 372 cols = [
373 373 'cf_pic:data',
... ... @@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase):
392 392 return self.rdd_data
393 393 else:
394 394 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
395   - withdata=withdata)
  395 + withdata=withdata)
396 396  
397 397 else:
398 398 raise Exception("Unknown mode!")
... ... @@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase):
461 461 elif mode == 'spark':
462 462 if self.sparker == None:
463 463 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
464   - master='spark://HPC-server:7077')
  464 + master='spark://HPC-server:7077')
465 465  
466 466 cols = [
467 467 'cf_pic:data',
... ... @@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase):
488 488 return self.rdd_data
489 489 else:
490 490 self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols,
491   - withdata=withdata)
  491 + withdata=withdata)
492 492  
493 493  
494 494 else:
... ... @@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase):
505 505 INDEX = []
506 506 X = []
507 507 Y = []
  508 + if mode == "local":
  509 + dict_dataset = {}
  510 +
  511 + if feattype == 'coef': # raw
  512 + with open(self.list_file, 'rb') as tsvfile:
  513 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  514 + for line in tsvfile:
  515 + hash = line[0]
  516 + tag = line[-1]
  517 + image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
  518 + if image:
  519 + im = Jpeg(image, key=sample_key)
  520 + dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y'))
  521 +
  522 + for tag, feat in dict_dataset.values():
  523 + X.append(feat.tolist())
  524 + Y.append(int(tag))
508 525  
509   - if mode == "hbase":
  526 + else:
  527 + with open(self.list_file, 'rb') as tsvfile:
  528 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  529 + for line in tsvfile:
  530 + hash = line[0]
  531 + tag = line[-1]
  532 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  533 + if path_feat:
  534 + with open(path_feat, 'rb') as featfile:
  535 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
  536 +
  537 + for tag, feat in dict_dataset.values():
  538 + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  539 + X.append(np.array(feat).ravel().tolist())
  540 + Y.append(int(tag))
  541 +
  542 + elif mode == "hbase":
510 543 if self.table == None:
511 544 self.table = self.get_table()
512 545  
513 546 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
514 547 for key, data in self.table.scan(columns=[col_feat, col_tag]):
515   - X.append(
516   - [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
  548 + # X.append(
  549 + # [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
  550 + X.append(np.array(json.loads(data[col_feat])).ravel().tolist())
517 551 Y.append(int(data[col_tag]))
518 552  
519 553 elif mode == "spark" or mode == "cluster":
520 554 if self.sparker == None:
521 555 self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S',
522   - master='spark://HPC-server:7077')
  556 + master='spark://HPC-server:7077')
523 557  
524 558 rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False)
525 559 if not collect:
... ...
test/test_model.py
... ... @@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S():
148 148 def test_THEANO_crop():
149 149  
150 150 timer.mark()
151   - dilc = ILSVRC.DataILSVRC(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil')
  151 + dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil')
152 152 X, Y = dilc.load_data(mode='local', feattype='coef')
153 153 timer.report()
154 154  
... ...