Commit 0bd44a28a8510a0496ca6271fe2a362dd152c584
1 parent
66c71f34
Exists in
master
and in
1 other branch
staged.
Showing
2 changed files
with
46 additions
and
12 deletions
Show diff stats
mdata/ILSVRC_S.py
| @@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 244 | elif mode == 'spark': | 244 | elif mode == 'spark': |
| 245 | if self.sparker == None: | 245 | if self.sparker == None: |
| 246 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', | 246 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
| 247 | - master='spark://HPC-server:7077') | 247 | + master='spark://HPC-server:7077') |
| 248 | 248 | ||
| 249 | cols = [ | 249 | cols = [ |
| 250 | 'cf_pic:data', | 250 | 'cf_pic:data', |
| @@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase): | @@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase): | ||
| 267 | 267 | ||
| 268 | 268 | ||
| 269 | self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | 269 | self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, |
| 270 | - collect=False).mapValues( | 270 | + collect=False).mapValues( |
| 271 | lambda data: [data] + SC.rddinfo_ILS(data)) | 271 | lambda data: [data] + SC.rddinfo_ILS(data)) |
| 272 | 272 | ||
| 273 | if not writeback: | 273 | if not writeback: |
| 274 | return self.rdd_data | 274 | return self.rdd_data |
| 275 | else: | 275 | else: |
| 276 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, | 276 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, |
| 277 | - withdata=withdata) | 277 | + withdata=withdata) |
| 278 | 278 | ||
| 279 | else: | 279 | else: |
| 280 | raise Exception("Unknown mode!") | 280 | raise Exception("Unknown mode!") |
| @@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 367 | elif mode == 'spark': | 367 | elif mode == 'spark': |
| 368 | if self.sparker == None: | 368 | if self.sparker == None: |
| 369 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', | 369 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
| 370 | - master='spark://HPC-server:7077') | 370 | + master='spark://HPC-server:7077') |
| 371 | 371 | ||
| 372 | cols = [ | 372 | cols = [ |
| 373 | 'cf_pic:data', | 373 | 'cf_pic:data', |
| @@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 392 | return self.rdd_data | 392 | return self.rdd_data |
| 393 | else: | 393 | else: |
| 394 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, | 394 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, |
| 395 | - withdata=withdata) | 395 | + withdata=withdata) |
| 396 | 396 | ||
| 397 | else: | 397 | else: |
| 398 | raise Exception("Unknown mode!") | 398 | raise Exception("Unknown mode!") |
| @@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 461 | elif mode == 'spark': | 461 | elif mode == 'spark': |
| 462 | if self.sparker == None: | 462 | if self.sparker == None: |
| 463 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', | 463 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
| 464 | - master='spark://HPC-server:7077') | 464 | + master='spark://HPC-server:7077') |
| 465 | 465 | ||
| 466 | cols = [ | 466 | cols = [ |
| 467 | 'cf_pic:data', | 467 | 'cf_pic:data', |
| @@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase): | ||
| 488 | return self.rdd_data | 488 | return self.rdd_data |
| 489 | else: | 489 | else: |
| 490 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, | 490 | self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, |
| 491 | - withdata=withdata) | 491 | + withdata=withdata) |
| 492 | 492 | ||
| 493 | 493 | ||
| 494 | else: | 494 | else: |
| @@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase): | @@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase): | ||
| 505 | INDEX = [] | 505 | INDEX = [] |
| 506 | X = [] | 506 | X = [] |
| 507 | Y = [] | 507 | Y = [] |
| 508 | + if mode == "local": | ||
| 509 | + dict_dataset = {} | ||
| 510 | + | ||
| 511 | + if feattype == 'coef': # raw | ||
| 512 | + with open(self.list_file, 'rb') as tsvfile: | ||
| 513 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
| 514 | + for line in tsvfile: | ||
| 515 | + hash = line[0] | ||
| 516 | + tag = line[-1] | ||
| 517 | + image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg') | ||
| 518 | + if image: | ||
| 519 | + im = Jpeg(image, key=sample_key) | ||
| 520 | + dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y')) | ||
| 521 | + | ||
| 522 | + for tag, feat in dict_dataset.values(): | ||
| 523 | + X.append(feat.tolist()) | ||
| 524 | + Y.append(int(tag)) | ||
| 508 | 525 | ||
| 509 | - if mode == "hbase": | 526 | + else: |
| 527 | + with open(self.list_file, 'rb') as tsvfile: | ||
| 528 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
| 529 | + for line in tsvfile: | ||
| 530 | + hash = line[0] | ||
| 531 | + tag = line[-1] | ||
| 532 | + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | ||
| 533 | + if path_feat: | ||
| 534 | + with open(path_feat, 'rb') as featfile: | ||
| 535 | + dict_dataset[hash] = (tag, json.loads(featfile.read())) | ||
| 536 | + | ||
| 537 | + for tag, feat in dict_dataset.values(): | ||
| 538 | + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | ||
| 539 | + X.append(np.array(feat).ravel().tolist()) | ||
| 540 | + Y.append(int(tag)) | ||
| 541 | + | ||
| 542 | + elif mode == "hbase": | ||
| 510 | if self.table == None: | 543 | if self.table == None: |
| 511 | self.table = self.get_table() | 544 | self.table = self.get_table() |
| 512 | 545 | ||
| 513 | col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | 546 | col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype |
| 514 | for key, data in self.table.scan(columns=[col_feat, col_tag]): | 547 | for key, data in self.table.scan(columns=[col_feat, col_tag]): |
| 515 | - X.append( | ||
| 516 | - [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) | 548 | + # X.append( |
| 549 | + # [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) | ||
| 550 | + X.append(np.array(json.loads(data[col_feat])).ravel().tolist()) | ||
| 517 | Y.append(int(data[col_tag])) | 551 | Y.append(int(data[col_tag])) |
| 518 | 552 | ||
| 519 | elif mode == "spark" or mode == "cluster": | 553 | elif mode == "spark" or mode == "cluster": |
| 520 | if self.sparker == None: | 554 | if self.sparker == None: |
| 521 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', | 555 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
| 522 | - master='spark://HPC-server:7077') | 556 | + master='spark://HPC-server:7077') |
| 523 | 557 | ||
| 524 | rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) | 558 | rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) |
| 525 | if not collect: | 559 | if not collect: |
test/test_model.py
| @@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S(): | @@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S(): | ||
| 148 | def test_THEANO_crop(): | 148 | def test_THEANO_crop(): |
| 149 | 149 | ||
| 150 | timer.mark() | 150 | timer.mark() |
| 151 | - dilc = ILSVRC.DataILSVRC(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil') | 151 | + dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil') |
| 152 | X, Y = dilc.load_data(mode='local', feattype='coef') | 152 | X, Y = dilc.load_data(mode='local', feattype='coef') |
| 153 | timer.report() | 153 | timer.report() |
| 154 | 154 |