From 0bd44a28a8510a0496ca6271fe2a362dd152c584 Mon Sep 17 00:00:00 2001 From: Chunk Date: Mon, 20 Apr 2015 14:52:14 +0800 Subject: [PATCH] staged. --- mdata/ILSVRC_S.py | 56 +++++++++++++++++++++++++++++++++++++++++++++----------- test/test_model.py | 2 +- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/mdata/ILSVRC_S.py b/mdata/ILSVRC_S.py index ac3c72e..77da0da 100644 --- a/mdata/ILSVRC_S.py +++ b/mdata/ILSVRC_S.py @@ -244,7 +244,7 @@ class DataILSVRC_S(DataDumperBase): elif mode == 'spark': if self.sparker == None: self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', - master='spark://HPC-server:7077') + master='spark://HPC-server:7077') cols = [ 'cf_pic:data', @@ -267,14 +267,14 @@ class DataILSVRC_S(DataDumperBase): self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, - collect=False).mapValues( + collect=False).mapValues( lambda data: [data] + SC.rddinfo_ILS(data)) if not writeback: return self.rdd_data else: self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, - withdata=withdata) + withdata=withdata) else: raise Exception("Unknown mode!") @@ -367,7 +367,7 @@ class DataILSVRC_S(DataDumperBase): elif mode == 'spark': if self.sparker == None: self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', - master='spark://HPC-server:7077') + master='spark://HPC-server:7077') cols = [ 'cf_pic:data', @@ -392,7 +392,7 @@ class DataILSVRC_S(DataDumperBase): return self.rdd_data else: self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, - withdata=withdata) + withdata=withdata) else: raise Exception("Unknown mode!") @@ -461,7 +461,7 @@ class DataILSVRC_S(DataDumperBase): elif mode == 'spark': if self.sparker == None: self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', - master='spark://HPC-server:7077') + master='spark://HPC-server:7077') cols = [ 'cf_pic:data', @@ -488,7 +488,7 @@ class DataILSVRC_S(DataDumperBase): return self.rdd_data else: self.sparker.write_hbase(self.table_name, self.rdd_data, fromrdd=True, columns=cols, - withdata=withdata) + withdata=withdata) else: @@ -505,21 +505,55 @@ class DataILSVRC_S(DataDumperBase): INDEX = [] X = [] Y = [] + if mode == "local": + dict_dataset = {} + + if feattype == 'coef': # raw + with open(self.list_file, 'rb') as tsvfile: + tsvfile = csv.reader(tsvfile, delimiter='\t') + for line in tsvfile: + hash = line[0] + tag = line[-1] + image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg') + if image: + im = Jpeg(image, key=sample_key) + dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y')) + + for tag, feat in dict_dataset.values(): + X.append(feat.tolist()) + Y.append(int(tag)) - if mode == "hbase": + else: + with open(self.list_file, 'rb') as tsvfile: + tsvfile = csv.reader(tsvfile, delimiter='\t') + for line in tsvfile: + hash = line[0] + tag = line[-1] + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) + if path_feat: + with open(path_feat, 'rb') as featfile: + dict_dataset[hash] = (tag, json.loads(featfile.read())) + + for tag, feat in dict_dataset.values(): + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) + X.append(np.array(feat).ravel().tolist()) + Y.append(int(tag)) + + elif mode == "hbase": if self.table == None: self.table = self.get_table() col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype for key, data in self.table.scan(columns=[col_feat, col_tag]): - X.append( - [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) + # X.append( + # [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) + X.append(np.array(json.loads(data[col_feat])).ravel().tolist()) Y.append(int(data[col_tag])) elif mode == "spark" or mode == "cluster": if self.sparker == None: self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', - master='spark://HPC-server:7077') + master='spark://HPC-server:7077') rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) if not collect: diff --git a/test/test_model.py b/test/test_model.py index e1c1199..2152a52 100755 --- a/test/test_model.py +++ b/test/test_model.py @@ -148,7 +148,7 @@ def test_SVM_ILSVRC_S(): def test_THEANO_crop(): timer.mark() - dilc = ILSVRC.DataILSVRC(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil') + dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil') X, Y = dilc.load_data(mode='local', feattype='coef') timer.report() -- libgit2 0.21.2