Commit 5426d9db04ff39181aca5bd19a3c98cf35fab3a7
1 parent
b69f7a48
Exists in
refactor
almost finished.
Showing
3 changed files
with
11 additions
and
62 deletions
Show diff stats
mspark/SC.py
... | ... | @@ -4,6 +4,7 @@ __author__ = 'chunk' |
4 | 4 | from ..common import * |
5 | 5 | from .dependencies import * |
6 | 6 | from . import * |
7 | +import rdd | |
7 | 8 | from .rdd import * |
8 | 9 | |
9 | 10 | import sys |
... | ... | @@ -110,7 +111,7 @@ class Sparker(object): |
110 | 111 | rdd_data = data |
111 | 112 | |
112 | 113 | rdd_data.flatMap( |
113 | - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
114 | + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
114 | 115 | conf=hconf, |
115 | 116 | keyConverter=hparams["writeKeyConverter"], |
116 | 117 | valueConverter=hparams["writeValueConverter"]) | ... | ... |
test/test_data.py
... | ... | @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): |
47 | 47 | timer.report() |
48 | 48 | |
49 | 49 | |
50 | -def test_ILSVRC_S_LOCAL(): | |
50 | +def test_ILSVRC_S_LOCAL(category='Train_100'): | |
51 | 51 | timer = Timer() |
52 | 52 | |
53 | 53 | timer.mark() |
54 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
54 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) | |
55 | 55 | dil.delete_table() |
56 | 56 | dil.format() |
57 | 57 | dil.store_img() |
58 | 58 | timer.report() |
59 | 59 | |
60 | - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
60 | + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) | |
61 | 61 | |
62 | 62 | # dils._extract_data(mode='hbase', writeback=True) |
63 | 63 | # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) |
... | ... | @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): |
90 | 90 | timer.report() |
91 | 91 | |
92 | 92 | |
93 | -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): | |
93 | +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None): | |
94 | 94 | timer = Timer() |
95 | 95 | |
96 | 96 | timer.mark() |
... | ... | @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): |
112 | 112 | timer.report() |
113 | 113 | |
114 | 114 | |
115 | -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'): | |
115 | +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'): | |
116 | 116 | timer = Timer() |
117 | 117 | |
118 | 118 | # timer.mark() | ... | ... |
test/test_model.py
... | ... | @@ -3,9 +3,8 @@ __author__ = 'chunk' |
3 | 3 | from sklearn import cross_validation |
4 | 4 | from pyspark.mllib.regression import LabeledPoint |
5 | 5 | from ..common import * |
6 | -from ..mdata import CV, ILSVRC, ILSVRC_S | |
6 | +from ..mdata import ILSVRC, ILSVRC_S | |
7 | 7 | from ..mmodel.svm import SVM |
8 | -from ..mmodel.theano import THEANO | |
9 | 8 | |
10 | 9 | import gzip |
11 | 10 | import cPickle |
... | ... | @@ -15,36 +14,6 @@ timer = Timer() |
15 | 14 | package_dir = os.path.dirname(os.path.abspath(__file__)) |
16 | 15 | |
17 | 16 | |
18 | -def test_SVM_CV(): | |
19 | - timer.mark() | |
20 | - dcv = CV.DataCV() | |
21 | - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s | |
22 | - # X, Y = dcv.load_data(mode='hbase') # 21.682754s | |
23 | - # X, Y = dcv.load_data(mode='spark') # 29.549597s | |
24 | - timer.report() | |
25 | - | |
26 | - timer.mark() | |
27 | - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s | |
28 | - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s | |
29 | - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s | |
30 | - msvm = SVM.ModelSVM(toolset='spark') | |
31 | - | |
32 | - msvm.train(X, Y) | |
33 | - timer.report() | |
34 | - | |
35 | - timer.mark() | |
36 | - for path, subdirs, files in os.walk('data/467/'): | |
37 | - for name in files: | |
38 | - imgpath = os.path.join(path, name) | |
39 | - feat = dcv.get_feat(imgpath, 'hog') | |
40 | - print name, msvm.predict(feat) | |
41 | - timer.report() | |
42 | - | |
43 | - timer.mark() | |
44 | - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib | |
45 | - timer.report() # 27.421949s for svm_lib | |
46 | - | |
47 | - | |
48 | 17 | def test_SVM_ILSVRC(): |
49 | 18 | timer.mark() |
50 | 19 | dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') |
... | ... | @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): |
119 | 88 | # print scores |
120 | 89 | # timer.report() |
121 | 90 | |
91 | + | |
122 | 92 | def test_SVM_ILSVRC_TEST(): |
123 | 93 | timer.mark() |
124 | 94 | |
... | ... | @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): |
131 | 101 | timer.report() |
132 | 102 | |
133 | 103 | timer.mark() |
134 | - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401) | |
104 | + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401) | |
135 | 105 | timer.report() # |
136 | 106 | # timer.mark() |
137 | 107 | # print 'or like this:' |
... | ... | @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): |
145 | 115 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') |
146 | 116 | # rdd_dataset = dils.load_data(mode='spark') # pass |
147 | 117 | X, Y = dils.load_data(mode='hbase') # pass |
148 | - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1])) | |
118 | + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1])) | |
149 | 119 | timer.report() |
150 | 120 | |
151 | 121 | timer.mark() |
... | ... | @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): |
171 | 141 | # test_SVM_ILSVRC_SPARK() |
172 | 142 | |
173 | 143 | |
174 | -def test_THEANO_mnist(): | |
175 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | |
176 | - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500) | |
177 | - | |
178 | - | |
179 | -def test_THEANO_crop(): | |
180 | - timer.mark() | |
181 | - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil') | |
182 | - X, Y = dilc.load_data(mode='local', feattype='coef') | |
183 | - print X[0],Y | |
184 | - timer.report() | |
185 | - | |
186 | - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0) | |
187 | - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f: | |
188 | - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f) | |
189 | - | |
190 | - timer.mark() | |
191 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | |
192 | - mtheano._train_cnn(X, Y) | |
193 | - timer.report() | |
194 | - | |
195 | - | |
196 | 144 | if __name__ == '__main__': |
197 | 145 | # test_SVM_CV() |
198 | 146 | test_SVM_ILSVRC() | ... | ... |