Commit 5426d9db04ff39181aca5bd19a3c98cf35fab3a7
1 parent
b69f7a48
Exists in
refactor
almost finished.
Showing
3 changed files
with
11 additions
and
62 deletions
Show diff stats
mspark/SC.py
| ... | ... | @@ -4,6 +4,7 @@ __author__ = 'chunk' |
| 4 | 4 | from ..common import * |
| 5 | 5 | from .dependencies import * |
| 6 | 6 | from . import * |
| 7 | +import rdd | |
| 7 | 8 | from .rdd import * |
| 8 | 9 | |
| 9 | 10 | import sys |
| ... | ... | @@ -110,7 +111,7 @@ class Sparker(object): |
| 110 | 111 | rdd_data = data |
| 111 | 112 | |
| 112 | 113 | rdd_data.flatMap( |
| 113 | - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
| 114 | + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
| 114 | 115 | conf=hconf, |
| 115 | 116 | keyConverter=hparams["writeKeyConverter"], |
| 116 | 117 | valueConverter=hparams["writeValueConverter"]) | ... | ... |
test/test_data.py
| ... | ... | @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): |
| 47 | 47 | timer.report() |
| 48 | 48 | |
| 49 | 49 | |
| 50 | -def test_ILSVRC_S_LOCAL(): | |
| 50 | +def test_ILSVRC_S_LOCAL(category='Train_100'): | |
| 51 | 51 | timer = Timer() |
| 52 | 52 | |
| 53 | 53 | timer.mark() |
| 54 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 54 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) | |
| 55 | 55 | dil.delete_table() |
| 56 | 56 | dil.format() |
| 57 | 57 | dil.store_img() |
| 58 | 58 | timer.report() |
| 59 | 59 | |
| 60 | - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 60 | + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) | |
| 61 | 61 | |
| 62 | 62 | # dils._extract_data(mode='hbase', writeback=True) |
| 63 | 63 | # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) |
| ... | ... | @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): |
| 90 | 90 | timer.report() |
| 91 | 91 | |
| 92 | 92 | |
| 93 | -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): | |
| 93 | +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None): | |
| 94 | 94 | timer = Timer() |
| 95 | 95 | |
| 96 | 96 | timer.mark() |
| ... | ... | @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): |
| 112 | 112 | timer.report() |
| 113 | 113 | |
| 114 | 114 | |
| 115 | -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'): | |
| 115 | +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'): | |
| 116 | 116 | timer = Timer() |
| 117 | 117 | |
| 118 | 118 | # timer.mark() | ... | ... |
test/test_model.py
| ... | ... | @@ -3,9 +3,8 @@ __author__ = 'chunk' |
| 3 | 3 | from sklearn import cross_validation |
| 4 | 4 | from pyspark.mllib.regression import LabeledPoint |
| 5 | 5 | from ..common import * |
| 6 | -from ..mdata import CV, ILSVRC, ILSVRC_S | |
| 6 | +from ..mdata import ILSVRC, ILSVRC_S | |
| 7 | 7 | from ..mmodel.svm import SVM |
| 8 | -from ..mmodel.theano import THEANO | |
| 9 | 8 | |
| 10 | 9 | import gzip |
| 11 | 10 | import cPickle |
| ... | ... | @@ -15,36 +14,6 @@ timer = Timer() |
| 15 | 14 | package_dir = os.path.dirname(os.path.abspath(__file__)) |
| 16 | 15 | |
| 17 | 16 | |
| 18 | -def test_SVM_CV(): | |
| 19 | - timer.mark() | |
| 20 | - dcv = CV.DataCV() | |
| 21 | - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s | |
| 22 | - # X, Y = dcv.load_data(mode='hbase') # 21.682754s | |
| 23 | - # X, Y = dcv.load_data(mode='spark') # 29.549597s | |
| 24 | - timer.report() | |
| 25 | - | |
| 26 | - timer.mark() | |
| 27 | - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s | |
| 28 | - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s | |
| 29 | - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s | |
| 30 | - msvm = SVM.ModelSVM(toolset='spark') | |
| 31 | - | |
| 32 | - msvm.train(X, Y) | |
| 33 | - timer.report() | |
| 34 | - | |
| 35 | - timer.mark() | |
| 36 | - for path, subdirs, files in os.walk('data/467/'): | |
| 37 | - for name in files: | |
| 38 | - imgpath = os.path.join(path, name) | |
| 39 | - feat = dcv.get_feat(imgpath, 'hog') | |
| 40 | - print name, msvm.predict(feat) | |
| 41 | - timer.report() | |
| 42 | - | |
| 43 | - timer.mark() | |
| 44 | - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib | |
| 45 | - timer.report() # 27.421949s for svm_lib | |
| 46 | - | |
| 47 | - | |
| 48 | 17 | def test_SVM_ILSVRC(): |
| 49 | 18 | timer.mark() |
| 50 | 19 | dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') |
| ... | ... | @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): |
| 119 | 88 | # print scores |
| 120 | 89 | # timer.report() |
| 121 | 90 | |
| 91 | + | |
| 122 | 92 | def test_SVM_ILSVRC_TEST(): |
| 123 | 93 | timer.mark() |
| 124 | 94 | |
| ... | ... | @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): |
| 131 | 101 | timer.report() |
| 132 | 102 | |
| 133 | 103 | timer.mark() |
| 134 | - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401) | |
| 104 | + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401) | |
| 135 | 105 | timer.report() # |
| 136 | 106 | # timer.mark() |
| 137 | 107 | # print 'or like this:' |
| ... | ... | @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): |
| 145 | 115 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') |
| 146 | 116 | # rdd_dataset = dils.load_data(mode='spark') # pass |
| 147 | 117 | X, Y = dils.load_data(mode='hbase') # pass |
| 148 | - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1])) | |
| 118 | + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1])) | |
| 149 | 119 | timer.report() |
| 150 | 120 | |
| 151 | 121 | timer.mark() |
| ... | ... | @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): |
| 171 | 141 | # test_SVM_ILSVRC_SPARK() |
| 172 | 142 | |
| 173 | 143 | |
| 174 | -def test_THEANO_mnist(): | |
| 175 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | |
| 176 | - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500) | |
| 177 | - | |
| 178 | - | |
| 179 | -def test_THEANO_crop(): | |
| 180 | - timer.mark() | |
| 181 | - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil') | |
| 182 | - X, Y = dilc.load_data(mode='local', feattype='coef') | |
| 183 | - print X[0],Y | |
| 184 | - timer.report() | |
| 185 | - | |
| 186 | - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0) | |
| 187 | - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f: | |
| 188 | - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f) | |
| 189 | - | |
| 190 | - timer.mark() | |
| 191 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | |
| 192 | - mtheano._train_cnn(X, Y) | |
| 193 | - timer.report() | |
| 194 | - | |
| 195 | - | |
| 196 | 144 | if __name__ == '__main__': |
| 197 | 145 | # test_SVM_CV() |
| 198 | 146 | test_SVM_ILSVRC() | ... | ... |