Commit 5426d9db04ff39181aca5bd19a3c98cf35fab3a7
1 parent
b69f7a48
Exists in
refactor
almost finished.
Showing
3 changed files
with
11 additions
and
62 deletions
Show diff stats
mspark/SC.py
| @@ -4,6 +4,7 @@ __author__ = 'chunk' | @@ -4,6 +4,7 @@ __author__ = 'chunk' | ||
| 4 | from ..common import * | 4 | from ..common import * |
| 5 | from .dependencies import * | 5 | from .dependencies import * |
| 6 | from . import * | 6 | from . import * |
| 7 | +import rdd | ||
| 7 | from .rdd import * | 8 | from .rdd import * |
| 8 | 9 | ||
| 9 | import sys | 10 | import sys |
| @@ -110,7 +111,7 @@ class Sparker(object): | @@ -110,7 +111,7 @@ class Sparker(object): | ||
| 110 | rdd_data = data | 111 | rdd_data = data |
| 111 | 112 | ||
| 112 | rdd_data.flatMap( | 113 | rdd_data.flatMap( |
| 113 | - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | 114 | + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( |
| 114 | conf=hconf, | 115 | conf=hconf, |
| 115 | keyConverter=hparams["writeKeyConverter"], | 116 | keyConverter=hparams["writeKeyConverter"], |
| 116 | valueConverter=hparams["writeValueConverter"]) | 117 | valueConverter=hparams["writeValueConverter"]) |
test/test_data.py
| @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): | @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): | ||
| 47 | timer.report() | 47 | timer.report() |
| 48 | 48 | ||
| 49 | 49 | ||
| 50 | -def test_ILSVRC_S_LOCAL(): | 50 | +def test_ILSVRC_S_LOCAL(category='Train_100'): |
| 51 | timer = Timer() | 51 | timer = Timer() |
| 52 | 52 | ||
| 53 | timer.mark() | 53 | timer.mark() |
| 54 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | 54 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) |
| 55 | dil.delete_table() | 55 | dil.delete_table() |
| 56 | dil.format() | 56 | dil.format() |
| 57 | dil.store_img() | 57 | dil.store_img() |
| 58 | timer.report() | 58 | timer.report() |
| 59 | 59 | ||
| 60 | - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | 60 | + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) |
| 61 | 61 | ||
| 62 | # dils._extract_data(mode='hbase', writeback=True) | 62 | # dils._extract_data(mode='hbase', writeback=True) |
| 63 | # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) | 63 | # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) |
| @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): | @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): | ||
| 90 | timer.report() | 90 | timer.report() |
| 91 | 91 | ||
| 92 | 92 | ||
| 93 | -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): | 93 | +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None): |
| 94 | timer = Timer() | 94 | timer = Timer() |
| 95 | 95 | ||
| 96 | timer.mark() | 96 | timer.mark() |
| @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): | @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): | ||
| 112 | timer.report() | 112 | timer.report() |
| 113 | 113 | ||
| 114 | 114 | ||
| 115 | -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'): | 115 | +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'): |
| 116 | timer = Timer() | 116 | timer = Timer() |
| 117 | 117 | ||
| 118 | # timer.mark() | 118 | # timer.mark() |
test/test_model.py
| @@ -3,9 +3,8 @@ __author__ = 'chunk' | @@ -3,9 +3,8 @@ __author__ = 'chunk' | ||
| 3 | from sklearn import cross_validation | 3 | from sklearn import cross_validation |
| 4 | from pyspark.mllib.regression import LabeledPoint | 4 | from pyspark.mllib.regression import LabeledPoint |
| 5 | from ..common import * | 5 | from ..common import * |
| 6 | -from ..mdata import CV, ILSVRC, ILSVRC_S | 6 | +from ..mdata import ILSVRC, ILSVRC_S |
| 7 | from ..mmodel.svm import SVM | 7 | from ..mmodel.svm import SVM |
| 8 | -from ..mmodel.theano import THEANO | ||
| 9 | 8 | ||
| 10 | import gzip | 9 | import gzip |
| 11 | import cPickle | 10 | import cPickle |
| @@ -15,36 +14,6 @@ timer = Timer() | @@ -15,36 +14,6 @@ timer = Timer() | ||
| 15 | package_dir = os.path.dirname(os.path.abspath(__file__)) | 14 | package_dir = os.path.dirname(os.path.abspath(__file__)) |
| 16 | 15 | ||
| 17 | 16 | ||
| 18 | -def test_SVM_CV(): | ||
| 19 | - timer.mark() | ||
| 20 | - dcv = CV.DataCV() | ||
| 21 | - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s | ||
| 22 | - # X, Y = dcv.load_data(mode='hbase') # 21.682754s | ||
| 23 | - # X, Y = dcv.load_data(mode='spark') # 29.549597s | ||
| 24 | - timer.report() | ||
| 25 | - | ||
| 26 | - timer.mark() | ||
| 27 | - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s | ||
| 28 | - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s | ||
| 29 | - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s | ||
| 30 | - msvm = SVM.ModelSVM(toolset='spark') | ||
| 31 | - | ||
| 32 | - msvm.train(X, Y) | ||
| 33 | - timer.report() | ||
| 34 | - | ||
| 35 | - timer.mark() | ||
| 36 | - for path, subdirs, files in os.walk('data/467/'): | ||
| 37 | - for name in files: | ||
| 38 | - imgpath = os.path.join(path, name) | ||
| 39 | - feat = dcv.get_feat(imgpath, 'hog') | ||
| 40 | - print name, msvm.predict(feat) | ||
| 41 | - timer.report() | ||
| 42 | - | ||
| 43 | - timer.mark() | ||
| 44 | - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib | ||
| 45 | - timer.report() # 27.421949s for svm_lib | ||
| 46 | - | ||
| 47 | - | ||
| 48 | def test_SVM_ILSVRC(): | 17 | def test_SVM_ILSVRC(): |
| 49 | timer.mark() | 18 | timer.mark() |
| 50 | dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') | 19 | dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') |
| @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): | @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): | ||
| 119 | # print scores | 88 | # print scores |
| 120 | # timer.report() | 89 | # timer.report() |
| 121 | 90 | ||
| 91 | + | ||
| 122 | def test_SVM_ILSVRC_TEST(): | 92 | def test_SVM_ILSVRC_TEST(): |
| 123 | timer.mark() | 93 | timer.mark() |
| 124 | 94 | ||
| @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): | @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): | ||
| 131 | timer.report() | 101 | timer.report() |
| 132 | 102 | ||
| 133 | timer.mark() | 103 | timer.mark() |
| 134 | - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401) | 104 | + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401) |
| 135 | timer.report() # | 105 | timer.report() # |
| 136 | # timer.mark() | 106 | # timer.mark() |
| 137 | # print 'or like this:' | 107 | # print 'or like this:' |
| @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): | @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): | ||
| 145 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') | 115 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') |
| 146 | # rdd_dataset = dils.load_data(mode='spark') # pass | 116 | # rdd_dataset = dils.load_data(mode='spark') # pass |
| 147 | X, Y = dils.load_data(mode='hbase') # pass | 117 | X, Y = dils.load_data(mode='hbase') # pass |
| 148 | - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1])) | 118 | + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1])) |
| 149 | timer.report() | 119 | timer.report() |
| 150 | 120 | ||
| 151 | timer.mark() | 121 | timer.mark() |
| @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): | @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): | ||
| 171 | # test_SVM_ILSVRC_SPARK() | 141 | # test_SVM_ILSVRC_SPARK() |
| 172 | 142 | ||
| 173 | 143 | ||
| 174 | -def test_THEANO_mnist(): | ||
| 175 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | ||
| 176 | - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500) | ||
| 177 | - | ||
| 178 | - | ||
| 179 | -def test_THEANO_crop(): | ||
| 180 | - timer.mark() | ||
| 181 | - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil') | ||
| 182 | - X, Y = dilc.load_data(mode='local', feattype='coef') | ||
| 183 | - print X[0],Y | ||
| 184 | - timer.report() | ||
| 185 | - | ||
| 186 | - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0) | ||
| 187 | - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f: | ||
| 188 | - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f) | ||
| 189 | - | ||
| 190 | - timer.mark() | ||
| 191 | - mtheano = THEANO.ModelTHEANO(toolset='cnn') | ||
| 192 | - mtheano._train_cnn(X, Y) | ||
| 193 | - timer.report() | ||
| 194 | - | ||
| 195 | - | ||
| 196 | if __name__ == '__main__': | 144 | if __name__ == '__main__': |
| 197 | # test_SVM_CV() | 145 | # test_SVM_CV() |
| 198 | test_SVM_ILSVRC() | 146 | test_SVM_ILSVRC() |