Commit 5426d9db04ff39181aca5bd19a3c98cf35fab3a7

Authored by Chunk
1 parent b69f7a48
Exists in refactor

almost finished.

mspark/SC.py
... ... @@ -4,6 +4,7 @@ __author__ = 'chunk'
4 4 from ..common import *
5 5 from .dependencies import *
6 6 from . import *
  7 +import rdd
7 8 from .rdd import *
8 9  
9 10 import sys
... ... @@ -110,7 +111,7 @@ class Sparker(object):
110 111 rdd_data = data
111 112  
112 113 rdd_data.flatMap(
113   - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset(
  114 + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset(
114 115 conf=hconf,
115 116 keyConverter=hparams["writeKeyConverter"],
116 117 valueConverter=hparams["writeValueConverter"])
... ...
test/test_data.py
... ... @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'):
47 47 timer.report()
48 48  
49 49  
50   -def test_ILSVRC_S_LOCAL():
  50 +def test_ILSVRC_S_LOCAL(category='Train_100'):
51 51 timer = Timer()
52 52  
53 53 timer.mark()
54   - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  54 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
55 55 dil.delete_table()
56 56 dil.format()
57 57 dil.store_img()
58 58 timer.report()
59 59  
60   - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  60 + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
61 61  
62 62 # dils._extract_data(mode='hbase', writeback=True)
63 63 # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True)
... ... @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'):
90 90 timer.report()
91 91  
92 92  
93   -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None):
  93 +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None):
94 94 timer = Timer()
95 95  
96 96 timer.mark()
... ... @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None):
112 112 timer.report()
113 113  
114 114  
115   -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'):
  115 +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'):
116 116 timer = Timer()
117 117  
118 118 # timer.mark()
... ...
test/test_model.py
... ... @@ -3,9 +3,8 @@ __author__ = 'chunk'
3 3 from sklearn import cross_validation
4 4 from pyspark.mllib.regression import LabeledPoint
5 5 from ..common import *
6   -from ..mdata import CV, ILSVRC, ILSVRC_S
  6 +from ..mdata import ILSVRC, ILSVRC_S
7 7 from ..mmodel.svm import SVM
8   -from ..mmodel.theano import THEANO
9 8  
10 9 import gzip
11 10 import cPickle
... ... @@ -15,36 +14,6 @@ timer = Timer()
15 14 package_dir = os.path.dirname(os.path.abspath(__file__))
16 15  
17 16  
18   -def test_SVM_CV():
19   - timer.mark()
20   - dcv = CV.DataCV()
21   - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s
22   - # X, Y = dcv.load_data(mode='hbase') # 21.682754s
23   - # X, Y = dcv.load_data(mode='spark') # 29.549597s
24   - timer.report()
25   -
26   - timer.mark()
27   - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s
28   - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s
29   - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s
30   - msvm = SVM.ModelSVM(toolset='spark')
31   -
32   - msvm.train(X, Y)
33   - timer.report()
34   -
35   - timer.mark()
36   - for path, subdirs, files in os.walk('data/467/'):
37   - for name in files:
38   - imgpath = os.path.join(path, name)
39   - feat = dcv.get_feat(imgpath, 'hog')
40   - print name, msvm.predict(feat)
41   - timer.report()
42   -
43   - timer.mark()
44   - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib
45   - timer.report() # 27.421949s for svm_lib
46   -
47   -
48 17 def test_SVM_ILSVRC():
49 18 timer.mark()
50 19 dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig')
... ... @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE():
119 88 # print scores
120 89 # timer.report()
121 90  
  91 +
122 92 def test_SVM_ILSVRC_TEST():
123 93 timer.mark()
124 94  
... ... @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST():
131 101 timer.report()
132 102  
133 103 timer.mark()
134   - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401)
  104 + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401)
135 105 timer.report() #
136 106 # timer.mark()
137 107 # print 'or like this:'
... ... @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK():
145 115 dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000')
146 116 # rdd_dataset = dils.load_data(mode='spark') # pass
147 117 X, Y = dils.load_data(mode='hbase') # pass
148   - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1]))
  118 + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1]))
149 119 timer.report()
150 120  
151 121 timer.mark()
... ... @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S():
171 141 # test_SVM_ILSVRC_SPARK()
172 142  
173 143  
174   -def test_THEANO_mnist():
175   - mtheano = THEANO.ModelTHEANO(toolset='cnn')
176   - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500)
177   -
178   -
179   -def test_THEANO_crop():
180   - timer.mark()
181   - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')
182   - X, Y = dilc.load_data(mode='local', feattype='coef')
183   - print X[0],Y
184   - timer.report()
185   -
186   - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0)
187   - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f:
188   - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f)
189   -
190   - timer.mark()
191   - mtheano = THEANO.ModelTHEANO(toolset='cnn')
192   - mtheano._train_cnn(X, Y)
193   - timer.report()
194   -
195   -
196 144 if __name__ == '__main__':
197 145 # test_SVM_CV()
198 146 test_SVM_ILSVRC()
... ...