Commit 5426d9db04ff39181aca5bd19a3c98cf35fab3a7

Authored by Chunk
1 parent b69f7a48
Exists in refactor

almost finished.

@@ -4,6 +4,7 @@ __author__ = 'chunk' @@ -4,6 +4,7 @@ __author__ = 'chunk'
4 from ..common import * 4 from ..common import *
5 from .dependencies import * 5 from .dependencies import *
6 from . import * 6 from . import *
  7 +import rdd
7 from .rdd import * 8 from .rdd import *
8 9
9 import sys 10 import sys
@@ -110,7 +111,7 @@ class Sparker(object): @@ -110,7 +111,7 @@ class Sparker(object):
110 rdd_data = data 111 rdd_data = data
111 112
112 rdd_data.flatMap( 113 rdd_data.flatMap(
113 - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( 114 + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset(
114 conf=hconf, 115 conf=hconf,
115 keyConverter=hparams["writeKeyConverter"], 116 keyConverter=hparams["writeKeyConverter"],
116 valueConverter=hparams["writeValueConverter"]) 117 valueConverter=hparams["writeValueConverter"])
test/test_data.py
@@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'):
47 timer.report() 47 timer.report()
48 48
49 49
50 -def test_ILSVRC_S_LOCAL(): 50 +def test_ILSVRC_S_LOCAL(category='Train_100'):
51 timer = Timer() 51 timer = Timer()
52 52
53 timer.mark() 53 timer.mark()
54 - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') 54 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
55 dil.delete_table() 55 dil.delete_table()
56 dil.format() 56 dil.format()
57 dil.store_img() 57 dil.store_img()
58 timer.report() 58 timer.report()
59 59
60 - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') 60 + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category)
61 61
62 # dils._extract_data(mode='hbase', writeback=True) 62 # dils._extract_data(mode='hbase', writeback=True)
63 # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) 63 # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True)
@@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'):
90 timer.report() 90 timer.report()
91 91
92 92
93 -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): 93 +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None):
94 timer = Timer() 94 timer = Timer()
95 95
96 timer.mark() 96 timer.mark()
@@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None):
112 timer.report() 112 timer.report()
113 113
114 114
115 -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'): 115 +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'):
116 timer = Timer() 116 timer = Timer()
117 117
118 # timer.mark() 118 # timer.mark()
test/test_model.py
@@ -3,9 +3,8 @@ __author__ = 'chunk' @@ -3,9 +3,8 @@ __author__ = 'chunk'
3 from sklearn import cross_validation 3 from sklearn import cross_validation
4 from pyspark.mllib.regression import LabeledPoint 4 from pyspark.mllib.regression import LabeledPoint
5 from ..common import * 5 from ..common import *
6 -from ..mdata import CV, ILSVRC, ILSVRC_S 6 +from ..mdata import ILSVRC, ILSVRC_S
7 from ..mmodel.svm import SVM 7 from ..mmodel.svm import SVM
8 -from ..mmodel.theano import THEANO  
9 8
10 import gzip 9 import gzip
11 import cPickle 10 import cPickle
@@ -15,36 +14,6 @@ timer = Timer() @@ -15,36 +14,6 @@ timer = Timer()
15 package_dir = os.path.dirname(os.path.abspath(__file__)) 14 package_dir = os.path.dirname(os.path.abspath(__file__))
16 15
17 16
18 -def test_SVM_CV():  
19 - timer.mark()  
20 - dcv = CV.DataCV()  
21 - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s  
22 - # X, Y = dcv.load_data(mode='hbase') # 21.682754s  
23 - # X, Y = dcv.load_data(mode='spark') # 29.549597s  
24 - timer.report()  
25 -  
26 - timer.mark()  
27 - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s  
28 - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s  
29 - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s  
30 - msvm = SVM.ModelSVM(toolset='spark')  
31 -  
32 - msvm.train(X, Y)  
33 - timer.report()  
34 -  
35 - timer.mark()  
36 - for path, subdirs, files in os.walk('data/467/'):  
37 - for name in files:  
38 - imgpath = os.path.join(path, name)  
39 - feat = dcv.get_feat(imgpath, 'hog')  
40 - print name, msvm.predict(feat)  
41 - timer.report()  
42 -  
43 - timer.mark()  
44 - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib  
45 - timer.report() # 27.421949s for svm_lib  
46 -  
47 -  
48 def test_SVM_ILSVRC(): 17 def test_SVM_ILSVRC():
49 timer.mark() 18 timer.mark()
50 dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') 19 dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig')
@@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE():
119 # print scores 88 # print scores
120 # timer.report() 89 # timer.report()
121 90
  91 +
122 def test_SVM_ILSVRC_TEST(): 92 def test_SVM_ILSVRC_TEST():
123 timer.mark() 93 timer.mark()
124 94
@@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST():
131 timer.report() 101 timer.report()
132 102
133 timer.mark() 103 timer.mark()
134 - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401) 104 + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401)
135 timer.report() # 105 timer.report() #
136 # timer.mark() 106 # timer.mark()
137 # print 'or like this:' 107 # print 'or like this:'
@@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK():
145 dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') 115 dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000')
146 # rdd_dataset = dils.load_data(mode='spark') # pass 116 # rdd_dataset = dils.load_data(mode='spark') # pass
147 X, Y = dils.load_data(mode='hbase') # pass 117 X, Y = dils.load_data(mode='hbase') # pass
148 - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1])) 118 + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1]))
149 timer.report() 119 timer.report()
150 120
151 timer.mark() 121 timer.mark()
@@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S():
171 # test_SVM_ILSVRC_SPARK() 141 # test_SVM_ILSVRC_SPARK()
172 142
173 143
174 -def test_THEANO_mnist():  
175 - mtheano = THEANO.ModelTHEANO(toolset='cnn')  
176 - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500)  
177 -  
178 -  
179 -def test_THEANO_crop():  
180 - timer.mark()  
181 - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')  
182 - X, Y = dilc.load_data(mode='local', feattype='coef')  
183 - print X[0],Y  
184 - timer.report()  
185 -  
186 - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0)  
187 - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f:  
188 - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f)  
189 -  
190 - timer.mark()  
191 - mtheano = THEANO.ModelTHEANO(toolset='cnn')  
192 - mtheano._train_cnn(X, Y)  
193 - timer.report()  
194 -  
195 -  
196 if __name__ == '__main__': 144 if __name__ == '__main__':
197 # test_SVM_CV() 145 # test_SVM_CV()
198 test_SVM_ILSVRC() 146 test_SVM_ILSVRC()