Commit 54e2adda230421077d556c3d775d9f399b82652e
1 parent
d642d837
Exists in
master
and in
1 other branch
staged.
Showing
3 changed files
with
179 additions
and
58 deletions
Show diff stats
mspark/SC.py
@@ -247,6 +247,77 @@ def format_out(row, cols, withdata=False): | @@ -247,6 +247,77 @@ def format_out(row, cols, withdata=False): | ||
247 | puts.append((key, [key] + col + [str(data)])) | 247 | puts.append((key, [key] + col + [str(data)])) |
248 | return puts | 248 | return puts |
249 | 249 | ||
250 | +# scconf = SparkConf() | ||
251 | +# scconf.setSparkHome("HPC-server") \ | ||
252 | +# .setMaster("spark://HPC-server:7077") \ | ||
253 | +# .setAppName("example") | ||
254 | +# sc = SparkContext(conf=scconf) | ||
255 | +# | ||
256 | +# | ||
257 | +# def read_hbase(table_name, func=None, collect=False): | ||
258 | +# """ | ||
259 | +# ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data | ||
260 | +# | ||
261 | +# Filter format: | ||
262 | +# columns=['cf1:col1', 'cf1:col2'] | ||
263 | +# or | ||
264 | +# columns=['cf1'] | ||
265 | +# | ||
266 | +# """ | ||
267 | +# | ||
268 | +# hconf = { | ||
269 | +# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
270 | +# # "hbase.zookeeper.quorum": self.host, | ||
271 | +# "hbase.mapreduce.inputtable": table_name, | ||
272 | +# } | ||
273 | +# | ||
274 | +# hbase_rdd = sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | ||
275 | +# keyClass=hparams["readKeyClass"], | ||
276 | +# valueClass=hparams["readValueClass"], | ||
277 | +# keyConverter=hparams["readKeyConverter"], | ||
278 | +# valueConverter=hparams["readValueConverter"], | ||
279 | +# conf=hconf) | ||
280 | +# | ||
281 | +# parser = func if func != None else rddparse_data_CV | ||
282 | +# hbase_rdd = hbase_rdd.map(lambda x: parser(x)) | ||
283 | +# | ||
284 | +# if collect: | ||
285 | +# return hbase_rdd.collect() | ||
286 | +# else: | ||
287 | +# return hbase_rdd | ||
288 | +# | ||
289 | +# | ||
290 | +# def write_hbase(table_name, data, fromrdd=False, columns=None, withdata=False): | ||
291 | +# """ | ||
292 | +# Data Format: (Deprecated) | ||
293 | +# e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | ||
294 | +# | ||
295 | +# Data(from dictionary): | ||
296 | +# e.g. data ={'row1':[1,3400,'hello'], 'row2':[34,5000,'here in mine']}, | ||
297 | +# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
298 | +# Data(from Rdd): | ||
299 | +# e.g. data =[('row1',[1,3400,'hello']), ('row2',[34,5000,'here in mine'])], | ||
300 | +# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
301 | +# """ | ||
302 | +# hconf = { | ||
303 | +# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | ||
304 | +# "hbase.mapreduce.inputtable": table_name, | ||
305 | +# "hbase.mapred.outputtable": table_name, | ||
306 | +# "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
307 | +# "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
308 | +# "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
309 | +# } | ||
310 | +# cols = [col.split(':') for col in columns] | ||
311 | +# if not fromrdd: | ||
312 | +# rdd_data = sc.parallelize(data) | ||
313 | +# else: | ||
314 | +# rdd_data = data | ||
315 | +# | ||
316 | +# rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | ||
317 | +# conf=hconf, | ||
318 | +# keyConverter=hparams["writeKeyConverter"], | ||
319 | +# valueConverter=hparams["writeValueConverter"]) | ||
320 | + | ||
250 | 321 | ||
251 | class Sparker(object): | 322 | class Sparker(object): |
252 | def __init__(self, host='HPC-server', appname='NewPySparkApp', **kwargs): | 323 | def __init__(self, host='HPC-server', appname='NewPySparkApp', **kwargs): |
@@ -283,9 +354,9 @@ class Sparker(object): | @@ -283,9 +354,9 @@ class Sparker(object): | ||
283 | """ | 354 | """ |
284 | 355 | ||
285 | hconf = { | 356 | hconf = { |
286 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
287 | - # "hbase.zookeeper.quorum": self.host, | ||
288 | - "hbase.mapreduce.inputtable": table_name, | 357 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", |
358 | + # "hbase.zookeeper.quorum": self.host, | ||
359 | + "hbase.mapreduce.inputtable": table_name, | ||
289 | } | 360 | } |
290 | 361 | ||
291 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | 362 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], |
@@ -316,12 +387,12 @@ class Sparker(object): | @@ -316,12 +387,12 @@ class Sparker(object): | ||
316 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | 387 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] |
317 | """ | 388 | """ |
318 | hconf = { | 389 | hconf = { |
319 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | ||
320 | - "hbase.mapreduce.inputtable": table_name, | ||
321 | - "hbase.mapred.outputtable": table_name, | ||
322 | - "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
323 | - "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
324 | - "mapreduce.job.output.value.class": hparams["writeValueClass"], | 390 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, |
391 | + "hbase.mapreduce.inputtable": table_name, | ||
392 | + "hbase.mapred.outputtable": table_name, | ||
393 | + "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
394 | + "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
395 | + "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
325 | } | 396 | } |
326 | cols = [col.split(':') for col in columns] | 397 | cols = [col.split(':') for col in columns] |
327 | if not fromrdd: | 398 | if not fromrdd: |
test/test_data.py
@@ -93,13 +93,13 @@ def test_ILSVRC_S_LOCAL(): | @@ -93,13 +93,13 @@ def test_ILSVRC_S_LOCAL(): | ||
93 | def test_ILSVRC_S_SPARK(): | 93 | def test_ILSVRC_S_SPARK(): |
94 | timer = Timer() | 94 | timer = Timer() |
95 | 95 | ||
96 | - # timer.mark() | ||
97 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1') | ||
98 | - # dil.delete_table() | ||
99 | - # dil.format() | ||
100 | - # dil.store_img() | ||
101 | - # timer.report() | ||
102 | - # return | 96 | + timer.mark() |
97 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1') | ||
98 | + dil.delete_table() | ||
99 | + dil.format() | ||
100 | + dil.store_img() | ||
101 | + timer.report() | ||
102 | + return | ||
103 | 103 | ||
104 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Test_1') | 104 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Test_1') |
105 | 105 |
test/test_whole.py
@@ -2,48 +2,98 @@ __author__ = 'chunk' | @@ -2,48 +2,98 @@ __author__ = 'chunk' | ||
2 | 2 | ||
3 | from ..mspark import SC | 3 | from ..mspark import SC |
4 | from pyspark.mllib.regression import LabeledPoint | 4 | from pyspark.mllib.regression import LabeledPoint |
5 | - | ||
6 | - | ||
7 | -cols0 = [ | ||
8 | - 'cf_pic:data', | ||
9 | - 'cf_info:width', | ||
10 | - 'cf_info:height', | ||
11 | - 'cf_info:size', | ||
12 | - 'cf_info:capacity', | ||
13 | - 'cf_info:quality', | ||
14 | - 'cf_info:rate', | ||
15 | - 'cf_tag:chosen', | ||
16 | - 'cf_tag:class' | ||
17 | -] | ||
18 | -cols1 = [ | ||
19 | - 'cf_pic:data', | ||
20 | - 'cf_info:width', | ||
21 | - 'cf_info:height', | ||
22 | - 'cf_info:size', | ||
23 | - 'cf_info:capacity', | ||
24 | - 'cf_info:quality', | ||
25 | - 'cf_info:rate', | ||
26 | - 'cf_tag:chosen', | ||
27 | - 'cf_tag:class', | ||
28 | - 'cf_feat:bid', | ||
29 | -] | ||
30 | - | ||
31 | -sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
32 | - | ||
33 | -rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \ | ||
34 | - .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \ | ||
35 | - .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \ | ||
36 | - .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
37 | - | ||
38 | -sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1, | ||
39 | - withdata=True) | ||
40 | - | ||
41 | - | ||
42 | - | ||
43 | - | ||
44 | - | ||
45 | - | ||
46 | - | 5 | +import happybase |
6 | + | ||
7 | +def test_whole(): | ||
8 | + cols0 = [ | ||
9 | + 'cf_pic:data', | ||
10 | + 'cf_info:width', | ||
11 | + 'cf_info:height', | ||
12 | + 'cf_info:size', | ||
13 | + 'cf_info:capacity', | ||
14 | + 'cf_info:quality', | ||
15 | + 'cf_info:rate', | ||
16 | + 'cf_tag:chosen', | ||
17 | + 'cf_tag:class' | ||
18 | + ] | ||
19 | + cols1 = [ | ||
20 | + 'cf_pic:data', | ||
21 | + 'cf_info:width', | ||
22 | + 'cf_info:height', | ||
23 | + 'cf_info:size', | ||
24 | + 'cf_info:capacity', | ||
25 | + 'cf_info:quality', | ||
26 | + 'cf_info:rate', | ||
27 | + 'cf_tag:chosen', | ||
28 | + 'cf_tag:class', | ||
29 | + 'cf_feat:bid', | ||
30 | + ] | ||
31 | + | ||
32 | + sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
33 | + | ||
34 | + # rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \ | ||
35 | + # .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \ | ||
36 | + # .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \ | ||
37 | + # .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
38 | + | ||
39 | + rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False).mapValues( | ||
40 | + lambda data: [data] + SC.rddinfo_ILS(data)) | ||
41 | + rdd_data_ext = rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=0.2)).filter(lambda x: x != None) | ||
42 | + | ||
43 | + rdd_data = rdd_data.union(rdd_data_ext).mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
44 | + | ||
45 | + print len(rdd_data.collect()) | ||
46 | + | ||
47 | + # sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1, | ||
48 | + # withdata=True) | ||
49 | + | ||
50 | + | ||
51 | +def test_whole_ext(): | ||
52 | + table_name = "ILSVRC2013_DET_val-Test_1" | ||
53 | + connection = happybase.Connection('HPC-server') | ||
54 | + tables = connection.tables() | ||
55 | + if table_name not in tables: | ||
56 | + families = {'cf_pic': dict(), | ||
57 | + 'cf_info': dict(max_versions=10), | ||
58 | + 'cf_tag': dict(), | ||
59 | + 'cf_feat': dict(), | ||
60 | + } | ||
61 | + connection.create_table(name=table_name, families=families) | ||
62 | + table = connection.table(name=table_name) | ||
63 | + | ||
64 | + cols = ['cf_pic:data'] | ||
65 | + list_data = [] | ||
66 | + for key, data in table.scan(columns=cols): | ||
67 | + data = data['cf_pic:data'] | ||
68 | + list_data.append((key,data)) | ||
69 | + | ||
70 | + sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
71 | + rdd_data = sparker.sc.parallelize(list_data,20)\ | ||
72 | + .mapValues(lambda data: [data] + SC.rddinfo_ILS(data))\ | ||
73 | + .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2))\ | ||
74 | + .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
75 | + | ||
76 | + rrr = rdd_data.collect() | ||
77 | + print "-----------------",len(rrr),"====================" | ||
78 | + print "+++++++++++++++++",rrr[0],"**********************" | ||
79 | + # try: | ||
80 | + # with table.batch(batch_size=5000) as b: | ||
81 | + # for imgname, imginfo in rdd_data.collect().items(): | ||
82 | + # b.put(imgname, | ||
83 | + # { | ||
84 | + # 'cf_pic:data': imginfo[0], | ||
85 | + # 'cf_info:width': str(imginfo[1]), | ||
86 | + # 'cf_info:height': str(imginfo[2]), | ||
87 | + # 'cf_info:size': str(imginfo[3]), | ||
88 | + # 'cf_info:capacity': str(imginfo[4]), | ||
89 | + # 'cf_info:quality': str(imginfo[5]), | ||
90 | + # 'cf_info:rate': str(imginfo[6]), | ||
91 | + # 'cf_tag:chosen': str(imginfo[7]), | ||
92 | + # 'cf_tag:class': str(imginfo[8]), | ||
93 | + # 'cf_feat:' + feattype: imginfo[9], | ||
94 | + # }) | ||
95 | + # except ValueError: | ||
96 | + # raise | ||
47 | 97 | ||
48 | 98 | ||
49 | 99 |