Commit d642d837fa065c133187e9c8f8dbc0283103bfe6
1 parent
489c5608
Exists in
master
and in
1 other branch
staged.
Showing
2 changed files
with
69 additions
and
13 deletions
Show diff stats
mspark/SC.py
| @@ -155,6 +155,7 @@ def rddembed_ILS(row, rate=None): | @@ -155,6 +155,7 @@ def rddembed_ILS(row, rate=None): | ||
| 155 | tmpf_src.close() | 155 | tmpf_src.close() |
| 156 | tmpf_dst.close() | 156 | tmpf_dst.close() |
| 157 | 157 | ||
| 158 | + | ||
| 158 | def rddembed_ILS_EXT(row, rate=None): | 159 | def rddembed_ILS_EXT(row, rate=None): |
| 159 | """ | 160 | """ |
| 160 | input: | 161 | input: |
| @@ -188,7 +189,7 @@ def rddembed_ILS_EXT(row, rate=None): | @@ -188,7 +189,7 @@ def rddembed_ILS_EXT(row, rate=None): | ||
| 188 | raw = tmpf_dst.read() | 189 | raw = tmpf_dst.read() |
| 189 | index = md5(raw).hexdigest() | 190 | index = md5(raw).hexdigest() |
| 190 | 191 | ||
| 191 | - return [row,(index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] | 192 | + return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] |
| 192 | 193 | ||
| 193 | except Exception as e: | 194 | except Exception as e: |
| 194 | print e | 195 | print e |
| @@ -282,10 +283,10 @@ class Sparker(object): | @@ -282,10 +283,10 @@ class Sparker(object): | ||
| 282 | """ | 283 | """ |
| 283 | 284 | ||
| 284 | hconf = { | 285 | hconf = { |
| 285 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
| 286 | - #"hbase.zookeeper.quorum": self.host, | ||
| 287 | - "hbase.mapreduce.inputtable": table_name, | ||
| 288 | - } | 286 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", |
| 287 | + # "hbase.zookeeper.quorum": self.host, | ||
| 288 | + "hbase.mapreduce.inputtable": table_name, | ||
| 289 | + } | ||
| 289 | 290 | ||
| 290 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | 291 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], |
| 291 | keyClass=hparams["readKeyClass"], | 292 | keyClass=hparams["readKeyClass"], |
| @@ -315,14 +316,13 @@ class Sparker(object): | @@ -315,14 +316,13 @@ class Sparker(object): | ||
| 315 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | 316 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] |
| 316 | """ | 317 | """ |
| 317 | hconf = { | 318 | hconf = { |
| 318 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
| 319 | - #"hbase.zookeeper.quorum": self.host, | ||
| 320 | - "hbase.mapreduce.inputtable": table_name, | ||
| 321 | - "hbase.mapred.outputtable": table_name, | ||
| 322 | - "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
| 323 | - "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
| 324 | - "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
| 325 | - } | 319 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, |
| 320 | + "hbase.mapreduce.inputtable": table_name, | ||
| 321 | + "hbase.mapred.outputtable": table_name, | ||
| 322 | + "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
| 323 | + "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
| 324 | + "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
| 325 | + } | ||
| 326 | cols = [col.split(':') for col in columns] | 326 | cols = [col.split(':') for col in columns] |
| 327 | if not fromrdd: | 327 | if not fromrdd: |
| 328 | rdd_data = self.sc.parallelize(data) | 328 | rdd_data = self.sc.parallelize(data) |
| @@ -0,0 +1,56 @@ | @@ -0,0 +1,56 @@ | ||
| 1 | +__author__ = 'chunk' | ||
| 2 | + | ||
| 3 | +from ..mspark import SC | ||
| 4 | +from pyspark.mllib.regression import LabeledPoint | ||
| 5 | + | ||
| 6 | + | ||
| 7 | +cols0 = [ | ||
| 8 | + 'cf_pic:data', | ||
| 9 | + 'cf_info:width', | ||
| 10 | + 'cf_info:height', | ||
| 11 | + 'cf_info:size', | ||
| 12 | + 'cf_info:capacity', | ||
| 13 | + 'cf_info:quality', | ||
| 14 | + 'cf_info:rate', | ||
| 15 | + 'cf_tag:chosen', | ||
| 16 | + 'cf_tag:class' | ||
| 17 | +] | ||
| 18 | +cols1 = [ | ||
| 19 | + 'cf_pic:data', | ||
| 20 | + 'cf_info:width', | ||
| 21 | + 'cf_info:height', | ||
| 22 | + 'cf_info:size', | ||
| 23 | + 'cf_info:capacity', | ||
| 24 | + 'cf_info:quality', | ||
| 25 | + 'cf_info:rate', | ||
| 26 | + 'cf_tag:chosen', | ||
| 27 | + 'cf_tag:class', | ||
| 28 | + 'cf_feat:bid', | ||
| 29 | +] | ||
| 30 | + | ||
| 31 | +sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
| 32 | + | ||
| 33 | +rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \ | ||
| 34 | + .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \ | ||
| 35 | + .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \ | ||
| 36 | + .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
| 37 | + | ||
| 38 | +sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1, | ||
| 39 | + withdata=True) | ||
| 40 | + | ||
| 41 | + | ||
| 42 | + | ||
| 43 | + | ||
| 44 | + | ||
| 45 | + | ||
| 46 | + | ||
| 47 | + | ||
| 48 | + | ||
| 49 | + | ||
| 50 | + | ||
| 51 | + | ||
| 52 | + | ||
| 53 | + | ||
| 54 | + | ||
| 55 | + | ||
| 56 | + |