staged.

Chunk
1 parent 489c5608
Showing 2 changed files with 69 additions and 13 deletions Show diff stats
mspark/SC.py
test/test_whole.py
@@ -155,6 +155,7 @@ def rddembed_ILS(row, rate=None):
         tmpf_src.close()
         tmpf_dst.close()
  
+
 def rddembed_ILS_EXT(row, rate=None):
     """
     input:
@@ -188,7 +189,7 @@ def rddembed_ILS_EXT(row, rate=None):
         raw = tmpf_dst.read()
         index = md5(raw).hexdigest()
  
-        return [row,(index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))]
+        return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))]
  
     except Exception as e:
         print e
@@ -282,10 +283,10 @@ class Sparker(object):
         """
  
         hconf = {
-		"hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
-                #"hbase.zookeeper.quorum": self.host, 
-		"hbase.mapreduce.inputtable": table_name,
-                 }
+        "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
+        # "hbase.zookeeper.quorum": self.host,
+        "hbase.mapreduce.inputtable": table_name,
+        }
  
         hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"],
                                             keyClass=hparams["readKeyClass"],
@@ -315,14 +316,13 @@ class Sparker(object):
                 cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc']
         """
         hconf = {
-		"hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
-		#"hbase.zookeeper.quorum": self.host,
-                 "hbase.mapreduce.inputtable": table_name,
-                 "hbase.mapred.outputtable": table_name,
-                 "mapreduce.outputformat.class": hparams["outputFormatClass"],
-                 "mapreduce.job.output.key.class": hparams["writeKeyClass"],
-                 "mapreduce.job.output.value.class": hparams["writeValueClass"],
-                 }
+        "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",  # "hbase.zookeeper.quorum": self.host,
+        "hbase.mapreduce.inputtable": table_name,
+        "hbase.mapred.outputtable": table_name,
+        "mapreduce.outputformat.class": hparams["outputFormatClass"],
+        "mapreduce.job.output.key.class": hparams["writeKeyClass"],
+        "mapreduce.job.output.value.class": hparams["writeValueClass"],
+        }
         cols = [col.split(':') for col in columns]
         if not fromrdd:
             rdd_data = self.sc.parallelize(data)
@@ -0,0 +1,56 @@
+__author__ = 'chunk'
+
+from ..mspark import SC
+from pyspark.mllib.regression import LabeledPoint
+
+
+cols0 = [
+    'cf_pic:data',
+    'cf_info:width',
+    'cf_info:height',
+    'cf_info:size',
+    'cf_info:capacity',
+    'cf_info:quality',
+    'cf_info:rate',
+    'cf_tag:chosen',
+    'cf_tag:class'
+]
+cols1 = [
+    'cf_pic:data',
+    'cf_info:width',
+    'cf_info:height',
+    'cf_info:size',
+    'cf_info:capacity',
+    'cf_info:quality',
+    'cf_info:rate',
+    'cf_tag:chosen',
+    'cf_tag:class',
+    'cf_feat:bid',
+]
+
+sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077')
+
+rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \
+    .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \
+    .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \
+    .mapValues(lambda items: SC.rddfeat_ILS(items))
+
+sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1,
+                    withdata=True)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
...	...	@@ -155,6 +155,7 @@ def rddembed_ILS(row, rate=None):
155	155	tmpf_src.close()
156	156	tmpf_dst.close()
157	157
	158	+
158	159	def rddembed_ILS_EXT(row, rate=None):
159	160	"""
160	161	input:
...	...	@@ -188,7 +189,7 @@ def rddembed_ILS_EXT(row, rate=None):
188	189	raw = tmpf_dst.read()
189	190	index = md5(raw).hexdigest()
190	191
191		- return [row,(index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))]
	192	+ return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))]
192	193
193	194	except Exception as e:
194	195	print e
...	...	@@ -282,10 +283,10 @@ class Sparker(object):
282	283	"""
283	284
284	285	hconf = {
285		- "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
286		- #"hbase.zookeeper.quorum": self.host,
287		- "hbase.mapreduce.inputtable": table_name,
288		- }
	286	+ "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
	287	+ # "hbase.zookeeper.quorum": self.host,
	288	+ "hbase.mapreduce.inputtable": table_name,
	289	+ }
289	290
290	291	hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"],
291	292	keyClass=hparams["readKeyClass"],
...	...	@@ -315,14 +316,13 @@ class Sparker(object):
315	316	cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc']
316	317	"""
317	318	hconf = {
318		- "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2",
319		- #"hbase.zookeeper.quorum": self.host,
320		- "hbase.mapreduce.inputtable": table_name,
321		- "hbase.mapred.outputtable": table_name,
322		- "mapreduce.outputformat.class": hparams["outputFormatClass"],
323		- "mapreduce.job.output.key.class": hparams["writeKeyClass"],
324		- "mapreduce.job.output.value.class": hparams["writeValueClass"],
325		- }
	319	+ "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host,
	320	+ "hbase.mapreduce.inputtable": table_name,
	321	+ "hbase.mapred.outputtable": table_name,
	322	+ "mapreduce.outputformat.class": hparams["outputFormatClass"],
	323	+ "mapreduce.job.output.key.class": hparams["writeKeyClass"],
	324	+ "mapreduce.job.output.value.class": hparams["writeValueClass"],
	325	+ }
326	326	cols = [col.split(':') for col in columns]
327	327	if not fromrdd:
328	328	rdd_data = self.sc.parallelize(data)
...	...
...	...	@@ -0,0 +1,56 @@
	1	+__author__ = 'chunk'
	2	+
	3	+from ..mspark import SC
	4	+from pyspark.mllib.regression import LabeledPoint
	5	+
	6	+
	7	+cols0 = [
	8	+ 'cf_pic:data',
	9	+ 'cf_info:width',
	10	+ 'cf_info:height',
	11	+ 'cf_info:size',
	12	+ 'cf_info:capacity',
	13	+ 'cf_info:quality',
	14	+ 'cf_info:rate',
	15	+ 'cf_tag:chosen',
	16	+ 'cf_tag:class'
	17	+]
	18	+cols1 = [
	19	+ 'cf_pic:data',
	20	+ 'cf_info:width',
	21	+ 'cf_info:height',
	22	+ 'cf_info:size',
	23	+ 'cf_info:capacity',
	24	+ 'cf_info:quality',
	25	+ 'cf_info:rate',
	26	+ 'cf_tag:chosen',
	27	+ 'cf_tag:class',
	28	+ 'cf_feat:bid',
	29	+]
	30	+
	31	+sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077')
	32	+
	33	+rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \
	34	+ .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \
	35	+ .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \
	36	+ .mapValues(lambda items: SC.rddfeat_ILS(items))
	37	+
	38	+sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1,
	39	+ withdata=True)
	40	+
	41	+
	42	+
	43	+
	44	+
	45	+
	46	+
	47	+
	48	+
	49	+
	50	+
	51	+
	52	+
	53	+
	54	+
	55	+
	56	+
...	...