Commit f69baeb66d0df4b82e33012523d21133c76cbe9a

Authored by Chunk
1 parent ca73c96f

spark streaming init.

.idea/ImageR.iml
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 <module type="PYTHON_MODULE" version="4"> 2 <module type="PYTHON_MODULE" version="4">
3 <component name="NewModuleRootManager"> 3 <component name="NewModuleRootManager">
4 <content url="file://$MODULE_DIR$" /> 4 <content url="file://$MODULE_DIR$" />
5 - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> 5 + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" />
6 <orderEntry type="sourceFolder" forTests="false" /> 6 <orderEntry type="sourceFolder" forTests="false" />
7 </component> 7 </component>
8 </module> 8 </module>
9 \ No newline at end of file 9 \ No newline at end of file
@@ -14,6 +14,8 @@ import ConfigParser @@ -14,6 +14,8 @@ import ConfigParser
14 14
15 import numpy as np 15 import numpy as np
16 16
  17 +package_dir = os.path.dirname(os.path.abspath(__file__))
  18 +
17 19
18 class Timer(): 20 class Timer():
19 def __init__(self): 21 def __init__(self):
@@ -66,7 +68,8 @@ def get_env_variable(var_name, default=False): @@ -66,7 +68,8 @@ def get_env_variable(var_name, default=False):
66 import StringIO 68 import StringIO
67 import ConfigParser 69 import ConfigParser
68 70
69 - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") 71 + res_envfile = os.path.join(package_dir, 'res', '.env')
  72 + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile)
70 try: 73 try:
71 config = StringIO.StringIO() 74 config = StringIO.StringIO()
72 config.write("[DATA]\n") 75 config.write("[DATA]\n")
@@ -95,7 +98,8 @@ def get_env_variable(var_name, default=False): @@ -95,7 +98,8 @@ def get_env_variable(var_name, default=False):
95 98
96 99
97 def load_env(default=False): 100 def load_env(default=False):
98 - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") 101 + res_envfile = os.path.join(package_dir, 'res', '.env')
  102 + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile)
99 try: 103 try:
100 config = StringIO.StringIO() 104 config = StringIO.StringIO()
101 config.write("[DATA]\n") 105 config.write("[DATA]\n")
1 __author__ = 'chunk' 1 __author__ = 'chunk'
2 2
3 from ..common import * 3 from ..common import *
  4 +from .dependencies import *
  5 +from . import *
4 6
5 import sys 7 import sys
6 -from dependencies import *  
7 from pyspark import SparkConf, SparkContext 8 from pyspark import SparkConf, SparkContext
8 from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD 9 from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD
9 from pyspark.mllib.regression import LabeledPoint 10 from pyspark.mllib.regression import LabeledPoint
@@ -12,23 +13,6 @@ import json @@ -12,23 +13,6 @@ import json
12 import pickle 13 import pickle
13 14
14 15
15 -hparams = dict(  
16 - inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat",  
17 - readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable",  
18 - readValueClass="org.apache.hadoop.hbase.client.Result",  
19 - readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter",  
20 - readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter",  
21 -  
22 - outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat",  
23 - writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable",  
24 - # writeValueClass="org.apache.hadoop.io.Writable",  
25 - writeValueClass="org.apache.hadoop.hbase.client.Put",  
26 - writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",  
27 - writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter",  
28 -  
29 -)  
30 -  
31 -  
32 def parse_cv(raw_row): 16 def parse_cv(raw_row):
33 """ 17 """
34 input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') 18 input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True')
mspark/SSC.py 0 → 100644
@@ -0,0 +1,61 @@ @@ -0,0 +1,61 @@
  1 +__author__ = 'chunk'
  2 +
  3 +from ..common import *
  4 +from . import *
  5 +from .dependencies import *
  6 +from .SC import *
  7 +
  8 +import sys
  9 +from pyspark import SparkConf, SparkContext
  10 +from pyspark.streaming import StreamingContext
  11 +
  12 +
  13 +class StreamSparker(Sparker):
  14 + def __init__(self, host='HPC-server', appname='NewPySparkStreamingApp', source='localhost', port=9999, **kwargs):
  15 + Sparker.__init__(self, host, appname)
  16 +
  17 + self.source = source
  18 + self.port = port
  19 + self.ssc = StreamingContext(sparkContext=self.sc, batchDuration=1)
  20 +
  21 + def start(self):
  22 + self.ssc.start()
  23 + self.ssc.awaitTermination()
  24 +
  25 +
  26 + def set_datasource(self, source='localhost', port=9999):
  27 + self.source = source
  28 + self.port = port
  29 +
  30 + def _word_count(self):
  31 + lines = self.ssc.socketTextStream(self.source, self.port)
  32 + words = lines.flatMap(lambda line: line.split(" "))
  33 + pairs = words.map(lambda word: (word, 1))
  34 + wordCounts = pairs.reduceByKey(lambda x, y: x + y)
  35 +
  36 + wordCounts.pprint()
  37 +
  38 + self.start()
  39 +
  40 +
  41 +
  42 +
  43 +
  44 +
  45 +
  46 +
  47 +
  48 +
  49 +
  50 +
  51 +
  52 +
  53 +
  54 +
  55 +
  56 +
  57 +
  58 +
  59 +
  60 +
  61 +
mspark/__init__.py
1 __author__ = 'chunk' 1 __author__ = 'chunk'
  2 +
  3 +hparams = dict(
  4 + inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat",
  5 + readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable",
  6 + readValueClass="org.apache.hadoop.hbase.client.Result",
  7 + readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter",
  8 + readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter",
  9 +
  10 + outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
  11 + writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable",
  12 + # writeValueClass="org.apache.hadoop.io.Writable",
  13 + writeValueClass="org.apache.hadoop.hbase.client.Put",
  14 + writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",
  15 + writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter",
  16 +
  17 +)
  18 +