Commit f69baeb66d0df4b82e33012523d21133c76cbe9a
1 parent
ca73c96f
Exists in
master
and in
2 other branches
spark streaming init.
Showing
5 changed files
with
87 additions
and
21 deletions
Show diff stats
.idea/ImageR.iml
| @@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
| 2 | <module type="PYTHON_MODULE" version="4"> | 2 | <module type="PYTHON_MODULE" version="4"> |
| 3 | <component name="NewModuleRootManager"> | 3 | <component name="NewModuleRootManager"> |
| 4 | <content url="file://$MODULE_DIR$" /> | 4 | <content url="file://$MODULE_DIR$" /> |
| 5 | - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | 5 | + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> |
| 6 | <orderEntry type="sourceFolder" forTests="false" /> | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
| 7 | </component> | 7 | </component> |
| 8 | </module> | 8 | </module> |
| 9 | \ No newline at end of file | 9 | \ No newline at end of file |
common.py
| @@ -14,6 +14,8 @@ import ConfigParser | @@ -14,6 +14,8 @@ import ConfigParser | ||
| 14 | 14 | ||
| 15 | import numpy as np | 15 | import numpy as np |
| 16 | 16 | ||
| 17 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| 18 | + | ||
| 17 | 19 | ||
| 18 | class Timer(): | 20 | class Timer(): |
| 19 | def __init__(self): | 21 | def __init__(self): |
| @@ -66,7 +68,8 @@ def get_env_variable(var_name, default=False): | @@ -66,7 +68,8 @@ def get_env_variable(var_name, default=False): | ||
| 66 | import StringIO | 68 | import StringIO |
| 67 | import ConfigParser | 69 | import ConfigParser |
| 68 | 70 | ||
| 69 | - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") | 71 | + res_envfile = os.path.join(package_dir, 'res', '.env') |
| 72 | + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile) | ||
| 70 | try: | 73 | try: |
| 71 | config = StringIO.StringIO() | 74 | config = StringIO.StringIO() |
| 72 | config.write("[DATA]\n") | 75 | config.write("[DATA]\n") |
| @@ -95,7 +98,8 @@ def get_env_variable(var_name, default=False): | @@ -95,7 +98,8 @@ def get_env_variable(var_name, default=False): | ||
| 95 | 98 | ||
| 96 | 99 | ||
| 97 | def load_env(default=False): | 100 | def load_env(default=False): |
| 98 | - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") | 101 | + res_envfile = os.path.join(package_dir, 'res', '.env') |
| 102 | + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile) | ||
| 99 | try: | 103 | try: |
| 100 | config = StringIO.StringIO() | 104 | config = StringIO.StringIO() |
| 101 | config.write("[DATA]\n") | 105 | config.write("[DATA]\n") |
mspark/SC.py
| 1 | __author__ = 'chunk' | 1 | __author__ = 'chunk' |
| 2 | 2 | ||
| 3 | from ..common import * | 3 | from ..common import * |
| 4 | +from .dependencies import * | ||
| 5 | +from . import * | ||
| 4 | 6 | ||
| 5 | import sys | 7 | import sys |
| 6 | -from dependencies import * | ||
| 7 | from pyspark import SparkConf, SparkContext | 8 | from pyspark import SparkConf, SparkContext |
| 8 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD | 9 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD |
| 9 | from pyspark.mllib.regression import LabeledPoint | 10 | from pyspark.mllib.regression import LabeledPoint |
| @@ -12,23 +13,6 @@ import json | @@ -12,23 +13,6 @@ import json | ||
| 12 | import pickle | 13 | import pickle |
| 13 | 14 | ||
| 14 | 15 | ||
| 15 | -hparams = dict( | ||
| 16 | - inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat", | ||
| 17 | - readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | ||
| 18 | - readValueClass="org.apache.hadoop.hbase.client.Result", | ||
| 19 | - readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", | ||
| 20 | - readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter", | ||
| 21 | - | ||
| 22 | - outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat", | ||
| 23 | - writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | ||
| 24 | - # writeValueClass="org.apache.hadoop.io.Writable", | ||
| 25 | - writeValueClass="org.apache.hadoop.hbase.client.Put", | ||
| 26 | - writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter", | ||
| 27 | - writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter", | ||
| 28 | - | ||
| 29 | -) | ||
| 30 | - | ||
| 31 | - | ||
| 32 | def parse_cv(raw_row): | 16 | def parse_cv(raw_row): |
| 33 | """ | 17 | """ |
| 34 | input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | 18 | input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') |
| @@ -0,0 +1,61 @@ | @@ -0,0 +1,61 @@ | ||
| 1 | +__author__ = 'chunk' | ||
| 2 | + | ||
| 3 | +from ..common import * | ||
| 4 | +from . import * | ||
| 5 | +from .dependencies import * | ||
| 6 | +from .SC import * | ||
| 7 | + | ||
| 8 | +import sys | ||
| 9 | +from pyspark import SparkConf, SparkContext | ||
| 10 | +from pyspark.streaming import StreamingContext | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +class StreamSparker(Sparker): | ||
| 14 | + def __init__(self, host='HPC-server', appname='NewPySparkStreamingApp', source='localhost', port=9999, **kwargs): | ||
| 15 | + Sparker.__init__(self, host, appname) | ||
| 16 | + | ||
| 17 | + self.source = source | ||
| 18 | + self.port = port | ||
| 19 | + self.ssc = StreamingContext(sparkContext=self.sc, batchDuration=1) | ||
| 20 | + | ||
| 21 | + def start(self): | ||
| 22 | + self.ssc.start() | ||
| 23 | + self.ssc.awaitTermination() | ||
| 24 | + | ||
| 25 | + | ||
| 26 | + def set_datasource(self, source='localhost', port=9999): | ||
| 27 | + self.source = source | ||
| 28 | + self.port = port | ||
| 29 | + | ||
| 30 | + def _word_count(self): | ||
| 31 | + lines = self.ssc.socketTextStream(self.source, self.port) | ||
| 32 | + words = lines.flatMap(lambda line: line.split(" ")) | ||
| 33 | + pairs = words.map(lambda word: (word, 1)) | ||
| 34 | + wordCounts = pairs.reduceByKey(lambda x, y: x + y) | ||
| 35 | + | ||
| 36 | + wordCounts.pprint() | ||
| 37 | + | ||
| 38 | + self.start() | ||
| 39 | + | ||
| 40 | + | ||
| 41 | + | ||
| 42 | + | ||
| 43 | + | ||
| 44 | + | ||
| 45 | + | ||
| 46 | + | ||
| 47 | + | ||
| 48 | + | ||
| 49 | + | ||
| 50 | + | ||
| 51 | + | ||
| 52 | + | ||
| 53 | + | ||
| 54 | + | ||
| 55 | + | ||
| 56 | + | ||
| 57 | + | ||
| 58 | + | ||
| 59 | + | ||
| 60 | + | ||
| 61 | + |
mspark/__init__.py
| 1 | __author__ = 'chunk' | 1 | __author__ = 'chunk' |
| 2 | + | ||
| 3 | +hparams = dict( | ||
| 4 | + inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat", | ||
| 5 | + readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | ||
| 6 | + readValueClass="org.apache.hadoop.hbase.client.Result", | ||
| 7 | + readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", | ||
| 8 | + readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter", | ||
| 9 | + | ||
| 10 | + outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat", | ||
| 11 | + writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | ||
| 12 | + # writeValueClass="org.apache.hadoop.io.Writable", | ||
| 13 | + writeValueClass="org.apache.hadoop.hbase.client.Put", | ||
| 14 | + writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter", | ||
| 15 | + writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter", | ||
| 16 | + | ||
| 17 | +) | ||
| 18 | + |