Commit f69baeb66d0df4b82e33012523d21133c76cbe9a
1 parent
ca73c96f
Exists in
master
and in
2 other branches
spark streaming init.
Showing
5 changed files
with
87 additions
and
21 deletions
Show diff stats
.idea/ImageR.iml
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | 3 | <component name="NewModuleRootManager"> |
4 | 4 | <content url="file://$MODULE_DIR$" /> |
5 | - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | |
5 | + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> | |
6 | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | 7 | </component> |
8 | 8 | </module> |
9 | 9 | \ No newline at end of file | ... | ... |
common.py
... | ... | @@ -14,6 +14,8 @@ import ConfigParser |
14 | 14 | |
15 | 15 | import numpy as np |
16 | 16 | |
17 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | |
18 | + | |
17 | 19 | |
18 | 20 | class Timer(): |
19 | 21 | def __init__(self): |
... | ... | @@ -66,7 +68,8 @@ def get_env_variable(var_name, default=False): |
66 | 68 | import StringIO |
67 | 69 | import ConfigParser |
68 | 70 | |
69 | - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") | |
71 | + res_envfile = os.path.join(package_dir, 'res', '.env') | |
72 | + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile) | |
70 | 73 | try: |
71 | 74 | config = StringIO.StringIO() |
72 | 75 | config.write("[DATA]\n") |
... | ... | @@ -95,7 +98,8 @@ def get_env_variable(var_name, default=False): |
95 | 98 | |
96 | 99 | |
97 | 100 | def load_env(default=False): |
98 | - env_file = os.environ.get('PROJECT_ENV_FILE', "res/.env") | |
101 | + res_envfile = os.path.join(package_dir, 'res', '.env') | |
102 | + env_file = os.environ.get('PROJECT_ENV_FILE', res_envfile) | |
99 | 103 | try: |
100 | 104 | config = StringIO.StringIO() |
101 | 105 | config.write("[DATA]\n") | ... | ... |
mspark/SC.py
1 | 1 | __author__ = 'chunk' |
2 | 2 | |
3 | 3 | from ..common import * |
4 | +from .dependencies import * | |
5 | +from . import * | |
4 | 6 | |
5 | 7 | import sys |
6 | -from dependencies import * | |
7 | 8 | from pyspark import SparkConf, SparkContext |
8 | 9 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD |
9 | 10 | from pyspark.mllib.regression import LabeledPoint |
... | ... | @@ -12,23 +13,6 @@ import json |
12 | 13 | import pickle |
13 | 14 | |
14 | 15 | |
15 | -hparams = dict( | |
16 | - inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat", | |
17 | - readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | |
18 | - readValueClass="org.apache.hadoop.hbase.client.Result", | |
19 | - readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", | |
20 | - readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter", | |
21 | - | |
22 | - outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat", | |
23 | - writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | |
24 | - # writeValueClass="org.apache.hadoop.io.Writable", | |
25 | - writeValueClass="org.apache.hadoop.hbase.client.Put", | |
26 | - writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter", | |
27 | - writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter", | |
28 | - | |
29 | -) | |
30 | - | |
31 | - | |
32 | 16 | def parse_cv(raw_row): |
33 | 17 | """ |
34 | 18 | input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | ... | ... |
... | ... | @@ -0,0 +1,61 @@ |
1 | +__author__ = 'chunk' | |
2 | + | |
3 | +from ..common import * | |
4 | +from . import * | |
5 | +from .dependencies import * | |
6 | +from .SC import * | |
7 | + | |
8 | +import sys | |
9 | +from pyspark import SparkConf, SparkContext | |
10 | +from pyspark.streaming import StreamingContext | |
11 | + | |
12 | + | |
13 | +class StreamSparker(Sparker): | |
14 | + def __init__(self, host='HPC-server', appname='NewPySparkStreamingApp', source='localhost', port=9999, **kwargs): | |
15 | + Sparker.__init__(self, host, appname) | |
16 | + | |
17 | + self.source = source | |
18 | + self.port = port | |
19 | + self.ssc = StreamingContext(sparkContext=self.sc, batchDuration=1) | |
20 | + | |
21 | + def start(self): | |
22 | + self.ssc.start() | |
23 | + self.ssc.awaitTermination() | |
24 | + | |
25 | + | |
26 | + def set_datasource(self, source='localhost', port=9999): | |
27 | + self.source = source | |
28 | + self.port = port | |
29 | + | |
30 | + def _word_count(self): | |
31 | + lines = self.ssc.socketTextStream(self.source, self.port) | |
32 | + words = lines.flatMap(lambda line: line.split(" ")) | |
33 | + pairs = words.map(lambda word: (word, 1)) | |
34 | + wordCounts = pairs.reduceByKey(lambda x, y: x + y) | |
35 | + | |
36 | + wordCounts.pprint() | |
37 | + | |
38 | + self.start() | |
39 | + | |
40 | + | |
41 | + | |
42 | + | |
43 | + | |
44 | + | |
45 | + | |
46 | + | |
47 | + | |
48 | + | |
49 | + | |
50 | + | |
51 | + | |
52 | + | |
53 | + | |
54 | + | |
55 | + | |
56 | + | |
57 | + | |
58 | + | |
59 | + | |
60 | + | |
61 | + | ... | ... |
mspark/__init__.py
1 | 1 | __author__ = 'chunk' |
2 | + | |
3 | +hparams = dict( | |
4 | + inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat", | |
5 | + readKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | |
6 | + readValueClass="org.apache.hadoop.hbase.client.Result", | |
7 | + readKeyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", | |
8 | + readValueConverter="org.apache.spark.examples.pythonconverters.CustomHBaseResultToStringConverter", | |
9 | + | |
10 | + outputFormatClass="org.apache.hadoop.hbase.mapreduce.TableOutputFormat", | |
11 | + writeKeyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", | |
12 | + # writeValueClass="org.apache.hadoop.io.Writable", | |
13 | + writeValueClass="org.apache.hadoop.hbase.client.Put", | |
14 | + writeKeyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter", | |
15 | + writeValueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter", | |
16 | + | |
17 | +) | |
18 | + | ... | ... |