Commit 4737e1662edbd91e50699151b52004c479a429e1
1 parent
9a5cac33
Exists in
master
and in
2 other branches
staged.
Showing
19 changed files
with
221 additions
and
84 deletions
Show diff stats
.idea/ImageR.iml
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | <module type="PYTHON_MODULE" version="4"> | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | <component name="NewModuleRootManager"> | 3 | <component name="NewModuleRootManager"> |
4 | <content url="file://$MODULE_DIR$" /> | 4 | <content url="file://$MODULE_DIR$" /> |
5 | - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | 5 | + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> |
6 | <orderEntry type="sourceFolder" forTests="false" /> | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | </component> | 7 | </component> |
8 | </module> | 8 | </module> |
9 | \ No newline at end of file | 9 | \ No newline at end of file |
run.sh
@@ -1,63 +0,0 @@ | @@ -1,63 +0,0 @@ | ||
1 | -#!/bin/zsh | ||
2 | -# chunk @ 2014 | ||
3 | -######################################################################################## | ||
4 | -## | ||
5 | -## F**k World! | ||
6 | -## | ||
7 | -## | ||
8 | -## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | ||
9 | -## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | ||
10 | -## | ||
11 | -## spark-submit \ | ||
12 | -## --driver-memory 1g \ | ||
13 | -## --executor-memory 1g \ | ||
14 | -## --executor-cores 2 \ | ||
15 | -## --deploy-mode client \ | ||
16 | -## --master yarn \ | ||
17 | -## --class "FuckWorld" \ | ||
18 | -## $APP_JAR $ARGS | ||
19 | -## | ||
20 | -## spark-class org.apache.spark.deploy.yarn.Client \ | ||
21 | -## --num-executors 2 \ | ||
22 | -## --executor-cores 2 \ | ||
23 | -## --driver-memory 1g \ | ||
24 | -## --executor-memory 1g \ | ||
25 | -## --name "F**k World" \ | ||
26 | -## --jar $APP_JAR \ | ||
27 | -## --class "FuckWorld" \ | ||
28 | -## --args $ARGS | ||
29 | -## | ||
30 | -##spark-submit \ | ||
31 | -## --driver-memory 1g \ | ||
32 | -## --executor-memory 1g \ | ||
33 | -## --executor-cores 2 \ | ||
34 | -## --master spark://HPC-server:7077 \ | ||
35 | -## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
36 | -## $APP_JAR $ARGS | ||
37 | -######################################################################################## | ||
38 | - | ||
39 | -source /home/hadoop/.zshrc | ||
40 | -v env1 | ||
41 | - | ||
42 | -export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | ||
43 | -export SPARK_CLASSPATH=`hbase classpath` | ||
44 | -export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | ||
45 | - | ||
46 | -#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | ||
47 | -# --py-files $COMPRESSED \ | ||
48 | -COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | ||
49 | - | ||
50 | -APP=test_spark.py | ||
51 | -#APP=test_model.py | ||
52 | -ARGS= | ||
53 | - | ||
54 | -spark-submit \ | ||
55 | - --driver-memory 1g \ | ||
56 | - --executor-memory 2g \ | ||
57 | - --executor-cores 2 \ | ||
58 | - --master spark://HPC-server:7077 \ | ||
59 | - --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
60 | - --py-files $COMPRESSED \ | ||
61 | - $APP $ARGS | ||
62 | - | ||
63 | - |
@@ -0,0 +1,63 @@ | @@ -0,0 +1,63 @@ | ||
1 | +#!/bin/zsh | ||
2 | +# chunk @ 2014 | ||
3 | +######################################################################################## | ||
4 | +## | ||
5 | +## F**k World! | ||
6 | +## | ||
7 | +## | ||
8 | +## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | ||
9 | +## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | ||
10 | +## | ||
11 | +## spark-submit \ | ||
12 | +## --driver-memory 1g \ | ||
13 | +## --executor-memory 1g \ | ||
14 | +## --executor-cores 2 \ | ||
15 | +## --deploy-mode client \ | ||
16 | +## --master yarn \ | ||
17 | +## --class "FuckWorld" \ | ||
18 | +## $APP_JAR $ARGS | ||
19 | +## | ||
20 | +## spark-class org.apache.spark.deploy.yarn.Client \ | ||
21 | +## --num-executors 2 \ | ||
22 | +## --executor-cores 2 \ | ||
23 | +## --driver-memory 1g \ | ||
24 | +## --executor-memory 1g \ | ||
25 | +## --name "F**k World" \ | ||
26 | +## --jar $APP_JAR \ | ||
27 | +## --class "FuckWorld" \ | ||
28 | +## --args $ARGS | ||
29 | +## | ||
30 | +##spark-submit \ | ||
31 | +## --driver-memory 1g \ | ||
32 | +## --executor-memory 1g \ | ||
33 | +## --executor-cores 2 \ | ||
34 | +## --master spark://HPC-server:7077 \ | ||
35 | +## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
36 | +## $APP_JAR $ARGS | ||
37 | +######################################################################################## | ||
38 | + | ||
39 | +source /home/hadoop/.zshrc | ||
40 | +v env1 | ||
41 | + | ||
42 | +export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | ||
43 | +export SPARK_CLASSPATH=`hbase classpath` | ||
44 | +export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | ||
45 | + | ||
46 | +#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | ||
47 | +# --py-files $COMPRESSED \ | ||
48 | +COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | ||
49 | + | ||
50 | +APP=test_spark.py | ||
51 | +#APP=test_model.py | ||
52 | +ARGS= | ||
53 | + | ||
54 | +spark-submit \ | ||
55 | + --driver-memory 1g \ | ||
56 | + --executor-memory 2g \ | ||
57 | + --executor-cores 2 \ | ||
58 | + --master spark://HPC-server:7077 \ | ||
59 | + --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
60 | + --py-files $COMPRESSED \ | ||
61 | + $APP $ARGS | ||
62 | + | ||
63 | + |
@@ -0,0 +1,23 @@ | @@ -0,0 +1,23 @@ | ||
1 | +#!/bin/zsh | ||
2 | +# chunk @ 2014 | ||
3 | + | ||
4 | +#################################################################### | ||
5 | +## environment variables | ||
6 | +#################################################################### | ||
7 | +export export TERM=xterm | ||
8 | +source /home/hadoop/.zshrc | ||
9 | +v env0 | ||
10 | + | ||
11 | +#################################################################### | ||
12 | +## additional files list | ||
13 | +#################################################################### | ||
14 | +FILE=hehe.json | ||
15 | + | ||
16 | +#scrapy runspider spider/test.py | ||
17 | +cd ./spider/mspider/ | ||
18 | +[ -f $FILE ] && rm $FILE | ||
19 | +scrapy crawl dmoz -o $FILE | ||
20 | + | ||
21 | + | ||
22 | + | ||
23 | + |
spider/__init__.py
@@ -0,0 +1,44 @@ | @@ -0,0 +1,44 @@ | ||
1 | +[{"link": ["/"], "title": ["Top"]}, | ||
2 | +{"link": ["/Computers/"], "title": ["Computers"]}, | ||
3 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
4 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
5 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
6 | +{"link": [], "title": []}, | ||
7 | +{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, | ||
8 | +{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, | ||
9 | +{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, | ||
10 | +{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, | ||
11 | +{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, | ||
12 | +{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, | ||
13 | +{"link": ["/"], "title": ["Top"]}, | ||
14 | +{"link": ["/Computers/"], "title": ["Computers"]}, | ||
15 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
16 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
17 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
18 | +{"link": [], "title": []}, | ||
19 | +{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, | ||
20 | +{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, | ||
21 | +{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, | ||
22 | +{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, | ||
23 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, | ||
24 | +{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, | ||
25 | +{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, | ||
26 | +{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, | ||
27 | +{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, | ||
28 | +{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, | ||
29 | +{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, | ||
30 | +{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, | ||
31 | +{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, | ||
32 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, | ||
33 | +{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, | ||
34 | +{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, | ||
35 | +{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, | ||
36 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, | ||
37 | +{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, | ||
38 | +{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, | ||
39 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, | ||
40 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, | ||
41 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, | ||
42 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, | ||
43 | +{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, | ||
44 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] | ||
0 | \ No newline at end of file | 45 | \ No newline at end of file |
No preview for this file type
@@ -0,0 +1,19 @@ | @@ -0,0 +1,19 @@ | ||
1 | +# -*- coding: utf-8 -*- | ||
2 | + | ||
3 | +# Define here the models for your scraped items | ||
4 | +# | ||
5 | +# See documentation in: | ||
6 | +# http://doc.scrapy.org/en/latest/topics/items.html | ||
7 | + | ||
8 | +import scrapy | ||
9 | + | ||
10 | + | ||
11 | +class MspiderItem(scrapy.Item): | ||
12 | + # define the fields for your item here like: | ||
13 | + # name = scrapy.Field() | ||
14 | + pass | ||
15 | + | ||
16 | +class DmozItem(scrapy.Item): | ||
17 | + title = scrapy.Field() | ||
18 | + link = scrapy.Field() | ||
19 | + desc = scrapy.Field() |
No preview for this file type
@@ -0,0 +1,11 @@ | @@ -0,0 +1,11 @@ | ||
1 | +# -*- coding: utf-8 -*- | ||
2 | + | ||
3 | +# Define your item pipelines here | ||
4 | +# | ||
5 | +# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
6 | +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
7 | + | ||
8 | + | ||
9 | +class MspiderPipeline(object): | ||
10 | + def process_item(self, item, spider): | ||
11 | + return item |
@@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
1 | +# -*- coding: utf-8 -*- | ||
2 | + | ||
3 | +# Scrapy settings for mspider project | ||
4 | +# | ||
5 | +# For simplicity, this file contains only the most important settings by | ||
6 | +# default. All the other settings are documented here: | ||
7 | +# | ||
8 | +# http://doc.scrapy.org/en/latest/topics/settings.html | ||
9 | +# | ||
10 | + | ||
11 | +BOT_NAME = 'mspider' | ||
12 | + | ||
13 | +SPIDER_MODULES = ['mspider.spiders'] | ||
14 | +NEWSPIDER_MODULE = 'mspider.spiders' | ||
15 | + | ||
16 | +# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
17 | +#USER_AGENT = 'mspider (+http://www.yourdomain.com)' |
No preview for this file type
No preview for this file type
@@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
1 | +__author__ = 'chunk' | ||
2 | + | ||
3 | +import scrapy | ||
4 | +from ..items import DmozItem | ||
5 | + | ||
6 | +class DmozSpider(scrapy.Spider): | ||
7 | + name = "dmoz" | ||
8 | + allowed_domains = ["dmoz.org"] | ||
9 | + start_urls = [ | ||
10 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | ||
11 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | ||
12 | + ] | ||
13 | + | ||
14 | + def parse(self, response): | ||
15 | + """ | ||
16 | + This is the default callback used by Scrapy to process downloaded responses | ||
17 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | ||
18 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | ||
19 | + | ||
20 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | ||
21 | + """ | ||
22 | + for sel in response.xpath('//ul/li'): | ||
23 | + item = DmozItem() | ||
24 | + item['title'] = sel.xpath('a/text()').extract() | ||
25 | + item['link'] = sel.xpath('a/@href').extract() | ||
26 | + | ||
27 | + yield item |
No preview for this file type
@@ -0,0 +1,11 @@ | @@ -0,0 +1,11 @@ | ||
1 | +# Automatically created by: scrapy startproject | ||
2 | +# | ||
3 | +# For more information about the [deploy] section see: | ||
4 | +# http://doc.scrapy.org/en/latest/topics/scrapyd.html | ||
5 | + | ||
6 | +[settings] | ||
7 | +default = mspider.settings | ||
8 | + | ||
9 | +[deploy] | ||
10 | +#url = http://localhost:6800/ | ||
11 | +project = mspider |