Commit 4737e1662edbd91e50699151b52004c479a429e1
1 parent
9a5cac33
Exists in
master
and in
2 other branches
staged.
Showing
19 changed files
with
221 additions
and
84 deletions
Show diff stats
.idea/ImageR.iml
| @@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
| 2 | <module type="PYTHON_MODULE" version="4"> | 2 | <module type="PYTHON_MODULE" version="4"> |
| 3 | <component name="NewModuleRootManager"> | 3 | <component name="NewModuleRootManager"> |
| 4 | <content url="file://$MODULE_DIR$" /> | 4 | <content url="file://$MODULE_DIR$" /> |
| 5 | - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | 5 | + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> |
| 6 | <orderEntry type="sourceFolder" forTests="false" /> | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
| 7 | </component> | 7 | </component> |
| 8 | </module> | 8 | </module> |
| 9 | \ No newline at end of file | 9 | \ No newline at end of file |
run.sh
| @@ -1,63 +0,0 @@ | @@ -1,63 +0,0 @@ | ||
| 1 | -#!/bin/zsh | ||
| 2 | -# chunk @ 2014 | ||
| 3 | -######################################################################################## | ||
| 4 | -## | ||
| 5 | -## F**k World! | ||
| 6 | -## | ||
| 7 | -## | ||
| 8 | -## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | ||
| 9 | -## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | ||
| 10 | -## | ||
| 11 | -## spark-submit \ | ||
| 12 | -## --driver-memory 1g \ | ||
| 13 | -## --executor-memory 1g \ | ||
| 14 | -## --executor-cores 2 \ | ||
| 15 | -## --deploy-mode client \ | ||
| 16 | -## --master yarn \ | ||
| 17 | -## --class "FuckWorld" \ | ||
| 18 | -## $APP_JAR $ARGS | ||
| 19 | -## | ||
| 20 | -## spark-class org.apache.spark.deploy.yarn.Client \ | ||
| 21 | -## --num-executors 2 \ | ||
| 22 | -## --executor-cores 2 \ | ||
| 23 | -## --driver-memory 1g \ | ||
| 24 | -## --executor-memory 1g \ | ||
| 25 | -## --name "F**k World" \ | ||
| 26 | -## --jar $APP_JAR \ | ||
| 27 | -## --class "FuckWorld" \ | ||
| 28 | -## --args $ARGS | ||
| 29 | -## | ||
| 30 | -##spark-submit \ | ||
| 31 | -## --driver-memory 1g \ | ||
| 32 | -## --executor-memory 1g \ | ||
| 33 | -## --executor-cores 2 \ | ||
| 34 | -## --master spark://HPC-server:7077 \ | ||
| 35 | -## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
| 36 | -## $APP_JAR $ARGS | ||
| 37 | -######################################################################################## | ||
| 38 | - | ||
| 39 | -source /home/hadoop/.zshrc | ||
| 40 | -v env1 | ||
| 41 | - | ||
| 42 | -export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | ||
| 43 | -export SPARK_CLASSPATH=`hbase classpath` | ||
| 44 | -export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | ||
| 45 | - | ||
| 46 | -#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | ||
| 47 | -# --py-files $COMPRESSED \ | ||
| 48 | -COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | ||
| 49 | - | ||
| 50 | -APP=test_spark.py | ||
| 51 | -#APP=test_model.py | ||
| 52 | -ARGS= | ||
| 53 | - | ||
| 54 | -spark-submit \ | ||
| 55 | - --driver-memory 1g \ | ||
| 56 | - --executor-memory 2g \ | ||
| 57 | - --executor-cores 2 \ | ||
| 58 | - --master spark://HPC-server:7077 \ | ||
| 59 | - --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
| 60 | - --py-files $COMPRESSED \ | ||
| 61 | - $APP $ARGS | ||
| 62 | - | ||
| 63 | - |
| @@ -0,0 +1,63 @@ | @@ -0,0 +1,63 @@ | ||
| 1 | +#!/bin/zsh | ||
| 2 | +# chunk @ 2014 | ||
| 3 | +######################################################################################## | ||
| 4 | +## | ||
| 5 | +## F**k World! | ||
| 6 | +## | ||
| 7 | +## | ||
| 8 | +## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | ||
| 9 | +## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | ||
| 10 | +## | ||
| 11 | +## spark-submit \ | ||
| 12 | +## --driver-memory 1g \ | ||
| 13 | +## --executor-memory 1g \ | ||
| 14 | +## --executor-cores 2 \ | ||
| 15 | +## --deploy-mode client \ | ||
| 16 | +## --master yarn \ | ||
| 17 | +## --class "FuckWorld" \ | ||
| 18 | +## $APP_JAR $ARGS | ||
| 19 | +## | ||
| 20 | +## spark-class org.apache.spark.deploy.yarn.Client \ | ||
| 21 | +## --num-executors 2 \ | ||
| 22 | +## --executor-cores 2 \ | ||
| 23 | +## --driver-memory 1g \ | ||
| 24 | +## --executor-memory 1g \ | ||
| 25 | +## --name "F**k World" \ | ||
| 26 | +## --jar $APP_JAR \ | ||
| 27 | +## --class "FuckWorld" \ | ||
| 28 | +## --args $ARGS | ||
| 29 | +## | ||
| 30 | +##spark-submit \ | ||
| 31 | +## --driver-memory 1g \ | ||
| 32 | +## --executor-memory 1g \ | ||
| 33 | +## --executor-cores 2 \ | ||
| 34 | +## --master spark://HPC-server:7077 \ | ||
| 35 | +## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
| 36 | +## $APP_JAR $ARGS | ||
| 37 | +######################################################################################## | ||
| 38 | + | ||
| 39 | +source /home/hadoop/.zshrc | ||
| 40 | +v env1 | ||
| 41 | + | ||
| 42 | +export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | ||
| 43 | +export SPARK_CLASSPATH=`hbase classpath` | ||
| 44 | +export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | ||
| 45 | + | ||
| 46 | +#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | ||
| 47 | +# --py-files $COMPRESSED \ | ||
| 48 | +COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | ||
| 49 | + | ||
| 50 | +APP=test_spark.py | ||
| 51 | +#APP=test_model.py | ||
| 52 | +ARGS= | ||
| 53 | + | ||
| 54 | +spark-submit \ | ||
| 55 | + --driver-memory 1g \ | ||
| 56 | + --executor-memory 2g \ | ||
| 57 | + --executor-cores 2 \ | ||
| 58 | + --master spark://HPC-server:7077 \ | ||
| 59 | + --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | ||
| 60 | + --py-files $COMPRESSED \ | ||
| 61 | + $APP $ARGS | ||
| 62 | + | ||
| 63 | + |
| @@ -0,0 +1,23 @@ | @@ -0,0 +1,23 @@ | ||
| 1 | +#!/bin/zsh | ||
| 2 | +# chunk @ 2014 | ||
| 3 | + | ||
| 4 | +#################################################################### | ||
| 5 | +## environment variables | ||
| 6 | +#################################################################### | ||
| 7 | +export export TERM=xterm | ||
| 8 | +source /home/hadoop/.zshrc | ||
| 9 | +v env0 | ||
| 10 | + | ||
| 11 | +#################################################################### | ||
| 12 | +## additional files list | ||
| 13 | +#################################################################### | ||
| 14 | +FILE=hehe.json | ||
| 15 | + | ||
| 16 | +#scrapy runspider spider/test.py | ||
| 17 | +cd ./spider/mspider/ | ||
| 18 | +[ -f $FILE ] && rm $FILE | ||
| 19 | +scrapy crawl dmoz -o $FILE | ||
| 20 | + | ||
| 21 | + | ||
| 22 | + | ||
| 23 | + |
spider/__init__.py
| @@ -0,0 +1,44 @@ | @@ -0,0 +1,44 @@ | ||
| 1 | +[{"link": ["/"], "title": ["Top"]}, | ||
| 2 | +{"link": ["/Computers/"], "title": ["Computers"]}, | ||
| 3 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
| 4 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
| 5 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
| 6 | +{"link": [], "title": []}, | ||
| 7 | +{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, | ||
| 8 | +{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, | ||
| 9 | +{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, | ||
| 10 | +{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, | ||
| 11 | +{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, | ||
| 12 | +{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, | ||
| 13 | +{"link": ["/"], "title": ["Top"]}, | ||
| 14 | +{"link": ["/Computers/"], "title": ["Computers"]}, | ||
| 15 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
| 16 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
| 17 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
| 18 | +{"link": [], "title": []}, | ||
| 19 | +{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, | ||
| 20 | +{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, | ||
| 21 | +{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, | ||
| 22 | +{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, | ||
| 23 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, | ||
| 24 | +{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, | ||
| 25 | +{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, | ||
| 26 | +{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, | ||
| 27 | +{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, | ||
| 28 | +{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, | ||
| 29 | +{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, | ||
| 30 | +{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, | ||
| 31 | +{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, | ||
| 32 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, | ||
| 33 | +{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, | ||
| 34 | +{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, | ||
| 35 | +{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, | ||
| 36 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, | ||
| 37 | +{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, | ||
| 38 | +{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, | ||
| 39 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, | ||
| 40 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, | ||
| 41 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, | ||
| 42 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, | ||
| 43 | +{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, | ||
| 44 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] | ||
| 0 | \ No newline at end of file | 45 | \ No newline at end of file |
No preview for this file type
| @@ -0,0 +1,19 @@ | @@ -0,0 +1,19 @@ | ||
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +# Define here the models for your scraped items | ||
| 4 | +# | ||
| 5 | +# See documentation in: | ||
| 6 | +# http://doc.scrapy.org/en/latest/topics/items.html | ||
| 7 | + | ||
| 8 | +import scrapy | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +class MspiderItem(scrapy.Item): | ||
| 12 | + # define the fields for your item here like: | ||
| 13 | + # name = scrapy.Field() | ||
| 14 | + pass | ||
| 15 | + | ||
| 16 | +class DmozItem(scrapy.Item): | ||
| 17 | + title = scrapy.Field() | ||
| 18 | + link = scrapy.Field() | ||
| 19 | + desc = scrapy.Field() |
No preview for this file type
| @@ -0,0 +1,11 @@ | @@ -0,0 +1,11 @@ | ||
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +# Define your item pipelines here | ||
| 4 | +# | ||
| 5 | +# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
| 6 | +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
| 7 | + | ||
| 8 | + | ||
| 9 | +class MspiderPipeline(object): | ||
| 10 | + def process_item(self, item, spider): | ||
| 11 | + return item |
| @@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +# Scrapy settings for mspider project | ||
| 4 | +# | ||
| 5 | +# For simplicity, this file contains only the most important settings by | ||
| 6 | +# default. All the other settings are documented here: | ||
| 7 | +# | ||
| 8 | +# http://doc.scrapy.org/en/latest/topics/settings.html | ||
| 9 | +# | ||
| 10 | + | ||
| 11 | +BOT_NAME = 'mspider' | ||
| 12 | + | ||
| 13 | +SPIDER_MODULES = ['mspider.spiders'] | ||
| 14 | +NEWSPIDER_MODULE = 'mspider.spiders' | ||
| 15 | + | ||
| 16 | +# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
| 17 | +#USER_AGENT = 'mspider (+http://www.yourdomain.com)' |
No preview for this file type
No preview for this file type
| @@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
| 1 | +__author__ = 'chunk' | ||
| 2 | + | ||
| 3 | +import scrapy | ||
| 4 | +from ..items import DmozItem | ||
| 5 | + | ||
| 6 | +class DmozSpider(scrapy.Spider): | ||
| 7 | + name = "dmoz" | ||
| 8 | + allowed_domains = ["dmoz.org"] | ||
| 9 | + start_urls = [ | ||
| 10 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | ||
| 11 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | ||
| 12 | + ] | ||
| 13 | + | ||
| 14 | + def parse(self, response): | ||
| 15 | + """ | ||
| 16 | + This is the default callback used by Scrapy to process downloaded responses | ||
| 17 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | ||
| 18 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | ||
| 19 | + | ||
| 20 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | ||
| 21 | + """ | ||
| 22 | + for sel in response.xpath('//ul/li'): | ||
| 23 | + item = DmozItem() | ||
| 24 | + item['title'] = sel.xpath('a/text()').extract() | ||
| 25 | + item['link'] = sel.xpath('a/@href').extract() | ||
| 26 | + | ||
| 27 | + yield item |
No preview for this file type
| @@ -0,0 +1,11 @@ | @@ -0,0 +1,11 @@ | ||
| 1 | +# Automatically created by: scrapy startproject | ||
| 2 | +# | ||
| 3 | +# For more information about the [deploy] section see: | ||
| 4 | +# http://doc.scrapy.org/en/latest/topics/scrapyd.html | ||
| 5 | + | ||
| 6 | +[settings] | ||
| 7 | +default = mspider.settings | ||
| 8 | + | ||
| 9 | +[deploy] | ||
| 10 | +#url = http://localhost:6800/ | ||
| 11 | +project = mspider |