From 4737e1662edbd91e50699151b52004c479a429e1 Mon Sep 17 00:00:00 2001 From: Chunk Date: Thu, 26 Mar 2015 13:13:20 +0800 Subject: [PATCH] staged. --- .idea/ImageR.iml | 2 +- run.sh | 63 --------------------------------------------------------------- run_spark.sh | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ run_spider.sh | 23 +++++++++++++++++++++++ spider/__init__.py | 2 +- spider/mspider/hehe.json | 44 ++++++++++++++++++++++++++++++++++++++++++++ spider/mspider/mspider/__init__.py | 0 spider/mspider/mspider/__init__.pyc | Bin 0 -> 163 bytes spider/mspider/mspider/items.py | 19 +++++++++++++++++++ spider/mspider/mspider/items.pyc | Bin 0 -> 709 bytes spider/mspider/mspider/pipelines.py | 11 +++++++++++ spider/mspider/mspider/settings.py | 17 +++++++++++++++++ spider/mspider/mspider/settings.pyc | Bin 0 -> 273 bytes spider/mspider/mspider/spiders/__init__.py | 4 ++++ spider/mspider/mspider/spiders/__init__.pyc | Bin 0 -> 171 bytes spider/mspider/mspider/spiders/test000.py | 27 +++++++++++++++++++++++++++ spider/mspider/mspider/spiders/test000.pyc | Bin 0 -> 1172 bytes spider/mspider/scrapy.cfg | 11 +++++++++++ spider/test.py | 19 ------------------- 19 files changed, 221 insertions(+), 84 deletions(-) delete mode 100755 run.sh create mode 100755 run_spark.sh create mode 100755 run_spider.sh create mode 100644 spider/mspider/hehe.json create mode 100644 spider/mspider/mspider/__init__.py create mode 100644 spider/mspider/mspider/__init__.pyc create mode 100644 spider/mspider/mspider/items.py create mode 100644 spider/mspider/mspider/items.pyc create mode 100644 spider/mspider/mspider/pipelines.py create mode 100644 spider/mspider/mspider/settings.py create mode 100644 spider/mspider/mspider/settings.pyc create mode 100644 spider/mspider/mspider/spiders/__init__.py create mode 100644 spider/mspider/mspider/spiders/__init__.pyc create mode 100644 spider/mspider/mspider/spiders/test000.py create mode 100644 spider/mspider/mspider/spiders/test000.pyc create mode 100644 spider/mspider/scrapy.cfg delete mode 100644 spider/test.py diff --git a/.idea/ImageR.iml b/.idea/ImageR.iml index 6d00e52..4f21c2d 100644 --- a/.idea/ImageR.iml +++ b/.idea/ImageR.iml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/run.sh b/run.sh deleted file mode 100755 index 0ba2638..0000000 --- a/run.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/zsh -# chunk @ 2014 -######################################################################################## -## -## F**k World! -## -## -## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` -## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` -## -## spark-submit \ -## --driver-memory 1g \ -## --executor-memory 1g \ -## --executor-cores 2 \ -## --deploy-mode client \ -## --master yarn \ -## --class "FuckWorld" \ -## $APP_JAR $ARGS -## -## spark-class org.apache.spark.deploy.yarn.Client \ -## --num-executors 2 \ -## --executor-cores 2 \ -## --driver-memory 1g \ -## --executor-memory 1g \ -## --name "F**k World" \ -## --jar $APP_JAR \ -## --class "FuckWorld" \ -## --args $ARGS -## -##spark-submit \ -## --driver-memory 1g \ -## --executor-memory 1g \ -## --executor-cores 2 \ -## --master spark://HPC-server:7077 \ -## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ -## $APP_JAR $ARGS -######################################################################################## - -source /home/hadoop/.zshrc -v env1 - -export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python -export SPARK_CLASSPATH=`hbase classpath` -export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar - -#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip -# --py-files $COMPRESSED \ -COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip - -APP=test_spark.py -#APP=test_model.py -ARGS= - -spark-submit \ - --driver-memory 1g \ - --executor-memory 2g \ - --executor-cores 2 \ - --master spark://HPC-server:7077 \ - --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ - --py-files $COMPRESSED \ - $APP $ARGS - - diff --git a/run_spark.sh b/run_spark.sh new file mode 100755 index 0000000..0ba2638 --- /dev/null +++ b/run_spark.sh @@ -0,0 +1,63 @@ +#!/bin/zsh +# chunk @ 2014 +######################################################################################## +## +## F**k World! +## +## +## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` +## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` +## +## spark-submit \ +## --driver-memory 1g \ +## --executor-memory 1g \ +## --executor-cores 2 \ +## --deploy-mode client \ +## --master yarn \ +## --class "FuckWorld" \ +## $APP_JAR $ARGS +## +## spark-class org.apache.spark.deploy.yarn.Client \ +## --num-executors 2 \ +## --executor-cores 2 \ +## --driver-memory 1g \ +## --executor-memory 1g \ +## --name "F**k World" \ +## --jar $APP_JAR \ +## --class "FuckWorld" \ +## --args $ARGS +## +##spark-submit \ +## --driver-memory 1g \ +## --executor-memory 1g \ +## --executor-cores 2 \ +## --master spark://HPC-server:7077 \ +## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ +## $APP_JAR $ARGS +######################################################################################## + +source /home/hadoop/.zshrc +v env1 + +export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python +export SPARK_CLASSPATH=`hbase classpath` +export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar + +#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip +# --py-files $COMPRESSED \ +COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip + +APP=test_spark.py +#APP=test_model.py +ARGS= + +spark-submit \ + --driver-memory 1g \ + --executor-memory 2g \ + --executor-cores 2 \ + --master spark://HPC-server:7077 \ + --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ + --py-files $COMPRESSED \ + $APP $ARGS + + diff --git a/run_spider.sh b/run_spider.sh new file mode 100755 index 0000000..f343d09 --- /dev/null +++ b/run_spider.sh @@ -0,0 +1,23 @@ +#!/bin/zsh +# chunk @ 2014 + +#################################################################### +## environment variables +#################################################################### +export export TERM=xterm +source /home/hadoop/.zshrc +v env0 + +#################################################################### +## additional files list +#################################################################### +FILE=hehe.json + +#scrapy runspider spider/test.py +cd ./spider/mspider/ +[ -f $FILE ] && rm $FILE +scrapy crawl dmoz -o $FILE + + + + diff --git a/spider/__init__.py b/spider/__init__.py index 683eeb4..a1459cf 100644 --- a/spider/__init__.py +++ b/spider/__init__.py @@ -1 +1 @@ -__author__ = 'hadoop' +__author__ = 'chunk' diff --git a/spider/mspider/hehe.json b/spider/mspider/hehe.json new file mode 100644 index 0000000..ba62262 --- /dev/null +++ b/spider/mspider/hehe.json @@ -0,0 +1,44 @@ +[{"link": ["/"], "title": ["Top"]}, +{"link": ["/Computers/"], "title": ["Computers"]}, +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, +{"link": [], "title": []}, +{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, +{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, +{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, +{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, +{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, +{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, +{"link": ["/"], "title": ["Top"]}, +{"link": ["/Computers/"], "title": ["Computers"]}, +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, +{"link": [], "title": []}, +{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, +{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, +{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, +{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, +{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, +{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, +{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, +{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, +{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, +{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, +{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, +{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, +{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, +{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, +{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, +{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, +{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, +{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, +{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, +{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, +{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] \ No newline at end of file diff --git a/spider/mspider/mspider/__init__.py b/spider/mspider/mspider/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/spider/mspider/mspider/__init__.py diff --git a/spider/mspider/mspider/__init__.pyc b/spider/mspider/mspider/__init__.pyc new file mode 100644 index 0000000..93ce4b9 Binary files /dev/null and b/spider/mspider/mspider/__init__.pyc differ diff --git a/spider/mspider/mspider/items.py b/spider/mspider/mspider/items.py new file mode 100644 index 0000000..dd3420f --- /dev/null +++ b/spider/mspider/mspider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class MspiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + +class DmozItem(scrapy.Item): + title = scrapy.Field() + link = scrapy.Field() + desc = scrapy.Field() diff --git a/spider/mspider/mspider/items.pyc b/spider/mspider/mspider/items.pyc new file mode 100644 index 0000000..7a90464 Binary files /dev/null and b/spider/mspider/mspider/items.pyc differ diff --git a/spider/mspider/mspider/pipelines.py b/spider/mspider/mspider/pipelines.py new file mode 100644 index 0000000..267bbeb --- /dev/null +++ b/spider/mspider/mspider/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class MspiderPipeline(object): + def process_item(self, item, spider): + return item diff --git a/spider/mspider/mspider/settings.py b/spider/mspider/mspider/settings.py new file mode 100644 index 0000000..7def0d4 --- /dev/null +++ b/spider/mspider/mspider/settings.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for mspider project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'mspider' + +SPIDER_MODULES = ['mspider.spiders'] +NEWSPIDER_MODULE = 'mspider.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'mspider (+http://www.yourdomain.com)' diff --git a/spider/mspider/mspider/settings.pyc b/spider/mspider/mspider/settings.pyc new file mode 100644 index 0000000..d58205b Binary files /dev/null and b/spider/mspider/mspider/settings.pyc differ diff --git a/spider/mspider/mspider/spiders/__init__.py b/spider/mspider/mspider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/spider/mspider/mspider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/spider/mspider/mspider/spiders/__init__.pyc b/spider/mspider/mspider/spiders/__init__.pyc new file mode 100644 index 0000000..44aef51 Binary files /dev/null and b/spider/mspider/mspider/spiders/__init__.pyc differ diff --git a/spider/mspider/mspider/spiders/test000.py b/spider/mspider/mspider/spiders/test000.py new file mode 100644 index 0000000..3a23b9a --- /dev/null +++ b/spider/mspider/mspider/spiders/test000.py @@ -0,0 +1,27 @@ +__author__ = 'chunk' + +import scrapy +from ..items import DmozItem + +class DmozSpider(scrapy.Spider): + name = "dmoz" + allowed_domains = ["dmoz.org"] + start_urls = [ + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + ] + + def parse(self, response): + """ + This is the default callback used by Scrapy to process downloaded responses + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. + + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html + """ + for sel in response.xpath('//ul/li'): + item = DmozItem() + item['title'] = sel.xpath('a/text()').extract() + item['link'] = sel.xpath('a/@href').extract() + + yield item diff --git a/spider/mspider/mspider/spiders/test000.pyc b/spider/mspider/mspider/spiders/test000.pyc new file mode 100644 index 0000000..8eb720d Binary files /dev/null and b/spider/mspider/mspider/spiders/test000.pyc differ diff --git a/spider/mspider/scrapy.cfg b/spider/mspider/scrapy.cfg new file mode 100644 index 0000000..4e0b532 --- /dev/null +++ b/spider/mspider/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = mspider.settings + +[deploy] +#url = http://localhost:6800/ +project = mspider diff --git a/spider/test.py b/spider/test.py deleted file mode 100644 index 6af0de4..0000000 --- a/spider/test.py +++ /dev/null @@ -1,19 +0,0 @@ -__author__ = 'chunk' - - - - - - - - - - - - - - - - - - -- libgit2 0.21.2