From a9d3ceff2adf356037aa157b5edc7a78b43ccc37 Mon Sep 17 00:00:00 2001 From: Chunk Date: Thu, 26 Mar 2015 20:12:49 +0800 Subject: [PATCH] Douban as an example. --- run_spider.sh | 17 +++++++++++------ spider/mspider/hehe.json | 45 +-------------------------------------------- spider/mspider/mspider/items.py | 10 ++++++---- spider/mspider/mspider/items.pyc | Bin 709 -> 0 bytes spider/mspider/mspider/pipelines.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ spider/mspider/mspider/pipelines.pyc | Bin 0 -> 3021 bytes spider/mspider/mspider/settings.py | 20 +++++++++++++++++++- spider/mspider/mspider/settings.pyc | Bin 273 -> 0 bytes spider/mspider/mspider/spiders/test000.py | 53 ++++++++++++++++++++++++++++++++++++++++++----------- spider/mspider/mspider/spiders/test000.pyc | Bin 1172 -> 0 bytes spider/mspider/useragents.txt | 30 ++++++++++++++++++++++++++++++ 11 files changed, 177 insertions(+), 66 deletions(-) create mode 100644 spider/mspider/mspider/pipelines.pyc create mode 100644 spider/mspider/useragents.txt diff --git a/run_spider.sh b/run_spider.sh index f343d09..78ec310 100755 --- a/run_spider.sh +++ b/run_spider.sh @@ -1,22 +1,27 @@ #!/bin/zsh # chunk @ 2014 +######################################################################################## +## +## F**k World! +## +######################################################################################## -#################################################################### +############################################## ## environment variables -#################################################################### -export export TERM=xterm +############################################## +export export TERM=linux source /home/hadoop/.zshrc v env0 -#################################################################### +############################################## ## additional files list -#################################################################### +############################################## FILE=hehe.json #scrapy runspider spider/test.py cd ./spider/mspider/ [ -f $FILE ] && rm $FILE -scrapy crawl dmoz -o $FILE +scrapy crawl douban -o $FILE diff --git a/spider/mspider/hehe.json b/spider/mspider/hehe.json index ba62262..9ea46a4 100644 --- a/spider/mspider/hehe.json +++ b/spider/mspider/hehe.json @@ -1,44 +1 @@ -[{"link": ["/"], "title": ["Top"]}, -{"link": ["/Computers/"], "title": ["Computers"]}, -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, -{"link": [], "title": []}, -{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, -{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, -{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, -{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, -{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, -{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, -{"link": ["/"], "title": ["Top"]}, -{"link": ["/Computers/"], "title": ["Computers"]}, -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, -{"link": [], "title": []}, -{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, -{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, -{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, -{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, -{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, -{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, -{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, -{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, -{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, -{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, -{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, -{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, -{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, -{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, -{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, -{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, -{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, -{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, -{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, -{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, -{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] \ No newline at end of file +[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}] \ No newline at end of file diff --git a/spider/mspider/mspider/items.py b/spider/mspider/mspider/items.py index dd3420f..625158b 100644 --- a/spider/mspider/mspider/items.py +++ b/spider/mspider/mspider/items.py @@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item): # name = scrapy.Field() pass -class DmozItem(scrapy.Item): - title = scrapy.Field() - link = scrapy.Field() - desc = scrapy.Field() +class DoubanItem(scrapy.Item): + ind = scrapy.Field() + name = scrapy.Field() + director = scrapy.Field() + rate = scrapy.Field() + diff --git a/spider/mspider/mspider/items.pyc b/spider/mspider/mspider/items.pyc index 7a90464..6e0ab4a 100644 Binary files a/spider/mspider/mspider/items.pyc and b/spider/mspider/mspider/items.pyc differ diff --git a/spider/mspider/mspider/pipelines.py b/spider/mspider/mspider/pipelines.py index 267bbeb..e2535a9 100644 --- a/spider/mspider/mspider/pipelines.py +++ b/spider/mspider/mspider/pipelines.py @@ -5,7 +5,75 @@ # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +from .items import DoubanItem + +from hashlib import md5 +import happybase + + +class HbaseDumper(object): + def __init__(self, Item, tablename=None): + self.Item = Item # class not object + self.table_name = tablename if tablename != None else self.Item.__name__ + self.table = None + self.connection = None + self.sparkcontex = None + + def get_table(self): + if self.table != None: + return self.table + + if self.connection is None: + c = happybase.Connection('HPC-server') + self.connection = c + + tables = self.connection.tables() + if self.table_name not in tables: + families = self.Item.fields + self.connection.create_table(name=self.table_name, families=families) + + table = self.connection.table(name=self.table_name) + + self.table = table + + return table + + def store_item(self, item): + if self.table == None: + self.table = self.get_table() + data = {} + for key in item.keys(): + data[key + ':'] = item[key] + + self.table.put(item['ind'], data) + + def store_items(self, items): + if self.table == None: + self.table = self.get_table() + + dict_databuf = {} + for item in items: + data = {} + for key in item.keys(): + data[key + ':'] = item[key] + dict_databuf[item['ind']] = data + + try: + with self.table.batch(batch_size=5000) as b: + for rowkey, data in dict_databuf.items(): + b.put(rowkey, data) + except ValueError: + raise + pass + class MspiderPipeline(object): + def __init__(self): + self.hbasedumper = HbaseDumper(DoubanItem) + def process_item(self, item, spider): + try: + self.hbasedumper.store_item(item) + except: + raise return item diff --git a/spider/mspider/mspider/pipelines.pyc b/spider/mspider/mspider/pipelines.pyc new file mode 100644 index 0000000..4580230 Binary files /dev/null and b/spider/mspider/mspider/pipelines.pyc differ diff --git a/spider/mspider/mspider/settings.py b/spider/mspider/mspider/settings.py index 7def0d4..9d876f8 100644 --- a/spider/mspider/mspider/settings.py +++ b/spider/mspider/mspider/settings.py @@ -5,7 +5,7 @@ # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # -# http://doc.scrapy.org/en/latest/topics/settings.html +# http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'mspider' @@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'mspider (+http://www.yourdomain.com)' + + + +##Custom for Chunk + +# anti-banning +COOKIES_ENABLED = True +# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36' +DOWNLOADER_MIDDLEWARES = { + 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, + 'random_useragent.RandomUserAgentMiddleware': 400 +} +# http://www.useragentstring.com/pages/useragentstring.php +USER_AGENT_LIST = 'useragents.txt' +# DOWNLOAD_DELAY = 0.1 + +# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } + diff --git a/spider/mspider/mspider/settings.pyc b/spider/mspider/mspider/settings.pyc index d58205b..b90b201 100644 Binary files a/spider/mspider/mspider/settings.pyc and b/spider/mspider/mspider/settings.pyc differ diff --git a/spider/mspider/mspider/spiders/test000.py b/spider/mspider/mspider/spiders/test000.py index 3a23b9a..1fbc1ea 100644 --- a/spider/mspider/mspider/spiders/test000.py +++ b/spider/mspider/mspider/spiders/test000.py @@ -1,16 +1,29 @@ +# -*- coding: utf-8 -*- __author__ = 'chunk' +from ..items import DoubanItem + import scrapy -from ..items import DmozItem +from scrapy import FormRequest +from scrapy.http import Request +from scrapy.utils.response import get_base_url +from urlparse import urljoin +import re +from hashlib import md5 + -class DmozSpider(scrapy.Spider): - name = "dmoz" - allowed_domains = ["dmoz.org"] +class DoubanSpider(scrapy.Spider): + name = "douban" + allowed_domains = ["douban.com"] start_urls = [ - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + "http://movie.douban.com/tag/", ] + def start_requests(self): + return [FormRequest("http://movie.douban.com/tag/", + cookies={'bid': "SCAM2676P0o"}, + callback=self.parse)] + def parse(self, response): """ This is the default callback used by Scrapy to process downloaded responses @@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider): Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html """ - for sel in response.xpath('//ul/li'): - item = DmozItem() - item['title'] = sel.xpath('a/text()').extract() - item['link'] = sel.xpath('a/@href').extract() + base_url = get_base_url(response) + for url in response.xpath('//@href').extract(): + url = urljoin(base_url, url.strip()) + if re.match('http://movie.douban.com/tag/\w+', url, re.U): + return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) + + + def parse_list(self, response): + base_url = get_base_url(response) + for url in response.xpath('//@href').extract(): + url = urljoin(base_url, url.strip()) + if re.match('http://movie.douban.com/subject/\w+', url, re.U): + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) + + def parse_item(self, response): + item = DoubanItem() + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() + item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() + + return item - yield item + def parse_details(self, response): + pass diff --git a/spider/mspider/mspider/spiders/test000.pyc b/spider/mspider/mspider/spiders/test000.pyc index 8eb720d..32ed789 100644 Binary files a/spider/mspider/mspider/spiders/test000.pyc and b/spider/mspider/mspider/spiders/test000.pyc differ diff --git a/spider/mspider/useragents.txt b/spider/mspider/useragents.txt new file mode 100644 index 0000000..a67dabb --- /dev/null +++ b/spider/mspider/useragents.txt @@ -0,0 +1,30 @@ +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36 +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0) +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64) +Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0) +Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0) +Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko +Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0 +Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0 +Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 +Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 \ No newline at end of file -- libgit2 0.21.2