Douban as an example.

Chunk
1 parent 4737e166
Showing 11 changed files with 177 additions and 66 deletions Show diff stats
run_spider.sh
spider/mspider/hehe.json
spider/mspider/mspider/items.py
spider/mspider/mspider/items.pyc
spider/mspider/mspider/pipelines.py
spider/mspider/mspider/pipelines.pyc
spider/mspider/mspider/settings.py
spider/mspider/mspider/settings.pyc
spider/mspider/mspider/spiders/test000.py
spider/mspider/mspider/spiders/test000.pyc
spider/mspider/useragents.txt
 #!/bin/zsh
 # chunk @ 2014
+########################################################################################
+##
+## F**k World!
+##
+########################################################################################
-####################################################################
+##############################################
 ## environment variables
-####################################################################
-export export TERM=xterm
+##############################################
+export export TERM=linux
 source /home/hadoop/.zshrc
 v env0
-####################################################################
+##############################################
 ## additional files list
-####################################################################
+##############################################
 FILE=hehe.json
 #scrapy runspider spider/test.py
 cd ./spider/mspider/
 [ -f $FILE ] && rm $FILE
-scrapy crawl dmoz -o $FILE
+scrapy crawl douban -o $FILE
-[{"link": ["/"], "title": ["Top"]},
-{"link": ["/Computers/"], "title": ["Computers"]},
-{"link": ["/Computers/Programming/"], "title": ["Programming"]},
-{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
-{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
-{"link": [], "title": []},
-{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]},
-{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]},
-{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]},
-{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]},
-{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]},
-{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]},
-{"link": ["/"], "title": ["Top"]},
-{"link": ["/Computers/"], "title": ["Computers"]},
-{"link": ["/Computers/Programming/"], "title": ["Programming"]},
-{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
-{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
-{"link": [], "title": []},
-{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]},
-{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]},
-{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]},
-{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]},
-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]},
-{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]},
-{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]},
-{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]},
-{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]},
-{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]},
-{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]},
-{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]},
-{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]},
-{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]},
-{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]},
-{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]},
-{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]},
-{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]},
-{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]},
-{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]},
-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]},
-{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]},
-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]},
-{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]},
-{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]},
-{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}]
 \ No newline at end of file
+[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}]
 \ No newline at end of file
@@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item):
     # name = scrapy.Field()
     pass
-class DmozItem(scrapy.Item):
-    title = scrapy.Field()
-    link = scrapy.Field()
-    desc = scrapy.Field()
+class DoubanItem(scrapy.Item):
+    ind = scrapy.Field()
+    name = scrapy.Field()
+    director = scrapy.Field()
+    rate = scrapy.Field()
+
@@ -5,7 +5,75 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+from .items import DoubanItem
+
+from hashlib import md5
+import happybase
+
+
+class HbaseDumper(object):
+    def __init__(self, Item, tablename=None):
+        self.Item = Item  # class not object
+        self.table_name = tablename if tablename != None else self.Item.__name__
+        self.table = None
+        self.connection = None
+        self.sparkcontex = None
+
+    def get_table(self):
+        if self.table != None:
+            return self.table
+
+        if self.connection is None:
+            c = happybase.Connection('HPC-server')
+            self.connection = c
+
+        tables = self.connection.tables()
+        if self.table_name not in tables:
+            families = self.Item.fields
+            self.connection.create_table(name=self.table_name, families=families)
+
+        table = self.connection.table(name=self.table_name)
+
+        self.table = table
+
+        return table
+
+    def store_item(self, item):
+        if self.table == None:
+            self.table = self.get_table()
+        data = {}
+        for key in item.keys():
+            data[key + ':'] = item[key]
+
+        self.table.put(item['ind'], data)
+
+    def store_items(self, items):
+        if self.table == None:
+            self.table = self.get_table()
+
+        dict_databuf = {}
+        for item in items:
+            data = {}
+            for key in item.keys():
+                data[key + ':'] = item[key]
+            dict_databuf[item['ind']] = data
+
+        try:
+            with self.table.batch(batch_size=5000) as b:
+                for rowkey, data in dict_databuf.items():
+                    b.put(rowkey, data)
+        except ValueError:
+            raise
+            pass
+
 class MspiderPipeline(object):
+    def __init__(self):
+        self.hbasedumper = HbaseDumper(DoubanItem)
+
     def process_item(self, item, spider):
+        try:
+            self.hbasedumper.store_item(item)
+        except:
+            raise
         return item
@@ -5,7 +5,7 @@
 # For simplicity, this file contains only the most important settings by
 # default. All the other settings are documented here:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
+# http://doc.scrapy.org/en/latest/topics/settings.html
 #
 BOT_NAME = 'mspider'
@@ -15,3 +15,21 @@ NEWSPIDER_MODULE = &#39;mspider.spiders&#39;
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'mspider (+http://www.yourdomain.com)'
+
+
+
+##Custom for Chunk
+
+# anti-banning
+COOKIES_ENABLED = True
+# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
+    'random_useragent.RandomUserAgentMiddleware': 400
+}
+# http://www.useragentstring.com/pages/useragentstring.php
+USER_AGENT_LIST = 'useragents.txt'
+# DOWNLOAD_DELAY = 0.1
+
+# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
+
+# -*- coding: utf-8 -*-
 __author__ = 'chunk'
+from ..items import DoubanItem
+
 import scrapy
-from ..items import DmozItem
+from scrapy import FormRequest
+from scrapy.http import Request
+from scrapy.utils.response import get_base_url
+from urlparse import urljoin
+import re
+from hashlib import md5
+
-class DmozSpider(scrapy.Spider):
-    name = "dmoz"
-    allowed_domains = ["dmoz.org"]
+class DoubanSpider(scrapy.Spider):
+    name = "douban"
+    allowed_domains = ["douban.com"]
     start_urls = [
-        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
-        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
+        "http://movie.douban.com/tag/",
     ]
+    def start_requests(self):
+        return [FormRequest("http://movie.douban.com/tag/",
+                            cookies={'bid': "SCAM2676P0o"},
+                            callback=self.parse)]
+
     def parse(self, response):
         """
         This is the default callback used by Scrapy to process downloaded responses
@@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider):
         Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
         """
-        for sel in response.xpath('//ul/li'):
-            item = DmozItem()
-            item['title'] = sel.xpath('a/text()').extract()
-            item['link'] = sel.xpath('a/@href').extract()
+        base_url = get_base_url(response)
+        for url in response.xpath('//@href').extract():
+            url = urljoin(base_url, url.strip())
+            if re.match('http://movie.douban.com/tag/\w+', url, re.U):
+                return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
+
+
+    def parse_list(self, response):
+        base_url = get_base_url(response)
+        for url in response.xpath('//@href').extract():
+            url = urljoin(base_url, url.strip())
+            if re.match('http://movie.douban.com/subject/\w+', url, re.U):
+                return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
+
+    def parse_item(self, response):
+        item = DoubanItem()
+        item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
+        item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
+        item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
+
+        return item
-            yield item
+    def parse_details(self, response):
+        pass
@@ -0,0 +1,30 @@
+Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36
+Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36
+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)
+Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)
+Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)
+Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
+Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko
+Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)
+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205
+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0
+Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0
+Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
+Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
 \ No newline at end of file
1	-[{"link": ["/"], "title": ["Top"]},
2	-{"link": ["/Computers/"], "title": ["Computers"]},
3	-{"link": ["/Computers/Programming/"], "title": ["Programming"]},
4	-{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
5	-{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
6	-{"link": [], "title": []},
7	-{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]},
8	-{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]},
9	-{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]},
10	-{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]},
11	-{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]},
12	-{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]},
13	-{"link": ["/"], "title": ["Top"]},
14	-{"link": ["/Computers/"], "title": ["Computers"]},
15	-{"link": ["/Computers/Programming/"], "title": ["Programming"]},
16	-{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
17	-{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
18	-{"link": [], "title": []},
19	-{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]},
20	-{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]},
21	-{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]},
22	-{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]},
23	-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]},
24	-{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]},
25	-{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]},
26	-{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]},
27	-{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]},
28	-{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]},
29	-{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]},
30	-{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]},
31	-{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]},
32	-{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]},
33	-{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]},
34	-{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]},
35	-{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]},
36	-{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]},
37	-{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]},
38	-{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]},
39	-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]},
40	-{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]},
41	-{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]},
42	-{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]},
43	-{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]},
44	-{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}]
45	\ No newline at end of file	1	\ No newline at end of file
		2	+[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}]
46	\ No newline at end of file	3	\ No newline at end of file
	@@ -5,7 +5,75 @@		@@ -5,7 +5,75 @@
5	# Don't forget to add your pipeline to the ITEM_PIPELINES setting	5	# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6	# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html	6	# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7		7
		8	+from .items import DoubanItem
		9	+
		10	+from hashlib import md5
		11	+import happybase
		12	+
		13	+
		14	+class HbaseDumper(object):
		15	+ def __init__(self, Item, tablename=None):
		16	+ self.Item = Item # class not object
		17	+ self.table_name = tablename if tablename != None else self.Item.__name__
		18	+ self.table = None
		19	+ self.connection = None
		20	+ self.sparkcontex = None
		21	+
		22	+ def get_table(self):
		23	+ if self.table != None:
		24	+ return self.table
		25	+
		26	+ if self.connection is None:
		27	+ c = happybase.Connection('HPC-server')
		28	+ self.connection = c
		29	+
		30	+ tables = self.connection.tables()
		31	+ if self.table_name not in tables:
		32	+ families = self.Item.fields
		33	+ self.connection.create_table(name=self.table_name, families=families)
		34	+
		35	+ table = self.connection.table(name=self.table_name)
		36	+
		37	+ self.table = table
		38	+
		39	+ return table
		40	+
		41	+ def store_item(self, item):
		42	+ if self.table == None:
		43	+ self.table = self.get_table()
		44	+ data = {}
		45	+ for key in item.keys():
		46	+ data[key + ':'] = item[key]
		47	+
		48	+ self.table.put(item['ind'], data)
		49	+
		50	+ def store_items(self, items):
		51	+ if self.table == None:
		52	+ self.table = self.get_table()
		53	+
		54	+ dict_databuf = {}
		55	+ for item in items:
		56	+ data = {}
		57	+ for key in item.keys():
		58	+ data[key + ':'] = item[key]
		59	+ dict_databuf[item['ind']] = data
		60	+
		61	+ try:
		62	+ with self.table.batch(batch_size=5000) as b:
		63	+ for rowkey, data in dict_databuf.items():
		64	+ b.put(rowkey, data)
		65	+ except ValueError:
		66	+ raise
		67	+ pass
		68	+
8		69
9	class MspiderPipeline(object):	70	class MspiderPipeline(object):
		71	+ def __init__(self):
		72	+ self.hbasedumper = HbaseDumper(DoubanItem)
		73	+
10	def process_item(self, item, spider):	74	def process_item(self, item, spider):
		75	+ try:
		76	+ self.hbasedumper.store_item(item)
		77	+ except:
		78	+ raise
11	return item	79	return item
	@@ -5,7 +5,7 @@		@@ -5,7 +5,7 @@
5	# For simplicity, this file contains only the most important settings by	5	# For simplicity, this file contains only the most important settings by
6	# default. All the other settings are documented here:	6	# default. All the other settings are documented here:
7	#	7	#
8	-# http://doc.scrapy.org/en/latest/topics/settings.html	8	+# http://doc.scrapy.org/en/latest/topics/settings.html
9	#	9	#
10		10
11	BOT_NAME = 'mspider'	11	BOT_NAME = 'mspider'
	@@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders'		@@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders'
15		15
16	# Crawl responsibly by identifying yourself (and your website) on the user-agent	16	# Crawl responsibly by identifying yourself (and your website) on the user-agent
17	#USER_AGENT = 'mspider (+http://www.yourdomain.com)'	17	#USER_AGENT = 'mspider (+http://www.yourdomain.com)'
		18	+
		19	+
		20	+
		21	+##Custom for Chunk
		22	+
		23	+# anti-banning
		24	+COOKIES_ENABLED = True
		25	+# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
		26	+DOWNLOADER_MIDDLEWARES = {
		27	+ 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
		28	+ 'random_useragent.RandomUserAgentMiddleware': 400
		29	+}
		30	+# http://www.useragentstring.com/pages/useragentstring.php
		31	+USER_AGENT_LIST = 'useragents.txt'
		32	+# DOWNLOAD_DELAY = 0.1
		33	+
		34	+# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
		35	+
		1	+# -- coding: utf-8 --
1	__author__ = 'chunk'	2	__author__ = 'chunk'
2		3
		4	+from ..items import DoubanItem
		5	+
3	import scrapy	6	import scrapy
4	-from ..items import DmozItem	7	+from scrapy import FormRequest
		8	+from scrapy.http import Request
		9	+from scrapy.utils.response import get_base_url
		10	+from urlparse import urljoin
		11	+import re
		12	+from hashlib import md5
		13	+
5		14
6	-class DmozSpider(scrapy.Spider):
7	- name = "dmoz"
8	- allowed_domains = ["dmoz.org"]	15	+class DoubanSpider(scrapy.Spider):
		16	+ name = "douban"
		17	+ allowed_domains = ["douban.com"]
9	start_urls = [	18	start_urls = [
10	- "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
11	- "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"	19	+ "http://movie.douban.com/tag/",
12	]	20	]
13		21
		22	+ def start_requests(self):
		23	+ return [FormRequest("http://movie.douban.com/tag/",
		24	+ cookies={'bid': "SCAM2676P0o"},
		25	+ callback=self.parse)]
		26	+
14	def parse(self, response):	27	def parse(self, response):
15	"""	28	"""
16	This is the default callback used by Scrapy to process downloaded responses	29	This is the default callback used by Scrapy to process downloaded responses
	@@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider):		@@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider):
19		32
20	Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html	33	Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
21	"""	34	"""
22	- for sel in response.xpath('//ul/li'):
23	- item = DmozItem()
24	- item['title'] = sel.xpath('a/text()').extract()
25	- item['link'] = sel.xpath('a/@href').extract()	35	+ base_url = get_base_url(response)
		36	+ for url in response.xpath('//@href').extract():
		37	+ url = urljoin(base_url, url.strip())
		38	+ if re.match('http://movie.douban.com/tag/\w+', url, re.U):
		39	+ return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
		40	+
		41	+
		42	+ def parse_list(self, response):
		43	+ base_url = get_base_url(response)
		44	+ for url in response.xpath('//@href').extract():
		45	+ url = urljoin(base_url, url.strip())
		46	+ if re.match('http://movie.douban.com/subject/\w+', url, re.U):
		47	+ return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
		48	+
		49	+ def parse_item(self, response):
		50	+ item = DoubanItem()
		51	+ item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
		52	+ item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
		53	+ item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
		54	+
		55	+ return item
26		56
27	- yield item	57	+ def parse_details(self, response):
		58	+ pass
	@@ -0,0 +1,30 @@		@@ -0,0 +1,30 @@
		1	+Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36
		2	+Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36
		3	+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
		4	+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
		5	+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
		6	+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)
		7	+Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)
		8	+Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)
		9	+Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
		10	+Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko
		11	+Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
		12	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
		13	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
		14	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
		15	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0
		16	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)
		17	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)
		18	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)
		19	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)
		20	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)
		21	+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205
		22	+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0
		23	+Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0
		24	+Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		25	+Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		26	+Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		27	+Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		28	+Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		29	+Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
		30	+Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
0	\ No newline at end of file	31	\ No newline at end of file