Commit a9d3ceff2adf356037aa157b5edc7a78b43ccc37
1 parent
4737e166
Exists in
master
and in
2 other branches
Douban as an example.
Showing
11 changed files
with
177 additions
and
66 deletions
Show diff stats
run_spider.sh
1 | 1 | #!/bin/zsh |
2 | 2 | # chunk @ 2014 |
3 | +######################################################################################## | |
4 | +## | |
5 | +## F**k World! | |
6 | +## | |
7 | +######################################################################################## | |
3 | 8 | |
4 | -#################################################################### | |
9 | +############################################## | |
5 | 10 | ## environment variables |
6 | -#################################################################### | |
7 | -export export TERM=xterm | |
11 | +############################################## | |
12 | +export export TERM=linux | |
8 | 13 | source /home/hadoop/.zshrc |
9 | 14 | v env0 |
10 | 15 | |
11 | -#################################################################### | |
16 | +############################################## | |
12 | 17 | ## additional files list |
13 | -#################################################################### | |
18 | +############################################## | |
14 | 19 | FILE=hehe.json |
15 | 20 | |
16 | 21 | #scrapy runspider spider/test.py |
17 | 22 | cd ./spider/mspider/ |
18 | 23 | [ -f $FILE ] && rm $FILE |
19 | -scrapy crawl dmoz -o $FILE | |
24 | +scrapy crawl douban -o $FILE | |
20 | 25 | |
21 | 26 | |
22 | 27 | ... | ... |
spider/mspider/hehe.json
1 | -[{"link": ["/"], "title": ["Top"]}, | |
2 | -{"link": ["/Computers/"], "title": ["Computers"]}, | |
3 | -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | |
4 | -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | |
5 | -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | |
6 | -{"link": [], "title": []}, | |
7 | -{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, | |
8 | -{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, | |
9 | -{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, | |
10 | -{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, | |
11 | -{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, | |
12 | -{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, | |
13 | -{"link": ["/"], "title": ["Top"]}, | |
14 | -{"link": ["/Computers/"], "title": ["Computers"]}, | |
15 | -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | |
16 | -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | |
17 | -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | |
18 | -{"link": [], "title": []}, | |
19 | -{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, | |
20 | -{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, | |
21 | -{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, | |
22 | -{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, | |
23 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, | |
24 | -{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, | |
25 | -{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, | |
26 | -{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, | |
27 | -{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, | |
28 | -{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, | |
29 | -{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, | |
30 | -{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, | |
31 | -{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, | |
32 | -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, | |
33 | -{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, | |
34 | -{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, | |
35 | -{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, | |
36 | -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, | |
37 | -{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, | |
38 | -{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, | |
39 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, | |
40 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, | |
41 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, | |
42 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, | |
43 | -{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, | |
44 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] | |
45 | 1 | \ No newline at end of file |
2 | +[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}] | |
46 | 3 | \ No newline at end of file | ... | ... |
spider/mspider/mspider/items.py
... | ... | @@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item): |
13 | 13 | # name = scrapy.Field() |
14 | 14 | pass |
15 | 15 | |
16 | -class DmozItem(scrapy.Item): | |
17 | - title = scrapy.Field() | |
18 | - link = scrapy.Field() | |
19 | - desc = scrapy.Field() | |
16 | +class DoubanItem(scrapy.Item): | |
17 | + ind = scrapy.Field() | |
18 | + name = scrapy.Field() | |
19 | + director = scrapy.Field() | |
20 | + rate = scrapy.Field() | |
21 | + | ... | ... |
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
... | ... | @@ -5,7 +5,75 @@ |
5 | 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting |
6 | 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html |
7 | 7 | |
8 | +from .items import DoubanItem | |
9 | + | |
10 | +from hashlib import md5 | |
11 | +import happybase | |
12 | + | |
13 | + | |
14 | +class HbaseDumper(object): | |
15 | + def __init__(self, Item, tablename=None): | |
16 | + self.Item = Item # class not object | |
17 | + self.table_name = tablename if tablename != None else self.Item.__name__ | |
18 | + self.table = None | |
19 | + self.connection = None | |
20 | + self.sparkcontex = None | |
21 | + | |
22 | + def get_table(self): | |
23 | + if self.table != None: | |
24 | + return self.table | |
25 | + | |
26 | + if self.connection is None: | |
27 | + c = happybase.Connection('HPC-server') | |
28 | + self.connection = c | |
29 | + | |
30 | + tables = self.connection.tables() | |
31 | + if self.table_name not in tables: | |
32 | + families = self.Item.fields | |
33 | + self.connection.create_table(name=self.table_name, families=families) | |
34 | + | |
35 | + table = self.connection.table(name=self.table_name) | |
36 | + | |
37 | + self.table = table | |
38 | + | |
39 | + return table | |
40 | + | |
41 | + def store_item(self, item): | |
42 | + if self.table == None: | |
43 | + self.table = self.get_table() | |
44 | + data = {} | |
45 | + for key in item.keys(): | |
46 | + data[key + ':'] = item[key] | |
47 | + | |
48 | + self.table.put(item['ind'], data) | |
49 | + | |
50 | + def store_items(self, items): | |
51 | + if self.table == None: | |
52 | + self.table = self.get_table() | |
53 | + | |
54 | + dict_databuf = {} | |
55 | + for item in items: | |
56 | + data = {} | |
57 | + for key in item.keys(): | |
58 | + data[key + ':'] = item[key] | |
59 | + dict_databuf[item['ind']] = data | |
60 | + | |
61 | + try: | |
62 | + with self.table.batch(batch_size=5000) as b: | |
63 | + for rowkey, data in dict_databuf.items(): | |
64 | + b.put(rowkey, data) | |
65 | + except ValueError: | |
66 | + raise | |
67 | + pass | |
68 | + | |
8 | 69 | |
9 | 70 | class MspiderPipeline(object): |
71 | + def __init__(self): | |
72 | + self.hbasedumper = HbaseDumper(DoubanItem) | |
73 | + | |
10 | 74 | def process_item(self, item, spider): |
75 | + try: | |
76 | + self.hbasedumper.store_item(item) | |
77 | + except: | |
78 | + raise | |
11 | 79 | return item | ... | ... |
No preview for this file type
spider/mspider/mspider/settings.py
... | ... | @@ -5,7 +5,7 @@ |
5 | 5 | # For simplicity, this file contains only the most important settings by |
6 | 6 | # default. All the other settings are documented here: |
7 | 7 | # |
8 | -# http://doc.scrapy.org/en/latest/topics/settings.html | |
8 | +# http://doc.scrapy.org/en/latest/topics/settings.html | |
9 | 9 | # |
10 | 10 | |
11 | 11 | BOT_NAME = 'mspider' |
... | ... | @@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders' |
15 | 15 | |
16 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
17 | 17 | #USER_AGENT = 'mspider (+http://www.yourdomain.com)' |
18 | + | |
19 | + | |
20 | + | |
21 | +##Custom for Chunk | |
22 | + | |
23 | +# anti-banning | |
24 | +COOKIES_ENABLED = True | |
25 | +# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36' | |
26 | +DOWNLOADER_MIDDLEWARES = { | |
27 | + 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, | |
28 | + 'random_useragent.RandomUserAgentMiddleware': 400 | |
29 | +} | |
30 | +# http://www.useragentstring.com/pages/useragentstring.php | |
31 | +USER_AGENT_LIST = 'useragents.txt' | |
32 | +# DOWNLOAD_DELAY = 0.1 | |
33 | + | |
34 | +# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | |
35 | + | ... | ... |
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
1 | +# -*- coding: utf-8 -*- | |
1 | 2 | __author__ = 'chunk' |
2 | 3 | |
4 | +from ..items import DoubanItem | |
5 | + | |
3 | 6 | import scrapy |
4 | -from ..items import DmozItem | |
7 | +from scrapy import FormRequest | |
8 | +from scrapy.http import Request | |
9 | +from scrapy.utils.response import get_base_url | |
10 | +from urlparse import urljoin | |
11 | +import re | |
12 | +from hashlib import md5 | |
13 | + | |
5 | 14 | |
6 | -class DmozSpider(scrapy.Spider): | |
7 | - name = "dmoz" | |
8 | - allowed_domains = ["dmoz.org"] | |
15 | +class DoubanSpider(scrapy.Spider): | |
16 | + name = "douban" | |
17 | + allowed_domains = ["douban.com"] | |
9 | 18 | start_urls = [ |
10 | - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | |
11 | - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | |
19 | + "http://movie.douban.com/tag/", | |
12 | 20 | ] |
13 | 21 | |
22 | + def start_requests(self): | |
23 | + return [FormRequest("http://movie.douban.com/tag/", | |
24 | + cookies={'bid': "SCAM2676P0o"}, | |
25 | + callback=self.parse)] | |
26 | + | |
14 | 27 | def parse(self, response): |
15 | 28 | """ |
16 | 29 | This is the default callback used by Scrapy to process downloaded responses |
... | ... | @@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider): |
19 | 32 | |
20 | 33 | Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html |
21 | 34 | """ |
22 | - for sel in response.xpath('//ul/li'): | |
23 | - item = DmozItem() | |
24 | - item['title'] = sel.xpath('a/text()').extract() | |
25 | - item['link'] = sel.xpath('a/@href').extract() | |
35 | + base_url = get_base_url(response) | |
36 | + for url in response.xpath('//@href').extract(): | |
37 | + url = urljoin(base_url, url.strip()) | |
38 | + if re.match('http://movie.douban.com/tag/\w+', url, re.U): | |
39 | + return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | |
40 | + | |
41 | + | |
42 | + def parse_list(self, response): | |
43 | + base_url = get_base_url(response) | |
44 | + for url in response.xpath('//@href').extract(): | |
45 | + url = urljoin(base_url, url.strip()) | |
46 | + if re.match('http://movie.douban.com/subject/\w+', url, re.U): | |
47 | + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
48 | + | |
49 | + def parse_item(self, response): | |
50 | + item = DoubanItem() | |
51 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() | |
52 | + item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() | |
53 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() | |
54 | + | |
55 | + return item | |
26 | 56 | |
27 | - yield item | |
57 | + def parse_details(self, response): | |
58 | + pass | ... | ... |
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
... | ... | @@ -0,0 +1,30 @@ |
1 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36 | |
2 | +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36 | |
3 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0) | |
4 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) | |
5 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) | |
6 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64) | |
7 | +Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0) | |
8 | +Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0) | |
9 | +Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) | |
10 | +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko | |
11 | +Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko | |
12 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) | |
13 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) | |
14 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E) | |
15 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0 | |
16 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8) | |
17 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0) | |
18 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts) | |
19 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215) | |
20 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57) | |
21 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205 | |
22 | +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0 | |
23 | +Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0 | |
24 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
25 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
26 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
27 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
28 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
29 | +Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
30 | +Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | |
0 | 31 | \ No newline at end of file | ... | ... |