Commit a9d3ceff2adf356037aa157b5edc7a78b43ccc37
1 parent
4737e166
Exists in
master
and in
2 other branches
Douban as an example.
Showing
11 changed files
with
177 additions
and
66 deletions
Show diff stats
run_spider.sh
1 | #!/bin/zsh | 1 | #!/bin/zsh |
2 | # chunk @ 2014 | 2 | # chunk @ 2014 |
3 | +######################################################################################## | ||
4 | +## | ||
5 | +## F**k World! | ||
6 | +## | ||
7 | +######################################################################################## | ||
3 | 8 | ||
4 | -#################################################################### | 9 | +############################################## |
5 | ## environment variables | 10 | ## environment variables |
6 | -#################################################################### | ||
7 | -export export TERM=xterm | 11 | +############################################## |
12 | +export export TERM=linux | ||
8 | source /home/hadoop/.zshrc | 13 | source /home/hadoop/.zshrc |
9 | v env0 | 14 | v env0 |
10 | 15 | ||
11 | -#################################################################### | 16 | +############################################## |
12 | ## additional files list | 17 | ## additional files list |
13 | -#################################################################### | 18 | +############################################## |
14 | FILE=hehe.json | 19 | FILE=hehe.json |
15 | 20 | ||
16 | #scrapy runspider spider/test.py | 21 | #scrapy runspider spider/test.py |
17 | cd ./spider/mspider/ | 22 | cd ./spider/mspider/ |
18 | [ -f $FILE ] && rm $FILE | 23 | [ -f $FILE ] && rm $FILE |
19 | -scrapy crawl dmoz -o $FILE | 24 | +scrapy crawl douban -o $FILE |
20 | 25 | ||
21 | 26 | ||
22 | 27 |
spider/mspider/hehe.json
1 | -[{"link": ["/"], "title": ["Top"]}, | ||
2 | -{"link": ["/Computers/"], "title": ["Computers"]}, | ||
3 | -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
4 | -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
5 | -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
6 | -{"link": [], "title": []}, | ||
7 | -{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, | ||
8 | -{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, | ||
9 | -{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, | ||
10 | -{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, | ||
11 | -{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, | ||
12 | -{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, | ||
13 | -{"link": ["/"], "title": ["Top"]}, | ||
14 | -{"link": ["/Computers/"], "title": ["Computers"]}, | ||
15 | -{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | ||
16 | -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | ||
17 | -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | ||
18 | -{"link": [], "title": []}, | ||
19 | -{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, | ||
20 | -{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, | ||
21 | -{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, | ||
22 | -{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, | ||
23 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, | ||
24 | -{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, | ||
25 | -{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, | ||
26 | -{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, | ||
27 | -{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, | ||
28 | -{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, | ||
29 | -{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, | ||
30 | -{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, | ||
31 | -{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, | ||
32 | -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, | ||
33 | -{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, | ||
34 | -{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, | ||
35 | -{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, | ||
36 | -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, | ||
37 | -{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, | ||
38 | -{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, | ||
39 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, | ||
40 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, | ||
41 | -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, | ||
42 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, | ||
43 | -{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, | ||
44 | -{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] | ||
45 | \ No newline at end of file | 1 | \ No newline at end of file |
2 | +[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}] | ||
46 | \ No newline at end of file | 3 | \ No newline at end of file |
spider/mspider/mspider/items.py
@@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item): | @@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item): | ||
13 | # name = scrapy.Field() | 13 | # name = scrapy.Field() |
14 | pass | 14 | pass |
15 | 15 | ||
16 | -class DmozItem(scrapy.Item): | ||
17 | - title = scrapy.Field() | ||
18 | - link = scrapy.Field() | ||
19 | - desc = scrapy.Field() | 16 | +class DoubanItem(scrapy.Item): |
17 | + ind = scrapy.Field() | ||
18 | + name = scrapy.Field() | ||
19 | + director = scrapy.Field() | ||
20 | + rate = scrapy.Field() | ||
21 | + |
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
@@ -5,7 +5,75 @@ | @@ -5,7 +5,75 @@ | ||
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting | 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting |
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html |
7 | 7 | ||
8 | +from .items import DoubanItem | ||
9 | + | ||
10 | +from hashlib import md5 | ||
11 | +import happybase | ||
12 | + | ||
13 | + | ||
14 | +class HbaseDumper(object): | ||
15 | + def __init__(self, Item, tablename=None): | ||
16 | + self.Item = Item # class not object | ||
17 | + self.table_name = tablename if tablename != None else self.Item.__name__ | ||
18 | + self.table = None | ||
19 | + self.connection = None | ||
20 | + self.sparkcontex = None | ||
21 | + | ||
22 | + def get_table(self): | ||
23 | + if self.table != None: | ||
24 | + return self.table | ||
25 | + | ||
26 | + if self.connection is None: | ||
27 | + c = happybase.Connection('HPC-server') | ||
28 | + self.connection = c | ||
29 | + | ||
30 | + tables = self.connection.tables() | ||
31 | + if self.table_name not in tables: | ||
32 | + families = self.Item.fields | ||
33 | + self.connection.create_table(name=self.table_name, families=families) | ||
34 | + | ||
35 | + table = self.connection.table(name=self.table_name) | ||
36 | + | ||
37 | + self.table = table | ||
38 | + | ||
39 | + return table | ||
40 | + | ||
41 | + def store_item(self, item): | ||
42 | + if self.table == None: | ||
43 | + self.table = self.get_table() | ||
44 | + data = {} | ||
45 | + for key in item.keys(): | ||
46 | + data[key + ':'] = item[key] | ||
47 | + | ||
48 | + self.table.put(item['ind'], data) | ||
49 | + | ||
50 | + def store_items(self, items): | ||
51 | + if self.table == None: | ||
52 | + self.table = self.get_table() | ||
53 | + | ||
54 | + dict_databuf = {} | ||
55 | + for item in items: | ||
56 | + data = {} | ||
57 | + for key in item.keys(): | ||
58 | + data[key + ':'] = item[key] | ||
59 | + dict_databuf[item['ind']] = data | ||
60 | + | ||
61 | + try: | ||
62 | + with self.table.batch(batch_size=5000) as b: | ||
63 | + for rowkey, data in dict_databuf.items(): | ||
64 | + b.put(rowkey, data) | ||
65 | + except ValueError: | ||
66 | + raise | ||
67 | + pass | ||
68 | + | ||
8 | 69 | ||
9 | class MspiderPipeline(object): | 70 | class MspiderPipeline(object): |
71 | + def __init__(self): | ||
72 | + self.hbasedumper = HbaseDumper(DoubanItem) | ||
73 | + | ||
10 | def process_item(self, item, spider): | 74 | def process_item(self, item, spider): |
75 | + try: | ||
76 | + self.hbasedumper.store_item(item) | ||
77 | + except: | ||
78 | + raise | ||
11 | return item | 79 | return item |
No preview for this file type
spider/mspider/mspider/settings.py
@@ -5,7 +5,7 @@ | @@ -5,7 +5,7 @@ | ||
5 | # For simplicity, this file contains only the most important settings by | 5 | # For simplicity, this file contains only the most important settings by |
6 | # default. All the other settings are documented here: | 6 | # default. All the other settings are documented here: |
7 | # | 7 | # |
8 | -# http://doc.scrapy.org/en/latest/topics/settings.html | 8 | +# http://doc.scrapy.org/en/latest/topics/settings.html |
9 | # | 9 | # |
10 | 10 | ||
11 | BOT_NAME = 'mspider' | 11 | BOT_NAME = 'mspider' |
@@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders' | @@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders' | ||
15 | 15 | ||
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
17 | #USER_AGENT = 'mspider (+http://www.yourdomain.com)' | 17 | #USER_AGENT = 'mspider (+http://www.yourdomain.com)' |
18 | + | ||
19 | + | ||
20 | + | ||
21 | +##Custom for Chunk | ||
22 | + | ||
23 | +# anti-banning | ||
24 | +COOKIES_ENABLED = True | ||
25 | +# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36' | ||
26 | +DOWNLOADER_MIDDLEWARES = { | ||
27 | + 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, | ||
28 | + 'random_useragent.RandomUserAgentMiddleware': 400 | ||
29 | +} | ||
30 | +# http://www.useragentstring.com/pages/useragentstring.php | ||
31 | +USER_AGENT_LIST = 'useragents.txt' | ||
32 | +# DOWNLOAD_DELAY = 0.1 | ||
33 | + | ||
34 | +# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | ||
35 | + |
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
1 | +# -*- coding: utf-8 -*- | ||
1 | __author__ = 'chunk' | 2 | __author__ = 'chunk' |
2 | 3 | ||
4 | +from ..items import DoubanItem | ||
5 | + | ||
3 | import scrapy | 6 | import scrapy |
4 | -from ..items import DmozItem | 7 | +from scrapy import FormRequest |
8 | +from scrapy.http import Request | ||
9 | +from scrapy.utils.response import get_base_url | ||
10 | +from urlparse import urljoin | ||
11 | +import re | ||
12 | +from hashlib import md5 | ||
13 | + | ||
5 | 14 | ||
6 | -class DmozSpider(scrapy.Spider): | ||
7 | - name = "dmoz" | ||
8 | - allowed_domains = ["dmoz.org"] | 15 | +class DoubanSpider(scrapy.Spider): |
16 | + name = "douban" | ||
17 | + allowed_domains = ["douban.com"] | ||
9 | start_urls = [ | 18 | start_urls = [ |
10 | - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | ||
11 | - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | 19 | + "http://movie.douban.com/tag/", |
12 | ] | 20 | ] |
13 | 21 | ||
22 | + def start_requests(self): | ||
23 | + return [FormRequest("http://movie.douban.com/tag/", | ||
24 | + cookies={'bid': "SCAM2676P0o"}, | ||
25 | + callback=self.parse)] | ||
26 | + | ||
14 | def parse(self, response): | 27 | def parse(self, response): |
15 | """ | 28 | """ |
16 | This is the default callback used by Scrapy to process downloaded responses | 29 | This is the default callback used by Scrapy to process downloaded responses |
@@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider): | @@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider): | ||
19 | 32 | ||
20 | Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | 33 | Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html |
21 | """ | 34 | """ |
22 | - for sel in response.xpath('//ul/li'): | ||
23 | - item = DmozItem() | ||
24 | - item['title'] = sel.xpath('a/text()').extract() | ||
25 | - item['link'] = sel.xpath('a/@href').extract() | 35 | + base_url = get_base_url(response) |
36 | + for url in response.xpath('//@href').extract(): | ||
37 | + url = urljoin(base_url, url.strip()) | ||
38 | + if re.match('http://movie.douban.com/tag/\w+', url, re.U): | ||
39 | + return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | ||
40 | + | ||
41 | + | ||
42 | + def parse_list(self, response): | ||
43 | + base_url = get_base_url(response) | ||
44 | + for url in response.xpath('//@href').extract(): | ||
45 | + url = urljoin(base_url, url.strip()) | ||
46 | + if re.match('http://movie.douban.com/subject/\w+', url, re.U): | ||
47 | + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | ||
48 | + | ||
49 | + def parse_item(self, response): | ||
50 | + item = DoubanItem() | ||
51 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() | ||
52 | + item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() | ||
53 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() | ||
54 | + | ||
55 | + return item | ||
26 | 56 | ||
27 | - yield item | 57 | + def parse_details(self, response): |
58 | + pass |
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
@@ -0,0 +1,30 @@ | @@ -0,0 +1,30 @@ | ||
1 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36 | ||
2 | +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36 | ||
3 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0) | ||
4 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) | ||
5 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) | ||
6 | +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64) | ||
7 | +Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0) | ||
8 | +Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0) | ||
9 | +Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0) | ||
10 | +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko | ||
11 | +Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko | ||
12 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) | ||
13 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) | ||
14 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E) | ||
15 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0 | ||
16 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8) | ||
17 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0) | ||
18 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts) | ||
19 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215) | ||
20 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57) | ||
21 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205 | ||
22 | +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0 | ||
23 | +Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0 | ||
24 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
25 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
26 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
27 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
28 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
29 | +Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
30 | +Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 | ||
0 | \ No newline at end of file | 31 | \ No newline at end of file |