Commit b5e0cb7308fddfa442501b13ce05dfc4c532a57e
1 parent
a9d3ceff
Exists in
master
and in
2 other branches
Baidu staged.
Showing
13 changed files
with
68 additions
and
16 deletions
Show diff stats
.idea/ImageR.iml
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | 3 | <component name="NewModuleRootManager"> |
4 | 4 | <content url="file://$MODULE_DIR$" /> |
5 | - <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> | |
5 | + <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | |
6 | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | 7 | </component> |
8 | 8 | </module> |
9 | 9 | \ No newline at end of file | ... | ... |
run_spider.sh
... | ... | @@ -11,7 +11,7 @@ |
11 | 11 | ############################################## |
12 | 12 | export export TERM=linux |
13 | 13 | source /home/hadoop/.zshrc |
14 | -v env0 | |
14 | +v env1 | |
15 | 15 | |
16 | 16 | ############################################## |
17 | 17 | ## additional files list |
... | ... | @@ -21,8 +21,8 @@ FILE=hehe.json |
21 | 21 | #scrapy runspider spider/test.py |
22 | 22 | cd ./spider/mspider/ |
23 | 23 | [ -f $FILE ] && rm $FILE |
24 | -scrapy crawl douban -o $FILE | |
25 | - | |
24 | +#scrapy crawl douban -o $FILE | |
25 | +scrapy crawl baidu -o $FILE | |
26 | 26 | |
27 | 27 | |
28 | 28 | ... | ... |
spider/mspider/hehe.json
No preview for this file type
spider/mspider/mspider/__init__.pyc
No preview for this file type
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
... | ... | @@ -73,7 +73,8 @@ class MspiderPipeline(object): |
73 | 73 | |
74 | 74 | def process_item(self, item, spider): |
75 | 75 | try: |
76 | - self.hbasedumper.store_item(item) | |
76 | + self.hbasedumper.store_item(item) # one by one | |
77 | + # self.hbasedumper.store_items(item) # bulk put | |
77 | 78 | except: |
78 | 79 | raise |
79 | 80 | return item | ... | ... |
spider/mspider/mspider/pipelines.pyc
No preview for this file type
spider/mspider/mspider/settings.py
... | ... | @@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = { |
29 | 29 | } |
30 | 30 | # http://www.useragentstring.com/pages/useragentstring.php |
31 | 31 | USER_AGENT_LIST = 'useragents.txt' |
32 | -# DOWNLOAD_DELAY = 0.1 | |
32 | +DOWNLOAD_DELAY = 1 | |
33 | 33 | |
34 | -# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | |
34 | +ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | |
35 | 35 | ... | ... |
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/__init__.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
... | ... | @@ -3,16 +3,16 @@ __author__ = 'chunk' |
3 | 3 | |
4 | 4 | from ..items import DoubanItem |
5 | 5 | |
6 | -import scrapy | |
7 | -from scrapy import FormRequest | |
6 | +from scrapy import Spider,FormRequest | |
8 | 7 | from scrapy.http import Request |
9 | 8 | from scrapy.utils.response import get_base_url |
10 | 9 | from urlparse import urljoin |
11 | 10 | import re |
12 | 11 | from hashlib import md5 |
12 | +import json | |
13 | 13 | |
14 | 14 | |
15 | -class DoubanSpider(scrapy.Spider): | |
15 | +class DoubanSpider(Spider): | |
16 | 16 | name = "douban" |
17 | 17 | allowed_domains = ["douban.com"] |
18 | 18 | start_urls = [ |
... | ... | @@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider): |
36 | 36 | for url in response.xpath('//@href').extract(): |
37 | 37 | url = urljoin(base_url, url.strip()) |
38 | 38 | if re.match('http://movie.douban.com/tag/\w+', url, re.U): |
39 | - return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | |
39 | + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | |
40 | 40 | |
41 | 41 | |
42 | 42 | def parse_list(self, response): |
... | ... | @@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider): |
44 | 44 | for url in response.xpath('//@href').extract(): |
45 | 45 | url = urljoin(base_url, url.strip()) |
46 | 46 | if re.match('http://movie.douban.com/subject/\w+', url, re.U): |
47 | - return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
47 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
48 | 48 | |
49 | 49 | def parse_item(self, response): |
50 | 50 | item = DoubanItem() |
51 | - item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() | |
52 | - item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() | |
53 | - item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() | |
51 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') | |
52 | + item['ind'] = md5(item['name']).hexdigest() | |
53 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | |
54 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | |
54 | 55 | |
55 | 56 | return item |
56 | 57 | |
57 | 58 | def parse_details(self, response): |
58 | 59 | pass |
60 | + | |
61 | + | |
62 | + | |
63 | +class BaiduSpider(Spider): | |
64 | + name = "baidu" | |
65 | + allowed_domains = ["image.baidu.com"] | |
66 | + start_urls = [ | |
67 | + "http://image.baidu.com/", | |
68 | + ] | |
69 | + | |
70 | + def start_requests(self): | |
71 | + return [FormRequest("http://image.baidu.com/", | |
72 | + cookies={'userid': "jhvigtgiq"}, | |
73 | + callback=self.parse)] | |
74 | + | |
75 | + def parse(self, response): | |
76 | + """ | |
77 | + This is the default callback used by Scrapy to process downloaded responses | |
78 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | |
79 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | |
80 | + | |
81 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | |
82 | + """ | |
83 | + base_url = get_base_url(response) | |
84 | + r = re.compile('http://image.baidu.com/channel\S+') | |
85 | + for url in filter(r.match,response.xpath('//@href').extract()): | |
86 | + return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list) | |
87 | + | |
88 | + | |
89 | + def parse_list(self, response): | |
90 | + base_url = get_base_url(response) | |
91 | + for url in response.xpath('//@href').extract(): | |
92 | + url = urljoin(base_url, url.strip()) | |
93 | + if re.match('http://movie.douban.com/subject/\w+', url, re.U): | |
94 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
95 | + | |
96 | + def parse_item(self, response): | |
97 | + item = DoubanItem() | |
98 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') | |
99 | + item['ind'] = md5(item['name']).hexdigest() | |
100 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | |
101 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | |
102 | + | |
103 | + return item | |
104 | + | |
105 | + def parse_details(self, response): | |
106 | + pass | |
107 | + | |
108 | + | |
109 | + | ... | ... |
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
No preview for this file type