Commit b5e0cb7308fddfa442501b13ce05dfc4c532a57e
1 parent
a9d3ceff
Exists in
master
and in
2 other branches
Baidu staged.
Showing
13 changed files
with
68 additions
and
16 deletions
Show diff stats
.idea/ImageR.iml
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | <module type="PYTHON_MODULE" version="4"> | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | <component name="NewModuleRootManager"> | 3 | <component name="NewModuleRootManager"> |
4 | <content url="file://$MODULE_DIR$" /> | 4 | <content url="file://$MODULE_DIR$" /> |
5 | - <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> | 5 | + <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> |
6 | <orderEntry type="sourceFolder" forTests="false" /> | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | </component> | 7 | </component> |
8 | </module> | 8 | </module> |
9 | \ No newline at end of file | 9 | \ No newline at end of file |
run_spider.sh
@@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
11 | ############################################## | 11 | ############################################## |
12 | export export TERM=linux | 12 | export export TERM=linux |
13 | source /home/hadoop/.zshrc | 13 | source /home/hadoop/.zshrc |
14 | -v env0 | 14 | +v env1 |
15 | 15 | ||
16 | ############################################## | 16 | ############################################## |
17 | ## additional files list | 17 | ## additional files list |
@@ -21,8 +21,8 @@ FILE=hehe.json | @@ -21,8 +21,8 @@ FILE=hehe.json | ||
21 | #scrapy runspider spider/test.py | 21 | #scrapy runspider spider/test.py |
22 | cd ./spider/mspider/ | 22 | cd ./spider/mspider/ |
23 | [ -f $FILE ] && rm $FILE | 23 | [ -f $FILE ] && rm $FILE |
24 | -scrapy crawl douban -o $FILE | ||
25 | - | 24 | +#scrapy crawl douban -o $FILE |
25 | +scrapy crawl baidu -o $FILE | ||
26 | 26 | ||
27 | 27 | ||
28 | 28 |
spider/mspider/hehe.json
No preview for this file type
spider/mspider/mspider/__init__.pyc
No preview for this file type
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
@@ -73,7 +73,8 @@ class MspiderPipeline(object): | @@ -73,7 +73,8 @@ class MspiderPipeline(object): | ||
73 | 73 | ||
74 | def process_item(self, item, spider): | 74 | def process_item(self, item, spider): |
75 | try: | 75 | try: |
76 | - self.hbasedumper.store_item(item) | 76 | + self.hbasedumper.store_item(item) # one by one |
77 | + # self.hbasedumper.store_items(item) # bulk put | ||
77 | except: | 78 | except: |
78 | raise | 79 | raise |
79 | return item | 80 | return item |
spider/mspider/mspider/pipelines.pyc
No preview for this file type
spider/mspider/mspider/settings.py
@@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = { | @@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = { | ||
29 | } | 29 | } |
30 | # http://www.useragentstring.com/pages/useragentstring.php | 30 | # http://www.useragentstring.com/pages/useragentstring.php |
31 | USER_AGENT_LIST = 'useragents.txt' | 31 | USER_AGENT_LIST = 'useragents.txt' |
32 | -# DOWNLOAD_DELAY = 0.1 | 32 | +DOWNLOAD_DELAY = 1 |
33 | 33 | ||
34 | -# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | 34 | +ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } |
35 | 35 |
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/__init__.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
@@ -3,16 +3,16 @@ __author__ = 'chunk' | @@ -3,16 +3,16 @@ __author__ = 'chunk' | ||
3 | 3 | ||
4 | from ..items import DoubanItem | 4 | from ..items import DoubanItem |
5 | 5 | ||
6 | -import scrapy | ||
7 | -from scrapy import FormRequest | 6 | +from scrapy import Spider,FormRequest |
8 | from scrapy.http import Request | 7 | from scrapy.http import Request |
9 | from scrapy.utils.response import get_base_url | 8 | from scrapy.utils.response import get_base_url |
10 | from urlparse import urljoin | 9 | from urlparse import urljoin |
11 | import re | 10 | import re |
12 | from hashlib import md5 | 11 | from hashlib import md5 |
12 | +import json | ||
13 | 13 | ||
14 | 14 | ||
15 | -class DoubanSpider(scrapy.Spider): | 15 | +class DoubanSpider(Spider): |
16 | name = "douban" | 16 | name = "douban" |
17 | allowed_domains = ["douban.com"] | 17 | allowed_domains = ["douban.com"] |
18 | start_urls = [ | 18 | start_urls = [ |
@@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider): | @@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider): | ||
36 | for url in response.xpath('//@href').extract(): | 36 | for url in response.xpath('//@href').extract(): |
37 | url = urljoin(base_url, url.strip()) | 37 | url = urljoin(base_url, url.strip()) |
38 | if re.match('http://movie.douban.com/tag/\w+', url, re.U): | 38 | if re.match('http://movie.douban.com/tag/\w+', url, re.U): |
39 | - return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | 39 | + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) |
40 | 40 | ||
41 | 41 | ||
42 | def parse_list(self, response): | 42 | def parse_list(self, response): |
@@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider): | @@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider): | ||
44 | for url in response.xpath('//@href').extract(): | 44 | for url in response.xpath('//@href').extract(): |
45 | url = urljoin(base_url, url.strip()) | 45 | url = urljoin(base_url, url.strip()) |
46 | if re.match('http://movie.douban.com/subject/\w+', url, re.U): | 46 | if re.match('http://movie.douban.com/subject/\w+', url, re.U): |
47 | - return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | 47 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) |
48 | 48 | ||
49 | def parse_item(self, response): | 49 | def parse_item(self, response): |
50 | item = DoubanItem() | 50 | item = DoubanItem() |
51 | - item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() | ||
52 | - item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() | ||
53 | - item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() | 51 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') |
52 | + item['ind'] = md5(item['name']).hexdigest() | ||
53 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | ||
54 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | ||
54 | 55 | ||
55 | return item | 56 | return item |
56 | 57 | ||
57 | def parse_details(self, response): | 58 | def parse_details(self, response): |
58 | pass | 59 | pass |
60 | + | ||
61 | + | ||
62 | + | ||
63 | +class BaiduSpider(Spider): | ||
64 | + name = "baidu" | ||
65 | + allowed_domains = ["image.baidu.com"] | ||
66 | + start_urls = [ | ||
67 | + "http://image.baidu.com/", | ||
68 | + ] | ||
69 | + | ||
70 | + def start_requests(self): | ||
71 | + return [FormRequest("http://image.baidu.com/", | ||
72 | + cookies={'userid': "jhvigtgiq"}, | ||
73 | + callback=self.parse)] | ||
74 | + | ||
75 | + def parse(self, response): | ||
76 | + """ | ||
77 | + This is the default callback used by Scrapy to process downloaded responses | ||
78 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | ||
79 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | ||
80 | + | ||
81 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | ||
82 | + """ | ||
83 | + base_url = get_base_url(response) | ||
84 | + r = re.compile('http://image.baidu.com/channel\S+') | ||
85 | + for url in filter(r.match,response.xpath('//@href').extract()): | ||
86 | + return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list) | ||
87 | + | ||
88 | + | ||
89 | + def parse_list(self, response): | ||
90 | + base_url = get_base_url(response) | ||
91 | + for url in response.xpath('//@href').extract(): | ||
92 | + url = urljoin(base_url, url.strip()) | ||
93 | + if re.match('http://movie.douban.com/subject/\w+', url, re.U): | ||
94 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | ||
95 | + | ||
96 | + def parse_item(self, response): | ||
97 | + item = DoubanItem() | ||
98 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') | ||
99 | + item['ind'] = md5(item['name']).hexdigest() | ||
100 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | ||
101 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | ||
102 | + | ||
103 | + return item | ||
104 | + | ||
105 | + def parse_details(self, response): | ||
106 | + pass | ||
107 | + | ||
108 | + | ||
109 | + |
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
No preview for this file type