Commit b5e0cb7308fddfa442501b13ce05dfc4c532a57e
1 parent
a9d3ceff
Exists in
master
and in
2 other branches
Baidu staged.
Showing
13 changed files
with
68 additions
and
16 deletions
Show diff stats
.idea/ImageR.iml
| ... | ... | @@ -2,7 +2,7 @@ |
| 2 | 2 | <module type="PYTHON_MODULE" version="4"> |
| 3 | 3 | <component name="NewModuleRootManager"> |
| 4 | 4 | <content url="file://$MODULE_DIR$" /> |
| 5 | - <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> | |
| 5 | + <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | |
| 6 | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
| 7 | 7 | </component> |
| 8 | 8 | </module> |
| 9 | 9 | \ No newline at end of file | ... | ... |
run_spider.sh
| ... | ... | @@ -11,7 +11,7 @@ |
| 11 | 11 | ############################################## |
| 12 | 12 | export export TERM=linux |
| 13 | 13 | source /home/hadoop/.zshrc |
| 14 | -v env0 | |
| 14 | +v env1 | |
| 15 | 15 | |
| 16 | 16 | ############################################## |
| 17 | 17 | ## additional files list |
| ... | ... | @@ -21,8 +21,8 @@ FILE=hehe.json |
| 21 | 21 | #scrapy runspider spider/test.py |
| 22 | 22 | cd ./spider/mspider/ |
| 23 | 23 | [ -f $FILE ] && rm $FILE |
| 24 | -scrapy crawl douban -o $FILE | |
| 25 | - | |
| 24 | +#scrapy crawl douban -o $FILE | |
| 25 | +scrapy crawl baidu -o $FILE | |
| 26 | 26 | |
| 27 | 27 | |
| 28 | 28 | ... | ... |
spider/mspider/hehe.json
No preview for this file type
spider/mspider/mspider/__init__.pyc
No preview for this file type
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
| ... | ... | @@ -73,7 +73,8 @@ class MspiderPipeline(object): |
| 73 | 73 | |
| 74 | 74 | def process_item(self, item, spider): |
| 75 | 75 | try: |
| 76 | - self.hbasedumper.store_item(item) | |
| 76 | + self.hbasedumper.store_item(item) # one by one | |
| 77 | + # self.hbasedumper.store_items(item) # bulk put | |
| 77 | 78 | except: |
| 78 | 79 | raise |
| 79 | 80 | return item | ... | ... |
spider/mspider/mspider/pipelines.pyc
No preview for this file type
spider/mspider/mspider/settings.py
| ... | ... | @@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = { |
| 29 | 29 | } |
| 30 | 30 | # http://www.useragentstring.com/pages/useragentstring.php |
| 31 | 31 | USER_AGENT_LIST = 'useragents.txt' |
| 32 | -# DOWNLOAD_DELAY = 0.1 | |
| 32 | +DOWNLOAD_DELAY = 1 | |
| 33 | 33 | |
| 34 | -# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | |
| 34 | +ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } | |
| 35 | 35 | ... | ... |
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/__init__.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
| ... | ... | @@ -3,16 +3,16 @@ __author__ = 'chunk' |
| 3 | 3 | |
| 4 | 4 | from ..items import DoubanItem |
| 5 | 5 | |
| 6 | -import scrapy | |
| 7 | -from scrapy import FormRequest | |
| 6 | +from scrapy import Spider,FormRequest | |
| 8 | 7 | from scrapy.http import Request |
| 9 | 8 | from scrapy.utils.response import get_base_url |
| 10 | 9 | from urlparse import urljoin |
| 11 | 10 | import re |
| 12 | 11 | from hashlib import md5 |
| 12 | +import json | |
| 13 | 13 | |
| 14 | 14 | |
| 15 | -class DoubanSpider(scrapy.Spider): | |
| 15 | +class DoubanSpider(Spider): | |
| 16 | 16 | name = "douban" |
| 17 | 17 | allowed_domains = ["douban.com"] |
| 18 | 18 | start_urls = [ |
| ... | ... | @@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider): |
| 36 | 36 | for url in response.xpath('//@href').extract(): |
| 37 | 37 | url = urljoin(base_url, url.strip()) |
| 38 | 38 | if re.match('http://movie.douban.com/tag/\w+', url, re.U): |
| 39 | - return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | |
| 39 | + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) | |
| 40 | 40 | |
| 41 | 41 | |
| 42 | 42 | def parse_list(self, response): |
| ... | ... | @@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider): |
| 44 | 44 | for url in response.xpath('//@href').extract(): |
| 45 | 45 | url = urljoin(base_url, url.strip()) |
| 46 | 46 | if re.match('http://movie.douban.com/subject/\w+', url, re.U): |
| 47 | - return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
| 47 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
| 48 | 48 | |
| 49 | 49 | def parse_item(self, response): |
| 50 | 50 | item = DoubanItem() |
| 51 | - item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() | |
| 52 | - item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract() | |
| 53 | - item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() | |
| 51 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') | |
| 52 | + item['ind'] = md5(item['name']).hexdigest() | |
| 53 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | |
| 54 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | |
| 54 | 55 | |
| 55 | 56 | return item |
| 56 | 57 | |
| 57 | 58 | def parse_details(self, response): |
| 58 | 59 | pass |
| 60 | + | |
| 61 | + | |
| 62 | + | |
| 63 | +class BaiduSpider(Spider): | |
| 64 | + name = "baidu" | |
| 65 | + allowed_domains = ["image.baidu.com"] | |
| 66 | + start_urls = [ | |
| 67 | + "http://image.baidu.com/", | |
| 68 | + ] | |
| 69 | + | |
| 70 | + def start_requests(self): | |
| 71 | + return [FormRequest("http://image.baidu.com/", | |
| 72 | + cookies={'userid': "jhvigtgiq"}, | |
| 73 | + callback=self.parse)] | |
| 74 | + | |
| 75 | + def parse(self, response): | |
| 76 | + """ | |
| 77 | + This is the default callback used by Scrapy to process downloaded responses | |
| 78 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | |
| 79 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | |
| 80 | + | |
| 81 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | |
| 82 | + """ | |
| 83 | + base_url = get_base_url(response) | |
| 84 | + r = re.compile('http://image.baidu.com/channel\S+') | |
| 85 | + for url in filter(r.match,response.xpath('//@href').extract()): | |
| 86 | + return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list) | |
| 87 | + | |
| 88 | + | |
| 89 | + def parse_list(self, response): | |
| 90 | + base_url = get_base_url(response) | |
| 91 | + for url in response.xpath('//@href').extract(): | |
| 92 | + url = urljoin(base_url, url.strip()) | |
| 93 | + if re.match('http://movie.douban.com/subject/\w+', url, re.U): | |
| 94 | + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) | |
| 95 | + | |
| 96 | + def parse_item(self, response): | |
| 97 | + item = DoubanItem() | |
| 98 | + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8') | |
| 99 | + item['ind'] = md5(item['name']).hexdigest() | |
| 100 | + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8') | |
| 101 | + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8') | |
| 102 | + | |
| 103 | + return item | |
| 104 | + | |
| 105 | + def parse_details(self, response): | |
| 106 | + pass | |
| 107 | + | |
| 108 | + | |
| 109 | + | ... | ... |
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
No preview for this file type