Commit b5e0cb7308fddfa442501b13ce05dfc4c532a57e

Authored by Chunk
1 parent a9d3ceff

Baidu staged.

.idea/ImageR.iml
... ... @@ -2,7 +2,7 @@
2 2 <module type="PYTHON_MODULE" version="4">
3 3 <component name="NewModuleRootManager">
4 4 <content url="file://$MODULE_DIR$" />
5   - <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" />
  5 + <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" />
6 6 <orderEntry type="sourceFolder" forTests="false" />
7 7 </component>
8 8 </module>
9 9 \ No newline at end of file
... ...
run_spider.sh
... ... @@ -11,7 +11,7 @@
11 11 ##############################################
12 12 export export TERM=linux
13 13 source /home/hadoop/.zshrc
14   -v env0
  14 +v env1
15 15  
16 16 ##############################################
17 17 ## additional files list
... ... @@ -21,8 +21,8 @@ FILE=hehe.json
21 21 #scrapy runspider spider/test.py
22 22 cd ./spider/mspider/
23 23 [ -f $FILE ] && rm $FILE
24   -scrapy crawl douban -o $FILE
25   -
  24 +#scrapy crawl douban -o $FILE
  25 +scrapy crawl baidu -o $FILE
26 26  
27 27  
28 28  
... ...
spider/mspider/hehe.json
No preview for this file type
spider/mspider/mspider/__init__.pyc
No preview for this file type
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
... ... @@ -73,7 +73,8 @@ class MspiderPipeline(object):
73 73  
74 74 def process_item(self, item, spider):
75 75 try:
76   - self.hbasedumper.store_item(item)
  76 + self.hbasedumper.store_item(item) # one by one
  77 + # self.hbasedumper.store_items(item) # bulk put
77 78 except:
78 79 raise
79 80 return item
... ...
spider/mspider/mspider/pipelines.pyc
No preview for this file type
spider/mspider/mspider/settings.py
... ... @@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = {
29 29 }
30 30 # http://www.useragentstring.com/pages/useragentstring.php
31 31 USER_AGENT_LIST = 'useragents.txt'
32   -# DOWNLOAD_DELAY = 0.1
  32 +DOWNLOAD_DELAY = 1
33 33  
34   -# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
  34 +ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
35 35  
... ...
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/__init__.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
... ... @@ -3,16 +3,16 @@ __author__ = &#39;chunk&#39;
3 3  
4 4 from ..items import DoubanItem
5 5  
6   -import scrapy
7   -from scrapy import FormRequest
  6 +from scrapy import Spider,FormRequest
8 7 from scrapy.http import Request
9 8 from scrapy.utils.response import get_base_url
10 9 from urlparse import urljoin
11 10 import re
12 11 from hashlib import md5
  12 +import json
13 13  
14 14  
15   -class DoubanSpider(scrapy.Spider):
  15 +class DoubanSpider(Spider):
16 16 name = "douban"
17 17 allowed_domains = ["douban.com"]
18 18 start_urls = [
... ... @@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider):
36 36 for url in response.xpath('//@href').extract():
37 37 url = urljoin(base_url, url.strip())
38 38 if re.match('http://movie.douban.com/tag/\w+', url, re.U):
39   - return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
  39 + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
40 40  
41 41  
42 42 def parse_list(self, response):
... ... @@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider):
44 44 for url in response.xpath('//@href').extract():
45 45 url = urljoin(base_url, url.strip())
46 46 if re.match('http://movie.douban.com/subject/\w+', url, re.U):
47   - return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
  47 + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
48 48  
49 49 def parse_item(self, response):
50 50 item = DoubanItem()
51   - item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
52   - item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
53   - item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
  51 + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
  52 + item['ind'] = md5(item['name']).hexdigest()
  53 + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
  54 + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
54 55  
55 56 return item
56 57  
57 58 def parse_details(self, response):
58 59 pass
  60 +
  61 +
  62 +
  63 +class BaiduSpider(Spider):
  64 + name = "baidu"
  65 + allowed_domains = ["image.baidu.com"]
  66 + start_urls = [
  67 + "http://image.baidu.com/",
  68 + ]
  69 +
  70 + def start_requests(self):
  71 + return [FormRequest("http://image.baidu.com/",
  72 + cookies={'userid': "jhvigtgiq"},
  73 + callback=self.parse)]
  74 +
  75 + def parse(self, response):
  76 + """
  77 + This is the default callback used by Scrapy to process downloaded responses
  78 + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
  79 + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
  80 +
  81 + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
  82 + """
  83 + base_url = get_base_url(response)
  84 + r = re.compile('http://image.baidu.com/channel\S+')
  85 + for url in filter(r.match,response.xpath('//@href').extract()):
  86 + return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list)
  87 +
  88 +
  89 + def parse_list(self, response):
  90 + base_url = get_base_url(response)
  91 + for url in response.xpath('//@href').extract():
  92 + url = urljoin(base_url, url.strip())
  93 + if re.match('http://movie.douban.com/subject/\w+', url, re.U):
  94 + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
  95 +
  96 + def parse_item(self, response):
  97 + item = DoubanItem()
  98 + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
  99 + item['ind'] = md5(item['name']).hexdigest()
  100 + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
  101 + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
  102 +
  103 + return item
  104 +
  105 + def parse_details(self, response):
  106 + pass
  107 +
  108 +
  109 +
... ...
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
test_data.pyc 0 → 100644
No preview for this file type