Commit b5e0cb7308fddfa442501b13ce05dfc4c532a57e

Authored by Chunk
1 parent a9d3ceff

Baidu staged.

.idea/ImageR.iml
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 <module type="PYTHON_MODULE" version="4"> 2 <module type="PYTHON_MODULE" version="4">
3 <component name="NewModuleRootManager"> 3 <component name="NewModuleRootManager">
4 <content url="file://$MODULE_DIR$" /> 4 <content url="file://$MODULE_DIR$" />
5 - <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> 5 + <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" />
6 <orderEntry type="sourceFolder" forTests="false" /> 6 <orderEntry type="sourceFolder" forTests="false" />
7 </component> 7 </component>
8 </module> 8 </module>
9 \ No newline at end of file 9 \ No newline at end of file
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 ############################################## 11 ##############################################
12 export export TERM=linux 12 export export TERM=linux
13 source /home/hadoop/.zshrc 13 source /home/hadoop/.zshrc
14 -v env0 14 +v env1
15 15
16 ############################################## 16 ##############################################
17 ## additional files list 17 ## additional files list
@@ -21,8 +21,8 @@ FILE=hehe.json @@ -21,8 +21,8 @@ FILE=hehe.json
21 #scrapy runspider spider/test.py 21 #scrapy runspider spider/test.py
22 cd ./spider/mspider/ 22 cd ./spider/mspider/
23 [ -f $FILE ] && rm $FILE 23 [ -f $FILE ] && rm $FILE
24 -scrapy crawl douban -o $FILE  
25 - 24 +#scrapy crawl douban -o $FILE
  25 +scrapy crawl baidu -o $FILE
26 26
27 27
28 28
spider/mspider/hehe.json
No preview for this file type
spider/mspider/mspider/__init__.pyc
No preview for this file type
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
@@ -73,7 +73,8 @@ class MspiderPipeline(object): @@ -73,7 +73,8 @@ class MspiderPipeline(object):
73 73
74 def process_item(self, item, spider): 74 def process_item(self, item, spider):
75 try: 75 try:
76 - self.hbasedumper.store_item(item) 76 + self.hbasedumper.store_item(item) # one by one
  77 + # self.hbasedumper.store_items(item) # bulk put
77 except: 78 except:
78 raise 79 raise
79 return item 80 return item
spider/mspider/mspider/pipelines.pyc
No preview for this file type
spider/mspider/mspider/settings.py
@@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = { @@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = {
29 } 29 }
30 # http://www.useragentstring.com/pages/useragentstring.php 30 # http://www.useragentstring.com/pages/useragentstring.php
31 USER_AGENT_LIST = 'useragents.txt' 31 USER_AGENT_LIST = 'useragents.txt'
32 -# DOWNLOAD_DELAY = 0.1 32 +DOWNLOAD_DELAY = 1
33 33
34 -# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, } 34 +ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
35 35
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/__init__.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
@@ -3,16 +3,16 @@ __author__ = &#39;chunk&#39; @@ -3,16 +3,16 @@ __author__ = &#39;chunk&#39;
3 3
4 from ..items import DoubanItem 4 from ..items import DoubanItem
5 5
6 -import scrapy  
7 -from scrapy import FormRequest 6 +from scrapy import Spider,FormRequest
8 from scrapy.http import Request 7 from scrapy.http import Request
9 from scrapy.utils.response import get_base_url 8 from scrapy.utils.response import get_base_url
10 from urlparse import urljoin 9 from urlparse import urljoin
11 import re 10 import re
12 from hashlib import md5 11 from hashlib import md5
  12 +import json
13 13
14 14
15 -class DoubanSpider(scrapy.Spider): 15 +class DoubanSpider(Spider):
16 name = "douban" 16 name = "douban"
17 allowed_domains = ["douban.com"] 17 allowed_domains = ["douban.com"]
18 start_urls = [ 18 start_urls = [
@@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider): @@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider):
36 for url in response.xpath('//@href').extract(): 36 for url in response.xpath('//@href').extract():
37 url = urljoin(base_url, url.strip()) 37 url = urljoin(base_url, url.strip())
38 if re.match('http://movie.douban.com/tag/\w+', url, re.U): 38 if re.match('http://movie.douban.com/tag/\w+', url, re.U):
39 - return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list) 39 + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
40 40
41 41
42 def parse_list(self, response): 42 def parse_list(self, response):
@@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider): @@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider):
44 for url in response.xpath('//@href').extract(): 44 for url in response.xpath('//@href').extract():
45 url = urljoin(base_url, url.strip()) 45 url = urljoin(base_url, url.strip())
46 if re.match('http://movie.douban.com/subject/\w+', url, re.U): 46 if re.match('http://movie.douban.com/subject/\w+', url, re.U):
47 - return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item) 47 + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
48 48
49 def parse_item(self, response): 49 def parse_item(self, response):
50 item = DoubanItem() 50 item = DoubanItem()
51 - item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()  
52 - item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()  
53 - item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract() 51 + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
  52 + item['ind'] = md5(item['name']).hexdigest()
  53 + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
  54 + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
54 55
55 return item 56 return item
56 57
57 def parse_details(self, response): 58 def parse_details(self, response):
58 pass 59 pass
  60 +
  61 +
  62 +
  63 +class BaiduSpider(Spider):
  64 + name = "baidu"
  65 + allowed_domains = ["image.baidu.com"]
  66 + start_urls = [
  67 + "http://image.baidu.com/",
  68 + ]
  69 +
  70 + def start_requests(self):
  71 + return [FormRequest("http://image.baidu.com/",
  72 + cookies={'userid': "jhvigtgiq"},
  73 + callback=self.parse)]
  74 +
  75 + def parse(self, response):
  76 + """
  77 + This is the default callback used by Scrapy to process downloaded responses
  78 + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
  79 + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
  80 +
  81 + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
  82 + """
  83 + base_url = get_base_url(response)
  84 + r = re.compile('http://image.baidu.com/channel\S+')
  85 + for url in filter(r.match,response.xpath('//@href').extract()):
  86 + return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list)
  87 +
  88 +
  89 + def parse_list(self, response):
  90 + base_url = get_base_url(response)
  91 + for url in response.xpath('//@href').extract():
  92 + url = urljoin(base_url, url.strip())
  93 + if re.match('http://movie.douban.com/subject/\w+', url, re.U):
  94 + yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
  95 +
  96 + def parse_item(self, response):
  97 + item = DoubanItem()
  98 + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
  99 + item['ind'] = md5(item['name']).hexdigest()
  100 + item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
  101 + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
  102 +
  103 + return item
  104 +
  105 + def parse_details(self, response):
  106 + pass
  107 +
  108 +
  109 +
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
test_data.pyc 0 → 100644
No preview for this file type