Baidu staged.

Chunk
1 parent a9d3ceff
Showing 13 changed files with 68 additions and 16 deletions Show diff stats
.idea/ImageR.iml
run_spider.sh
spider/mspider/hehe.json
spider/mspider/mspider/__init__.pyc
spider/mspider/mspider/items.pyc
spider/mspider/mspider/pipelines.py
spider/mspider/mspider/pipelines.pyc
spider/mspider/mspider/settings.py
spider/mspider/mspider/settings.pyc
spider/mspider/mspider/spiders/__init__.pyc
spider/mspider/mspider/spiders/test000.py
spider/mspider/mspider/spiders/test000.pyc
test_data.pyc
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
 \ No newline at end of file
@@ -11,7 +11,7 @@
 ##############################################
 export export TERM=linux
 source /home/hadoop/.zshrc
-v env0
+v env1
  
 ##############################################
 ## additional files list
@@ -21,8 +21,8 @@ FILE=hehe.json
 #scrapy runspider spider/test.py
 cd ./spider/mspider/
 [ -f $FILE ] && rm $FILE
-scrapy crawl douban -o $FILE
-
+#scrapy crawl douban -o $FILE
+scrapy crawl baidu -o $FILE
  
  
  
@@ -73,7 +73,8 @@ class MspiderPipeline(object):
  
     def process_item(self, item, spider):
         try:
-            self.hbasedumper.store_item(item)
+            self.hbasedumper.store_item(item) # one by one
+            # self.hbasedumper.store_items(item) # bulk put
         except:
             raise
         return item
@@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = {
 }
 # http://www.useragentstring.com/pages/useragentstring.php
 USER_AGENT_LIST = 'useragents.txt'
-# DOWNLOAD_DELAY = 0.1
+DOWNLOAD_DELAY = 1
  
-# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
+ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
  
@@ -3,16 +3,16 @@ __author__ = &#39;chunk&#39;
  
 from ..items import DoubanItem
  
-import scrapy
-from scrapy import FormRequest
+from scrapy import Spider,FormRequest
 from scrapy.http import Request
 from scrapy.utils.response import get_base_url
 from urlparse import urljoin
 import re
 from hashlib import md5
+import json
  
  
-class DoubanSpider(scrapy.Spider):
+class DoubanSpider(Spider):
     name = "douban"
     allowed_domains = ["douban.com"]
     start_urls = [
@@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider):
         for url in response.xpath('//@href').extract():
             url = urljoin(base_url, url.strip())
             if re.match('http://movie.douban.com/tag/\w+', url, re.U):
-                return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
+                return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
  
  
     def parse_list(self, response):
@@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider):
         for url in response.xpath('//@href').extract():
             url = urljoin(base_url, url.strip())
             if re.match('http://movie.douban.com/subject/\w+', url, re.U):
-                return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
+                yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
  
     def parse_item(self, response):
         item = DoubanItem()
-        item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
-        item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
-        item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
+        item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
+        item['ind'] = md5(item['name']).hexdigest()
+        item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
+        item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
  
         return item
  
     def parse_details(self, response):
         pass
+
+
+
+class BaiduSpider(Spider):
+    name = "baidu"
+    allowed_domains = ["image.baidu.com"]
+    start_urls = [
+        "http://image.baidu.com/",
+    ]
+
+    def start_requests(self):
+        return [FormRequest("http://image.baidu.com/",
+                            cookies={'userid': "jhvigtgiq"},
+                            callback=self.parse)]
+
+    def parse(self, response):
+        """
+        This is the default callback used by Scrapy to process downloaded responses
+        The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
+        This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
+
+        Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
+        """
+        base_url = get_base_url(response)
+        r = re.compile('http://image.baidu.com/channel\S+')
+        for url in filter(r.match,response.xpath('//@href').extract()):
+            return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list)
+
+
+    def parse_list(self, response):
+        base_url = get_base_url(response)
+        for url in response.xpath('//@href').extract():
+            url = urljoin(base_url, url.strip())
+            if re.match('http://movie.douban.com/subject/\w+', url, re.U):
+                yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
+
+    def parse_item(self, response):
+        item = DoubanItem()
+        item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
+        item['ind'] = md5(item['name']).hexdigest()
+        item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
+        item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
+
+        return item
+
+    def parse_details(self, response):
+        pass
+
+
+
...	...	@@ -2,7 +2,7 @@
2	2	<module type="PYTHON_MODULE" version="4">
3	3	<component name="NewModuleRootManager">
4	4	<content url="file://$MODULE_DIR$" />
5		- <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" />
	5	+ <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" />
6	6	<orderEntry type="sourceFolder" forTests="false" />
7	7	</component>
8	8	</module>
9	9	\ No newline at end of file
...	...
...	...	@@ -11,7 +11,7 @@
11	11	##############################################
12	12	export export TERM=linux
13	13	source /home/hadoop/.zshrc
14		-v env0
	14	+v env1
15	15
16	16	##############################################
17	17	## additional files list
...	...	@@ -21,8 +21,8 @@ FILE=hehe.json
21	21	#scrapy runspider spider/test.py
22	22	cd ./spider/mspider/
23	23	[ -f $FILE ] && rm $FILE
24		-scrapy crawl douban -o $FILE
25		-
	24	+#scrapy crawl douban -o $FILE
	25	+scrapy crawl baidu -o $FILE
26	26
27	27
28	28
...	...
...	...	@@ -73,7 +73,8 @@ class MspiderPipeline(object):
73	73
74	74	def process_item(self, item, spider):
75	75	try:
76		- self.hbasedumper.store_item(item)
	76	+ self.hbasedumper.store_item(item) # one by one
	77	+ # self.hbasedumper.store_items(item) # bulk put
77	78	except:
78	79	raise
79	80	return item
...	...
...	...	@@ -29,7 +29,7 @@ DOWNLOADER_MIDDLEWARES = {
29	29	}
30	30	# http://www.useragentstring.com/pages/useragentstring.php
31	31	USER_AGENT_LIST = 'useragents.txt'
32		-# DOWNLOAD_DELAY = 0.1
	32	+DOWNLOAD_DELAY = 1
33	33
34		-# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
	34	+ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
35	35
...	...
...	...	@@ -3,16 +3,16 @@ __author__ = 'chunk'
3	3
4	4	from ..items import DoubanItem
5	5
6		-import scrapy
7		-from scrapy import FormRequest
	6	+from scrapy import Spider,FormRequest
8	7	from scrapy.http import Request
9	8	from scrapy.utils.response import get_base_url
10	9	from urlparse import urljoin
11	10	import re
12	11	from hashlib import md5
	12	+import json
13	13
14	14
15		-class DoubanSpider(scrapy.Spider):
	15	+class DoubanSpider(Spider):
16	16	name = "douban"
17	17	allowed_domains = ["douban.com"]
18	18	start_urls = [
...	...	@@ -36,7 +36,7 @@ class DoubanSpider(scrapy.Spider):
36	36	for url in response.xpath('//@href').extract():
37	37	url = urljoin(base_url, url.strip())
38	38	if re.match('http://movie.douban.com/tag/\w+', url, re.U):
39		- return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
	39	+ return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
40	40
41	41
42	42	def parse_list(self, response):
...	...	@@ -44,15 +44,66 @@ class DoubanSpider(scrapy.Spider):
44	44	for url in response.xpath('//@href').extract():
45	45	url = urljoin(base_url, url.strip())
46	46	if re.match('http://movie.douban.com/subject/\w+', url, re.U):
47		- return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
	47	+ yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
48	48
49	49	def parse_item(self, response):
50	50	item = DoubanItem()
51		- item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
52		- item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
53		- item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
	51	+ item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
	52	+ item['ind'] = md5(item['name']).hexdigest()
	53	+ item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
	54	+ item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
54	55
55	56	return item
56	57
57	58	def parse_details(self, response):
58	59	pass
	60	+
	61	+
	62	+
	63	+class BaiduSpider(Spider):
	64	+ name = "baidu"
	65	+ allowed_domains = ["image.baidu.com"]
	66	+ start_urls = [
	67	+ "http://image.baidu.com/",
	68	+ ]
	69	+
	70	+ def start_requests(self):
	71	+ return [FormRequest("http://image.baidu.com/",
	72	+ cookies={'userid': "jhvigtgiq"},
	73	+ callback=self.parse)]
	74	+
	75	+ def parse(self, response):
	76	+ """
	77	+ This is the default callback used by Scrapy to process downloaded responses
	78	+ The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
	79	+ This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
	80	+
	81	+ Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
	82	+ """
	83	+ base_url = get_base_url(response)
	84	+ r = re.compile('http://image.baidu.com/channel\S+')
	85	+ for url in filter(r.match,response.xpath('//@href').extract()):
	86	+ return Request(url, cookies={'userid': "jhvigtgiq"}, callback=self.parse_list)
	87	+
	88	+
	89	+ def parse_list(self, response):
	90	+ base_url = get_base_url(response)
	91	+ for url in response.xpath('//@href').extract():
	92	+ url = urljoin(base_url, url.strip())
	93	+ if re.match('http://movie.douban.com/subject/\w+', url, re.U):
	94	+ yield Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
	95	+
	96	+ def parse_item(self, response):
	97	+ item = DoubanItem()
	98	+ item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0].encode('utf8')
	99	+ item['ind'] = md5(item['name']).hexdigest()
	100	+ item['director'] = json.dumps(response.xpath('//a[@rel="v:directedBy"]/text()').extract(), ensure_ascii=False).encode('utf8')
	101	+ item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()[0].encode('utf-8')
	102	+
	103	+ return item
	104	+
	105	+ def parse_details(self, response):
	106	+ pass
	107	+
	108	+
	109	+
...	...