test000.py
2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
__author__ = 'chunk'
from ..items import DoubanItem
import scrapy
from scrapy import FormRequest
from scrapy.http import Request
from scrapy.utils.response import get_base_url
from urlparse import urljoin
import re
from hashlib import md5
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["douban.com"]
start_urls = [
"http://movie.douban.com/tag/",
]
def start_requests(self):
return [FormRequest("http://movie.douban.com/tag/",
cookies={'bid': "SCAM2676P0o"},
callback=self.parse)]
def parse(self, response):
"""
This is the default callback used by Scrapy to process downloaded responses
The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
"""
base_url = get_base_url(response)
for url in response.xpath('//@href').extract():
url = urljoin(base_url, url.strip())
if re.match('http://movie.douban.com/tag/\w+', url, re.U):
return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
def parse_list(self, response):
base_url = get_base_url(response)
for url in response.xpath('//@href').extract():
url = urljoin(base_url, url.strip())
if re.match('http://movie.douban.com/subject/\w+', url, re.U):
return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
def parse_item(self, response):
item = DoubanItem()
item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
return item
def parse_details(self, response):
pass