settings.py
1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
# Scrapy settings for mspider project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'mspider'
SPIDER_MODULES = ['mspider.spiders']
NEWSPIDER_MODULE = 'mspider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mspider (+http://www.yourdomain.com)'
##Custom for Chunk
# anti-banning
COOKIES_ENABLED = True
# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'random_useragent.RandomUserAgentMiddleware': 400
}
# http://www.useragentstring.com/pages/useragentstring.php
USER_AGENT_LIST = 'useragents.txt'
DOWNLOAD_HANDLERS = {
'http': 'scrapy_webdriver.download.WebdriverDownloadHandler',
'https': 'scrapy_webdriver.download.WebdriverDownloadHandler',
}
SPIDER_MIDDLEWARES = {
'scrapy_webdriver.middlewares.WebdriverSpiderMiddleware': 543,
}
WEBDRIVER_BROWSER = 'PhantomJS' # Or any other from selenium.webdriver
# Optional passing of parameters to the webdriver
WEBDRIVER_OPTIONS = {
'service_args': ['--debug=true', '--load-images=false', '--webdriver-loglevel=debug']
}
# DUPEFILTER_CLASS = 'mymidlleware.CustomFilter'
ITEM_PIPELINES = {
'mymidlleware.DuplicatesPipeline': 543,
}
# DOWNLOAD_DELAY = 1
# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }