settings.py 1.58 KB
# -*- coding: utf-8 -*-

# Scrapy settings for mspider project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#


BOT_NAME = 'mspider'

SPIDER_MODULES = ['mspider.spiders']
NEWSPIDER_MODULE = 'mspider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mspider (+http://www.yourdomain.com)'



##Custom for Chunk

# anti-banning
COOKIES_ENABLED = True
# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
    'random_useragent.RandomUserAgentMiddleware': 400
}
# http://www.useragentstring.com/pages/useragentstring.php
USER_AGENT_LIST = 'useragents.txt'

DOWNLOAD_HANDLERS = {
    'http': 'scrapy_webdriver.download.WebdriverDownloadHandler',
    'https': 'scrapy_webdriver.download.WebdriverDownloadHandler',
}

SPIDER_MIDDLEWARES = {
    'scrapy_webdriver.middlewares.WebdriverSpiderMiddleware': 543,
}

WEBDRIVER_BROWSER = 'PhantomJS'  # Or any other from selenium.webdriver

# Optional passing of parameters to the webdriver
WEBDRIVER_OPTIONS = {
    'service_args': ['--debug=true', '--load-images=false', '--webdriver-loglevel=debug']
}

# DUPEFILTER_CLASS = 'mymidlleware.CustomFilter'

ITEM_PIPELINES = {
    'mymidlleware.DuplicatesPipeline': 543,
}

# DOWNLOAD_DELAY = 1
# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }