__author__ = 'hadoop' import os, sys import logging from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals from mspider.spiders.larvae import Larvae from scrapy.utils.project import get_project_settings from scrapy import cmdline import json package_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(1, os.path.join(package_dir)) def run_spider(): logfile = open('spider.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() if 'SCRAPY_SETTINGS_MODULE' not in os.environ: os.environ['SCRAPY_SETTINGS_MODULE'] = 'mspider.settings' ss = os.environ.get('SCRAPY_SETTINGS_MODULE') spider = Larvae() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=log.INFO) log.msg('Running reactor...') reactor.run() log.msg('Reactor stopped.') class SpiderHandler(object): spiderCounter = 0 def setupCrawler(self, spiderName): crawler = Crawler(get_project_settings()) crawler.signals.connect(self.spiderClosed, signal=signals.spider_closed) # crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() spider = crawler.spiders.create(spiderName) crawler.crawl(spider) crawler.start() def spiderClosed(self): self.spiderCounter -= 1 if self.spiderCounter == 0: reactor.stop() log.msg('Reactor stopped.') def run(self): crawler = Crawler(get_project_settings()) crawler.configure() log.start() log.msg('Running reactor...') for spiderName in crawler.spiders.list(): log.msg(spiderName) self.spiderCounter += 1 self.setupCrawler(spiderName) reactor.run() if __name__ == '__main__': handle = SpiderHandler() handle.run()