helper.py 2.04 KB
__author__ = 'hadoop'

import os, sys
import logging

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from mspider.spiders.larvae import Larvae
from scrapy.utils.project import get_project_settings
from scrapy import cmdline
import json

package_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, os.path.join(package_dir))


def run_spider():
    logfile = open('spider.log', 'w')
    log_observer = log.ScrapyFileLogObserver(logfile, level=logging.DEBUG)
    log_observer.start()

    if 'SCRAPY_SETTINGS_MODULE' not in os.environ:
        os.environ['SCRAPY_SETTINGS_MODULE'] = 'mspider.settings'

    ss = os.environ.get('SCRAPY_SETTINGS_MODULE')
    spider = Larvae()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=log.INFO)
    log.msg('Running reactor...')
    reactor.run()
    log.msg('Reactor stopped.')


class SpiderHandler(object):
    spiderCounter = 0

    def setupCrawler(self, spiderName):
        crawler = Crawler(get_project_settings())
        crawler.signals.connect(self.spiderClosed, signal=signals.spider_closed)
        # crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()

        spider = crawler.spiders.create(spiderName)

        crawler.crawl(spider)
        crawler.start()


    def spiderClosed(self):
        self.spiderCounter -= 1

        if self.spiderCounter == 0:
            reactor.stop()
        log.msg('Reactor stopped.')

    def run(self):
        crawler = Crawler(get_project_settings())
        crawler.configure()
        log.start()
        log.msg('Running reactor...')
        for spiderName in crawler.spiders.list():
            log.msg(spiderName)
            self.spiderCounter += 1
            self.setupCrawler(spiderName)

        reactor.run()


if __name__ == '__main__':
    handle = SpiderHandler()
    handle.run()