1b6f5e02
Chunk
re-spidering
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
__author__ = 'hadoop'
import os, sys
import logging
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from mspider.spiders.larvae import Larvae
from scrapy.utils.project import get_project_settings
from scrapy import cmdline
import json
package_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, os.path.join(package_dir))
def run_spider():
logfile = open('spider.log', 'w')
log_observer = log.ScrapyFileLogObserver(logfile, level=logging.DEBUG)
log_observer.start()
if 'SCRAPY_SETTINGS_MODULE' not in os.environ:
os.environ['SCRAPY_SETTINGS_MODULE'] = 'mspider.settings'
ss = os.environ.get('SCRAPY_SETTINGS_MODULE')
spider = Larvae()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start(loglevel=log.INFO)
log.msg('Running reactor...')
reactor.run()
log.msg('Reactor stopped.')
class SpiderHandler(object):
spiderCounter = 0
def setupCrawler(self, spiderName):
crawler = Crawler(get_project_settings())
crawler.signals.connect(self.spiderClosed, signal=signals.spider_closed)
# crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
spider = crawler.spiders.create(spiderName)
crawler.crawl(spider)
crawler.start()
def spiderClosed(self):
self.spiderCounter -= 1
if self.spiderCounter == 0:
reactor.stop()
log.msg('Reactor stopped.')
def run(self):
crawler = Crawler(get_project_settings())
crawler.configure()
log.start()
log.msg('Running reactor...')
for spiderName in crawler.spiders.list():
log.msg(spiderName)
self.spiderCounter += 1
self.setupCrawler(spiderName)
reactor.run()
if __name__ == '__main__':
handle = SpiderHandler()
handle.run()
|