Blame view

mspider/mymidlleware.py 1.09 KB
f8215db1   Chunk   recusive with web...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
__author__ = 'chunk'

import os

from scrapy.dupefilter import RFPDupeFilter
from scrapy.utils.request import request_fingerprint

class CustomFilter(RFPDupeFilter):
    """
    A dupe filter that considers specific ids in the url
    http://stackoverflow.com/questions/12553117/how-to-filter-duplicate-requests-based-on-url-in-scrapy
    """

    def __getid(self, url):
eddb1414   Chunk   almost finished.L...
15
        mm = url.split("/")[-1] #or something like that
f8215db1   Chunk   recusive with web...
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
        return mm

    def request_seen(self, request):
        fp = self.__getid(request.url)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + os.linesep)


from scrapy.exceptions import DropItem

class DuplicatesPipeline(object):
    """
    http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    """
    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        if item['id'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['id'])
            return item