__author__ = 'chunk' import os from scrapy.dupefilter import RFPDupeFilter from scrapy.utils.request import request_fingerprint class CustomFilter(RFPDupeFilter): """ A dupe filter that considers specific ids in the url http://stackoverflow.com/questions/12553117/how-to-filter-duplicate-requests-based-on-url-in-scrapy """ def __getid(self, url): mm = url.split("/")[-1] #or something like that return mm def request_seen(self, request): fp = self.__getid(request.url) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep) from scrapy.exceptions import DropItem class DuplicatesPipeline(object): """ http://doc.scrapy.org/en/latest/topics/item-pipeline.html """ def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item