Commit a6bc62deb035db9a114822ebfa076a3e42e706df
1 parent
6472fb0c
Exists in
master
add script
Showing
10 changed files
with
47 additions
and
11 deletions
Show diff stats
crawler/crawler/__init__.pyc
No preview for this file type
crawler/crawler/items.pyc
No preview for this file type
crawler/crawler/settings.py
| ... | ... | @@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders' |
| 16 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
| 17 | 17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' |
| 18 | 18 | |
| 19 | -ITEM_PIPELINES = { | |
| 20 | - 'crawler.pipelines.CrawlerPipeline': 300 | |
| 21 | -} | |
| 19 | +#ITEM_PIPELINES = { | |
| 20 | +# 'crawler.pipelines.CrawlerPipeline': 300 | |
| 21 | +#} | ... | ... |
crawler/crawler/settings.pyc
No preview for this file type
No preview for this file type
crawler/crawler/spiders/ImgCrawler.py
| 1 | -from scrapy.spider import BaseSpider | |
| 1 | +from scrapy.spider import BaseSpider, Spider | |
| 2 | 2 | from scrapy.http import Request |
| 3 | 3 | from crawler.items import CrawlerItem |
| 4 | 4 | import re |
| 5 | 5 | |
| 6 | -class Crawler(BaseSpider): | |
| 6 | +class Crawler(Spider): #Scrapy 0.22 | |
| 7 | +#class Crawler(BaseSpider): #Scrapy 0.18 | |
| 7 | 8 | name = 'img' |
| 8 | - allowed_domains = ['www.cs.tsinghua.edu.cn'] | |
| 9 | - start_urls = ['http://www.cs.tsinghua.edu.cn'] | |
| 10 | - urlSet = set() | |
| 9 | + | |
| 10 | + def __init__(self): | |
| 11 | + self.urlSet = set() | |
| 12 | + fr = open('input.txt').readlines() | |
| 13 | + self.mode = int(fr[0].strip()) | |
| 14 | + self.tmp = fr[1].strip() | |
| 15 | + if self.mode == 1: | |
| 16 | + self.allowed_domains = [self.tmp,] | |
| 17 | + self.start_urls = ['http://' + self.tmp, ] | |
| 18 | + else: | |
| 19 | + self.allowed_domains = [] | |
| 20 | + self.start_urls = [] | |
| 11 | 21 | |
| 12 | 22 | def parse(self, response): |
| 13 | 23 | html = response.body |
| ... | ... | @@ -21,13 +31,12 @@ class Crawler(BaseSpider): |
| 21 | 31 | |
| 22 | 32 | for url in urlList: |
| 23 | 33 | if url not in self.urlSet: |
| 24 | - yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse) | |
| 25 | - #pass | |
| 34 | + yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse) | |
| 26 | 35 | |
| 27 | 36 | item = CrawlerItem() |
| 28 | 37 | item['url'] = response.url |
| 29 | 38 | item['imgList'] = [] |
| 30 | 39 | for img in imgList: |
| 31 | - item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"')) | |
| 40 | + item['imgList'].append(img.strip('"')) | |
| 32 | 41 | |
| 33 | 42 | yield item | ... | ... |
crawler/crawler/spiders/ImgCrawler.pyc
No preview for this file type
crawler/crawler/spiders/__init__.pyc
No preview for this file type
| ... | ... | @@ -0,0 +1,25 @@ |
| 1 | +#!/bin/bash | |
| 2 | + | |
| 3 | +if [ $# -lt 2 ]; then | |
| 4 | + echo 'Usage:' | |
| 5 | + echo '1. run 1 allowed_domain' | |
| 6 | + echo 'crawl all imgs in the allowed_domain' | |
| 7 | + echo '2. run 2 input_file' | |
| 8 | + echo 'crawl the imgs and attach the corresponding tags in input_file' | |
| 9 | + exit 0 | |
| 10 | +fi | |
| 11 | + | |
| 12 | +cd crawler/crawler | |
| 13 | +input_file=input.txt | |
| 14 | + | |
| 15 | +if [ $1 = 1 ]; then | |
| 16 | + echo $1 > ${input_file} | |
| 17 | + echo $2 >> ${input_file} | |
| 18 | + scrapy crawl img | |
| 19 | +elif [ $1 = 2 ]; then | |
| 20 | + echo $1 > ${input_file} | |
| 21 | + echo $2 >> ${input_file} | |
| 22 | + scrapy crawl img | |
| 23 | +else | |
| 24 | + echo 'Invalid mode!' | |
| 25 | +fi | ... | ... |