Commit a6bc62deb035db9a114822ebfa076a3e42e706df
1 parent
6472fb0c
Exists in
master
add script
Showing
10 changed files
with
47 additions
and
11 deletions
Show diff stats
crawler/crawler/__init__.pyc
No preview for this file type
crawler/crawler/items.pyc
No preview for this file type
crawler/crawler/settings.py
... | ... | @@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders' |
16 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
17 | 17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' |
18 | 18 | |
19 | -ITEM_PIPELINES = { | |
20 | - 'crawler.pipelines.CrawlerPipeline': 300 | |
21 | -} | |
19 | +#ITEM_PIPELINES = { | |
20 | +# 'crawler.pipelines.CrawlerPipeline': 300 | |
21 | +#} | ... | ... |
crawler/crawler/settings.pyc
No preview for this file type
No preview for this file type
crawler/crawler/spiders/ImgCrawler.py
1 | -from scrapy.spider import BaseSpider | |
1 | +from scrapy.spider import BaseSpider, Spider | |
2 | 2 | from scrapy.http import Request |
3 | 3 | from crawler.items import CrawlerItem |
4 | 4 | import re |
5 | 5 | |
6 | -class Crawler(BaseSpider): | |
6 | +class Crawler(Spider): #Scrapy 0.22 | |
7 | +#class Crawler(BaseSpider): #Scrapy 0.18 | |
7 | 8 | name = 'img' |
8 | - allowed_domains = ['www.cs.tsinghua.edu.cn'] | |
9 | - start_urls = ['http://www.cs.tsinghua.edu.cn'] | |
10 | - urlSet = set() | |
9 | + | |
10 | + def __init__(self): | |
11 | + self.urlSet = set() | |
12 | + fr = open('input.txt').readlines() | |
13 | + self.mode = int(fr[0].strip()) | |
14 | + self.tmp = fr[1].strip() | |
15 | + if self.mode == 1: | |
16 | + self.allowed_domains = [self.tmp,] | |
17 | + self.start_urls = ['http://' + self.tmp, ] | |
18 | + else: | |
19 | + self.allowed_domains = [] | |
20 | + self.start_urls = [] | |
11 | 21 | |
12 | 22 | def parse(self, response): |
13 | 23 | html = response.body |
... | ... | @@ -21,13 +31,12 @@ class Crawler(BaseSpider): |
21 | 31 | |
22 | 32 | for url in urlList: |
23 | 33 | if url not in self.urlSet: |
24 | - yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse) | |
25 | - #pass | |
34 | + yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse) | |
26 | 35 | |
27 | 36 | item = CrawlerItem() |
28 | 37 | item['url'] = response.url |
29 | 38 | item['imgList'] = [] |
30 | 39 | for img in imgList: |
31 | - item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"')) | |
40 | + item['imgList'].append(img.strip('"')) | |
32 | 41 | |
33 | 42 | yield item | ... | ... |
crawler/crawler/spiders/ImgCrawler.pyc
No preview for this file type
crawler/crawler/spiders/__init__.pyc
No preview for this file type
... | ... | @@ -0,0 +1,25 @@ |
1 | +#!/bin/bash | |
2 | + | |
3 | +if [ $# -lt 2 ]; then | |
4 | + echo 'Usage:' | |
5 | + echo '1. run 1 allowed_domain' | |
6 | + echo 'crawl all imgs in the allowed_domain' | |
7 | + echo '2. run 2 input_file' | |
8 | + echo 'crawl the imgs and attach the corresponding tags in input_file' | |
9 | + exit 0 | |
10 | +fi | |
11 | + | |
12 | +cd crawler/crawler | |
13 | +input_file=input.txt | |
14 | + | |
15 | +if [ $1 = 1 ]; then | |
16 | + echo $1 > ${input_file} | |
17 | + echo $2 >> ${input_file} | |
18 | + scrapy crawl img | |
19 | +elif [ $1 = 2 ]; then | |
20 | + echo $1 > ${input_file} | |
21 | + echo $2 >> ${input_file} | |
22 | + scrapy crawl img | |
23 | +else | |
24 | + echo 'Invalid mode!' | |
25 | +fi | ... | ... |