Commit a6bc62deb035db9a114822ebfa076a3e42e706df
1 parent
6472fb0c
Exists in
master
add script
Showing
10 changed files
with
47 additions
and
11 deletions
Show diff stats
crawler/crawler/__init__.pyc
No preview for this file type
crawler/crawler/items.pyc
No preview for this file type
crawler/crawler/settings.py
@@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders' | @@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders' | ||
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent |
17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' | 17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' |
18 | 18 | ||
19 | -ITEM_PIPELINES = { | ||
20 | - 'crawler.pipelines.CrawlerPipeline': 300 | ||
21 | -} | 19 | +#ITEM_PIPELINES = { |
20 | +# 'crawler.pipelines.CrawlerPipeline': 300 | ||
21 | +#} |
crawler/crawler/settings.pyc
No preview for this file type
No preview for this file type
crawler/crawler/spiders/ImgCrawler.py
1 | -from scrapy.spider import BaseSpider | 1 | +from scrapy.spider import BaseSpider, Spider |
2 | from scrapy.http import Request | 2 | from scrapy.http import Request |
3 | from crawler.items import CrawlerItem | 3 | from crawler.items import CrawlerItem |
4 | import re | 4 | import re |
5 | 5 | ||
6 | -class Crawler(BaseSpider): | 6 | +class Crawler(Spider): #Scrapy 0.22 |
7 | +#class Crawler(BaseSpider): #Scrapy 0.18 | ||
7 | name = 'img' | 8 | name = 'img' |
8 | - allowed_domains = ['www.cs.tsinghua.edu.cn'] | ||
9 | - start_urls = ['http://www.cs.tsinghua.edu.cn'] | ||
10 | - urlSet = set() | 9 | + |
10 | + def __init__(self): | ||
11 | + self.urlSet = set() | ||
12 | + fr = open('input.txt').readlines() | ||
13 | + self.mode = int(fr[0].strip()) | ||
14 | + self.tmp = fr[1].strip() | ||
15 | + if self.mode == 1: | ||
16 | + self.allowed_domains = [self.tmp,] | ||
17 | + self.start_urls = ['http://' + self.tmp, ] | ||
18 | + else: | ||
19 | + self.allowed_domains = [] | ||
20 | + self.start_urls = [] | ||
11 | 21 | ||
12 | def parse(self, response): | 22 | def parse(self, response): |
13 | html = response.body | 23 | html = response.body |
@@ -21,13 +31,12 @@ class Crawler(BaseSpider): | @@ -21,13 +31,12 @@ class Crawler(BaseSpider): | ||
21 | 31 | ||
22 | for url in urlList: | 32 | for url in urlList: |
23 | if url not in self.urlSet: | 33 | if url not in self.urlSet: |
24 | - yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse) | ||
25 | - #pass | 34 | + yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse) |
26 | 35 | ||
27 | item = CrawlerItem() | 36 | item = CrawlerItem() |
28 | item['url'] = response.url | 37 | item['url'] = response.url |
29 | item['imgList'] = [] | 38 | item['imgList'] = [] |
30 | for img in imgList: | 39 | for img in imgList: |
31 | - item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"')) | 40 | + item['imgList'].append(img.strip('"')) |
32 | 41 | ||
33 | yield item | 42 | yield item |
crawler/crawler/spiders/ImgCrawler.pyc
No preview for this file type
crawler/crawler/spiders/__init__.pyc
No preview for this file type
@@ -0,0 +1,25 @@ | @@ -0,0 +1,25 @@ | ||
1 | +#!/bin/bash | ||
2 | + | ||
3 | +if [ $# -lt 2 ]; then | ||
4 | + echo 'Usage:' | ||
5 | + echo '1. run 1 allowed_domain' | ||
6 | + echo 'crawl all imgs in the allowed_domain' | ||
7 | + echo '2. run 2 input_file' | ||
8 | + echo 'crawl the imgs and attach the corresponding tags in input_file' | ||
9 | + exit 0 | ||
10 | +fi | ||
11 | + | ||
12 | +cd crawler/crawler | ||
13 | +input_file=input.txt | ||
14 | + | ||
15 | +if [ $1 = 1 ]; then | ||
16 | + echo $1 > ${input_file} | ||
17 | + echo $2 >> ${input_file} | ||
18 | + scrapy crawl img | ||
19 | +elif [ $1 = 2 ]; then | ||
20 | + echo $1 > ${input_file} | ||
21 | + echo $2 >> ${input_file} | ||
22 | + scrapy crawl img | ||
23 | +else | ||
24 | + echo 'Invalid mode!' | ||
25 | +fi |