Commit a6bc62deb035db9a114822ebfa076a3e42e706df

Authored by Peng Xu
1 parent 6472fb0c
Exists in master

add script

crawler/crawler/__init__.pyc
No preview for this file type
crawler/crawler/input.txt 0 โ†’ 100644
... ... @@ -0,0 +1,2 @@
  1 +1
  2 +www.cs.tsinghua.edu.cn
... ...
crawler/crawler/items.pyc
No preview for this file type
crawler/crawler/settings.py
... ... @@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders'
16 16 # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 17 #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
18 18  
19   -ITEM_PIPELINES = {
20   - 'crawler.pipelines.CrawlerPipeline': 300
21   -}
  19 +#ITEM_PIPELINES = {
  20 +# 'crawler.pipelines.CrawlerPipeline': 300
  21 +#}
... ...
crawler/crawler/settings.pyc
No preview for this file type
crawler/crawler/spiders/.ImgCrawler.py.swp 0 โ†’ 100644
No preview for this file type
crawler/crawler/spiders/ImgCrawler.py
1   -from scrapy.spider import BaseSpider
  1 +from scrapy.spider import BaseSpider, Spider
2 2 from scrapy.http import Request
3 3 from crawler.items import CrawlerItem
4 4 import re
5 5  
6   -class Crawler(BaseSpider):
  6 +class Crawler(Spider): #Scrapy 0.22
  7 +#class Crawler(BaseSpider): #Scrapy 0.18
7 8 name = 'img'
8   - allowed_domains = ['www.cs.tsinghua.edu.cn']
9   - start_urls = ['http://www.cs.tsinghua.edu.cn']
10   - urlSet = set()
  9 +
  10 + def __init__(self):
  11 + self.urlSet = set()
  12 + fr = open('input.txt').readlines()
  13 + self.mode = int(fr[0].strip())
  14 + self.tmp = fr[1].strip()
  15 + if self.mode == 1:
  16 + self.allowed_domains = [self.tmp,]
  17 + self.start_urls = ['http://' + self.tmp, ]
  18 + else:
  19 + self.allowed_domains = []
  20 + self.start_urls = []
11 21  
12 22 def parse(self, response):
13 23 html = response.body
... ... @@ -21,13 +31,12 @@ class Crawler(BaseSpider):
21 31  
22 32 for url in urlList:
23 33 if url not in self.urlSet:
24   - yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse)
25   - #pass
  34 + yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse)
26 35  
27 36 item = CrawlerItem()
28 37 item['url'] = response.url
29 38 item['imgList'] = []
30 39 for img in imgList:
31   - item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"'))
  40 + item['imgList'].append(img.strip('"'))
32 41  
33 42 yield item
... ...
crawler/crawler/spiders/ImgCrawler.pyc
No preview for this file type
crawler/crawler/spiders/__init__.pyc
No preview for this file type
run.sh 0 โ†’ 100755
... ... @@ -0,0 +1,25 @@
  1 +#!/bin/bash
  2 +
  3 +if [ $# -lt 2 ]; then
  4 + echo 'Usage:'
  5 + echo '1. run 1 allowed_domain'
  6 + echo 'crawl all imgs in the allowed_domain'
  7 + echo '2. run 2 input_file'
  8 + echo 'crawl the imgs and attach the corresponding tags in input_file'
  9 + exit 0
  10 +fi
  11 +
  12 +cd crawler/crawler
  13 +input_file=input.txt
  14 +
  15 +if [ $1 = 1 ]; then
  16 + echo $1 > ${input_file}
  17 + echo $2 >> ${input_file}
  18 + scrapy crawl img
  19 +elif [ $1 = 2 ]; then
  20 + echo $1 > ${input_file}
  21 + echo $2 >> ${input_file}
  22 + scrapy crawl img
  23 +else
  24 + echo 'Invalid mode!'
  25 +fi
... ...