add script

Peng Xu
1 parent 6472fb0c
Showing 10 changed files with 47 additions and 11 deletions Show diff stats
crawler/crawler/__init__.pyc
crawler/crawler/input.txt
crawler/crawler/items.pyc
crawler/crawler/settings.py
crawler/crawler/settings.pyc
crawler/crawler/spiders/.ImgCrawler.py.swp
crawler/crawler/spiders/ImgCrawler.py
crawler/crawler/spiders/ImgCrawler.pyc
crawler/crawler/spiders/__init__.pyc
run.sh
@@ -0,0 +1,2 @@
+1
+www.cs.tsinghua.edu.cn
@@ -16,6 +16,6 @@ NEWSPIDER_MODULE = &#39;crawler.spiders&#39;
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
-ITEM_PIPELINES = {
-	'crawler.pipelines.CrawlerPipeline': 300
-}
+#ITEM_PIPELINES = {
+#	'crawler.pipelines.CrawlerPipeline': 300
+#}
-from scrapy.spider import BaseSpider
+from scrapy.spider import BaseSpider, Spider
 from scrapy.http import Request
 from crawler.items import CrawlerItem
 import re
-class Crawler(BaseSpider):
+class Crawler(Spider): #Scrapy 0.22
+#class Crawler(BaseSpider): #Scrapy 0.18
 	name = 'img'
-	allowed_domains = ['www.cs.tsinghua.edu.cn']
-	start_urls = ['http://www.cs.tsinghua.edu.cn']
-	urlSet = set()
+
+	def __init__(self):
+		self.urlSet = set()
+		fr = open('input.txt').readlines()
+		self.mode = int(fr[0].strip())
+		self.tmp = fr[1].strip()
+		if self.mode == 1:
+			self.allowed_domains = [self.tmp,]
+			self.start_urls = ['http://' + self.tmp, ]
+		else:
+			self.allowed_domains = []
+			self.start_urls = []
 	def parse(self, response):
 		html = response.body
@@ -21,13 +31,12 @@ class Crawler(BaseSpider):
 		for url in urlList:
 			if url not in self.urlSet:
-				yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse)
-				#pass
+				yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse)
 		item = CrawlerItem()
 		item['url'] = response.url
 		item['imgList'] = []
 		for img in imgList:
-			item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"'))
+			item['imgList'].append(img.strip('"'))
 		yield item
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+	echo 'Usage:'
+	echo '1. run 1 allowed_domain'
+	echo 'crawl all imgs in the allowed_domain'
+	echo '2. run 2 input_file'
+	echo 'crawl the imgs and attach the corresponding tags in input_file'
+	exit 0
+fi
+
+cd crawler/crawler
+input_file=input.txt
+
+if [ $1 = 1 ]; then
+	echo $1 > ${input_file}
+	echo $2 >> ${input_file}
+	scrapy crawl img
+elif [ $1 = 2 ]; then
+	echo $1 > ${input_file}
+	echo $2 >> ${input_file}
+	scrapy crawl img
+else
+	echo 'Invalid mode!'
+fi
@@ -0,0 +1,25 @@		@@ -0,0 +1,25 @@
	1	+#!/bin/bash
	2	+
	3	+if [ $# -lt 2 ]; then
	4	+ echo 'Usage:'
	5	+ echo '1. run 1 allowed_domain'
	6	+ echo 'crawl all imgs in the allowed_domain'
	7	+ echo '2. run 2 input_file'
	8	+ echo 'crawl the imgs and attach the corresponding tags in input_file'
	9	+ exit 0
	10	+fi
	11	+
	12	+cd crawler/crawler
	13	+input_file=input.txt
	14	+
	15	+if [ $1 = 1 ]; then
	16	+ echo $1 > ${input_file}
	17	+ echo $2 >> ${input_file}
	18	+ scrapy crawl img
	19	+elif [ $1 = 2 ]; then
	20	+ echo $1 > ${input_file}
	21	+ echo $2 >> ${input_file}
	22	+ scrapy crawl img
	23	+else
	24	+ echo 'Invalid mode!'
	25	+fi