add script

Peng Xu
1 parent 6472fb0c
Showing 10 changed files with 47 additions and 11 deletions Show diff stats
crawler/crawler/__init__.pyc
crawler/crawler/input.txt
crawler/crawler/items.pyc
crawler/crawler/settings.py
crawler/crawler/settings.pyc
crawler/crawler/spiders/.ImgCrawler.py.swp
crawler/crawler/spiders/ImgCrawler.py
crawler/crawler/spiders/ImgCrawler.pyc
crawler/crawler/spiders/__init__.pyc
run.sh
@@ -0,0 +1,2 @@
+1
+www.cs.tsinghua.edu.cn
@@ -16,6 +16,6 @@ NEWSPIDER_MODULE = &#39;crawler.spiders&#39;
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
  
-ITEM_PIPELINES = {
-	'crawler.pipelines.CrawlerPipeline': 300
-}
+#ITEM_PIPELINES = {
+#	'crawler.pipelines.CrawlerPipeline': 300
+#}
-from scrapy.spider import BaseSpider
+from scrapy.spider import BaseSpider, Spider
 from scrapy.http import Request
 from crawler.items import CrawlerItem
 import re
  
-class Crawler(BaseSpider):
+class Crawler(Spider): #Scrapy 0.22
+#class Crawler(BaseSpider): #Scrapy 0.18
 	name = 'img'
-	allowed_domains = ['www.cs.tsinghua.edu.cn']
-	start_urls = ['http://www.cs.tsinghua.edu.cn']
-	urlSet = set()
+
+	def __init__(self):
+		self.urlSet = set()
+		fr = open('input.txt').readlines()
+		self.mode = int(fr[0].strip())
+		self.tmp = fr[1].strip()
+		if self.mode == 1:
+			self.allowed_domains = [self.tmp,]
+			self.start_urls = ['http://' + self.tmp, ]
+		else:
+			self.allowed_domains = []
+			self.start_urls = []
  
 	def parse(self, response):
 		html = response.body
@@ -21,13 +31,12 @@ class Crawler(BaseSpider):
  
 		for url in urlList:
 			if url not in self.urlSet:
-				yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse)
-				#pass
+				yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse)
  
 		item = CrawlerItem()
 		item['url'] = response.url
 		item['imgList'] = []
 		for img in imgList:
-			item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"'))
+			item['imgList'].append(img.strip('"'))
  
 		yield item
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+	echo 'Usage:'
+	echo '1. run 1 allowed_domain'
+	echo 'crawl all imgs in the allowed_domain'
+	echo '2. run 2 input_file'
+	echo 'crawl the imgs and attach the corresponding tags in input_file'
+	exit 0
+fi
+
+cd crawler/crawler
+input_file=input.txt
+
+if [ $1 = 1 ]; then
+	echo $1 > ${input_file}
+	echo $2 >> ${input_file}
+	scrapy crawl img
+elif [ $1 = 2 ]; then
+	echo $1 > ${input_file}
+	echo $2 >> ${input_file}
+	scrapy crawl img
+else
+	echo 'Invalid mode!'
+fi
...	...	@@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders'
16	16	# Crawl responsibly by identifying yourself (and your website) on the user-agent
17	17	#USER_AGENT = 'crawler (+http://www.yourdomain.com)'
18	18
19		-ITEM_PIPELINES = {
20		- 'crawler.pipelines.CrawlerPipeline': 300
21		-}
	19	+#ITEM_PIPELINES = {
	20	+# 'crawler.pipelines.CrawlerPipeline': 300
	21	+#}
...	...
1		-from scrapy.spider import BaseSpider
	1	+from scrapy.spider import BaseSpider, Spider
2	2	from scrapy.http import Request
3	3	from crawler.items import CrawlerItem
4	4	import re
5	5
6		-class Crawler(BaseSpider):
	6	+class Crawler(Spider): #Scrapy 0.22
	7	+#class Crawler(BaseSpider): #Scrapy 0.18
7	8	name = 'img'
8		- allowed_domains = ['www.cs.tsinghua.edu.cn']
9		- start_urls = ['http://www.cs.tsinghua.edu.cn']
10		- urlSet = set()
	9	+
	10	+ def __init__(self):
	11	+ self.urlSet = set()
	12	+ fr = open('input.txt').readlines()
	13	+ self.mode = int(fr[0].strip())
	14	+ self.tmp = fr[1].strip()
	15	+ if self.mode == 1:
	16	+ self.allowed_domains = [self.tmp,]
	17	+ self.start_urls = ['http://' + self.tmp, ]
	18	+ else:
	19	+ self.allowed_domains = []
	20	+ self.start_urls = []
11	21
12	22	def parse(self, response):
13	23	html = response.body
...	...	@@ -21,13 +31,12 @@ class Crawler(BaseSpider):
21	31
22	32	for url in urlList:
23	33	if url not in self.urlSet:
24		- yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse)
25		- #pass
	34	+ yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse)
26	35
27	36	item = CrawlerItem()
28	37	item['url'] = response.url
29	38	item['imgList'] = []
30	39	for img in imgList:
31		- item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"'))
	40	+ item['imgList'].append(img.strip('"'))
32	41
33	42	yield item
...	...
...	...	@@ -0,0 +1,25 @@
	1	+#!/bin/bash
	2	+
	3	+if [ $# -lt 2 ]; then
	4	+ echo 'Usage:'
	5	+ echo '1. run 1 allowed_domain'
	6	+ echo 'crawl all imgs in the allowed_domain'
	7	+ echo '2. run 2 input_file'
	8	+ echo 'crawl the imgs and attach the corresponding tags in input_file'
	9	+ exit 0
	10	+fi
	11	+
	12	+cd crawler/crawler
	13	+input_file=input.txt
	14	+
	15	+if [ $1 = 1 ]; then
	16	+ echo $1 > ${input_file}
	17	+ echo $2 >> ${input_file}
	18	+ scrapy crawl img
	19	+elif [ $1 = 2 ]; then
	20	+ echo $1 > ${input_file}
	21	+ echo $2 >> ${input_file}
	22	+ scrapy crawl img
	23	+else
	24	+ echo 'Invalid mode!'
	25	+fi
...	...