Commit a9d3ceff2adf356037aa157b5edc7a78b43ccc37

Authored by Chunk
1 parent 4737e166

Douban as an example.

1 #!/bin/zsh 1 #!/bin/zsh
2 # chunk @ 2014 2 # chunk @ 2014
  3 +########################################################################################
  4 +##
  5 +## F**k World!
  6 +##
  7 +########################################################################################
3 8
4 -#################################################################### 9 +##############################################
5 ## environment variables 10 ## environment variables
6 -####################################################################  
7 -export export TERM=xterm 11 +##############################################
  12 +export export TERM=linux
8 source /home/hadoop/.zshrc 13 source /home/hadoop/.zshrc
9 v env0 14 v env0
10 15
11 -#################################################################### 16 +##############################################
12 ## additional files list 17 ## additional files list
13 -#################################################################### 18 +##############################################
14 FILE=hehe.json 19 FILE=hehe.json
15 20
16 #scrapy runspider spider/test.py 21 #scrapy runspider spider/test.py
17 cd ./spider/mspider/ 22 cd ./spider/mspider/
18 [ -f $FILE ] && rm $FILE 23 [ -f $FILE ] && rm $FILE
19 -scrapy crawl dmoz -o $FILE 24 +scrapy crawl douban -o $FILE
20 25
21 26
22 27
spider/mspider/hehe.json
1 -[{"link": ["/"], "title": ["Top"]},  
2 -{"link": ["/Computers/"], "title": ["Computers"]},  
3 -{"link": ["/Computers/Programming/"], "title": ["Programming"]},  
4 -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},  
5 -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},  
6 -{"link": [], "title": []},  
7 -{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]},  
8 -{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]},  
9 -{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]},  
10 -{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]},  
11 -{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]},  
12 -{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]},  
13 -{"link": ["/"], "title": ["Top"]},  
14 -{"link": ["/Computers/"], "title": ["Computers"]},  
15 -{"link": ["/Computers/Programming/"], "title": ["Programming"]},  
16 -{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},  
17 -{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},  
18 -{"link": [], "title": []},  
19 -{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]},  
20 -{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]},  
21 -{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]},  
22 -{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]},  
23 -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]},  
24 -{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]},  
25 -{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]},  
26 -{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]},  
27 -{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]},  
28 -{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]},  
29 -{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]},  
30 -{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]},  
31 -{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]},  
32 -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]},  
33 -{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]},  
34 -{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]},  
35 -{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]},  
36 -{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]},  
37 -{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]},  
38 -{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]},  
39 -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]},  
40 -{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]},  
41 -{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]},  
42 -{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]},  
43 -{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]},  
44 -{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}]  
45 \ No newline at end of file 1 \ No newline at end of file
  2 +[{"director": ["\u80af\u5c3c\u601d\u00b7\u5e03\u62c9\u7eb3"], "rate": ["7.0"], "name": ["\u7070\u59d1\u5a18 Cinderella"]}]
46 \ No newline at end of file 3 \ No newline at end of file
spider/mspider/mspider/items.py
@@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item): @@ -13,7 +13,9 @@ class MspiderItem(scrapy.Item):
13 # name = scrapy.Field() 13 # name = scrapy.Field()
14 pass 14 pass
15 15
16 -class DmozItem(scrapy.Item):  
17 - title = scrapy.Field()  
18 - link = scrapy.Field()  
19 - desc = scrapy.Field() 16 +class DoubanItem(scrapy.Item):
  17 + ind = scrapy.Field()
  18 + name = scrapy.Field()
  19 + director = scrapy.Field()
  20 + rate = scrapy.Field()
  21 +
spider/mspider/mspider/items.pyc
No preview for this file type
spider/mspider/mspider/pipelines.py
@@ -5,7 +5,75 @@ @@ -5,7 +5,75 @@
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 7
  8 +from .items import DoubanItem
  9 +
  10 +from hashlib import md5
  11 +import happybase
  12 +
  13 +
  14 +class HbaseDumper(object):
  15 + def __init__(self, Item, tablename=None):
  16 + self.Item = Item # class not object
  17 + self.table_name = tablename if tablename != None else self.Item.__name__
  18 + self.table = None
  19 + self.connection = None
  20 + self.sparkcontex = None
  21 +
  22 + def get_table(self):
  23 + if self.table != None:
  24 + return self.table
  25 +
  26 + if self.connection is None:
  27 + c = happybase.Connection('HPC-server')
  28 + self.connection = c
  29 +
  30 + tables = self.connection.tables()
  31 + if self.table_name not in tables:
  32 + families = self.Item.fields
  33 + self.connection.create_table(name=self.table_name, families=families)
  34 +
  35 + table = self.connection.table(name=self.table_name)
  36 +
  37 + self.table = table
  38 +
  39 + return table
  40 +
  41 + def store_item(self, item):
  42 + if self.table == None:
  43 + self.table = self.get_table()
  44 + data = {}
  45 + for key in item.keys():
  46 + data[key + ':'] = item[key]
  47 +
  48 + self.table.put(item['ind'], data)
  49 +
  50 + def store_items(self, items):
  51 + if self.table == None:
  52 + self.table = self.get_table()
  53 +
  54 + dict_databuf = {}
  55 + for item in items:
  56 + data = {}
  57 + for key in item.keys():
  58 + data[key + ':'] = item[key]
  59 + dict_databuf[item['ind']] = data
  60 +
  61 + try:
  62 + with self.table.batch(batch_size=5000) as b:
  63 + for rowkey, data in dict_databuf.items():
  64 + b.put(rowkey, data)
  65 + except ValueError:
  66 + raise
  67 + pass
  68 +
8 69
9 class MspiderPipeline(object): 70 class MspiderPipeline(object):
  71 + def __init__(self):
  72 + self.hbasedumper = HbaseDumper(DoubanItem)
  73 +
10 def process_item(self, item, spider): 74 def process_item(self, item, spider):
  75 + try:
  76 + self.hbasedumper.store_item(item)
  77 + except:
  78 + raise
11 return item 79 return item
spider/mspider/mspider/pipelines.pyc 0 → 100644
No preview for this file type
spider/mspider/mspider/settings.py
@@ -5,7 +5,7 @@ @@ -5,7 +5,7 @@
5 # For simplicity, this file contains only the most important settings by 5 # For simplicity, this file contains only the most important settings by
6 # default. All the other settings are documented here: 6 # default. All the other settings are documented here:
7 # 7 #
8 -# http://doc.scrapy.org/en/latest/topics/settings.html 8 +# http://doc.scrapy.org/en/latest/topics/settings.html
9 # 9 #
10 10
11 BOT_NAME = 'mspider' 11 BOT_NAME = 'mspider'
@@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders' @@ -15,3 +15,21 @@ NEWSPIDER_MODULE = 'mspider.spiders'
15 15
16 # Crawl responsibly by identifying yourself (and your website) on the user-agent 16 # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 #USER_AGENT = 'mspider (+http://www.yourdomain.com)' 17 #USER_AGENT = 'mspider (+http://www.yourdomain.com)'
  18 +
  19 +
  20 +
  21 +##Custom for Chunk
  22 +
  23 +# anti-banning
  24 +COOKIES_ENABLED = True
  25 +# USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
  26 +DOWNLOADER_MIDDLEWARES = {
  27 + 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
  28 + 'random_useragent.RandomUserAgentMiddleware': 400
  29 +}
  30 +# http://www.useragentstring.com/pages/useragentstring.php
  31 +USER_AGENT_LIST = 'useragents.txt'
  32 +# DOWNLOAD_DELAY = 0.1
  33 +
  34 +# ITEM_PIPELINES = {'mspider.pipelines.MspiderPipeline': 300, }
  35 +
spider/mspider/mspider/settings.pyc
No preview for this file type
spider/mspider/mspider/spiders/test000.py
  1 +# -*- coding: utf-8 -*-
1 __author__ = 'chunk' 2 __author__ = 'chunk'
2 3
  4 +from ..items import DoubanItem
  5 +
3 import scrapy 6 import scrapy
4 -from ..items import DmozItem 7 +from scrapy import FormRequest
  8 +from scrapy.http import Request
  9 +from scrapy.utils.response import get_base_url
  10 +from urlparse import urljoin
  11 +import re
  12 +from hashlib import md5
  13 +
5 14
6 -class DmozSpider(scrapy.Spider):  
7 - name = "dmoz"  
8 - allowed_domains = ["dmoz.org"] 15 +class DoubanSpider(scrapy.Spider):
  16 + name = "douban"
  17 + allowed_domains = ["douban.com"]
9 start_urls = [ 18 start_urls = [
10 - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",  
11 - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" 19 + "http://movie.douban.com/tag/",
12 ] 20 ]
13 21
  22 + def start_requests(self):
  23 + return [FormRequest("http://movie.douban.com/tag/",
  24 + cookies={'bid': "SCAM2676P0o"},
  25 + callback=self.parse)]
  26 +
14 def parse(self, response): 27 def parse(self, response):
15 """ 28 """
16 This is the default callback used by Scrapy to process downloaded responses 29 This is the default callback used by Scrapy to process downloaded responses
@@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider): @@ -19,9 +32,27 @@ class DmozSpider(scrapy.Spider):
19 32
20 Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html 33 Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
21 """ 34 """
22 - for sel in response.xpath('//ul/li'):  
23 - item = DmozItem()  
24 - item['title'] = sel.xpath('a/text()').extract()  
25 - item['link'] = sel.xpath('a/@href').extract() 35 + base_url = get_base_url(response)
  36 + for url in response.xpath('//@href').extract():
  37 + url = urljoin(base_url, url.strip())
  38 + if re.match('http://movie.douban.com/tag/\w+', url, re.U):
  39 + return Request(url,cookies={'bid': "SCAM2676P0o"}, callback=self.parse_list)
  40 +
  41 +
  42 + def parse_list(self, response):
  43 + base_url = get_base_url(response)
  44 + for url in response.xpath('//@href').extract():
  45 + url = urljoin(base_url, url.strip())
  46 + if re.match('http://movie.douban.com/subject/\w+', url, re.U):
  47 + return Request(url, cookies={'bid': "SCAM2676P0o"}, callback=self.parse_item)
  48 +
  49 + def parse_item(self, response):
  50 + item = DoubanItem()
  51 + item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
  52 + item['director'] = response.xpath('//a[@rel="v:directedBy"]/text()').extract()
  53 + item['rate'] = response.xpath('//*[@property="v:average"]/text()').extract()
  54 +
  55 + return item
26 56
27 - yield item 57 + def parse_details(self, response):
  58 + pass
spider/mspider/mspider/spiders/test000.pyc
No preview for this file type
spider/mspider/useragents.txt 0 → 100644
@@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
  1 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36
  2 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36
  3 +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
  4 +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
  5 +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
  6 +Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)
  7 +Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)
  8 +Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)
  9 +Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)
  10 +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko
  11 +Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
  12 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
  13 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
  14 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
  15 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0
  16 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)
  17 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)
  18 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)
  19 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)
  20 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)
  21 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205
  22 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0
  23 +Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0
  24 +Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  25 +Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  26 +Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  27 +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  28 +Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  29 +Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
  30 +Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27
0 \ No newline at end of file 31 \ No newline at end of file