1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
| from sbdspider.scrapy_redis.spiders import RedisSpider
from scrapy.http import Request from sbdspider.items import SbdspiderItem import requests import re import datetime
class SobaidupanSpider(RedisSpider): name = "sobaidu" ckm_music=('mp3','wav','mid','wma','cda','acc') ckm_picture=('jpg','jpeg','png','gif','psd','bmp','svg','tga') ckm_ebook=('txt','pdf','mobi','azw','mbp','ebx') ckm_docfile=('doc','docx','wps','ppt','xls','xlsx') ckm_app=('apk','ipa','sis','sisx','xap') ckm_torrent=('torrent') ckm_movie=('mkv','rmvb','mp4','rm','avi','wmv','asf','asx','mpg','mpeg','mpe','3gp','flv','f4v','vob','mov') ckm_apeflac=('ape','flac') ckm_teach=(u'教程',u'入门',u'精讲',u'详解',u'课程') allowed_domains = ["www.sobaidupan.com"] redis_key = "sobaidupan:start_urls" start_urls = ['http://www.sobaidupan.com/'] def start_requests(self): for u in self.start_urls: yield Request(u,callback=self.parse, errback=self.errback)
def parse(self, response): yield self.parse_item(response) for a in response.css('a::attr(href)').extract(): if not a: continue next_url = response.urljoin(a) yield Request(next_url,callback=self.parse) def parse_item(self,response): uid = re.search('user-(\d*)-1\.html',response.text) name = re.search(u'<div align="center">用户名:(.+?)</div></td>',response.text) avatar = re.search('<img src="(.+?)" width="100" height="100" border="0">',response.text) title = re.search('<h1>(.+?)</h1>',response.text) ressize = re.search(u'<B>资源大小:</B>(.+?) <b>',response.text) description = re.search(u'<B>资源类别:</B>(.+?)</div>',response.text) sharetime = re.search(u'<b>分享日期:</b>(.+?)</div>',response.text) res = re.search('href="(http://sbdp\.baidudaquan\.com/down\.asp\?id=.+?)"',response.text) if res is not None and title is not None: ssource = requests.get(res.group(1)) ssource.encoding = 'utf-8' resurl = re.search("URL=(.+?)'",ssource.text) if resurl is not None: item = SbdspiderItem() item['tid'] = 1 item['cid'] = self.classifyRes(title.group(1)) item['uid'] = uid.group(1) item['name'] = name.group(1) item['avatar'] = avatar.group(1) item['title'] = title.group(1) if ressize is not None: item['size'] = ressize.group(1) else: item['size'] = '未知' item['url'] = resurl.group(1) item['pwd'] = '' if description is not None: item['description'] = description.group(1) else: item['description'] = '' item['available'] = 1 if sharetime is not None: item['sharetime'] = sharetime.group(1) else: dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") item['sharetime'] = dt return item def classifyRes(self,title): ext_title='' classid=1 if len(title.split('.'))>=2: ext_title = title.split('.')[-1] else: ext_title = title ext_title.encoding = 'utf-8' if ext_title in self.ckm_music: classid = 2 elif ext_title in self.ckm_picture: classid = 3 elif ext_title in self.ckm_ebook: classid = 4 elif ext_title in self.ckm_docfile: classid = 5 elif ext_title in self.ckm_torrent: classid = 6 elif ext_title in self.ckm_app: classid = 7 elif ext_title in self.ckm_movie: classid = 8 elif ext_title in self.ckm_apeflac: classid = 9 else: for s in self.ckm_teach: if s in ext_title: classid = 10 return classid def errback(self, failure): pass
|