UU Blog

Python基于Scrapy框架写个麻雀虽小五脏俱全的爬虫

这个东西写有半个月了,最近工作忙没空理会,看了下数据,跑得还不错。

用各种框架和开源项目配合,站在巨人的肩膀上,轻轻松松完成一个麻雀虽小五脏俱全的爬虫。

采集URL,大规模URL去重,分类,入库,反爬虫。而完成这些,只需要寥寥不到三百行代码。

Scrapy工程目录如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
./
├── crawls # 开启持久化会产生一些记录文件
├── sbdspider
│   ├── __init__.py
│   ├── __init__.pyc
│   ├── items.py # 定义要采集的字段
│   ├── items.pyc
│   ├── middlewares # 中间件 主要是随机选择 UserAgent和代理IP 主要用来反爬虫
│   │   ├── __init__.py
│   │   ├── __init__.pyc
│   │   ├── RandomProxy.py
│   │   ├── RandomProxy.pyc
│   │   ├── RandomUserAgent.py
│   │   └── RandomUserAgent.pyc
│   ├── middlewares.py
│   ├── pipelines.py # 入库MySQL
│   ├── pipelines.pyc
│   ├── scrapy_redis # 用的九茶的模块 用Bloomfilter+redis去重
│   │   ├── BloomfilterOnRedis.py
│   │   ├── BloomfilterOnRedis.pyc
│   │   ├── connection.py
│   │   ├── connection.pyc
│   │   ├── dupefilter.py
│   │   ├── dupefilter.pyc
│   │   ├── __init__.py
│   │   ├── __init__.pyc
│   │   ├── isExists.py
│   │   ├── pipelines.py
│   │   ├── queue.py
│   │   ├── queue.pyc
│   │   ├── scheduler.py
│   │   ├── scheduler.pyc
│   │   ├── spiders.py
│   │   ├── spiders.pyc
│   │   └── tests.py
│   ├── settings.py # 配置 pipeline、middlewares的引用声明主要在这里
│   ├── settings.pyc
│   └── spiders
│   ├── __init__.py
│   ├── __init__.pyc
│   ├── sobaidupan.py # 爬虫主体 主要是提取数据 分类
│   └── sobaidupan.pyc
└── scrapy.cfg

首先是字段的定义,我需要保存哪些信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import scrapy

class SbdspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
tid = scrapy.Field() # 网盘类型ID
cid = scrapy.Field() # 资源分类ID
uid = scrapy.Field() # 资源用户ID
name = scrapy.Field()
avatar = scrapy.Field()
title = scrapy.Field() # 资源标题
size = scrapy.Field() # 资源大小
url = scrapy.Field() # 资源URL
pwd = scrapy.Field() # 资源密码
description = scrapy.Field() # 资源描述
available = scrapy.Field() # 是否可用
sharetime = scrapy.Field() # 分享时间

然后我设计了这样的数据库表来保存它们:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
-- phpMyAdmin SQL Dump
-- version 4.6.6
-- https://www.phpmyadmin.net/
--
-- Host: localhost
-- Generation Time: 2017-03-10 05:44:53
-- 服务器版本: 5.5.53-log
-- PHP Version: 5.5.38

SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";


/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;

--
-- Database: `yzy_data`
--

-- --------------------------------------------------------

--
-- 表的结构 `yzy_class`
--

CREATE TABLE `yzy_class` (
`id` int(11) NOT NULL,
`cname` varchar(10) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4;

--
-- 转存表中的数据 `yzy_class`
--
-- 这样我以后保存分类ID就够了,避免重复字段太多,占用太多数据库,也不方便数据大后索引
INSERT INTO `yzy_class` (`id`, `cname`) VALUES
(1, '其它'),
(2, '音乐'),
(3, '图片'),
(4, '电子书'),
(5, '文档'),
(6, '种子'),
(7, '手机APP'),
(8, '影视'),
(9, '无损音乐'),
(10, '教程');

-- --------------------------------------------------------

--
-- 表的结构 `yzy_resources`
--
-- 这个表才是重点 基本Items.py定义的都是保存到这里来
CREATE TABLE `yzy_resources` (
`id` int(11) NOT NULL,
`tid` tinyint(3) UNSIGNED NOT NULL,
`cid` tinyint(3) UNSIGNED NOT NULL,
`uid` int(11) NOT NULL,
`title` varchar(80) NOT NULL,
`size` varchar(10) NOT NULL,
`url` varchar(255) NOT NULL,
`pwd` varchar(10) NOT NULL,
`description` varchar(100) NOT NULL,
`available` tinyint(1) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4;

-- --------------------------------------------------------

--
-- 表的结构 `yzy_type`
--
-- 留一个表保存资源类别,为以后多个类型网盘资源采集打下基础
CREATE TABLE `yzy_type` (
`id` int(11) NOT NULL,
`name` char(10) NOT NULL,
`ename` char(10) NOT NULL,
`shortname` char(4) NOT NULL,
`url` varchar(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4;

--
-- 转存表中的数据 `yzy_type`
--

INSERT INTO `yzy_type` (`id`, `name`, `ename`, `shortname`, `url`) VALUES
(1, '百度网盘', 'dupan', '度盘', 'https:/pan.baidu.com/');

-- --------------------------------------------------------

--
-- 表的结构 `yzy_users`
--
-- 保存网盘用户信息
CREATE TABLE `yzy_users` (
`id` int(11) NOT NULL,
`tid` tinyint(4) NOT NULL,
`uid` varchar(20) NOT NULL,
`uname` varchar(20) NOT NULL,
`avatar` varchar(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4;

--
-- Indexes for dumped tables
--

--
-- Indexes for table `yzy_class`
--
ALTER TABLE `yzy_class`
ADD PRIMARY KEY (`id`);

--
-- Indexes for table `yzy_resources`
--
ALTER TABLE `yzy_resources`
ADD PRIMARY KEY (`id`);

--
-- Indexes for table `yzy_type`
--
ALTER TABLE `yzy_type`
ADD PRIMARY KEY (`id`);

--
-- Indexes for table `yzy_users`
--
ALTER TABLE `yzy_users`
ADD PRIMARY KEY (`id`);

--
-- 在导出的表使用AUTO_INCREMENT
--

--
-- 使用表AUTO_INCREMENT `yzy_class`
--
ALTER TABLE `yzy_class`
MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=11;
--
-- 使用表AUTO_INCREMENT `yzy_resources`
--
ALTER TABLE `yzy_resources`
MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
--
-- 使用表AUTO_INCREMENT `yzy_type`
--
ALTER TABLE `yzy_type`
MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=2;
--
-- 使用表AUTO_INCREMENT `yzy_users`
--
ALTER TABLE `yzy_users`
MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

注意看注释,还是很好理解这些表干嘛用的。

pipelines.py获取到的Items怎么处理

这里有两个类,其实是两种处理方式,一种是默认的,我改了一下,采集到的数据,以JSON的形式保存。

sobaiduPipeline才是重点,主要有两次插入数据,一次是插入用户数据,一次获取到用户ID后,插入到yzy_resources表。

数据库的定义在 settings.py 里面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json  
import MySQLdb
from scrapy.exceptions import DropItem
import settings


class SbdspiderPipeline(object):

def __init__(self):
self.file = open('items.jl', 'wb')

def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item


# 入库到MySQL
class sobaiduPipeline(object):
# 初始化连接
def __init__(self):
self.conn=MySQLdb.connect(host=settings.MYSQL_HOST,
user=settings.MYSQL_USER,
passwd=settings.MYSQL_PASS,
db=settings.MYSQL_NAME,
charset='utf8',
use_unicode=True)
self.curosr = self.conn.cursor()
# 处理item
def process_item(self,item,spider):
try:
userid = self.insert_user(item['uid'],item['name'],item['avatar'])
sql="""INSERT INTO
yzy_resources(tid,cid,uid,title,size,url,pwd,description,available,sharetime)
VALUES('%d','%d','%d','%s','%s','%s','%s','%s','%d','%s')
"""%(item['tid'],item['cid'],userid,item['title'],item['size'],item['url'],item['pwd'],item['description'],item['available'],item['sharetime'])
vsql=sql.encode('utf8')
self.curosr.execute(vsql)

except MySQLdb.Error,e:
print "Error:%d:%s" % (e.args[0],e.args[1])

return item
# 插入用户数据
def insert_user(self,uid,name,pic):
try:
userid=0
bSginal=self.curosr.execute("SELECT * FROM yzy_users WHERE uid='%s'"%(uid))
if bSginal==1:
results=self.curosr.fetchone()
userid=results[0]
else:
sql = """INSERT INTO yzy_users(uid,uname,avatar)
VALUES('%s','%s','%s')"""%(uid,name,pic)
vsql = sql.encode('utf8')
if self.curosr.execute(vsql)==1:
userid=self.curosr.lastrowid

except MySQLdb.Error,e:
print "Error:%d:%s" % (e.args[0], e.args[1])

return userid

sobaidupan.py 蜘蛛的主体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
from sbdspider.scrapy_redis.spiders import RedisSpider
# 这里我引入的是九茶的模块 RedisSpider
from scrapy.http import Request
from sbdspider.items import SbdspiderItem
import requests
import re
import datetime

class SobaidupanSpider(RedisSpider):
name = "sobaidu"
# class type keymap
ckm_music=('mp3','wav','mid','wma','cda','acc')
# id 2
ckm_picture=('jpg','jpeg','png','gif','psd','bmp','svg','tga')
# id 3
ckm_ebook=('txt','pdf','mobi','azw','mbp','ebx')
# id 4
ckm_docfile=('doc','docx','wps','ppt','xls','xlsx')
# id 5
ckm_app=('apk','ipa','sis','sisx','xap')
# id 6
ckm_torrent=('torrent')
# id 7
ckm_movie=('mkv','rmvb','mp4','rm','avi','wmv','asf','asx','mpg','mpeg','mpe','3gp','flv','f4v','vob','mov')
# id 8
ckm_apeflac=('ape','flac')
# id 9
ckm_teach=(u'教程',u'入门',u'精讲',u'详解',u'课程')
# id 10
allowed_domains = ["www.sobaidupan.com"]
redis_key = "sobaidupan:start_urls"
start_urls = ['http://www.sobaidupan.com/']

def start_requests(self):
for u in self.start_urls:
yield Request(u,callback=self.parse,
errback=self.errback)

def parse(self, response):
yield self.parse_item(response)
for a in response.css('a::attr(href)').extract():
if not a:
continue
next_url = response.urljoin(a)
yield Request(next_url,callback=self.parse)
# 匹配字段
def parse_item(self,response):
uid = re.search('user-(\d*)-1\.html',response.text)
name = re.search(u'<div align="center">用户名:(.+?)</div></td>',response.text)
avatar = re.search('<img src="(.+?)" width="100" height="100" border="0">',response.text)
title = re.search('<h1>(.+?)</h1>',response.text)
ressize = re.search(u'<B>资源大小:</B>(.+?)&nbsp;<b>',response.text)
description = re.search(u'<B>资源类别:</B>(.+?)</div>',response.text)
sharetime = re.search(u'<b>分享日期:</b>(.+?)</div>',response.text)
res = re.search('href="(http://sbdp\.baidudaquan\.com/down\.asp\?id=.+?)"',response.text)
if res is not None and title is not None:
ssource = requests.get(res.group(1))
ssource.encoding = 'utf-8'
resurl = re.search("URL=(.+?)'",ssource.text)
# re.search("URL=(http://pan\.baidu\.com/share/link\?shareid=.+?)'",ssource.text)
if resurl is not None:
item = SbdspiderItem()
item['tid'] = 1
item['cid'] = self.classifyRes(title.group(1))
item['uid'] = uid.group(1)
item['name'] = name.group(1)
item['avatar'] = avatar.group(1)
item['title'] = title.group(1)
if ressize is not None:
item['size'] = ressize.group(1)
else:
item['size'] = '未知'
item['url'] = resurl.group(1)
item['pwd'] = ''
if description is not None:
item['description'] = description.group(1)
else:
item['description'] = ''
item['available'] = 1
if sharetime is not None:
item['sharetime'] = sharetime.group(1)
else:
dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item['sharetime'] = dt
return item

# 大致给数据分类
def classifyRes(self,title):
ext_title=''
classid=1 # 初始化ID为1
# 尝试提取后缀
if len(title.split('.'))>=2:
ext_title = title.split('.')[-1]
else:
ext_title = title
ext_title.encoding = 'utf-8'

# 按keymap分类
if ext_title in self.ckm_music:
classid = 2
elif ext_title in self.ckm_picture:
classid = 3
elif ext_title in self.ckm_ebook:
classid = 4
elif ext_title in self.ckm_docfile:
classid = 5
elif ext_title in self.ckm_torrent:
classid = 6
elif ext_title in self.ckm_app:
classid = 7
elif ext_title in self.ckm_movie:
classid = 8
elif ext_title in self.ckm_apeflac:
classid = 9
else:
for s in self.ckm_teach:
if s in ext_title:
classid = 10
return classid

def errback(self, failure):
pass

多IP代理采集反爬虫 IPProxys+RandomUserAgent

先下载安装这个 IPProxyPool

搭建成功后,运行有采集到数据的话,curl http://127.0.0.1:8000/?types=0&count=5&country=国内 可以看到返回json格式的数据。这样就成功一半了。

主要调用接口和随机切换代码 RandomProxy.py

1
2
3
4
5
6
7
8
9
10
11
12
class RandomProxy(object):
def __init__(self):
self.r = requests.get(u'http://127.0.0.1:8000/?types=0&count=&country=国内')
self.ip_ports=json.loads(self.r.text)

def process_request(self, request, spider):
# 数组中随机取一个
ip_port=random.choice(self.ip_ports)
# 拼接起来
http_proxy="http://%s:%s"%(ip_port[0],ip_port[1])
# 设置代理
request.meta['proxy'] = http_proxy

最后怎么在服务器上挂机采集,爬虫持久化

1
nohup scrapy crawl sobaidu -s JOBDIR=crawls/sobaidu-1 1>/dev/null 2>logfile.log &

数据样本,5天28万的数据,主要是代理IP质量不高,否则速度还能上一个台阶,还不是分布式。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
MariaDB [yzy_data]> select count(*) from yzy_resources;
+----------+
| count(*) |
+----------+
| 283888 |
+----------+
1 row in set (0.00 sec)

MariaDB [yzy_data]> select * from yzy_resources limit 10 \G;
*************************** 1. row ***************************
id: 1
tid: 1
cid: 1
uid: 0
title: 《帝王师刘伯温[精品]》.epub
size: 2.65 MB
url: http://pan.baidu.com/share/link?shareid=2676964745&uk=4194551491&fid=646837282049387
pwd:
description: 人物传记
available: 1
sharetime: 2015-11-17 04:59:00
*************************** 2. row ***************************
id: 2
tid: 1
cid: 4
uid: 0
title: 余念.txt
size: 338.77 KB
url: http://pan.baidu.com/s/1jIRZs7W
pwd:
description: /
available: 1
sharetime: 2017-02-14 07:37:00
*************************** 3. row ***************************
id: 3
tid: 1
cid: 4
uid: 0
title: 《千山记》石头与水(晋江金牌推荐超高积分01-13更新至完结).txt
size: 4.07 MB
url: http://pan.baidu.com/s/1geJHGll
pwd:
description: /
available: 1
sharetime: 2017-02-14 07:37:00
*************************** 4. row ***************************
id: 4
tid: 1
cid: 8
uid: 0
title: (微博:小小精灵玩家520)政宗くんのリベンジ 09.mp4
size: 195.44 MB
url: http://pan.baidu.com/s/1c13Bcp6
pwd:
description: 高清动漫下载区/2017年1月新番/政宗君的复仇/(微博:小小精灵玩家520)政宗くんのリベンジ 09.mp4
available: 1
sharetime: 2017-03-04 05:31:00
*************************** 5. row ***************************
id: 5
tid: 1
cid: 1
uid: 0
title: 04 Take It (Previously Unreleased).m4a
size: 5.71 MB
url: http://pan.baidu.com/s/1ntAxIZJ
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
*************************** 6. row ***************************
id: 6
tid: 1
cid: 1
uid: 0
title: 表情.zip
size: 4.96 MB
url: http://pan.baidu.com/s/1gdd1XYV
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
*************************** 7. row ***************************
id: 7
tid: 1
cid: 1
uid: 0
title: 【艾薇儿饭团】07年Flare杂志.rar
size: 563.13 KB
url: http://pan.baidu.com/share/link?shareid=3408670202&uk=1042245391
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
*************************** 8. row ***************************
id: 8
tid: 1
cid: 1
uid: 0
title: 【艾薇儿饭团】2003滚石杂志.rar
size: 3 MB
url: http://pan.baidu.com/share/link?shareid=424894405&uk=1042245391
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
*************************** 9. row ***************************
id: 9
tid: 1
cid: 1
uid: 0
title: 【饭团资源】致敬艾薇儿.rar
size: 75.64 MB
url: http://pan.baidu.com/share/link?shareid=1371654997&uk=1042245391
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
*************************** 10. row ***************************
id: 10
tid: 1
cid: 1
uid: 0
title: AVRIL.Candy.zip
size: 4.33 MB
url: http://pan.baidu.com/s/1ntCy8sx
pwd:
description: /
available: 1
sharetime: 2017-03-12 17:16:00
10 rows in set (0.00 sec)

项目下载地址

参考资料

给作者打一针鸡血