安装过程? --->> 需要有python环境(3.8+)
pip install Scrapy
使用 Scrapy 命令创建一个项目
scrapy startproject mySpider
创建成功后,如下提示:
cd yourproject
# scrapy genspider [baidu baidu.com]
baidu解释:名为什么的爬虫,比如baidu爬虫,baidu.com就是目标站点
完整命令如下:
scrapy genspider baidu baidu.com
项目结构 :
使用开发工具 PyCharm 打开刚创建好的项目,打开项目,打开设置,创建项目venv专属python环境,保存退出
打开终端,可以看到已经激活了项目的venv环境
在终端中输入 pip install scrapy 安装项目专属的scrapy依赖
打开spider目录下的爬虫(baidu,这里我的叫douban)
scrapy框架的重点是解析页面,只需要关心解析页面即可,框架自带了一下持久化工具(json......)
修改 items.py 将需要爬到的数据组装成Item对象
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 爬虫爬到的数据需要组装成Item对象
class MovieItem(scrapy.Item):
title = scrapy.Field()
rank = scrapy.Field()
subject = scrapy.Field()
接下来是分析目标页面,只要关心页面
右击li标签----复制----复制selector,得到
#content > div > div.article > ol:nth-child(2) > li:nth-child(1)
将上面得复制得结果修改为
#content > div > div.article > ol > li
粘贴到爬虫中(注释 解析页面方式)
import scrapy
from scrapy import Selector
from lyy_spider.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
print(response.request.headers['User-Agent'])
# 把 response 包装成selector对象
sel = Selector(response)
# 解析页面方式
list_items = sel.css('#content > div > div.article > ol > li')
# 遍历
for list_item in list_items:
# 创建movieItem对象,
movie_item = MovieItem()
movie_item['title'] = list_item.css('span.title::text').extract_first()
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item
修改 setting.py 添加代理,如果不修改的话,默认会使用xxxxspider为名字
import random
# user agent 列表
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
# 开启随机延时下载
RANDOMIZE_DOWNLOAD_DELAY = True
打开终端输入 scrapy crawl douban -o douban.csv 稍等一会即可在当前目录下查看文件
上面是获取到的单页数据。
为了能获取更多数据,需要对下一页,上一页的a标签进行解析
改造后的douban.py
import scrapy
from scrapy import Selector, Request
from scrapy.http import HtmlResponse
from lyy_spider.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
# start_urls = ["https://movie.douban.com/top250?start=0&filter="]
def start_requests(self):
for page in range(1):
yield Request(url=f'https://movie.douban.com/top250?start={page * 25}&filter=')
def parse(self, response: HtmlResponse):
print(response.request.headers['User-Agent'])
# 把 response 包装成selector对象
sel = Selector(response)
# 解析页面方式
list_items = sel.css('#content > div > div.article > ol > li')
# 遍历
for list_item in list_items:
# 创建movieItem对象,
movie_item = MovieItem()
movie_item['title'] = list_item.css('span.title::text').extract_first()
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item
# 获取页码,之后对其拼接
# href_list = sel.css('div.paginator > a::attr(href)')
# for href in href_list:
# url = response.urljoin(href.extract())
# yield Request(url=url)
添加openpyxl依赖(导出excel)
pip install openpyxl
修改piplines.py代码
def __init__(self):
wb = openpyx.Workbook
.........
改造目录下的pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
class ExcelPipeline:
def __init__(self):
self.wb = openpyxl.Workbook()
# workbook.create_sheet()
self.ws = self.wb.active
self.ws.title = 'Top250'
self.ws.append(('标题', '评分', '主题'))
def close_spider(self, spider):
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):
title = item.get('title', '')
rank = item.get('rank', '')
subject = item.get('subject', '')
self.ws.append((title, rank, subject))
return item
保存到mysql数据库中
准备建表语句
drop table if exists `tb_top_movie`;
create table `tb_top_movie` (
movie_id int unsigned auto_increment primary key comment '编号',
title varchar(50) not null comment '标题',
rating decimal(3,1) not null comment '评分',
subject varchar(200) default '' comment '主题'
) engine=innodb comment ='Top电影表';
修改 pipelines.py
ITEM_PIPELINES = {
"lyy_spider.pipelines.ExcelPipeline": 300,
"lyy_spider.pipelines.DBPipeline": 300,
}
安装python 连接mysql包
import pymysql
pip install pymysql
没有使用批处理原始方法
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
import pymysql
class DBPipeline:
# 初始化
def __init__(self):
# 建立mysql数据库连接
self.conn = pymysql.connect(host='localhost', port=3306,
user='localhost', password='localhost',
db='localhost', charset='utf8mb4')
self.cursor = self.conn.cursor()
# 关闭
def close_spider(self, spider):
# 提交记录
self.conn.commit()
# 关闭mysql连接
self.conn.close()
# 数据处理
def process_item(self, item, spider):
title = item.get('title', 'nil')
rank = item.get('rank', '0')
subject = item.get('subject', 'nil')
self.cursor.execute(
'insert into tb_top_movie (title,rating,subject) values (%s ,%s ,%s)',
(title, rank, subject)
)
return item
class ExcelPipeline:
def __init__(self):
self.wb = openpyxl.Workbook()
# workbook.create_sheet()
self.ws = self.wb.active
self.ws.title = 'Top250'
self.ws.append(('标题', '评分', '主题'))
def close_spider(self, spider):
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):
title = item.get('title', 'nil')
rank = item.get('rank', 'nil')
subject = item.get('subject', 'nil')
self.ws.append((title, rank, subject))
return item
使用批处理方式插入数据
对于插入少量====1条数据来说,使用execute,足以;插入大量数据使用executemany
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
import pymysql
class DBPipeline:
# 初始化
def __init__(self):
# 建立mysql数据库连接
self.conn = pymysql.connect(host='localhost', port=3306,
user='localhost', password='localhost',
db='localhost', charset='utf8mb4')
# 获取一个游标对象
self.cursor = self.conn.cursor()
# 批处理容器
self.data = []
# 关闭
def close_spider(self, spider):
# 如果还有数据继续写入
if len(self.data) > 0:
self._write_to_db()
# # 清空 data
# self.data.clear()
# 关闭mysql连接
self.conn.close()
# 数据处理
def process_item(self, item, spider):
# 批处理,把数据存入容器
self.data.append((item.get('title', 'nil'), item.get('rank', '0'), item.get('subject', 'nil')))
# 判断数据长度,每100次向mysql写入数据
if len(self.data) == 100:
self._write_to_db()
return item
# 数据写入mysql,将插入语句拆分成方法
def _write_to_db(self):
# 写入
self.cursor.executemany(
'insert into tb_top_movie (title,rating,subject) values (%s ,%s ,%s)',
self.data
)
# 提交
self.conn.commit()
# 清空 data,以便下次插入数据
self.data.clear()
class ExcelPipeline:
# 初始化
def __init__(self):
self.wb = openpyxl.Workbook()
# workbook.create_sheet()
# 默认选中
self.ws = self.wb.active
# 表名(sheet)名称
self.ws.title = 'Top250'
# 内容
self.ws.append(('标题', '评分', '主题'))
# 关闭
def close_spider(self, spider):
self.wb.save('电影数据.xlsx')
# 数据处理
def process_item(self, item, spider):
title = item.get('title', 'nil')
rank = item.get('rank', 'nil')
subject = item.get('subject', 'nil')
self.ws.append((title, rank, subject))
return item
完整代码: