LEEYANGY

关注TA

拼搏百天

加入社区2,064天
写了322,476字

该文章投稿至Nemo社区 Python 板块复制链接

scrapy爬虫框架学习

发布于 2023/03/25 21:42 7,300浏览 0回复 9,383字

安装过程？ --->> 需要有python环境(3.8+)

pip install Scrapy

使用 Scrapy 命令创建一个项目

scrapy startproject mySpider

创建成功后，如下提示：

cd yourproject
# scrapy genspider [baidu baidu.com]
baidu解释：名为什么的爬虫，比如baidu爬虫，baidu.com就是目标站点
完整命令如下：
scrapy genspider baidu baidu.com

项目结构：

使用开发工具 PyCharm 打开刚创建好的项目，打开项目，打开设置，创建项目venv专属python环境，保存退出

打开终端，可以看到已经激活了项目的venv环境

在终端中输入 pip install scrapy 安装项目专属的scrapy依赖

打开spider目录下的爬虫(baidu，这里我的叫douban)

scrapy框架的重点是解析页面，只需要关心解析页面即可，框架自带了一下持久化工具(json......)

修改 items.py 将需要爬到的数据组装成Item对象

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 爬虫爬到的数据需要组装成Item对象
class MovieItem(scrapy.Item):
    title = scrapy.Field()
    rank = scrapy.Field()
    subject = scrapy.Field()

接下来是分析目标页面,只要关心页面

右击li标签----复制----复制selector，得到

#content > div > div.article > ol:nth-child(2) > li:nth-child(1)

将上面得复制得结果修改为

#content > div > div.article > ol > li

粘贴到爬虫中(注释解析页面方式)

import scrapy
from scrapy import Selector

from lyy_spider.items import MovieItem


class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]

    def parse(self, response):
        print(response.request.headers['User-Agent'])
        # 把 response 包装成selector对象
        sel = Selector(response)
        # 解析页面方式
        list_items = sel.css('#content > div > div.article > ol > li')
        # 遍历
        for list_item in list_items:
            # 创建movieItem对象,
            movie_item = MovieItem()
            movie_item['title'] = list_item.css('span.title::text').extract_first()
            movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
            movie_item['subject'] = list_item.css('span.inq::text').extract_first()
            yield movie_item

修改 setting.py 添加代理，如果不修改的话，默认会使用xxxxspider为名字

import random

# user agent 列表
USER_AGENT_LIST = [
    'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
    'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
    'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
    'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
    'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
    'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
    'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
    'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)

# 开启随机延时下载
RANDOMIZE_DOWNLOAD_DELAY = True

打开终端输入 scrapy crawl douban -o douban.csv 稍等一会即可在当前目录下查看文件

上面是获取到的单页数据。

为了能获取更多数据，需要对下一页，上一页的a标签进行解析

改造后的douban.py

import scrapy
from scrapy import Selector, Request
from scrapy.http import HtmlResponse

from lyy_spider.items import MovieItem


class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    # start_urls = ["https://movie.douban.com/top250?start=0&filter="]

    def start_requests(self):
        for page in range(1):
            yield Request(url=f'https://movie.douban.com/top250?start={page * 25}&filter=')

    def parse(self, response: HtmlResponse):
        print(response.request.headers['User-Agent'])
        # 把 response 包装成selector对象
        sel = Selector(response)
        # 解析页面方式
        list_items = sel.css('#content > div > div.article > ol > li')
        # 遍历
        for list_item in list_items:
            # 创建movieItem对象,
            movie_item = MovieItem()
            movie_item['title'] = list_item.css('span.title::text').extract_first()
            movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
            movie_item['subject'] = list_item.css('span.inq::text').extract_first()
            yield movie_item

        # 获取页码，之后对其拼接
        # href_list = sel.css('div.paginator > a::attr(href)')
        # for href in href_list:
        #     url = response.urljoin(href.extract())
        #     yield Request(url=url)

添加openpyxl依赖（导出excel）

pip install openpyxl

修改piplines.py代码

def __init__(self):

wb = openpyx.Workbook

.........

改造目录下的pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import openpyxl


class ExcelPipeline:
    def __init__(self):
        self.wb = openpyxl.Workbook()
        # workbook.create_sheet()
        self.ws = self.wb.active
        self.ws.title = 'Top250'
        self.ws.append(('标题', '评分', '主题'))

    def close_spider(self, spider):
        self.wb.save('电影数据.xlsx')

    def process_item(self, item, spider):
        title = item.get('title', '')
        rank = item.get('rank', '')
        subject = item.get('subject', '')
        self.ws.append((title, rank, subject))
        return item

保存到mysql数据库中

准备建表语句

drop table if exists `tb_top_movie`;
create table `tb_top_movie` (
    movie_id int unsigned auto_increment primary key comment '编号',
    title varchar(50) not null comment '标题',
    rating decimal(3,1) not null comment '评分',
    subject varchar(200) default '' comment '主题'
) engine=innodb comment ='Top电影表';

修改 pipelines.py

ITEM_PIPELINES = {
    "lyy_spider.pipelines.ExcelPipeline": 300,
    "lyy_spider.pipelines.DBPipeline": 300,
}

安装python 连接mysql包

import pymysql
pip install pymysql

没有使用批处理原始方法

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import openpyxl
import pymysql


class DBPipeline:
    # 初始化
    def __init__(self):
        # 建立mysql数据库连接
        self.conn = pymysql.connect(host='localhost', port=3306,
                                    user='localhost', password='localhost',
                                    db='localhost', charset='utf8mb4')
        self.cursor = self.conn.cursor()

    # 关闭
    def close_spider(self, spider):
        # 提交记录
        self.conn.commit()
        # 关闭mysql连接
        self.conn.close()

    # 数据处理
    def process_item(self, item, spider):
        title = item.get('title', 'nil')
        rank = item.get('rank', '0')
        subject = item.get('subject', 'nil')
        self.cursor.execute(
            'insert into tb_top_movie (title,rating,subject) values (%s ,%s ,%s)',
            (title, rank, subject)
        )
        return item


class ExcelPipeline:

    def __init__(self):
        self.wb = openpyxl.Workbook()
        # workbook.create_sheet()
        self.ws = self.wb.active
        self.ws.title = 'Top250'
        self.ws.append(('标题', '评分', '主题'))

    def close_spider(self, spider):
        self.wb.save('电影数据.xlsx')

    def process_item(self, item, spider):
        title = item.get('title', 'nil')
        rank = item.get('rank', 'nil')
        subject = item.get('subject', 'nil')
        self.ws.append((title, rank, subject))
        return item

使用批处理方式插入数据

对于插入少量====1条数据来说，使用execute，足以；插入大量数据使用executemany

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import openpyxl
import pymysql


class DBPipeline:
    # 初始化
    def __init__(self):
        # 建立mysql数据库连接
        self.conn = pymysql.connect(host='localhost', port=3306,
                                    user='localhost', password='localhost',
                                    db='localhost', charset='utf8mb4')
        # 获取一个游标对象
        self.cursor = self.conn.cursor()
        # 批处理容器
        self.data = []

    # 关闭
    def close_spider(self, spider):
        # 如果还有数据继续写入
        if len(self.data) > 0:
            self._write_to_db()
        # # 清空 data
        # self.data.clear()
        # 关闭mysql连接
        self.conn.close()

    # 数据处理
    def process_item(self, item, spider):
        # 批处理，把数据存入容器
        self.data.append((item.get('title', 'nil'), item.get('rank', '0'), item.get('subject', 'nil')))
        # 判断数据长度，每100次向mysql写入数据
        if len(self.data) == 100:
            self._write_to_db()
        return item

    # 数据写入mysql,将插入语句拆分成方法
    def _write_to_db(self):
        # 写入
        self.cursor.executemany(
            'insert into tb_top_movie (title,rating,subject) values (%s ,%s ,%s)',
            self.data
        )
        # 提交
        self.conn.commit()
        # 清空 data，以便下次插入数据
        self.data.clear()


class ExcelPipeline:

    # 初始化
    def __init__(self):
        self.wb = openpyxl.Workbook()
        # workbook.create_sheet()
        # 默认选中
        self.ws = self.wb.active
        # 表名(sheet)名称
        self.ws.title = 'Top250'
        # 内容
        self.ws.append(('标题', '评分', '主题'))

    # 关闭
    def close_spider(self, spider):
        self.wb.save('电影数据.xlsx')

    # 数据处理
    def process_item(self, item, spider):
        title = item.get('title', 'nil')
        rank = item.get('rank', 'nil')
        subject = item.get('subject', 'nil')
        self.ws.append((title, rank, subject))
        return item

完整代码:

点了个评