大飞

大飞 关注TA

挑战一切!

大飞

大飞

关注TA

挑战一切!

  •  世界的顶端
  • 混口饭吃
  • 写了333,609字

该文章投稿至Nemo社区   Python  板块 复制链接


Python 多线程爬取网站小说

发布于 2018/05/23 11:00 2,127浏览 2回复 3,948


WX20180523-105932WX20180523-105942

import requests
from bs4 import BeautifulSoup
from multiprocessing import Process, Queue
from threading import Thread
import os

# 请求头字典
req_header = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cookie': '__jsluid=ffa49d477b3fb0a1979c5482a6046d94; UM_distinctid=16385d0b4c63d6-0eab1fcdfee39d-33617f06-fa000-16385d0b4c74e8; CNZZDATA1272873873=1745087307-1526954737-%7C1526970941',
    'Host': 'www.biquge.com.tw',
    'Connection': 'keep-alive',
    'Referer': 'http://www.biquge.com.tw',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}


## 获取当前章节的标题和内容并保存到本地
class StoryContent(Thread):
    def __init__(self, title, name, section_url, q):
        super(StoryContent, self).__init__()
        self.title = title
        self.name = name
        self.section_url = section_url
        self.q = q

    def run(self):
        try:
            req_url = 'http://www.biquge.com.tw' + self.section_url
            # 请求当前章节页面  params为请求参数
            r = requests.get(req_url, params=req_header)
            r.encoding = 'gb2312'

            # soup转换
            soup = BeautifulSoup(r.text, "html.parser")
            # 获取章节名称
            section_name = soup.select('#wrapper .content_read .box_con .bookname h1')[0].text
            # 获取章节文本
            section_text = soup.select('#wrapper .content_read .box_con #content')[0].text

            # print('章节名称:%s' % section_name)
            # print("章节内容:\n" + section_text)
            path = '/Users/lyf/test/' + self.title
            if not os.path.exists(path):
                try:
                    os.makedirs(path)
                except:
                    print('创建%s错误' % path)
                # print('创建文件夹:%s' % path)
            ##写入文件
            with open('/Users/lyf/test/' + self.title + '/' + self.name + '.txt', 'wb+') as f:
                f.write(('\r章节名称:' + section_name + '\r\n').encode('utf-8'))
                f.write(('\r章节内容:\n\n' + section_text + '\r\n').encode('utf-8'))
                self.q.put('下载完成章节:%s' % self.title)
        except:
            print('下载出错')



###获取章节类
class StorySection(Thread):
    def __init__(self, title, href):
        super(StorySection, self).__init__()
        self.title = title
        self.href = href

    def run(self):
        # 请求当前章节页面  params为请求参数
        r = requests.get(self.href, params=req_header)
        r.encoding = 'gb2312'
        # soup转换
        soup = BeautifulSoup(r.text, "html.parser")
        # 获取章节列表
        section_list = soup.select('#wrapper .box_con #list dl dd a')
        # 创建一个队列用来保存进程获取到的数据
        q = Queue()
        # 保存线程
        Thread_list = []

        for section in section_list:
            # get_content(self.title, section.text, section.get('href'))
            p = StoryContent(self.title, section.text, section.get('href'), q)
            p.start()
            Thread_list.append(p)
            print('下载完成《%s》' % self.title)
        # 开始线程
        for t in Thread_list:
            t.join()

        while not q.empty():
            print(q.get())


## 获取小说
def get_story():
    req_url = 'http://www.biquge.com.tw'
    # 请求当前章节页面  params为请求参数
    r = requests.get(req_url, params=req_header)
    r.encoding = 'gb2312'
    # soup转换
    soup = BeautifulSoup(r.text, "html.parser")
    # 获取小说列表
    story_list = soup.findAll('a')
    # 保存进程
    Process_list = []
    for story in story_list:
        href = story.get('href')
        title = story.text
        if title and href and href.endswith('/') and href.startswith('http://'):
            p = StorySection(title, href)
            p.start()
    for p in Process_list:
        p.join()


get_story()

exit()
本文标签
 {{tag}}
点了个评