import requests
from bs4 import BeautifulSoup
from multiprocessing import Process, Queue
from threading import Thread
import os
# 请求头字典
req_header = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '__jsluid=ffa49d477b3fb0a1979c5482a6046d94; UM_distinctid=16385d0b4c63d6-0eab1fcdfee39d-33617f06-fa000-16385d0b4c74e8; CNZZDATA1272873873=1745087307-1526954737-%7C1526970941',
'Host': 'www.biquge.com.tw',
'Connection': 'keep-alive',
'Referer': 'http://www.biquge.com.tw',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}
## 获取当前章节的标题和内容并保存到本地
class StoryContent(Thread):
def __init__(self, title, name, section_url, q):
super(StoryContent, self).__init__()
self.title = title
self.name = name
self.section_url = section_url
self.q = q
def run(self):
try:
req_url = 'http://www.biquge.com.tw' + self.section_url
# 请求当前章节页面 params为请求参数
r = requests.get(req_url, params=req_header)
r.encoding = 'gb2312'
# soup转换
soup = BeautifulSoup(r.text, "html.parser")
# 获取章节名称
section_name = soup.select('#wrapper .content_read .box_con .bookname h1')[0].text
# 获取章节文本
section_text = soup.select('#wrapper .content_read .box_con #content')[0].text
# print('章节名称:%s' % section_name)
# print("章节内容:\n" + section_text)
path = '/Users/lyf/test/' + self.title
if not os.path.exists(path):
try:
os.makedirs(path)
except:
print('创建%s错误' % path)
# print('创建文件夹:%s' % path)
##写入文件
with open('/Users/lyf/test/' + self.title + '/' + self.name + '.txt', 'wb+') as f:
f.write(('\r章节名称:' + section_name + '\r\n').encode('utf-8'))
f.write(('\r章节内容:\n\n' + section_text + '\r\n').encode('utf-8'))
self.q.put('下载完成章节:%s' % self.title)
except:
print('下载出错')
###获取章节类
class StorySection(Thread):
def __init__(self, title, href):
super(StorySection, self).__init__()
self.title = title
self.href = href
def run(self):
# 请求当前章节页面 params为请求参数
r = requests.get(self.href, params=req_header)
r.encoding = 'gb2312'
# soup转换
soup = BeautifulSoup(r.text, "html.parser")
# 获取章节列表
section_list = soup.select('#wrapper .box_con #list dl dd a')
# 创建一个队列用来保存进程获取到的数据
q = Queue()
# 保存线程
Thread_list = []
for section in section_list:
# get_content(self.title, section.text, section.get('href'))
p = StoryContent(self.title, section.text, section.get('href'), q)
p.start()
Thread_list.append(p)
print('下载完成《%s》' % self.title)
# 开始线程
for t in Thread_list:
t.join()
while not q.empty():
print(q.get())
## 获取小说
def get_story():
req_url = 'http://www.biquge.com.tw'
# 请求当前章节页面 params为请求参数
r = requests.get(req_url, params=req_header)
r.encoding = 'gb2312'
# soup转换
soup = BeautifulSoup(r.text, "html.parser")
# 获取小说列表
story_list = soup.findAll('a')
# 保存进程
Process_list = []
for story in story_list:
href = story.get('href')
title = story.text
if title and href and href.endswith('/') and href.startswith('http://'):
p = StorySection(title, href)
p.start()
for p in Process_list:
p.join()
get_story()
exit()