简单实现了下,自动保存豆瓣美女网站的图片到本地,仅作学习参考:
import requests
import os
from lxml import etree
import random
import string
import datetime
# 保存目录
path = 'D://photos/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'}
def get_girlphoto(url):
try:
data = requests.get(url, headers=headers)
txt = data.content.decode('utf-8')
selector = etree.HTML(txt)
# 获取图片的URL列表
girlphoto_urls = selector.xpath('//div/a/img/@src')
for item in girlphoto_urls:
if not os.path.exists(path):
os.makedirs(path)
print("path:{}创建成功".format(path))
data = requests.get(item, headers=headers)
with open(path + get_filename() + item[-4:], 'wb') as f:
f.write(data.content)
f.close()
titles = selector.xpath('//div/ul[@class="pagination"]/li/a[@title="下一页"]')
return len(titles) > 0
except Exception as e:
print("Exception",e)
def get_filename():
salt = ''.join(random.sample(string.ascii_letters + string.digits, 8))
return salt
if __name__ == '__main__':
source = 'https://www.dbmeinv.com/?pager_offset={}'
hasNext = True
pageNo = 1
starttime = datetime.datetime.now()
while hasNext:
print('正在爬取第[{}]页的数据'.format(pageNo))
url = source.format(pageNo)
hasNext = get_girlphoto(url)
pageNo += 1
endtime = datetime.datetime.now()
print('数据爬取完毕,共耗时:{}s'.format((endtime - starttime).seconds))