最新消息: USBMI致力于为网友们分享Windows、安卓、IOS等主流手机系统相关的资讯以及评测、同时提供相关教程、应用、软件下载等服务。

爬虫 爬取小说 简单实战

业界 admin 14浏览 0评论

文章目录

    • 前言
    • 主体
    • 1.借鉴和补充后的源码
    • 2.笔趣阁(改编)
      • 获取xpath路径
        • 路径 报错
        • 解决方法
    • 收尾
      • 全部源码

前言

最近学了一点python,想着搞个爬虫练一下手,记录一下学习过程

基于requests库和lxml库编写的爬虫
借鉴于python爬虫实战——小说爬取

讲的很详细且易懂
本文的例子只是在其基础上补充一下,可先看了上面的再回来看这里,更容易理解

主体

本文记载了两个本人练手的爬虫,爬取的网站分别是:
1.http://www.365kk/
2.某小说网站(跟1差不多,但存在一个坑)

Exception has occurred: IndexError list index out of range File “D:\Desktop\vscode\python\biquge_book.py”, line 111, in bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0] IndexError: list index out of range

1.借鉴和补充后的源码

import os
import requests
from lxml import etree
import time
import random


# 获取下一页链接的函数
def next_url(next_url_element):
    nxturl = 'http://www.365kk/255/255036/'
    # rfind('/') 获取最后一个'/'字符的索引
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl


# 数据清洗函数
def clean_data(filename, info,output_folder):
    """
    :param filename: 原文档名
    :param info: [bookTitle, author, update, introduction]
    """

    print("\n==== 数据清洗开始 ====")

    # 新的文件名
    btitle ="new"+info[0]+".txt"
    #new_filename ="new"+filename
    #保存到文件夹里
    new_filename = f"{output_folder}/{btitle}"

    # 打开两个文本文档
    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    # 首先在新的文档中写入书籍信息
    f_new.write('==  《' + info[0] + '》\r\n')  # 标题
    f_new.write('==  ' + info[1] + '\r\n')     # 作者
    f_new.write('==  ' + info[2] + '\r\n')     # 最后更新时间
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')     # 简介
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()  # 按行读取原文档中的内容
    empty_cnt = 0  # 用于记录连续的空行数

    # 遍历原文档中的每行
    for line in lines:
        if line == '\n':        # 如果当前是空行
            empty_cnt += 1      # 连续空行数+1
            if empty_cnt >= 2:  # 如果连续空行数不少于2
                continue        # 直接读取下一行,当前空行不写入
        else:                   # 如果当前不是空行
            empty_cnt = 0       # 连续空行数清零
        if line.startswith("\u3000\u3000"):  # 如果有段首缩进
            line = line[2:]                  # 删除段首缩进
            f_new.write(line)                # 写入当前行
        elif line.startswith("第"):          # 如果当前行是章节标题
            f_new.write("\r\n")              # 写入换行
            f_new.write("-" * 20)            # 写入20个'-'
            f_new.write("\r\n")              # 写入换行
            f_new.write(line)                # 写入章节标题
        else:                                # 如果当前行是未缩进的普通段落
            f_new.write(line)                # 保持原样写入

    f_old.close()
    f_new.close()
    print(f"\n==== 数据清洗完成,新文件已保存到 {new_filename} ====")

"""
# 请求头,使用前需填入自己浏览器的信息
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'fontFamily=null; fontColor=null; fontSize=null; bg=null; ASP.NET_SessionId=2xgsi3yqhveljdny3gp4vvox; bookid=255036; booklist=%257B%2522BookId%2522%253A255036%252C%2522ChapterId%2522%253A4147599%252C%2522ChapterName%2522%253A%2522%25u7B2C%25u4E00%25u7AE0%2520%25u5C0F%25u5C0F%25u7075%25u5A25%2522%257D',
    'Host': 'www.365kk',
    'Connection': 'keep-alive'
}
"""


'''
#反爬
headers_list = [
    {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666'
    }, {
        'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
    }, {
        'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'
    }, {
        'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)'
    }, {
        'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
    }, {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
    }
]
'''

#headers = random.choice(headers_list)
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'fontFamily=null; fontColor=null; fontSize=null; bg=null; ASP.NET_SessionId=2xgsi3yqhveljdny3gp4vvox; bookid=255036; booklist=%257B%2522BookId%2522%253A255036%252C%2522ChapterId%2522%253A4147599%252C%2522ChapterName%2522%253A%2522%25u7B2C%25u4E00%25u7AE0%2520%25u5C0F%25u5C0F%25u7075%25u5A25%2522%257D',
    'Host': 'www.365kk',
    'Connection': 'keep-alive'
}
#headers['user-agent'] = random.choice(headers_list)['user-agent']



file = input('请建立一个存储图片的文件夹,输入文件夹名称即可: ')
y = os.path.exists(file)
if y == 1:
    #print('文件夹已存在')
    #file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可: ')
    #os.mkdir(file)
    print('文件夹已存在,将保存进其中')
else:
    os.mkdir(file)
    print('文件夹不存在,已创建')
folder=file+'/'
 


# 小说主页
main_url = "http://www.365kk/255/255036/"

# 使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

# 将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

# 将文本内容创建为可解析元素
main_html = etree.HTML(main_text)


#bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
bookTitle =main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
#author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
author =main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
#update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
#introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
update=main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction=main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[2]/text()')[0]

# 调试期间仅爬取六个页面
maxPages = 767
cnt = 0

# 记录上一章节的标题
lastTitle = ''

# 爬取起点
url = 'http://www.365kk/255/255036/4147599.html'

# 爬取终点
#endurl = 'http://www.365kk/255/255036/4148385.html'

#while url != endurl:
while True:
    cnt += 1  # 记录当前爬取的页面
    if cnt > maxPages:
        break  # 当爬取的页面数超过maxPages时停止
    
    #resp = requests.get(url, headers)
    try:
        resp = requests.get(url, headers)  # 超时设置为10秒
    except:
        for i in range(4):  # 循环去请求网站
            resp = requests.get(url, headers=headers, timeout=20)
            if resp.status_code == 200:
                break
            
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="title"]/text()')[0]
    contents = html.xpath('//*[@id="content"]/text()')

    # 输出爬取进度信息
    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    #print(contents)
    file_name = f"{folder}/{bookTitle}"
    #file_name=bookTitle+'.txt'
    #with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
    with open(file_name + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:  # 章节标题改变
            f_new.write(title)      # 写入新的章节标题
            lastTitle = title   # 更新章节标题
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close( )

    # 获取"下一页"按钮指向的链接
    next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]

    # 传入函数next_url得到下一页链接
    url = next_url(next_url_element)

    sleepTime = random.randint(2, 6)  # 产生一个2~5之间的随机数
    time.sleep(sleepTime)             # 暂停2~5之间随机的秒数

    
clean_data(file_name + '.txt', [bookTitle, author, update, introduction],file+"/")
print("complete!")

2.笔趣阁(改编)

获取xpath路径

路径 报错

如果正确按照以上操作获取xpath路径,却报错:

Exception has occurred: IndexError list index out of range File “D:\Desktop\vscode\python\biquge_book.py”, line 111, in bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0] IndexError: list index out of range

就要去仔细看一下网页源代码,看有没有啥陷阱了

比如图上的框住的这个东西就会导致程序报错:

解决方法

服务器在返回response浏览器渲染时有时候会加上一些特殊的标签,查看源代码看不出来,导致xpath取值 为空,需print()打印出来才可以看出来!

小说主页
main_url = "https://www.bige3/book/26580/"

使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

调试用
print(main_text)

打印出来后就可以对照着网页源代码,找出错误原因了
我就是在仔细看了一遍后才发现

这东西貌似没有传给程序,导致出错
知道了原因,就很容易的解决问题了

先正常获取xpath路径
再根据打印内容更改
修改前
bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0]
修改后
bookTitle=main_html.xpath(‘/html/body/div[4]/div[2]/h1/text()’)[0]
很明显,改个数字就行了

import os
import requests
from lxml import etree
import time
import random


# 获取下一页链接的函数
def next_url(next_url_element):
    #nxturl = 'http://www.365kk/255/255036/'
    nxturl='https://www.bige3/book/66/'
    # rfind('/') 获取最后一个'/'字符的索引
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl


# 数据清洗函数
def clean_data(filename, info,output_folder):
    """
    :param filename: 原文档名
    :param info: [bookTitle, author, update, introduction]
    """

    print("\n==== 数据清洗开始 ====")

    # 新的文件名
    btitle ="new"+info[0]+".txt"
    #new_filename ="new"+filename
    #保存到文件夹里
    new_filename = f"{output_folder}/{btitle}"

    # 打开两个文本文档
    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    # 首先在新的文档中写入书籍信息
    f_new.write('==  《' + info[0] + '》\r\n')  # 标题
    f_new.write('==  ' + info[1] + '\r\n')     # 作者
    f_new.write('==  ' + info[2] + '\r\n')     # 最后更新时间
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')     # 简介
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()  # 按行读取原文档中的内容
    empty_cnt = 0  # 用于记录连续的空行数

    # 遍历原文档中的每行
    for line in lines:
        if line == '\n':        # 如果当前是空行
            empty_cnt += 1      # 连续空行数+1
            if empty_cnt >= 2:  # 如果连续空行数不少于2
                continue        # 直接读取下一行,当前空行不写入
        else:                   # 如果当前不是空行
            empty_cnt = 0       # 连续空行数清零
        if line.startswith("\u3000\u3000"):  # 如果有段首缩进
            line = line[2:]                  # 删除段首缩进
            f_new.write(line)                # 写入当前行
        elif line.startswith("第"):          # 如果当前行是章节标题
            f_new.write("\r\n")              # 写入换行
            f_new.write("-" * 20)            # 写入20个'-'
            f_new.write("\r\n")              # 写入换行
            f_new.write(line)                # 写入章节标题
        else:                                # 如果当前行是未缩进的普通段落
            f_new.write(line)                # 保持原样写入

    f_old.close()
    f_new.close()
    print(f"\n==== 数据清洗完成,新文件已保存到 {new_filename} ====")


# 请求头,使用前需填入自己浏览器的信息
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'hm=0614940c5550f8dc77073d3a2be486aa',
    #'Host': '',
    'Connection': 'keep-alive'
}


file = input('请建立一个存储图片的文件夹,输入文件夹名称即可: ')
y = os.path.exists(file)
if y == 1:
    #print('文件夹已存在')
    #file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可: ')
    #os.mkdir(file)
    print('文件夹已存在,将保存进其中')
else:
    os.mkdir(file)
    print('文件夹不存在,已创建')
folder=file+'/'
 


# 小说主页
main_url = "https://www.bige3/book/26580/"

# 使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

# 将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

#调试用
#print(main_text)

# 将文本内容创建为可解析元素
main_html = etree.HTML(main_text)


#bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
bookTitle=main_html.xpath('/html/body/div[4]/div[2]/h1/text()')[0]
#author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
author =main_html.xpath('/html/body/div[4]/div[2]/div[2]/span[1]/text()')[0]
#update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
#introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
update=main_html.xpath('/html/body/div[4]/div[2]/div[2]/span[3]/text()')[0]
introduction=main_html.xpath('/html/body/div[4]/div[2]/div[4]/dl/dd/text()')[0]

# 调试期间仅爬取六个页面
maxPages = 6
cnt = 0

# 记录上一章节的标题
lastTitle = ''

# 爬取起点
url = 'https://www.bige3/book/26580/1.html'

# 爬取终点
endurl = 'https://www.bige3/book/26580/6.html'

#while url != endurl:
while True:
    cnt += 1  # 记录当前爬取的页面
    if cnt > maxPages:
        break  # 当爬取的页面数超过maxPages时停止
    
    resp = requests.get(url, headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="wap_none"]/text()')[0]
    contents = html.xpath('//*[@id="chaptercontent"]/text()')

    # 输出爬取进度信息
    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    print(contents)
    file_name = f"{folder}/{bookTitle}"
    #file_name=bookTitle+'.txt'
    #with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
    with open(file_name + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:  # 章节标题改变
            f_new.write(title)      # 写入新的章节标题
            lastTitle = title   # 更新章节标题
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close( )

    # 获取"下一页"按钮指向的链接
    next_url_element = html.xpath('//*[@class="Readpage pagedown"]/a[3]/@href')[0]

    # 传入函数next_url得到下一页链接
    url = next_url(next_url_element)

    sleepTime = random.randint(2, 6)  # 产生一个2~5之间的随机数
    time.sleep(sleepTime)             # 暂停2~5之间随机的秒数

    
clean_data(file_name + '.txt', [bookTitle, author, update, introduction],file+"/")
print("complete!")


收尾

全部源码

已传到仓库,
我的github仓库
我的gitee仓库https://gitee/aa2255/smalltask

文章目录

    • 前言
    • 主体
    • 1.借鉴和补充后的源码
    • 2.笔趣阁(改编)
      • 获取xpath路径
        • 路径 报错
        • 解决方法
    • 收尾
      • 全部源码

前言

最近学了一点python,想着搞个爬虫练一下手,记录一下学习过程

基于requests库和lxml库编写的爬虫
借鉴于python爬虫实战——小说爬取

讲的很详细且易懂
本文的例子只是在其基础上补充一下,可先看了上面的再回来看这里,更容易理解

主体

本文记载了两个本人练手的爬虫,爬取的网站分别是:
1.http://www.365kk/
2.某小说网站(跟1差不多,但存在一个坑)

Exception has occurred: IndexError list index out of range File “D:\Desktop\vscode\python\biquge_book.py”, line 111, in bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0] IndexError: list index out of range

1.借鉴和补充后的源码

import os
import requests
from lxml import etree
import time
import random


# 获取下一页链接的函数
def next_url(next_url_element):
    nxturl = 'http://www.365kk/255/255036/'
    # rfind('/') 获取最后一个'/'字符的索引
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl


# 数据清洗函数
def clean_data(filename, info,output_folder):
    """
    :param filename: 原文档名
    :param info: [bookTitle, author, update, introduction]
    """

    print("\n==== 数据清洗开始 ====")

    # 新的文件名
    btitle ="new"+info[0]+".txt"
    #new_filename ="new"+filename
    #保存到文件夹里
    new_filename = f"{output_folder}/{btitle}"

    # 打开两个文本文档
    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    # 首先在新的文档中写入书籍信息
    f_new.write('==  《' + info[0] + '》\r\n')  # 标题
    f_new.write('==  ' + info[1] + '\r\n')     # 作者
    f_new.write('==  ' + info[2] + '\r\n')     # 最后更新时间
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')     # 简介
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()  # 按行读取原文档中的内容
    empty_cnt = 0  # 用于记录连续的空行数

    # 遍历原文档中的每行
    for line in lines:
        if line == '\n':        # 如果当前是空行
            empty_cnt += 1      # 连续空行数+1
            if empty_cnt >= 2:  # 如果连续空行数不少于2
                continue        # 直接读取下一行,当前空行不写入
        else:                   # 如果当前不是空行
            empty_cnt = 0       # 连续空行数清零
        if line.startswith("\u3000\u3000"):  # 如果有段首缩进
            line = line[2:]                  # 删除段首缩进
            f_new.write(line)                # 写入当前行
        elif line.startswith("第"):          # 如果当前行是章节标题
            f_new.write("\r\n")              # 写入换行
            f_new.write("-" * 20)            # 写入20个'-'
            f_new.write("\r\n")              # 写入换行
            f_new.write(line)                # 写入章节标题
        else:                                # 如果当前行是未缩进的普通段落
            f_new.write(line)                # 保持原样写入

    f_old.close()
    f_new.close()
    print(f"\n==== 数据清洗完成,新文件已保存到 {new_filename} ====")

"""
# 请求头,使用前需填入自己浏览器的信息
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'fontFamily=null; fontColor=null; fontSize=null; bg=null; ASP.NET_SessionId=2xgsi3yqhveljdny3gp4vvox; bookid=255036; booklist=%257B%2522BookId%2522%253A255036%252C%2522ChapterId%2522%253A4147599%252C%2522ChapterName%2522%253A%2522%25u7B2C%25u4E00%25u7AE0%2520%25u5C0F%25u5C0F%25u7075%25u5A25%2522%257D',
    'Host': 'www.365kk',
    'Connection': 'keep-alive'
}
"""


'''
#反爬
headers_list = [
    {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666'
    }, {
        'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
    }, {
        'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'
    }, {
        'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)'
    }, {
        'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
    }, {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
    }, {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
    }, {
        'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
    }
]
'''

#headers = random.choice(headers_list)
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'fontFamily=null; fontColor=null; fontSize=null; bg=null; ASP.NET_SessionId=2xgsi3yqhveljdny3gp4vvox; bookid=255036; booklist=%257B%2522BookId%2522%253A255036%252C%2522ChapterId%2522%253A4147599%252C%2522ChapterName%2522%253A%2522%25u7B2C%25u4E00%25u7AE0%2520%25u5C0F%25u5C0F%25u7075%25u5A25%2522%257D',
    'Host': 'www.365kk',
    'Connection': 'keep-alive'
}
#headers['user-agent'] = random.choice(headers_list)['user-agent']



file = input('请建立一个存储图片的文件夹,输入文件夹名称即可: ')
y = os.path.exists(file)
if y == 1:
    #print('文件夹已存在')
    #file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可: ')
    #os.mkdir(file)
    print('文件夹已存在,将保存进其中')
else:
    os.mkdir(file)
    print('文件夹不存在,已创建')
folder=file+'/'
 


# 小说主页
main_url = "http://www.365kk/255/255036/"

# 使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

# 将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

# 将文本内容创建为可解析元素
main_html = etree.HTML(main_text)


#bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
bookTitle =main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
#author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
author =main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
#update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
#introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
update=main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction=main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[2]/text()')[0]

# 调试期间仅爬取六个页面
maxPages = 767
cnt = 0

# 记录上一章节的标题
lastTitle = ''

# 爬取起点
url = 'http://www.365kk/255/255036/4147599.html'

# 爬取终点
#endurl = 'http://www.365kk/255/255036/4148385.html'

#while url != endurl:
while True:
    cnt += 1  # 记录当前爬取的页面
    if cnt > maxPages:
        break  # 当爬取的页面数超过maxPages时停止
    
    #resp = requests.get(url, headers)
    try:
        resp = requests.get(url, headers)  # 超时设置为10秒
    except:
        for i in range(4):  # 循环去请求网站
            resp = requests.get(url, headers=headers, timeout=20)
            if resp.status_code == 200:
                break
            
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="title"]/text()')[0]
    contents = html.xpath('//*[@id="content"]/text()')

    # 输出爬取进度信息
    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    #print(contents)
    file_name = f"{folder}/{bookTitle}"
    #file_name=bookTitle+'.txt'
    #with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
    with open(file_name + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:  # 章节标题改变
            f_new.write(title)      # 写入新的章节标题
            lastTitle = title   # 更新章节标题
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close( )

    # 获取"下一页"按钮指向的链接
    next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]

    # 传入函数next_url得到下一页链接
    url = next_url(next_url_element)

    sleepTime = random.randint(2, 6)  # 产生一个2~5之间的随机数
    time.sleep(sleepTime)             # 暂停2~5之间随机的秒数

    
clean_data(file_name + '.txt', [bookTitle, author, update, introduction],file+"/")
print("complete!")

2.笔趣阁(改编)

获取xpath路径

路径 报错

如果正确按照以上操作获取xpath路径,却报错:

Exception has occurred: IndexError list index out of range File “D:\Desktop\vscode\python\biquge_book.py”, line 111, in bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0] IndexError: list index out of range

就要去仔细看一下网页源代码,看有没有啥陷阱了

比如图上的框住的这个东西就会导致程序报错:

解决方法

服务器在返回response浏览器渲染时有时候会加上一些特殊的标签,查看源代码看不出来,导致xpath取值 为空,需print()打印出来才可以看出来!

小说主页
main_url = "https://www.bige3/book/26580/"

使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

调试用
print(main_text)

打印出来后就可以对照着网页源代码,找出错误原因了
我就是在仔细看了一遍后才发现

这东西貌似没有传给程序,导致出错
知道了原因,就很容易的解决问题了

先正常获取xpath路径
再根据打印内容更改
修改前
bookTitle=main_html.xpath(‘/html/body/div[5]/div[2]/h1/text()’)[0]
修改后
bookTitle=main_html.xpath(‘/html/body/div[4]/div[2]/h1/text()’)[0]
很明显,改个数字就行了

import os
import requests
from lxml import etree
import time
import random


# 获取下一页链接的函数
def next_url(next_url_element):
    #nxturl = 'http://www.365kk/255/255036/'
    nxturl='https://www.bige3/book/66/'
    # rfind('/') 获取最后一个'/'字符的索引
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl


# 数据清洗函数
def clean_data(filename, info,output_folder):
    """
    :param filename: 原文档名
    :param info: [bookTitle, author, update, introduction]
    """

    print("\n==== 数据清洗开始 ====")

    # 新的文件名
    btitle ="new"+info[0]+".txt"
    #new_filename ="new"+filename
    #保存到文件夹里
    new_filename = f"{output_folder}/{btitle}"

    # 打开两个文本文档
    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    # 首先在新的文档中写入书籍信息
    f_new.write('==  《' + info[0] + '》\r\n')  # 标题
    f_new.write('==  ' + info[1] + '\r\n')     # 作者
    f_new.write('==  ' + info[2] + '\r\n')     # 最后更新时间
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')     # 简介
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()  # 按行读取原文档中的内容
    empty_cnt = 0  # 用于记录连续的空行数

    # 遍历原文档中的每行
    for line in lines:
        if line == '\n':        # 如果当前是空行
            empty_cnt += 1      # 连续空行数+1
            if empty_cnt >= 2:  # 如果连续空行数不少于2
                continue        # 直接读取下一行,当前空行不写入
        else:                   # 如果当前不是空行
            empty_cnt = 0       # 连续空行数清零
        if line.startswith("\u3000\u3000"):  # 如果有段首缩进
            line = line[2:]                  # 删除段首缩进
            f_new.write(line)                # 写入当前行
        elif line.startswith("第"):          # 如果当前行是章节标题
            f_new.write("\r\n")              # 写入换行
            f_new.write("-" * 20)            # 写入20个'-'
            f_new.write("\r\n")              # 写入换行
            f_new.write(line)                # 写入章节标题
        else:                                # 如果当前行是未缩进的普通段落
            f_new.write(line)                # 保持原样写入

    f_old.close()
    f_new.close()
    print(f"\n==== 数据清洗完成,新文件已保存到 {new_filename} ====")


# 请求头,使用前需填入自己浏览器的信息
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Cookie': 'hm=0614940c5550f8dc77073d3a2be486aa',
    #'Host': '',
    'Connection': 'keep-alive'
}


file = input('请建立一个存储图片的文件夹,输入文件夹名称即可: ')
y = os.path.exists(file)
if y == 1:
    #print('文件夹已存在')
    #file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可: ')
    #os.mkdir(file)
    print('文件夹已存在,将保存进其中')
else:
    os.mkdir(file)
    print('文件夹不存在,已创建')
folder=file+'/'
 


# 小说主页
main_url = "https://www.bige3/book/26580/"

# 使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)

# 将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')

#调试用
#print(main_text)

# 将文本内容创建为可解析元素
main_html = etree.HTML(main_text)


#bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
bookTitle=main_html.xpath('/html/body/div[4]/div[2]/h1/text()')[0]
#author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
author =main_html.xpath('/html/body/div[4]/div[2]/div[2]/span[1]/text()')[0]
#update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
#introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
update=main_html.xpath('/html/body/div[4]/div[2]/div[2]/span[3]/text()')[0]
introduction=main_html.xpath('/html/body/div[4]/div[2]/div[4]/dl/dd/text()')[0]

# 调试期间仅爬取六个页面
maxPages = 6
cnt = 0

# 记录上一章节的标题
lastTitle = ''

# 爬取起点
url = 'https://www.bige3/book/26580/1.html'

# 爬取终点
endurl = 'https://www.bige3/book/26580/6.html'

#while url != endurl:
while True:
    cnt += 1  # 记录当前爬取的页面
    if cnt > maxPages:
        break  # 当爬取的页面数超过maxPages时停止
    
    resp = requests.get(url, headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="wap_none"]/text()')[0]
    contents = html.xpath('//*[@id="chaptercontent"]/text()')

    # 输出爬取进度信息
    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    print(contents)
    file_name = f"{folder}/{bookTitle}"
    #file_name=bookTitle+'.txt'
    #with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
    with open(file_name + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:  # 章节标题改变
            f_new.write(title)      # 写入新的章节标题
            lastTitle = title   # 更新章节标题
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close( )

    # 获取"下一页"按钮指向的链接
    next_url_element = html.xpath('//*[@class="Readpage pagedown"]/a[3]/@href')[0]

    # 传入函数next_url得到下一页链接
    url = next_url(next_url_element)

    sleepTime = random.randint(2, 6)  # 产生一个2~5之间的随机数
    time.sleep(sleepTime)             # 暂停2~5之间随机的秒数

    
clean_data(file_name + '.txt', [bookTitle, author, update, introduction],file+"/")
print("complete!")


收尾

全部源码

已传到仓库,
我的github仓库
我的gitee仓库https://gitee/aa2255/smalltask

发布评论

评论列表 (0)

  1. 暂无评论