ITPub博客

首页 > IT职业 > IT生活 > Python自动化爬取小说,解放你的双手

Python自动化爬取小说,解放你的双手

IT生活 作者:专注的阿熊 时间:2021-04-02 17:06:21 1 删除 编辑

# -*- encoding='utf-8' -*-

import requests

import random

import time

import re

from bs4 import BeautifulSoup

import os

# 首页域名

Host = "

user_agent = [

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

header = {'User-Agent': random.choice(user_agent)}

re_link = re.compile(r'<a href="(.*?)"')

class Bug_pa(object):

    def __init__(self, RootLink):

        """ 初始化 """

        self.rootLink = RootLink

    def scarpylink(self):

        """ 爬取书籍每个章节的 url """

        try:

            Link = self.rootLink[0]

            data = []

            res = requests.get(url=Link, headers=header)

            res.encoding = 'gbk'

            """ 解析内容 """

            res = str(res.text)

            soup = BeautifulSoup(res, 'lxml')

            # 获取书名

            bookname = soup.select_one('#info > h1').text

            # 获取章节链接

            ran = 1

            for htmls in soup.select('#list > dl > dd > a'):

                if ran > 12:

                    htmls = str(htmls)

                    links = re_link.findall(htmls)

                    data.append(Link + ''.join(links))

                else:

                    ran += 1

            if data:

                print(' 章节链接获取成功 ')

                return {'bookname': bookname,

                        'links': data}

            else:

                return []

        except Exception:

            print(' 出错 ')

            return []

    def scarpytext(self, url):

        """ 爬取书籍每个章节的内容 """

        try:

            texts = []

            page = requests.get(url=url, headers=header)

            page.encoding = 'gbk'

            soup = BeautifulSoup(page.text, 'lxml')

            selctname = soup.select_one('.bookname > h1').text

            print('外汇跟单gendan5.com 章节标题获取成功 ')

            for text_list in soup.select('#content'):

                for tex in text_list:    # 需要获取详细文本

                    tex = str(tex)

                    tex = tex.replace('<br/>', '').replace('\xa0\xa0\xa0\xa0', '')

                    texts.append(tex)

            if texts:

                print(' 章节文本获取成功! ')

                return {'name': selctname,

                        'text': texts}

        except Exception:

            print(' 文本函数出错 ')

            return []

    def save(self, bookname, name, booktext):

        """ 保存小说 """

        try:

            # exists: 判断括号中的文件路径是否存在

            # 如果文件夹不存在 , 就以书名创建一个文件夹

            print(bookname)

            print(name)

            if not os.path.exists(r'G:\book\-'+str(bookname)):

                os.mkdir(r'G:\book\-'+str(bookname))

            # with open(r'G:\book\book' + str(bookname) + str(name) + '.txt', 'a') as fp:

            with open(r'G:\book\-' + str(bookname) + '\-' + str(name) + '.txt', 'a') as fp:

                fp.write(name + '\n')

                for txt in booktext:

                    fp.write(txt + '\n')

            print(' 保存成功 ')

        except Exception:

            print(' 保存函数出错了 ')

            return []

    def rand_time(self):

        """ 启动随机延时 """

        i = random.uniform(0, 30)

        time.sleep(i)

    def main(self):

        """ 主函数 """

        try:

            # 爬取书的章节链接

            linkinfo = self.scarpylink()

            i = 1

            for link in linkinfo['links']:

                txt = self.scarpytext(link)

                if txt:

                    if self.save(linkinfo['bookname'], str(i) + '-' + txt['name'], txt['text'] ):

                        print(' 储存 ok', txt['name'])

                else:

                    print(' 储存失败 ', txt['name'])

                i += 1

        except Exception:

            print(' 主函数有问题 ')

def scrapyRootLink(url):

    """ 获取分类下的 url """

    try:

        links = []

        res = requests.get(url=url, headers=header)

        res.encoding = 'gbk'

        date = BeautifulSoup(res.text, 'lxml')

        for lin in date.select('.s2 > :link'):

            lin = str(lin)

            link = re_link.findall(lin)

            if link:

                links.append(link)

        if links:

            print(' 书籍链接获取成功 ')

            return links

        else:

            print('scrapyRootLink 有问题 ')

            return []

    except Exception as e:

        print(e)

        return []

if __name__ == '__main__':

    rootlinks = []

    for i in range(1, 2):

        rlink = '%s.html' %(i)

        rootlinks.append(rlink)

    for rootlink in rootlinks:

        for alink in scrapyRootLink(rootlink):

            shu = Bug_pa(alink)

            shu.main()

            shu.rand_time()     # 每爬取一页线程随机睡眠 0-30


来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/69946337/viewspace-2766414/,如需转载,请注明出处,否则将追究法律责任。

请登录后发表评论 登录
全部评论

注册时间:2019-08-23

  • 博文量
    133
  • 访问量
    90012