欢迎光临
我们一直在努力

【原创.NET】庆祝.NET 7.0正式发布,用C#编写个小说网站通用爬虫程序

本帖最后由 jdclang 于 2022-11-11 12:58 编辑

之前一直用Python写爬虫,Python写爬虫也确实方便,不过实在不喜欢,甚至讨厌Python在任务并发上的模式,这次疫情被封控在家,加上.NET 7.0发布,顺手用.NET 7写了个所谓的小说网站通用爬虫(在初始化类的时候需要自己提供不同网站的XPATH),跟Python的代码量比较了一下,其实也没多多少内容。唯一没Python方便的就是HtmlAgilityPack没有BeautifulSoup方便,只支持XPATH方式,不过其实XPATH也挺方便的
代码看着多,其实里面注释差不多占了不少内容,方便各位理解,本来还想加上自动获取代{过}{滤}理池的,不过后来想想,国内盗版网站好像都没防爬虫机制,就懒得折腾了,有兴趣的自己修改吧。

[C#] 纯文本查看 复制代码
using System.Diagnostics.CodeAnalysis;using System.Net;using System.Text;using System.Text.RegularExpressions;using HtmlAgilityPack;using NLog;using NLog.Targets;namespace StoryBookSpyder;public class SpyderEngine{    #region 初始化变量    //初始化日志工具    private static Logger _logger = LogManager.GetCurrentClassLogger();    //定义小说属性    private string _bookName; //书名    private string _encoding; //网页字符编码    private string _baseAddress; //小说首页    private string _url; //章节地址    public int _Max_Concurrency { get; set; } = 50;    //设置爬取内容的Xpath路径    /// <summary>    /// 小说标题    /// </summary>    public string _Xpath_Title { get; set; } = "/html/body/div[1]/h1/a";    /// <summary>    /// 小说章节    /// </summary>    public string _Xpath_Content { get; set; } = "//*[@class='chapter']/li";    /// <summary>    /// 章节下一页    /// </summary>    public string _Xpath_Nextpage { get; set; } = "(//*[@class='page'])[1]/a";    /// <summary>    /// 章节内容    /// </summary>    public string _Xpath_ChapterContent { get; set; } = "(//*[@class='nr_nr'])[1]/div";    /// <summary>    /// 章节内容下一页    /// </summary>    public string _Xpath_ChapterContent_Nextpage { get; set; } = "(//td[@class='next']//a)[2]";    //初始化章节列表    private List<Catalogue> bookCatalogues = new();    static HttpClientHandler handler = new();    internal static readonly HttpClient httpClient = new(handler);    #endregion    #region 初始化爬虫引擎    /// <summary>    /// 初始化爬虫引擎,设置网站的默认地址,如果小说页面地址不完整,则自动根据默认地址补全    /// </summary>    /// <param name="baseAddress">网站地址</param>    /// <param name="url">小说页面地址</param>    /// <param name="encoding">网页字符编码</param>    internal SpyderEngine(string baseAddress, string encoding, string? xpath_Title, string? xpath_content,        string? xpath_nextpage)    {        Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);        handler.ServerCertificateCustomValidationCallback =            HttpClientHandler.DangerousAcceptAnyServerCertificateValidator;        _baseAddress = baseAddress;        httpClient.BaseAddress = new Uri(_baseAddress);        _encoding = encoding;        if (xpath_Title is not null)        {            _Xpath_Title = xpath_Title;        }        if (xpath_content is not null)        {            _Xpath_Content = xpath_content;        }        if (xpath_nextpage is not null)        {            _Xpath_Nextpage = xpath_nextpage;        }        httpClient.DefaultRequestHeaders.Clear();        httpClient.DefaultRequestHeaders.Add("Accept",            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");        httpClient.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");        httpClient.DefaultRequestHeaders.Add("Cache-Control", "max-age=0");        httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive");        httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1");        httpClient.DefaultRequestHeaders.Add("User-Agent",            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35");    }    #endregion    #region 开始爬取小说    /// <summary>    /// 开始执行爬虫    /// </summary>    /// <param name="url">含有章节列表的网页地址</param>    public async Task SpyderBook(string url)    {        await GetAllCatalogue(url);        // 输出已存储的章节列表        foreach (var bookCatalogue in bookCatalogues)        {            Console.WriteLine($"章节名:{bookCatalogue._title},章节地址:{bookCatalogue._catalogueUrl}");        }        //根据章节列表抓取章节内容,单线程        // foreach (var bookCatalogue in bookCatalogues)        // {        //     var catalogue = await GetChapterContents(bookCatalogue._catalogueUrl);        //     bookCatalogue._content = catalogue;        // }        //根据章节列表抓取章节内容,多线程        Parallel.ForEach(bookCatalogues, new ParallelOptions() {MaxDegreeOfParallelism = _Max_Concurrency},            (_catalogue, loopState) =>            {                try                {                    var catalogue = GetChapterContents(_catalogue._catalogueUrl).Result;                    _catalogue._content = catalogue;                }                catch (Exception ex)                {                    Console.Error.WriteLine("出错了");                    Console.ReadLine();                    _logger.Error($"出问题了,{ex.Message},出问题的章节是:{_catalogue._catalogueUrl}");                }            });        string result = String.Empty;        foreach (var book in bookCatalogues)        {            // Console.WriteLine($"{book._title}---{book._content}");            result +="rn"+ book._title + "rn" + book._content;        }        // 处理小说文字中的垃圾内容,根据自己需求实际情况在自己调整              Regex regex = new Regex("(?<=「).*?(?=」)");        result = RemoveFragments.RemoveFragmentsBetween(result, '(', ')')            .RemoveFragmentsBetween('(', ')')            .RemoveFragmentsBetween('【', '】')            ;        var match = regex.Matches(result);        try        {            for (int i = 0; i < match.Count; i++)            {                result = result.Replace(match[i].Value, match[i].Value.Replace("<br>", ""));            }        }        catch(Exception e){}        result = result.Replace(" ", "")                .Replace("-->>", "")                .Replace("「「", "」「")                .Replace("」」", "」「")                .Replace("第一发布站:xxxx.coм", "")                .Replace("www.xxxxx.com收藏不迷路!","")                .Replace("发布地址:<ref="http:www.xxxxx.com"target="_blank">","")                .Replace("</ref="http:>","")                .Replace("<br><br><br>", "")                .Replace("<br><br>", "rnrn    ")                .Replace("<br>", "")            ;        Regex regex_again = new Regex("&.*?;");        result = regex_again.Replace(result, "");        Console.WriteLine(result);        Write2Txt(result, $"{AppDomain.CurrentDomain.BaseDirectory}{_bookName}\{_bookName}.txt");    }    /// <summary>    /// 获取所有章节,写入章节列表    /// </summary>    /// <param name="url">包含章节列表的页面</param>    private async Task GetAllCatalogue(string url)    {        //读取制定网页html代码        var html = await GetHtmlAsync(url);        if (html is not null)        {            //读取小说名            _bookName = await GetBookName(html);            if (_bookName is not null)            {                //读取章节列表                GetBookCatalogue(html);                //判断是否有下一页                var nextpage = await GetNextPage(html);                //针对网站相同网页存在不同地址的处理                if (!nextpage.Equals(string.Empty) && nextpage != url.Replace(_baseAddress, "").Replace("/index.html", "_1/"))                {                    _logger.Info($"找到下一页,地址{nextpage}");                    await GetAllCatalogue(nextpage);                }            }        }        else        {            _logger.Error($"任务非正常终止,获取章节列表失败,无法访问{_url}");            return;        }    }    #endregion    #region 获取指定网页的html代码    /// <summary>    /// 获取指定网页的html代码    /// </summary>    /// <param name="url">目标网页url</param>    /// <returns></returns>    private async Task<string> GetHtmlAsync(string url)    {        int _retry = 5; //错误尝试次数        bool EOF = true; //错误尝试开关        string? pageAddress;        string? responseString = String.Empty;        if (string.IsNullOrEmpty(url))        {            pageAddress = _url;        }        else        {            pageAddress = url;        }        #region 使用HttpRequestMessage的坑,注意避坑        // 这里很坑,使用HttpRequestMessage不能重复提交访问,否则报错。所以放弃使用HttpRequestMessage        // HttpRequestMessage request =        //     new HttpRequestMessage(HttpMethod.Get, pageAddress);        // request.Headers.Add("Accept",        //     "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");        // request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");        // request.Headers.Add("Cache-Control", "max-age=0");        // request.Headers.Add("Connection", "keep-alive");        // // request.Headers.Add("Cookie",        // //     "X_CACHE_KEY=b42c608e5a7d95dbcd5c1b890fbd5417; PHPSESSID=c13e4def29ec686f997e48184a2209cc");        // request.Headers.Add("Referer", request.RequestUri.ToString());        // request.Headers.Add("Upgrade-Insecure-Requests", "1");        // request.Headers.Add("User-Agent",        //     "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35");        #endregion        while (EOF)        {            HttpResponseMessage? response = null;            try            {                response = await httpClient.GetAsync(pageAddress);                var responseMessage = response.EnsureSuccessStatusCode();                if (responseMessage.IsSuccessStatusCode)                {                    //转换编码格式,避免返回乱码                    var responseBody = await response.Content.ReadAsByteArrayAsync();                    responseString = Encoding.GetEncoding(_encoding).GetString(responseBody);                    _logger.Info($"请求{pageAddress},响应状态码: {(int) response.StatusCode}, 请求结果:{response.ReasonPhrase}");                    EOF = false;                }            }            catch (HttpRequestException exception)            {                _logger.Warn($"请求{response.RequestMessage.RequestUri}失败,{exception.Message},10秒后重试...");                Thread.Sleep(TimeSpan.FromSeconds(10));                //尝试5次后放弃                _retry--;                if (_retry == 0)                {                    EOF = false;                    _logger.Error($"多次请求{response.RequestMessage.RequestUri}失败,已放弃。");                }            }        }        return responseString;    }    #endregion    #region 获取小说名    /// <summary>    /// 从获取小说名开始执行后续任务    /// </summary>    /// <param name="html">小说首页地址</param>    private async Task<string> GetBookName(string html)    {        HtmlDocument doc = new HtmlDocument();        doc.LoadHtml(html);        // 内容解析 - 获得书名        var bookName = doc.DocumentNode.SelectSingleNode(_Xpath_Title).InnerText;        _logger.Info($"获取书名: 《{bookName}》");        return bookName;    }    #endregion    #region 获取小说章节    private void GetBookCatalogue(string html)    {        HtmlDocument doc = new HtmlDocument();        doc.LoadHtml(html);        //获取当前页面章节        //用手写XPATH最简单的写法,注意后面的li就是要取的列的标记,各网站不一定相同        HtmlNodeCollection catalogue = doc.DocumentNode.SelectNodes(_Xpath_Content);        foreach (var data in catalogue)        {            HtmlNode node = HtmlNode.CreateNode(data.OuterHtml);            HtmlNode a = node.SelectSingleNode("//a");            string u = a.Attributes["href"].Value;            //将章节写入章节列表            Catalogue cl = new Catalogue(data.InnerText, u);            bookCatalogues.Add(cl);        }    }    private async Task<string?> GetNextPage(string html)    {        string? nextpageUrl = String.Empty;        HtmlDocument doc = new HtmlDocument();        doc.LoadHtml(html);        //判断并获取下一页地址        //用插件自动获取的XPATH,注意获取后需要在后面补充要取的列的标记,比如这里的 “/a”        HtmlNodeCollection page = doc.DocumentNode.SelectNodes(_Xpath_Nextpage);        foreach (var data in page)        {            HtmlNode node = HtmlNode.CreateNode(data.OuterHtml);            HtmlNode a = node.SelectSingleNode("//a");            string u = a.Attributes["href"].Value;            // Console.WriteLine($"{data.InnerText},url={u}");            if (data.InnerText.Contains("下一页"))            {                nextpageUrl = u;            }        }        return nextpageUrl;    }    #endregion    #region 获取章节内容    /// <summary>    /// 遍历获取章节内容    /// </summary>    /// <param name="ChapterUrl"></param>    /// <returns></returns>    private async Task<string> GetChapterContents(string ChapterUrl)    {        bool eof = true;        //章节内容变量        string? chapterContents = String.Empty;        //读取章节内容        //获取章节html        while (eof)        {            var chapterHtml = await GetHtmlAsync(ChapterUrl);            if (chapterHtml is not null)            {                HtmlDocument doc = new HtmlDocument();                doc.LoadHtml(chapterHtml);                HtmlNodeCollection contentsList = doc.DocumentNode.SelectNodes(_Xpath_ChapterContent);                foreach (var content in contentsList)                {                    // chapterContents += content.InnerText + Environment.NewLine;                    // 如果要保留html标记比如<br>做特殊处理,用下面语句                    chapterContents += content.InnerHtml;                }                var nextpage = doc.DocumentNode.SelectSingleNode(_Xpath_ChapterContent_Nextpage);                if (nextpage.InnerText.Trim().Equals("下一页"))                {                    ChapterUrl = nextpage.Attributes["href"].Value;                }                else                {                    eof = false;                }            }        }        return chapterContents;    }    #endregion    private static void Write2Txt(string log, string filepath)    {        try        {            string folder = filepath.Substring(0, filepath.LastIndexOf('\'));            // 创建目录            if (Directory.Exists(folder) == false)            {                Directory.CreateDirectory(folder);            }            // 当文件已存在时删除文件            if (File.Exists(filepath) == true)            {                //FileStream fs = new FileStream(filepath, FileMode.Truncate, FileAccess.ReadWrite);                //fs.Close();                File.Delete(filepath);                File.AppendAllText(filepath, log + "rn", Encoding.Default);            }            else            {                //FileStream fs = File.Create(filepath);                //fs.Close();                File.Create(filepath).Close();                // 写入文件内容                File.AppendAllText(filepath, log + "rn", Encoding.Default);            }        }        catch (Exception exception)        {            _logger.Error(exception.Message);        }    }}

调用方式

[C#] 纯文本查看 复制代码
SpyderEngine spyderEngine = new SpyderEngine("http://m.xxxxxxxxx.net",    "GBK", null, null, null);await spyderEngine.SpyderBook("http://xxxxxxxxxxx/xxxx/xx/1220_1/");

具体网站地址就不透露啦,好不容易找到个能爬最新江山云罗的网站

赞(0) 打赏
未经允许不得转载:哈哈网 » 【原创.NET】庆祝.NET 7.0正式发布,用C#编写个小说网站通用爬虫程序

评论 抢沙发

觉得文章有用就打赏一下文章作者

非常感谢你的打赏,我们将继续提供更多优质内容,让我们一起创建更加美好的网络世界!

支付宝扫一扫打赏

微信扫一扫打赏