本帖最后由 jdclang 于 2022-11-11 12:58 编辑
之前一直用Python写爬虫,Python写爬虫也确实方便,不过实在不喜欢,甚至讨厌Python在任务并发上的模式,这次疫情被封控在家,加上.NET 7.0发布,顺手用.NET 7写了个所谓的小说网站通用爬虫(在初始化类的时候需要自己提供不同网站的XPATH),跟Python的代码量比较了一下,其实也没多多少内容。唯一没Python方便的就是HtmlAgilityPack没有BeautifulSoup方便,只支持XPATH方式,不过其实XPATH也挺方便的。
代码看着多,其实里面注释差不多占了不少内容,方便各位理解,本来还想加上自动获取代{过}{滤}理池的,不过后来想想,国内盗版网站好像都没防爬虫机制,就懒得折腾了,有兴趣的自己修改吧。
[C#] 纯文本查看 复制代码
using System.Diagnostics.CodeAnalysis;using System.Net;using System.Text;using System.Text.RegularExpressions;using HtmlAgilityPack;using NLog;using NLog.Targets;namespace StoryBookSpyder;public class SpyderEngine{ #region 初始化变量 //初始化日志工具 private static Logger _logger = LogManager.GetCurrentClassLogger(); //定义小说属性 private string _bookName; //书名 private string _encoding; //网页字符编码 private string _baseAddress; //小说首页 private string _url; //章节地址 public int _Max_Concurrency { get; set; } = 50; //设置爬取内容的Xpath路径 /// <summary> /// 小说标题 /// </summary> public string _Xpath_Title { get; set; } = "/html/body/div[1]/h1/a"; /// <summary> /// 小说章节 /// </summary> public string _Xpath_Content { get; set; } = "//*[@class='chapter']/li"; /// <summary> /// 章节下一页 /// </summary> public string _Xpath_Nextpage { get; set; } = "(//*[@class='page'])[1]/a"; /// <summary> /// 章节内容 /// </summary> public string _Xpath_ChapterContent { get; set; } = "(//*[@class='nr_nr'])[1]/div"; /// <summary> /// 章节内容下一页 /// </summary> public string _Xpath_ChapterContent_Nextpage { get; set; } = "(//td[@class='next']//a)[2]"; //初始化章节列表 private List<Catalogue> bookCatalogues = new(); static HttpClientHandler handler = new(); internal static readonly HttpClient httpClient = new(handler); #endregion #region 初始化爬虫引擎 /// <summary> /// 初始化爬虫引擎,设置网站的默认地址,如果小说页面地址不完整,则自动根据默认地址补全 /// </summary> /// <param name="baseAddress">网站地址</param> /// <param name="url">小说页面地址</param> /// <param name="encoding">网页字符编码</param> internal SpyderEngine(string baseAddress, string encoding, string? xpath_Title, string? xpath_content, string? xpath_nextpage) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); handler.ServerCertificateCustomValidationCallback = HttpClientHandler.DangerousAcceptAnyServerCertificateValidator; _baseAddress = baseAddress; httpClient.BaseAddress = new Uri(_baseAddress); _encoding = encoding; if (xpath_Title is not null) { _Xpath_Title = xpath_Title; } if (xpath_content is not null) { _Xpath_Content = xpath_content; } if (xpath_nextpage is not null) { _Xpath_Nextpage = xpath_nextpage; } httpClient.DefaultRequestHeaders.Clear(); httpClient.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); httpClient.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"); httpClient.DefaultRequestHeaders.Add("Cache-Control", "max-age=0"); httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive"); httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1"); httpClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35"); } #endregion #region 开始爬取小说 /// <summary> /// 开始执行爬虫 /// </summary> /// <param name="url">含有章节列表的网页地址</param> public async Task SpyderBook(string url) { await GetAllCatalogue(url); // 输出已存储的章节列表 foreach (var bookCatalogue in bookCatalogues) { Console.WriteLine($"章节名:{bookCatalogue._title},章节地址:{bookCatalogue._catalogueUrl}"); } //根据章节列表抓取章节内容,单线程 // foreach (var bookCatalogue in bookCatalogues) // { // var catalogue = await GetChapterContents(bookCatalogue._catalogueUrl); // bookCatalogue._content = catalogue; // } //根据章节列表抓取章节内容,多线程 Parallel.ForEach(bookCatalogues, new ParallelOptions() {MaxDegreeOfParallelism = _Max_Concurrency}, (_catalogue, loopState) => { try { var catalogue = GetChapterContents(_catalogue._catalogueUrl).Result; _catalogue._content = catalogue; } catch (Exception ex) { Console.Error.WriteLine("出错了"); Console.ReadLine(); _logger.Error($"出问题了,{ex.Message},出问题的章节是:{_catalogue._catalogueUrl}"); } }); string result = String.Empty; foreach (var book in bookCatalogues) { // Console.WriteLine($"{book._title}---{book._content}"); result +="rn"+ book._title + "rn" + book._content; } // 处理小说文字中的垃圾内容,根据自己需求实际情况在自己调整 Regex regex = new Regex("(?<=「).*?(?=」)"); result = RemoveFragments.RemoveFragmentsBetween(result, '(', ')') .RemoveFragmentsBetween('(', ')') .RemoveFragmentsBetween('【', '】') ; var match = regex.Matches(result); try { for (int i = 0; i < match.Count; i++) { result = result.Replace(match[i].Value, match[i].Value.Replace("<br>", "")); } } catch(Exception e){} result = result.Replace(" ", "") .Replace("-->>", "") .Replace("「「", "」「") .Replace("」」", "」「") .Replace("第一发布站:xxxx.coм", "") .Replace("www.xxxxx.com收藏不迷路!","") .Replace("发布地址:<ref="http:www.xxxxx.com"target="_blank">","") .Replace("</ref="http:>","") .Replace("<br><br><br>", "") .Replace("<br><br>", "rnrn ") .Replace("<br>", "") ; Regex regex_again = new Regex("&.*?;"); result = regex_again.Replace(result, ""); Console.WriteLine(result); Write2Txt(result, $"{AppDomain.CurrentDomain.BaseDirectory}{_bookName}\{_bookName}.txt"); } /// <summary> /// 获取所有章节,写入章节列表 /// </summary> /// <param name="url">包含章节列表的页面</param> private async Task GetAllCatalogue(string url) { //读取制定网页html代码 var html = await GetHtmlAsync(url); if (html is not null) { //读取小说名 _bookName = await GetBookName(html); if (_bookName is not null) { //读取章节列表 GetBookCatalogue(html); //判断是否有下一页 var nextpage = await GetNextPage(html); //针对网站相同网页存在不同地址的处理 if (!nextpage.Equals(string.Empty) && nextpage != url.Replace(_baseAddress, "").Replace("/index.html", "_1/")) { _logger.Info($"找到下一页,地址{nextpage}"); await GetAllCatalogue(nextpage); } } } else { _logger.Error($"任务非正常终止,获取章节列表失败,无法访问{_url}"); return; } } #endregion #region 获取指定网页的html代码 /// <summary> /// 获取指定网页的html代码 /// </summary> /// <param name="url">目标网页url</param> /// <returns></returns> private async Task<string> GetHtmlAsync(string url) { int _retry = 5; //错误尝试次数 bool EOF = true; //错误尝试开关 string? pageAddress; string? responseString = String.Empty; if (string.IsNullOrEmpty(url)) { pageAddress = _url; } else { pageAddress = url; } #region 使用HttpRequestMessage的坑,注意避坑 // 这里很坑,使用HttpRequestMessage不能重复提交访问,否则报错。所以放弃使用HttpRequestMessage // HttpRequestMessage request = // new HttpRequestMessage(HttpMethod.Get, pageAddress); // request.Headers.Add("Accept", // "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); // request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"); // request.Headers.Add("Cache-Control", "max-age=0"); // request.Headers.Add("Connection", "keep-alive"); // // request.Headers.Add("Cookie", // // "X_CACHE_KEY=b42c608e5a7d95dbcd5c1b890fbd5417; PHPSESSID=c13e4def29ec686f997e48184a2209cc"); // request.Headers.Add("Referer", request.RequestUri.ToString()); // request.Headers.Add("Upgrade-Insecure-Requests", "1"); // request.Headers.Add("User-Agent", // "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35"); #endregion while (EOF) { HttpResponseMessage? response = null; try { response = await httpClient.GetAsync(pageAddress); var responseMessage = response.EnsureSuccessStatusCode(); if (responseMessage.IsSuccessStatusCode) { //转换编码格式,避免返回乱码 var responseBody = await response.Content.ReadAsByteArrayAsync(); responseString = Encoding.GetEncoding(_encoding).GetString(responseBody); _logger.Info($"请求{pageAddress},响应状态码: {(int) response.StatusCode}, 请求结果:{response.ReasonPhrase}"); EOF = false; } } catch (HttpRequestException exception) { _logger.Warn($"请求{response.RequestMessage.RequestUri}失败,{exception.Message},10秒后重试..."); Thread.Sleep(TimeSpan.FromSeconds(10)); //尝试5次后放弃 _retry--; if (_retry == 0) { EOF = false; _logger.Error($"多次请求{response.RequestMessage.RequestUri}失败,已放弃。"); } } } return responseString; } #endregion #region 获取小说名 /// <summary> /// 从获取小说名开始执行后续任务 /// </summary> /// <param name="html">小说首页地址</param> private async Task<string> GetBookName(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); // 内容解析 - 获得书名 var bookName = doc.DocumentNode.SelectSingleNode(_Xpath_Title).InnerText; _logger.Info($"获取书名: 《{bookName}》"); return bookName; } #endregion #region 获取小说章节 private void GetBookCatalogue(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //获取当前页面章节 //用手写XPATH最简单的写法,注意后面的li就是要取的列的标记,各网站不一定相同 HtmlNodeCollection catalogue = doc.DocumentNode.SelectNodes(_Xpath_Content); foreach (var data in catalogue) { HtmlNode node = HtmlNode.CreateNode(data.OuterHtml); HtmlNode a = node.SelectSingleNode("//a"); string u = a.Attributes["href"].Value; //将章节写入章节列表 Catalogue cl = new Catalogue(data.InnerText, u); bookCatalogues.Add(cl); } } private async Task<string?> GetNextPage(string html) { string? nextpageUrl = String.Empty; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //判断并获取下一页地址 //用插件自动获取的XPATH,注意获取后需要在后面补充要取的列的标记,比如这里的 “/a” HtmlNodeCollection page = doc.DocumentNode.SelectNodes(_Xpath_Nextpage); foreach (var data in page) { HtmlNode node = HtmlNode.CreateNode(data.OuterHtml); HtmlNode a = node.SelectSingleNode("//a"); string u = a.Attributes["href"].Value; // Console.WriteLine($"{data.InnerText},url={u}"); if (data.InnerText.Contains("下一页")) { nextpageUrl = u; } } return nextpageUrl; } #endregion #region 获取章节内容 /// <summary> /// 遍历获取章节内容 /// </summary> /// <param name="ChapterUrl"></param> /// <returns></returns> private async Task<string> GetChapterContents(string ChapterUrl) { bool eof = true; //章节内容变量 string? chapterContents = String.Empty; //读取章节内容 //获取章节html while (eof) { var chapterHtml = await GetHtmlAsync(ChapterUrl); if (chapterHtml is not null) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(chapterHtml); HtmlNodeCollection contentsList = doc.DocumentNode.SelectNodes(_Xpath_ChapterContent); foreach (var content in contentsList) { // chapterContents += content.InnerText + Environment.NewLine; // 如果要保留html标记比如<br>做特殊处理,用下面语句 chapterContents += content.InnerHtml; } var nextpage = doc.DocumentNode.SelectSingleNode(_Xpath_ChapterContent_Nextpage); if (nextpage.InnerText.Trim().Equals("下一页")) { ChapterUrl = nextpage.Attributes["href"].Value; } else { eof = false; } } } return chapterContents; } #endregion private static void Write2Txt(string log, string filepath) { try { string folder = filepath.Substring(0, filepath.LastIndexOf('\')); // 创建目录 if (Directory.Exists(folder) == false) { Directory.CreateDirectory(folder); } // 当文件已存在时删除文件 if (File.Exists(filepath) == true) { //FileStream fs = new FileStream(filepath, FileMode.Truncate, FileAccess.ReadWrite); //fs.Close(); File.Delete(filepath); File.AppendAllText(filepath, log + "rn", Encoding.Default); } else { //FileStream fs = File.Create(filepath); //fs.Close(); File.Create(filepath).Close(); // 写入文件内容 File.AppendAllText(filepath, log + "rn", Encoding.Default); } } catch (Exception exception) { _logger.Error(exception.Message); } }}
调用方式
[C#] 纯文本查看 复制代码
SpyderEngine spyderEngine = new SpyderEngine("http://m.xxxxxxxxx.net", "GBK", null, null, null);await spyderEngine.SpyderBook("http://xxxxxxxxxxx/xxxx/xx/1220_1/");
具体网站地址就不透露啦,好不容易找到个能爬最新江山云罗的网站