本帖最后由 gebiafu 于 2022-11-9 22:25 编辑
周末学习了一下爬虫,手痒,百度随便找了个小说网试水
可以搜索想要看的小说,按提示选择结果,最后批量保存所有章节内容到本地;
[Python] 纯文本查看 复制代码
import requestsfrom bs4 import BeautifulSoupimport sysimport osimport threading# 解决特殊字符命名def FileName(STR): for i,j in ("//","\\\","??","|︱","\""","**","<<",">>"): STR=STR.replace(i,j) return STR# 抓取单章小说def catchOne(link,name): # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36' } res=requests.get(url=link,headers=headers) res.encoding='utf-8' # 解析数据 co=BeautifulSoup(res.text,'html.parser') soups=co.find_all('dd') title=co.find('h2').text # 定义空数组,方便后续排序 list=[] for soup in soups: dict1={'id':soup['data-id'],'content':soup.text} list.append(dict1) # 定义排序方法 def order(e): return int(e['id']) list.sort(key=order,reverse=False) # 删除后面无用的两行 del list[-1] # del list[-1] # 保存数据到本地文件 for i in list: with open('e:\\img\\' + name+'\\'+FileName(title) + '.txt', mode='ab') as f: f.write(bytes(i['content'],encoding='utf8'))# 获取所有章节并到本地def catchAllText(link,name): # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36' } res=requests.get(url=link,headers=headers) res.encoding='utf-8' # 解析数据 co=BeautifulSoup(res.text,'html.parser') cos=co.find_all('a',class_='name') # 创建文件夹 os.mkdir('e:\\img\\'+name) for c in cos: target=link+c['href'] title=c.text print('开始抓取章节 : '+title) t=threading.Thread(target=catchOne,args=(target,name,)) t.run()# 指定小说爬取def catchNovel( ): par=input("请输入小说名进行搜索 : ") link = 'https://www.aixs.la/search.php' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36' } data = https://www.52pojie.cn/{'key': par } res = requests.post(url=link, headers=headers, data=https://www.52pojie.cn/data)'utf-8' # 解析数据 co = BeautifulSoup(res.text, 'html.parser') cos = co.find_all(class_='bigpic-book-name') list = [] for i in cos: dit = {'id': cos.index(i) + 1, 'title': i.text, 'href': 'https://www.aixs.la' + i['href']} print(dit) list.append(dit) if len(list) !=0: id = input('请输入序号id:') # 获得目标书籍的地址 targetUrl = list[int(id) - 1]['href'] # 书名 name=list[int(id)-1]['title'] res2 = requests.get(url=targetUrl, headers=headers) res2.encoding = 'utf-8' # 解析数据 co2 = BeautifulSoup(res2.text, 'html.parser') # 获得目录地址 cos2 = co2.find(class_='tab fl j-content-tab').find_all('a')[1]['href'] mlUrl = 'https://www.aixs.la' + cos2 catchAllText(mlUrl,name) else: print('暂无搜索到结果!') catchNovel()catchNovel()