Python爬虫新手,第一个项目是笔趣阁小说,感觉有很多可以改正的地方,欢迎大佬批评。
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupimport time# 链接的前缀,‘944’是《剑来》的地址,想其他书可以查看原网页换地址first_url = 'https://www.bqg99.com/book/944/'basic_url = { 'whole': None, 'index': 1}custom_header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35'}# 好的小说output = 'D:/Data/novel.txt'# 更新网页链接def update_link(url_dict): url_dict['whole'] = first_url + str(url_dict['index']) + '.html' url_dict['index'] = url_dict['index'] + 1 print(url_dict['whole']) return url_dict# 通过GET方法获取网页文本内容def fetch_text(url_dict, request_header): data = https://www.52pojie.cn/requests.get(url=url_dict['whole'], headers=request_header, allow_redirects=False) print('status = %d' % data.status_code) if data.status_code == 302: return None else: data = https://www.52pojie.cn/BeautifulSoup(data.text,'lxml') article = data.find(name='div', class_='content') chapter_topic = article.h1.text content_soup = article.find(name='div', id='chaptercontent', class_='Readarea ReadAjax_content') content_soup.p.decompose() # 去掉多余的“上一章”、“下一章”的导航链接 charter_words = content_soup.stripped_strings chapter = { 'topic': chapter_topic, 'content': charter_words } return chapterdef main(): novel = open(file=output, mode='a+', encoding='utf8') link = update_link(basic_url) text = fetch_text(url_dict=link, request_header=custom_header) while text is not None: novel.write(text['topic']) novel.write('\n') for line in text['content']: line = line novel.write(line) novel.write('\n') novel.write('\n\n') time.sleep(1) # 暂停1秒,防止服务器拒绝,不过这个网站好像没有反爬机制 link = update_link(link) text = fetch_text(url_dict=link, request_header=custom_header) novel.close()if __name__ == '__main__': main()