欢迎光临
我们一直在努力

python爬取小姐姐 分文件夹 异步 参数控制并发

本帖最后由 lifan1995 于 2022-11-6 11:56 编辑

新人第一次发帖,如有本人不了解的违规处,请管理员删帖
常年潜水,最近看到一些爬虫的分享,遂自己写了套代码练练手,与大家交流学习(python纯属自学),有什么代码建议请指出,谢谢大家
代码纯属学习
注意:
控制并发参数的设置:sem_pic,不要太大了,不要影响网站正常运行!!!
控制并发参数的设置:sem_pic,不要太大了,不要影响网站正常运行!!!
控制并发参数的设置:sem_pic,不要太大了,不要影响网站正常运行!!!
重要的事情说三遍

[Python] 纯文本查看 复制代码
import asyncioimport osimport reimport timeimport aiohttpfrom bs4 import BeautifulSoup# 此处放绝对路径,此路径需要已存在'''实例:linux:'/home/lifan/workspace/download/www.tuiimg.com/'widows:'E:/picture/test''''root_path = 'E:/picture/test'url = 'https://www.tuiimg.com/meinv/'headers = {    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35",    'Pragma': 'no-cache',    'Accept-Encoding': 'gzip, deflate',    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',    'Cache-Control': 'no-cache',    'Connection': 'keep-alive',    "Content-Type": "text/html;charset=utf-8"}# 文件写入的并发限制sem_write = asyncio.Semaphore(500)# 请求分类页面的并发限制(不要一次请求太多分类)sem_page = asyncio.Semaphore(10)'''单张图片有大有小,100-400kb之间设置为5时,前2页内容,耗时182.47s,网速3-4m/s设置为15时,前2页内容,耗时73.12s,网速7-9m/s'''# 请求图片的并发限制(主要考虑网站的带宽和自己的网速,速度和快慢主要取决于这个参数)sem_pic = asyncio.Semaphore(15)async def fetch_content(url):    '''    根据url获取内容    :param url: 网页的url    :return:    '''    async with sem_page:        print(f'开始解析链接:{url}')        max_retries = 3        attempt = 0        while True:            try:                async with aiohttp.ClientSession(                        headers=headers, connector=aiohttp.TCPConnector(ssl=False)                ) as session:                    async with session.get(url, timeout=10) as resp:                        return await resp.text()                break            except (                    asyncio.TimeoutError            ):                if attempt < max_retries:                    print("解析链接异常,稍后自动重试:{} times:{}".format(url, attempt))                    attempt += 1                else:                    raiseasync def write_pic(pic, img_path_name):    '''    写入图片    :param pic: 图片对象    :param img_path_name: 图片的路径    :return:    '''    async with sem_write:        with open(img_path_name, 'wb') as f:            f.write(pic)            print(f'写入图片:{img_path_name}')async def download_one(img_url, img_path):    '''    一张图片    :param img_url: 图片的url    :param img_path: 图片存放的文件夹    :return:    '''    async with sem_pic:        img_name = img_url.split("/")[-1]        img_path_name = "{}/{}".format(img_path, img_name)        if not os.path.exists(img_path_name):            max_retries = 3            attempt = 0            while True:                try:                    async with aiohttp.ClientSession(                            headers=headers, connector=aiohttp.TCPConnector(ssl=False)                    ) as session:                        async with session.get(img_url, timeout=10) as resp:                            pic = await resp.read()                            await write_pic(pic, img_path_name)                    break                except (                        asyncio.TimeoutError                ):                    if attempt < max_retries:                        print("{}times:{}".format(img_path_name, attempt))                        attempt += 1                    else:                        raiseasync def page_pic(pic_page):    '''    处理一套图片,分文件夹存放    :param pic_page: 一套图片的url    :return:    '''    bs = BeautifulSoup(pic_page, 'lxml')    title = bs.find('title').get_text()    img_path = "{}/{}".format(root_path, "{}".format(title))    if not os.path.exists(img_path):        print(f'创建存放路径:{img_path}')        os.mkdir(img_path)    else:        print(f'已存在存放路径:{img_path}')    div = bs.find('div', {'class': "content"})    img_temp_link = div.find('img')['src']    img_base_link = img_temp_link[0:-5]    allText = bs.find('i', id='allbtn').get_text()    pattern = re.compile("\((.*?)\)")    total = pattern.search(allText).group(1).split("/")[1]    img_urls = []    for i in range(1, int(total) + 1):        img_url = img_base_link + str(i) + '.jpg'        img_urls.append(img_url)    task_img = [download_one(img_url, img_path) for img_url in img_urls]    await asyncio.gather(*task_img)async def page_main(url):    '''    单页    :param url: 单页的url    :return: None    '''    mainPageText = await fetch_content(url)    bs = BeautifulSoup(mainPageText, 'lxml')    a_all = bs.find_all('a', {'class': 'pic'})    page_urls = []    for a in a_all:        page_urls.append(a['href'])    tasks = [fetch_content(pic_url) for pic_url in page_urls]    pic_pages = await asyncio.gather(*tasks)    pics = [page_pic(pic_page) for pic_page in pic_pages]    await asyncio.gather(*pics)async def main():    '''    遍历所有页面的url进行    :return:    '''    await target_folder()    start = time.time()    mainPageText = await fetch_content(url)    bs = BeautifulSoup(mainPageText, 'lxml')    page_count = bs.find('div', {'class', 'page'}).find('a', {'class', "end"}).get_text()    page_urls = []    for i in range(1, int(page_count) + 1):        page_url = f'{url}list_{i}.html'        page_urls.append(page_url)    tasks = [page_main(page_url) for page_url in page_urls]    await asyncio.gather(*tasks)    end = time.time()    print(f"耗时:{end - start:.2f}秒")async def target_folder():    '''    指定的目录,需要已存在的,此处特意没有自动创建    :return:    '''    if not os.path.exists(root_path):        raise Exception(f'指定的路径不存在:{root_path}')if __name__ == '__main__':    # linux下用此方法    asyncio.run(main())    # windows下上面的方法会报错(会在运行完成后报错,不影响),可以换成用下面这两行    # loop = asyncio.get_event_loop()    # loop.run_until_complete(main())

赞(0) 打赏
未经允许不得转载:哈哈网 » python爬取小姐姐 分文件夹 异步 参数控制并发

评论 抢沙发

觉得文章有用就打赏一下文章作者

非常感谢你的打赏,我们将继续提供更多优质内容,让我们一起创建更加美好的网络世界!

支付宝扫一扫打赏

微信扫一扫打赏