python 只下载福吧汇总第三页的美女图片
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
# 获取页面源代码
def get_source(url):
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36’
}
resp = requests.get(url=url, headers=headers)
resp.encoding = ‘utf-8’
resp.close()
return resp.text
# 源代码解析 要求只获取jpg格式的链接
def parse_source(html):
jpg_urls = []
tree = etree.HTML(html)
imgs_url = tree.xpath(“/html/body/section/div[1]/div/article/p//img/@src”)
for img_url in imgs_url:
if img_url.split(‘.’)[-1] == ‘jpg’:
jpg_urls.append(img_url)
return jpg_urls
# 下载jpg图片
def download(jpg_url):
resp = requests.get(url=jpg_url)
down_name = jpg_url.split(‘/’)[-1]
with open(‘D:/jpg_download/’ + down_name, mode=’wb’) as f:
f.write(resp.content)
print(down_name, ‘下载完毕…’)
# 主程序,线程池下载
def main():
url = ‘https://fuliba2021.net/2021177.html/3’
html = get_source(url)
jpg_urls = parse_source(html)
with ThreadPoolExecutor(20) as t:
for jpg_url in jpg_urls:
t.submit(download, jpg_url)
if __name__ == ‘__main__’:
main()
复制代码最近又学了线程池,于是汇总一下自己所学到的爬虫知识,写了一个批量下载2022第一期福利汇总第三页的图片,最近的三次元图片太给力啦,美胸美腿都有