教程搬运工,来源:(11 封私信 / 17 条消息) 爬虫入门_6:高性能异步爬取及案例实战 - 知乎
目的:在爬虫中使用异步实现高性能的数据爬取操作
异步爬取的方式:
import requests
def get_content(url):
print("正在爬取:", url)
# get方法是一个阻塞的方法
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.content
def parse_content(content):
print("响应数据的长度为: ",len(content))
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
urls = [
'<https://downsc.chinaz.net/Files/DownLoad/jianli/202111/jianli16476.rar>',
'<https://downsc.chinaz.net/Files/DownLoad/jianli/202111/jianli16473.rar>',
'<https://downsc.chinaz.net/Files/DownLoad/jianli/202112/jianli16531.rar>'
]
for url in urls:
content = get_content(url)
parse_content(content)
代码如下,其中以单线程串行的方式执行后的耗时为8秒,线程池方式执行后的耗时为2秒,大大的减少运行时间。
import time
from multiprocessing.dummy import Pool # 导入线程池模块对应的类
def get_page(str):
print("正在下载: ",str)
time.sleep(2)
print("下载成功:", str)
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
name_list = ['xiaozi','aa','bb','cc']
start_time = time.time()
# 1.使用单线程串行的方式执行
# for i in range(len(name_list)):
# get_page(name_list[i])
# 2.使用线程池方式执行
# # 实例化一个线程池对象
pool = Pool(4)
# # 将列表中每一个列表元素传递给get_page进行处理
pool.map(get_page,name_list)
end_time = time.time()
print('%d second'% (end_time-start_time))
地址:梨视频网站
实现代码如下:
import requests
import time
from lxml import etree
from multiprocessing.dummy import Pool # 导入线程池模块对应的类
def get_video_data(dic):
"""获取video数据"""
url = dic['url']
print(dic['name'], '正在下载......')
data = requests.get(url=url, headers=headers).content
# 持久化存储操作
with open('./result/' + dic['name'], 'wb') as fp:
fp.write(data)
print(dic['name'], '下载成功!!!')
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
# 线程池使用的基本原则:线程池处理的是阻塞且较为耗时的操作
# 对下述url发起请求解析出视频详情页url和视频的名称
url = '<https://www.pearvideo.com/category_5>'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls = [] # 存储所有视频的链接和名称
for li in li_list:
href = li.xpath('./div/a/@href')[0]
video_id = href.split('_')[-1] # 视频ids
video_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4' # 视频名称
# 对详情页的url发起请求
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Referer': '<https://www.pearvideo.com/>' + href
}
detail_url = '<https://www.pearvideo.com/videoStatus.jsp?contId={}>'.format(str(video_id))
video_json = requests.get(url=detail_url, headers=headers).json()
video_url = video_json['videoInfo']['videos']['srcUrl']
# 获取的video_url地址打开后没有视频,和原始的视频地址对比,发现存在不同点,需要对video_url地址中不同地方进行替换后,视频地址可以播放
video_url = video_url.replace(video_url.split('/')[-1].split('-')[0], 'cont-%s' % video_id)
# print(video_url)
# 将视频链接和名称存储在字典类型中
dic = {
'name': video_name,
'url': video_url
}
urls.append(dic)
# 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
start_time = time.time()
pool = Pool(4)
pool.map(get_video_data, urls)
pool.close()
pool.join()
end_time = time.time()
print('total time is %d second' % (end_time - start_time))
运行结果:
