最新python request使用异步协程async/await详解

【注意:此文章为博主原创文章!转载需注意,请带原文链接,至少也要是txt格式!】
在看本文之前,你必须要弄明白async/await的原理,可以参考:超级详细解释 Python 中的 async await 概念 虽然你也可以直接看本文,但是建议你还是先理解理解概念。
这里首先要确保你的python是3.9+的最新版本,其次请一定安装aiohttp,也要确保是最新版本。
首先我们通过一个大量的请求来做一个实验,【关于单一异步协程】代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | #!/usr/bin/python3
# -*- coding: utf-8 -*-
# @File : 协程.py
# @Time : 2021/8/19
# @Desc :
import asyncio
import time
import aiohttp
import ujson
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Connection": "close",
"User-Agent": "Mozilla/5.0 (Windows NT 11.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4515.131 Safari/537.36 Edg/95.0.902.73",
"Sec-Fetch-Site": "none", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate",
"Sec-Fetch-Mode": "navigate",
"sec-ch-ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"92\"",
"sec-ch-ua-mobile": "?0", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-User": "?1",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"}
cookies = {"siteToken": "abc", "siteUserId": "efd"}
async def main():
async with aiohttp.ClientSession(headers=headers, cookies=cookies) as session:
for num in range(100000, 100500):
paramsPost = {"pregnantCycle": "", "s": "{}".format(num)}
url = f'https://www.baidu.com/?queryMemberList'
async with session.post(url, data=paramsPost, timeout=20) as resp:
data = ujson.loads(await resp.text())
status = resp.status
try:
if status == 200 and data['data']['list']:
print("【当前商铺ID:{} --- 会员总数量:{}】\n[举例其中一条数据]:会员昵称--{},"
"会员真实姓名--{},会员手机--{}".format(num, data['data']['pageCond']['count'],
data['data']['list'][0]['nickname'],
data['data']['list'][0]['truename'],
data['data']['list'][0]['mobile']))
except Exception as e:
print(e)
if num % 100 == 0:
print("已遍历至商户ID:{}".format(num))
if __name__ == '__main__':
tt = time.time()
asyncio.run(main())
print(time.time() - tt) |
上面的脚本最终运行时间:193.5414 秒,,注意上面的代码,结果默认是str格式,转换json一定要用ujson,处理字符速度贼快。
利用 asyncio 提高性能,在上面的案例中,我们await在每个单独的 HTTP 请求之后使用,这并不理想。我们可以改为“同时”将所有这些请求作为异步任务运行,然后在最后检查结果,使用asyncio.ensure_future和asyncio.gather。
如果实际发出请求的代码被分解为自己的协程函数,我们可以创建一个任务列表,由每个请求的任务组成。然后我们可以将这个列表解压到一个gather调用中,该调用将它们一起运行。当我们await调用 时asyncio.gather,我们将返回所有传入的任务的可迭代对象,并保持它们在列表中的顺序。
【关于多任务异步协程】代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | # -*- coding:utf-8 -*-
# __author__ = shrimp
# __Blog__ = https://woj.app
# __Date__ = 2021/8/19
# __ver__ = python3
import asyncio
import time
import aiohttp
import ujson
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Connection": "close",
"User-Agent": "Mozilla/5.0 (Windows NT 11.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4515.131 Safari/537.36 Edg/95.0.902.73",
"Sec-Fetch-Site": "none", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate",
"Sec-Fetch-Mode": "navigate",
"sec-ch-ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"92\"",
"sec-ch-ua-mobile": "?0", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-User": "?1",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"}
cookies = {"siteToken": "abc", "siteUserId": "efd"}
async def get_pokemon(session, url, paramsPost):
async with session.post(url, data=paramsPost, timeout=20) as resp:
data = ujson.loads(await resp.text())
status = resp.status
return data, status
async def main():
# loop = asyncio.get_running_loop()
task_list = []
async with aiohttp.ClientSession(headers=headers, cookies=cookies) as session:
for num in range(100000, 100500):
paramsPost = {"pregnantCycle": "", "s": "{}".format(num)}
url = f'https://www.baidu.com/?queryMemberList'
future = asyncio.ensure_future(get_pokemon(session, url, paramsPost))
task_list.append(future)
##没办法计数,因为任务是并发执行
datas = await asyncio.gather(*task_list)
for data, status in datas: # 并发执行
# print(data)
try:
if status == 200 and data['data']['list']:
print("【当前商铺ID:{} --- 会员总数量:{}】\n[举例其中一条数据]:会员昵称--{},"
"会员真实姓名--{},会员手机--{}".format(num, data['data']['pageCond']['count'],
data['data']['list'][0]['nickname'],
data['data']['list'][0]['truename'],
data['data']['list'][0]['mobile']))
except Exception as e:
print(e)
if __name__ == '__main__':
tt = time.time()
asyncio.run(main())
print(time.time() - tt) |
上面的脚本最终运行时间:19.5669 秒,运行了多次,就在这个结果附近。
这里经过调试发现,上面的循环是运行500次么,当第一次进行session.post请求时,必然需要等待对方系统返回相应的数据,这个时候任务会进行下一个请求。 大家可以这么理解,我请求1之后,你愿意多久返回数据就多久,你慢慢返回数据,我直接去请求2,如果此时1返回数据了,则2的请求等待着它的,我回来处理1的数据,以此类推。【个人理解,其实也就是瞬间发出去500个请求,然后看哪个返回数据,就先处理哪个任务,毕竟发送请求不怎么耗费IO】
有点夸张,快了10倍。本着求真务实的原则,我有用多线程(设置40个线程),不用协程,脚本如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | #!/usr/bin/python3
# -*- coding: utf-8 -*-
# @File : 多线程.py
# @Time : 2021/8/19
import asyncio
import concurrent.futures
import time
import requests
session = requests.Session()
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Connection": "close",
"User-Agent": "Mozilla/5.0 (Windows NT 11.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4515.131 Safari/537.36 Edg/95.0.902.73",
"Sec-Fetch-Site": "none", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate",
"Sec-Fetch-Mode": "navigate",
"sec-ch-ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"92\"",
"sec-ch-ua-mobile": "?0", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-User": "?1",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"}
cookies = {"siteToken": "abc", "siteUserId": "efd"}
def main():
# 标准启动
ret = async_task(0) # 启动异步任务
print(ret)
def async_task(x):
task_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=40) as pool:
for i in range(100000, 100500):
future = pool.submit(long_time_task, x,i)
task_list.append(future)
for future in concurrent.futures.as_completed(task_list): # 在Future实例运行结束后产出.
future.result() # 此时不会阻塞,因为 as_completed 只会产出运行结束后的Future实例.
def long_time_task(a1, id):
paramsGet = {"pregnantCycle": "", "s": "{}".format(num)}
response = session.get("https://www.baidu.com/?queryMemberList", params=paramsGet,
headers=headers, cookies=cookies, timeout=20)
try:
if response.status_code == 200 and response.json()['data']['list']:
print("【当前商铺ID:{} --- 会员总数量:{}】\n[举例其中一条数据]:会员昵称--{},"
"会员真实姓名--{},会员手机--{}".format(id, response.json()['data']['pageCond']['count'],
response.json()['data']['list'][0]['nickname'],
response.json()['data']['list'][0]['truename'],
response.json()['data']['list'][0]['mobile']))
if id % 500 == 0:
print("已遍历至商户ID:{}".format(id))
except Exception as e:
print(e)
return
if __name__ == '__main__':
tt = time.time()
main()
print(time.time() - tt) |
这个纯粹就是多线程,大家猜猜运行时长是多少???
运行多次结果都在13--19秒之间。。。
此刻的我,破口大骂,我CTMD协程,浪费感情么不是。。。你瞬间发N多个请求,对方不封你IP?因为你没办法控制协程任务数量啊!!!但是多线程你能啊,所以,,,老子要你有个乱用?!
如果多线程稍微设置多一点,是不是就和协程一样了,其实不然,对本机器所耗费的资源还是不同的。协程相对来说能比多线程节省一些资源。
布施恩德可便相知重
微信扫一扫打赏
支付宝扫一扫打赏