python异步赞助
环境:python3.7.0
协作日程
协和式,英语中称为Coroutine,也称为微线程、光纤。 协和式是用户状态的轻量级线程。
协程本质上是单进程的,协程对于多进程没有线程上下文切换开销、原子操作锁定和同步开销,编程模型也非常简单。
用协和式控制器实现异步操作。 在发出一个请求后,需要一定的时间才能得到响应。 在此等待过程中,程序可以做很多其他事情,然后切换并继续处理,直到得到响应。 这样可以充分利用CPU和其他资源。 这就是异步协和式飞机的优点。
异步协作可以减少爬行动物爬文件的时间
其中使用aiohttp库
sudo pip3 install aiohttp
Name: aiohttp
Version: 3.4.1
summary : async http客户端/服务器框架(async io ) )。
home-page :https://github.com/AIO-libs/AIO http
Author: Nikolay Kim
author-email : faf HRD 91 @ Gmail.com
许可证: Apache 2
location :/library/frameworks/python.framework/versions/3.7/lib/python 3.7/site-packages
Requires: yarl,attrs,multidict,chardet,async-timeout
要求- by :
以下是异步和异步不获取unsplash首页图像的比较
unsplash.py
导入请求
import os
导入re
来自时间导入时间
类喷射(object ) :
def __init__(self,n=10 ) :
self.headers={
'用户代理' : ' Mozilla/5.0 (机器; Intel Mac OS X 10_13_6) appleWebKit/537.36(khtml,like Gecko ) chrome/68.0.3440.106 safari/537.36 '
}
path='./download/unsplash '
ifnotOS.path.Exists(path ) :
是OS.mkdir(Path )
self.path=path
else:
self.path=path
self.n=n
self.num=1
defgetimageslinks(self,page:int ) :
URL=' https://un splash.com/napi/photos '
params={
' page': page,
' per_page': '12 ',
' order_by': 'latest '
}
links=[]
try:
r=requests.get(URL=URL,params=params,timeout=60 ) ) )。
r.raise_for_status (
r.encoding=r.apparent_encoding
for data in r.json () :
links.append (data [ ' URLs ' ] [ ' full ' ] )
except Exception as e:
打印(e.args ) )。
finally:
返回链接
defsave_img(self,url ) :
text=' '
try:
r=requests.get(URL,headers=self.headers,timeout=60 ) )。
r.raise_for_status (
text=r.content
except Exception as e:
打印(e.args ) )。
finally:
返回文本
defdownload_img(self,url:str ) :
URL_split=re.split(r'/|?) ,url )
try:
filename=url_split[3] '.jpg '
ifos.path.exists (self.path '/' filename ) :
print (“下载失败。 文件已经存在)”
/p>else:
with open(self.path+'/'+filename,'wb') as f:
f.write(self.save_img(url))
f.close()
print('成功下载第%d张图片'%(self.num))
self.num+=1
except Exception as e:
print(e.args)
finally:
pass
def run(self):
try:
for i in range(1,self.n+1):
urls=self.getImagesLinks(i)
for url in urls:
self.download_img(url)
except Exception as e:
print(e.args)
finally:
pass
def main():
start=time()
spider=Spider(n=1)
spider.run()
end=time()
print(end-start,'s')
if __name__ == '__main__':
main()
2.png
unsplash.py爬取里unsplash首页的12张图片,共耗时62.375s
async_unsplash.py
import requests
import re
import os
import asyncio
import aiohttp
from time import time
class Spider(object):
def __init__(self,n=10):
self.headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
path = './download/async_unsplash'
if not os.path.exists(path):
os.mkdir(path)
self.path = path
else:
self.path = path
self.n = n
self.num=1
def getImagesLinks(self,page:int):
url='https://unsplash.com/napi/photos'
params={
'page': page,
'per_page': '12',
'order_by': 'latest'
}
links=[]
try:
r=requests.get(url=url,params=params,timeout=60)
r.raise_for_status()
r.encoding=r.apparent_encoding
for data in r.json():
links.append(data['urls']['full'])
except Exception as e:
print(e.args)
finally:
return links
async def save_img(self,url):
content=""
try:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
response=await session.get(url,headers=self.headers,timeout=60)
content=await response.read()
await session.close()
except Exception as e:
print(e.args)
finally:
return content
async def download_img(self,url:str):
url_split = re.split(r'/|?', url)
try:
filename = url_split[3] + '.jpg'
if os.path.exists(self.path + '/' + filename):
print('下载失败,文件已存在')
else:
content=await self.save_img(url)
with open(self.path + '/' + filename, 'wb') as f:
f.write(content)
f.close()
print('成功下载第%d张图片' % (self.num))
self.num+=1
except Exception as e:
print(e.args)
finally:
pass
def run(self):
try:
for i in range(1, self.n + 1):
urls = self.getImagesLinks(i)
tasks=[asyncio.ensure_future(self.download_img(url)) for url in urls]
loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
except Exception as e:
print(e.args)
finally:
pass
def main():
start=time()
spider=Spider(n=1)
spider.run()
end=time()
print(end-start,'s')
if __name__ == '__main__':
main()
1.png
async_unsplash.py爬取里unsplash首页的12张图片,共耗时16.740s
通过上述两个事例,可以很明显看出异步协程对爬虫爬取文件速度的提升,对别的程序运行也有同样的作用
运行异步协程程序的时候,内存的占有比不用时大