python 并发http请求,github

python异步赞助

环境：python3.7.0

协作日程

协和式，英语中称为Coroutine，也称为微线程、光纤。协和式是用户状态的轻量级线程。

协程本质上是单进程的，协程对于多进程没有线程上下文切换开销、原子操作锁定和同步开销，编程模型也非常简单。

用协和式控制器实现异步操作。在发出一个请求后，需要一定的时间才能得到响应。在此等待过程中，程序可以做很多其他事情，然后切换并继续处理，直到得到响应。这样可以充分利用CPU和其他资源。这就是异步协和式飞机的优点。

异步协作可以减少爬行动物爬文件的时间

其中使用aiohttp库

sudo pip3 install aiohttp

Name: aiohttp

Version: 3.4.1

summary : async http客户端/服务器框架(async io ) )。

home-page :https://github.com/AIO-libs/AIO http

Author: Nikolay Kim

author-email : faf HRD 91 @ Gmail.com

许可证： Apache 2

location :/library/frameworks/python.framework/versions/3.7/lib/python 3.7/site-packages

Requires: yarl，attrs，multidict，chardet，async-timeout

要求- by :

以下是异步和异步不获取unsplash首页图像的比较

unsplash.py

导入请求

import os

导入re

来自时间导入时间

类喷射(object ) :

def __init__(self，n=10 ) :

self.headers={

'用户代理' : ' Mozilla/5.0 (机器； Intel Mac OS X 10_13_6) appleWebKit/537.36(khtml，like Gecko ) chrome/68.0.3440.106 safari/537.36 '

}

path='./download/unsplash '

ifnotOS.path.Exists(path ) :

是OS.mkdir(Path )

self.path=path

else:

self.path=path

self.n=n

self.num=1

defgetimageslinks(self，page:int ) :

URL=' https://un splash.com/napi/photos '

params={

' page': page，

' per_page': '12 '，

' order_by': 'latest '

}

links=[]

try:

r=requests.get(URL=URL，params=params，timeout=60 ) ) )。

r.raise_for_status (

r.encoding=r.apparent_encoding

for data in r.json () :

links.append (data [ ' URLs ' ] [ ' full ' ] )

except Exception as e:

打印(e.args ) )。

finally:

返回链接

defsave_img(self，url ) :

text=' '

try:

r=requests.get(URL，headers=self.headers，timeout=60 ) )。

r.raise_for_status (

text=r.content

except Exception as e:

打印(e.args ) )。

finally:

返回文本

defdownload_img(self，url:str ) :

URL_split=re.split(r'/|？) ，url )

try:

filename=url_split[3] '.jpg '

ifos.path.exists (self.path '/' filename ) :

print (“下载失败。文件已经存在)”

/p>

else:

with open(self.path+'/'+filename,'wb') as f:

f.write(self.save_img(url))

f.close()

print('成功下载第%d张图片'%(self.num))

self.num+=1

except Exception as e:

print(e.args)

finally:

pass

def run(self):

try:

for i in range(1,self.n+1):

urls=self.getImagesLinks(i)

for url in urls:

self.download_img(url)

except Exception as e:

print(e.args)

finally:

pass

def main():

start=time()

spider=Spider(n=1)

spider.run()

end=time()

print(end-start,'s')

if __name__ == '__main__':

main()

2.png

unsplash.py爬取里unsplash首页的12张图片,共耗时62.375s

async_unsplash.py

import requests

import re

import os

import asyncio

import aiohttp

from time import time

class Spider(object):

def __init__(self,n=10):

self.headers = {

'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

}

path = './download/async_unsplash'

if not os.path.exists(path):

os.mkdir(path)

self.path = path

else:

self.path = path

self.n = n

self.num=1

def getImagesLinks(self,page:int):

url='https://unsplash.com/napi/photos'

params={

'page': page,

'per_page': '12',

'order_by': 'latest'

}

links=[]

try:

r=requests.get(url=url,params=params,timeout=60)

r.raise_for_status()

r.encoding=r.apparent_encoding

for data in r.json():

links.append(data['urls']['full'])

except Exception as e:

print(e.args)

finally:

return links

async def save_img(self,url):

content=""

try:

async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:

response=await session.get(url,headers=self.headers,timeout=60)

content=await response.read()

await session.close()

except Exception as e:

print(e.args)

finally:

return content

async def download_img(self,url:str):

url_split = re.split(r'/|?', url)

try:

filename = url_split[3] + '.jpg'

if os.path.exists(self.path + '/' + filename):

print('下载失败，文件已存在')

else:

content=await self.save_img(url)

with open(self.path + '/' + filename, 'wb') as f:

f.write(content)

f.close()

print('成功下载第%d张图片' % (self.num))

self.num+=1

except Exception as e:

print(e.args)

finally:

pass

def run(self):

try:

for i in range(1, self.n + 1):

urls = self.getImagesLinks(i)

tasks=[asyncio.ensure_future(self.download_img(url)) for url in urls]

loop=asyncio.get_event_loop()

loop.run_until_complete(asyncio.wait(tasks))

except Exception as e:

print(e.args)

finally:

pass

def main():

start=time()

spider=Spider(n=1)

spider.run()

end=time()

print(end-start,'s')

if __name__ == '__main__':

main()

1.png

async_unsplash.py爬取里unsplash首页的12张图片,共耗时16.740s

通过上述两个事例，可以很明显看出异步协程对爬虫爬取文件速度的提升,对别的程序运行也有同样的作用

运行异步协程程序的时候,内存的占有比不用时大