首页 > 编程知识 正文

用python3爬取代理服务器列表

时间:2023-05-05 20:57:28 阅读:214166 作者:922

from urllib.request import urlopenfrom bs4 import BeautifulSoupfrom urllib.error import HTTPErrorimport redef gethtml(url): try: html=urlopen(url) except HTTPError as e: print(e) return None try: bsobj=BeautifulSoup(html.read(),'lxml') labellist=bsobj.findAll('td') except AttributeError as e: print(e) return None return labellist def getdict(html): regip=re.compile('[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}') regp=re.compile('d{1,4}d$') dict={} for item in html: if regip.match(item.get_text()): key=item.get_text() if regp.match(item.get_text()): value=item.get_text() dict[key]=value return dictdef fwrite(dict,fname): file=open(fname,'a') for key in dict: file.write(key+':'+dict[key]+'n') file.close() if __name__=='__main__': url='http://www.66ip.cn/' fname='poxy.txt' for page in range(1,20): if page==1: newurl=url+'index.html' else: newurl=url+str(page)+'.html' html=gethtml(url) dict=getdict(html) fwrite(dict,fname) print(u'完成第'+newurl+u'页') print(u'写入成功')

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。