首页 > 编程知识 正文

Scrapy MongoDB FastAPI搭建免费IP代理池接口,免费代理空间搭建

时间:2023-05-05 01:36:17 阅读:202356 作者:466

目标设定

为方便后期爬虫部署,以免因频繁访问被封IP,构造获取免费IP代理的接口

解决思路 数据爬取

数据源选取提供免费IP代理的网站,使用Scrapy进行爬取(5分钟一次),使用SpiderKepper进行部署管理。

数据校验

爬取到的代理并非全部可用,故而入库前先进行校验,仅入库可用代理;

同时未保证代理池内始终保有代理,在每次爬取前先进行已有代理校验,删除过期代理。

数据存储

提供代理信息的相关字段:ip、port、address、is_china(是否为国内)、source(代理数据来源),数据库使用MongoDB作为持久化存储,当然如果想要加快响应速度,也可以使用内存数据库Redis存储。

接口构造

接口根据传参num返回指定数量的IP,默认为1,最大为100,这里使用FastAPI完成

数据爬取 分析请求路径

https://www.7yip.cn/free/?action=china&page={页码}

http://www.89ip.cn/index_{页码}.html

https://ip.jiangxianli.com/?page={页码}&country=中国

https://www.kuaidaili.com/free/inha/{页码}/

Scrapy实现

setting.py内设置Mongodb连接参数等信息

##### setting.py# MongoDB连接参数MONGODB_HOST='*.*.*.*'MONGODB_PORT=*MONGODB_USER='*'MONGODB_PASSWD='*'MONGODB_DB='ipproxy'# 不当君子,小人一次ROBOTSTXT_OBEY = False# 允许对状态码不为200的响应进行处理HTTPERROR_ALLOW_ALL= True# 使用随机代理中间件DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'ipproxy.middlewares.RandomUserAgentMiddleware': 400,}# 取消注释,对响应进行处理ITEM_PIPELINES = { 'ipproxy.pipelines.IpproxyPipeline': 300,}# 使用随机请求代理,模拟浏览器访问USER_AGENT = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"]

构造随机代理中间件

##### middlewares.pyfrom scrapy import signalsimport scrapyfrom scrapy.downloadermiddlewares.useragent import UserAgentMiddlewareimport randomclass RandomUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENT') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent

定义字段模型

##### items.pyimport scrapyclass IpproxyItem(scrapy.Item): ip=scrapy.Field() port=scrapy.Field() address=scrapy.Field() is_china=scrapy.Field() source=scrapy.Field()

页码爬取,以7yip为例,其他网站只需要修改个别参数即可复用

##### a7yip.py# -*- coding: utf-8 -*-import scrapyclass A7yipSpider(scrapy.Spider): name = '7yip' allowed_domains = ['www.7yip.cn'] start_url = 'https://www.7yip.cn/free/?action=china&page={0}' max_page_size=5 def start_requests(self): for i in range(1,self.max_page_size+1): yield scrapy.Request(self.start_url.format(i), callback=self.parse, dont_filter=True) def parse(self, response): results=response.xpath('/html/body/section/section/div[2]/table/tbody/tr')[1:] for record in results: fields=record.xpath('td/text()') proxy_item={'source':'7yip','is_china':True} proxy_item['ip']=fields[0].extract().strip() proxy_item['port']=fields[1].extract().strip() proxy_item['address']=fields[4].extract().strip() yield proxy_item

处理响应并入库

##### a7yip.pyfrom scrapy.utils.project import get_project_settingsimport telnetlibfrom pymongo import MongoClientimport logginglogging.getLogger().setLevel(logging.INFO)settings = get_project_settings()class IpproxyPipeline(object): @staticmethod def is_valid_ip(ip, port): # 验证IP是否可用 try: telnetlib.Telnet(ip, port, timeout=2) return True except: return False def open_spider(self, spider): # 连接MONGODB try: client = MongoClient( settings['MONGODB_HOST'], settings['MONGODB_PORT']) logging.info('Mongodb连接成功!') except Exception as e: logging.error('Mongodb连接失败:{0}'.format(e)) db = client.admin.authenticate( settings['MONGODB_USER'], settings['MONGODB_PASSWD']) if db: logging.info('Mongodb验证成功!') else: logging.error('Mongodb验证失败!') self.collection = client.ipproxy[spider.name] # 校验当前已有代理 for record in self.collection.find(): if not IpproxyPipeline.is_valid_ip(record['ip'], record['port']): self.collection.delete_one({'_id': record['_id']}) logging.warning('{0}:{1}代理不可用,已删除!'.format( record['ip'], record['port'])) def process_item(self, item, spider): if len(list(self.collection.find({'ip': item['ip']}, {'port': item['port']}))) < 1 and IpproxyPipeline.is_valid_ip(item['ip'], item['port']): try: self.collection.insert_one(item) logging.info('已写入{0}!' .format(spider.name)) except Exception as e: logging.error('写入出错:{0}'.format(e)) else: logging.warn('{0}:{1}代理不可用,或已存在!'.format(item['ip'], item['port'])) return item def close_spider(self, spider): logging.info('{0}爬取结束!'.format(spider.name)) 部署定时爬虫并执行

使用SpiderKeeper进行部署管理


MongoDB代理池查看


接口构造

使用FastAPI实现,核心代码如下:

##### ipproxy_query.py# coding=utf-8'''查询免费代理IP,数据源-89ip/7yip/jxlip/kuaiip,存储-MongoDB'''from module.config import base_config # 自定义配置读取类,用于读取配置文件from pymongo import MongoClientimport logginglogging.getLogger().setLevel(logging.INFO)def query_ipproxy(num: int) -> list: ''' 查询Mongodb免费代理IP ''' config = base_config().config['mongodb'] # 连接MONGODB try: client = MongoClient( config['host'], config['port']) logging.info('Mongodb连接成功!') except Exception as e: logging.error('Mongodb连接失败:{0}'.format(e)) db = client.admin.authenticate( config['user'], config['password']) if db: logging.info('Mongodb验证成功!') else: logging.error('Mongodb验证失败!') result,source_list=[],['89ip','7yip','kuaiip','jxlip'] for source in source_list: for proxy in client.ipproxy.get_collection(source).find(): proxy.pop('_id') result.append(proxy) return result[:num] if len(result)>num else result ##### ipproxy_router.py# coding=utf-8'''处理免费IP代理的请求,支持GET'''from module import ipproxy_query as iqfrom fastapi import APIRouter,Queryrouter = APIRouter()@router.get('/china', tags=["获取免费http代理"], description="数据源:89ip/7yip/jxlip/kuaiip,地址:http://www.89ip.cn , https://www.7yip.cn/free/ , https://ip.jiangxianli.com/ , https://ip.jiangxianli.com/")async def get_ipprxy(num: int = Query(1,description='请求的IP数量',ge=1,le=100)): result = {} try: ipproxy_data = iq.query_ipproxy(num) result['msg'] = '查询成功!' result['code'] = 200 result['total']=len(ipproxy_data) result['list'] = ipproxy_data except: result['msg'] = '查询失败!' result['code'] = 400 return result ### main.py# -*- coding:utf-8 -*-'''基于FastAPI的REST接口'''__author__ = 'owl'from fastapi import FastAPIfrom router import ipproxy_router as ipproxy# API文档配置app = FastAPI(title="数据管理接口", description="用于获取数据的接口", version="0.1.0", openapi_prefix="", openapi_url="/fastapi/data_manger.json", docs_url="/fastapi/docs", redoc_url="/fastapi/redoc")# 导入响应路由模块app.include_router(email.router, prefix='/fastapi/notify')

部署到服务器并使用Nginx代理,实现效果如下:

http://pandora128.cn/fastapi/docs

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。