首页 > 编程知识 正文

爬虫百度贴吧,网络爬虫是什么虫

时间:2023-05-04 07:09:00 阅读:215236 作者:3668

工具:

pyspider

数据库:

mongodb

思路: 假设你要根据两个关键字搜索百度知道答案,比如:”购物“和”价格“;组建爬虫的url,需要把这两个关键字转化为url编码的格式,url编码教程详见;取出搜索页面列表上面所有项的url链接;然后,爬取步骤3的url,取出页面上面的question和最佳答案;循环往复,进行2、3、4步骤;代码: #!/usr/hxsdhs/env python# -*- encoding: utf-8 -*-from pyspider.libs.base_handler import *from urllib.parse import quote, unquotefrom pymongo import MongoClientimport datetimeimport timeimport randomclient = MongoClient("自定义数据库接口")db = client.自定义数据库名class Handler(BaseHandler): crawl_config = { } key_word1 = quote("自定义关键字1".encode("GB2312")) key_word2_list = ["自定义关键字2"] key_word2_list = [quote(i.encode("GB2312")) for i in key_word2_list] url_format = "https://zhidao.baidu.com/search?word={}&ie=gbk&site=-1&sites=0&date=0&pn={}" page_num = 76 # 最大页码 start_page = 0 # 开始的页码 max_random = 5 # 随机数的最大值 headers1 = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Host": "zhidao.baidu.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "自定义系统的浏览器版本号", } fetch_count = 0 # 访问api计数 @every(minutes=24 * 60) def on_start(self): start_time = time.time() for kw2 in Handler.key_word2_list: kw = Handler.key_word1 + "+" + kw2 for p in range(Handler.start_page, Handler.page_num): url = Handler.url_format.format(kw, p*10) Handler.headers1["Referer"] = url Handler.fetch_count += 1 # 抓取次数累计 start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random) self.crawl(url, callback=self.index_page, headers=Handler.headers1, save={"url": url, "kw2": unquote(kw2, "GB2312"), "start_time": start_time}, exetime=start_time) @config(age=24 * 60 * 60) def index_page(self, response): start_time = response.save["start_time"] for i in response.doc("#wgt-list > dl").items(): url = i("dt > a").attr("href") Handler.headers1["Referer"] = response.save["url"] Handler.fetch_count += 1 # 抓取次数累计 start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random) self.crawl(url, callback=self.detail_page, headers=Handler.headers1, save={"kw2": response.save["kw2"]}, exetime=start_time) @config(priority=2) def detail_page(self, response): data = {} data["url"] = response.url data["question"] = response.doc("#wgt-ask > h1").text().strip() data["answer"] = response.doc("div.best-text").text().strip().replace("n", "").replace("展开全部", "") if not data["answer"]: data["answer"] = response.doc("div.answer-text").text().strip().replace("n", "").replace("展开全部", "") data["kw2"] = response.save["kw2"] data["kw1"] = unquote(Handler.key_word1, "GB2312") data["crawl_time"] = datetime.datetime.strptime(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") print(data) if db["自定义表名"].find({"kw2": data["kw2"], "question": data["question"]}).count() == 0: result = db["自定义表名"].insert(data) else: result = db["自定义表名"].update(data) print(result)

 

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。