首页 > 编程知识 正文

mmdetection github,如何用爬虫爬取网页上的数据

时间:2023-05-05 13:58:18 阅读:254992 作者:4629

只爬取一个图集里面所有图片,目标地址url:http://www.mm131.com/xinggan/
代码速度还行,效果如下图:

用selenium翻页时:

1、图集翻页,启动浏览器获取本页图集的所有,title,url_photo,保存到mongodb,并且递归,直到没有发现最后一页按钮,就跳出结束(翻页时,并没有设置多少页,是一直点击到最后一页,总共大概154页)
2、图片翻页,再次启动一个浏览器,对每次图集翻页获取的url_photo都循环一次获取获取每张图片的url_image(图片翻页时,可以获取到总页数,所有直接循环点击,平均一副图片有30张)
3、对每一个url,获取一次图片,保存到本地
4、注意图片url有防盗链,下载时需要在请求头中写明自己来自哪里。
4、总共 大约154页20个30张 = 图片数量,我没有跑完,你可以试试。
5、 selenium3.x已经放弃PhantomJS了,这里运行会出现警告,不过没关系,还是可以运行代码,也可以把这部分修改下。

所有代码如下:

import pymongofrom requests import RequestExceptionfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom config import *from selenium.webdriver.support import expected_conditions as ECfrom pyquery import PyQuery as pqfrom selenium.common.exceptions import TimeoutExceptionimport osfrom hashlib import md5import requests'''from selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gpu')browser = webdriver.Chrome(chrome_options=chrome_options)'''browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) browser.set_window_size(1400,900) tar_url ='http://www.mm131.com/xinggan/'client = pymongo.MongoClient(MONGO_URL)db = client[MONGO_DB]browser_1.get(tar_url)wait = WebDriverWait(browser_1, 10)#图集翻页def next_to_page(): try: print('图集下一页') e1 = browser_1.find_element_by_link_text('下一页') if e1.is_displayed(): nextpage = wait.until(EC.element_to_be_clickabl((By.LINK_TEXT, '下一页'))) nextpage.click() wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.page_now'))) url_photo=get_photo_url() number=get_page_number(url_photo) next_to_photo(url_photo,number) next_to_page() #递归 else: print('所有图集翻页完毕') browser.close() except TimeoutException: next_to_page()#图集这一页获取urldef get_photo_url(): html = browser.page_source doc = pq(html) items = doc('.list-left.public-box>dd').items() for item in items: url = item.find('a').attr('href') try: resonpse = requests.get(url) if resonpse.status_code == 200: dic = { 'title': item.text(), 'url_photo': item.find('a').attr('href')} save_to_mongo(dic) #title = dic.get('title') url_photo = dic.get('url_photo') return url_photo except RequestException: pass# 获取每一页的页数def get_page_number(url_photo): try: doc = pq(url=url_photo, encoding='gbk') page = doc.find('.page-ch').text() number = int(page[1:3]) print('图片页数是:',number) return number except Exception: passdef next_to_photo(url_photo,number): print('正在获取图片所有链接') browser_2 = webdriver.PhantomJS(service_args=SERVICE_ARGS) browser_2.set_window_size(1400, 900) wait = WebDriverWait(browser, 10) browser_2.get(url_photo) try: for i in range(1,number): if i != number-1 nextpage = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '下一页'))) html = browser.page_source doc = pq(html) imageUrl = doc.find('.content-pic>a>img').attr('src') print('-------------------------------------') download_image(imageUrl) #print(imageUrl) nextpage.click() wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.page_now'))) html = browser.page_source else: browser.close() except TimeoutException: next_to_photo(number)# 把url传过来,用浏览器去请求图片对应的urldef download_image(imageUrl): print('下载成功', imageUrl) Picreferer = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/63.0.3239.108 Safari/537.36', 'Referer': 'http://www.mm131.com/xinggan/4210.html'} resonpse = requests.get(imageUrl, headers=Picreferer) if resonpse.status_code == 200: save_image(resonpse.content) def save_image(content): filename = 'E:\PycharmProjects\Crawler\MM131\photo' file_path = '{0}/{1}.{2}'.format(filename, md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() def save_to_mongo(result): try: if db[MONGO_TABLE].insert_one(result): print('存到mongodb成功',result) except Exception: print('存到mongodb失败',result)if __name__ == '__main__': next_to_page() SERVICE_ARGS = ['--load-images=false','--disk-cache=true']MONGO_URL = 'localhost'MONGO_DB = '图片'MONGO_TABLE = '图片'

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。