首页 > 编程知识 正文

表情包爬虫代码,表情包爬虫项目意义

时间:2023-05-06 02:27:57 阅读:280528 作者:4079

一、同步爬虫 import requestsfrom lxml import etreefrom urllib import requestimport osimport reimport timedef get_page_source(link): headers = { 'Referer': 'http://www.doutula.com/photo/list/?page=23', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0', 'Cookie': '__cfduid=d74bb1bdede33ae5fa88970198604232f1570874777; XSRF-TOKEN=eyJpdiI6IjE3ZUNSS1VJWXp2MzRENEhOdmlPSXc9PSIsInZhbHVlIjoiUFBVM25OSVBhZDRsSEhheGhGaVpLdFgyWU1TUmdoUGY2TFFxQ0ZkQUZvNjBONW94MmtmdDVHTEZ0TmMzWW5GNyIsIm1hYyI6IjY4NTQ5Yjk0MDVlOGViMWI1NTA4YWYyODI1N2NhNGJhMWFjMWQwMjI5NTEyMGQ2NTlmYWUzNGI4ZmVhMzkzNjQifQ%3D%3D; doutula_session=eyJpdiI6ImxmeFwvcDR1UVR0OTcrOVFPbnM4eCtnPT0iLCJ2YWx1ZSI6IjV4c3liSTF2VUtBellnbHJhNWxjWWk1QmZnRllRR0wwYnRvZjFzeTNjMFJkWEZlcWZiTlA4aEVXRUh6OWZKV3giLCJtYWMiOiJiZGU2ZTFkOTFhMTkyNjFkYmUwMTU1MGFiMWY0MDgxNWQ3MzQ4MDBmNmE4NjEyMzc1ODFjMDRjYmM2NGYxYjk0In0%3D; UM_distinctid=16dbf6ee8e4417-0d482538bfe3688-14377a40-144000-16dbf6ee8e652a; CNZZDATA1256911977=144637179-1570873422-%7C1570878822; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1570881136; Hm_lpvt_24b7d5cc1b26f24f256b6869b069278e=1570881170' } resp = requests.get(link, headers=headers) html = etree.HTML(resp.text) imgs = html.xpath("//div[@class='page-content text-center']//a//img[@class!='gif']") for img in imgs: img_url = img.get("data-original") # 图片链接 alt = img.get("alt") # 图片名字 alt = re.sub(r'[*?。,?,.,!!]', "", alt) # 替换非法命名字符 suffix = os.path.splitext(img_url)[-1] # 获取后缀 # print(img_url, alt, suffix) filename = alt + suffix request.urlretrieve(img_url, 'images/' + filename) # 保存图片 # print(etree.tostring(img))def main(): for i in range(1, 20): url = 'http://www.doutula.com/photo/list/?page={}'.format(i) get_page_source(url)if __name__ == '__main__': start = time.time() main() end = time.time() print(end - start) 二、异步模式

多线程用法:https://www.cnblogs.com/Eva-J/articles/8306047.html

import reimport timefrom lxml import etreeimport requestsimport osfrom urllib import requestfrom queue import Queueimport threading'''Queue是线程安全的'''class Get_Link(threading.Thread): def __init__(self, page_queue, image_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.page_queue = page_queue self.image_queue = image_queue def run(self): while True: if self.page_queue.empty(): # 如果为空,证明20页已抓取完 break url = self.page_queue.get() # 获取每页的url self.get_link(url) # 调用get_link,获取每页中所有图片的链接,put队列中 def get_link(self, url): headers = { 'Referer': 'http://www.doutula.com/photo/list/?page=23', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0', 'Cookie': '__cfduid=d74bb1bdede33ae5fa88970198604232f1570874777; XSRF-TOKEN=eyJpdiI6IjE3ZUNSS1VJWXp2MzRENEhOdmlPSXc9PSIsInZhbHVlIjoiUFBVM25OSVBhZDRsSEhheGhGaVpLdFgyWU1TUmdoUGY2TFFxQ0ZkQUZvNjBONW94MmtmdDVHTEZ0TmMzWW5GNyIsIm1hYyI6IjY4NTQ5Yjk0MDVlOGViMWI1NTA4YWYyODI1N2NhNGJhMWFjMWQwMjI5NTEyMGQ2NTlmYWUzNGI4ZmVhMzkzNjQifQ%3D%3D; doutula_session=eyJpdiI6ImxmeFwvcDR1UVR0OTcrOVFPbnM4eCtnPT0iLCJ2YWx1ZSI6IjV4c3liSTF2VUtBellnbHJhNWxjWWk1QmZnRllRR0wwYnRvZjFzeTNjMFJkWEZlcWZiTlA4aEVXRUh6OWZKV3giLCJtYWMiOiJiZGU2ZTFkOTFhMTkyNjFkYmUwMTU1MGFiMWY0MDgxNWQ3MzQ4MDBmNmE4NjEyMzc1ODFjMDRjYmM2NGYxYjk0In0%3D; UM_distinctid=16dbf6ee8e4417-0d482538bfe3688-14377a40-144000-16dbf6ee8e652a; CNZZDATA1256911977=144637179-1570873422-%7C1570878822; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1570881136; Hm_lpvt_24b7d5cc1b26f24f256b6869b069278e=1570881170' } resp = requests.get(url, headers=headers) html = etree.HTML(resp.text) # 使用XPath解析 imgs = html.xpath("//div[@class='page-content text-center']//a//img[@class!='gif']") for img in imgs: img_url = img.get("data-original") # 图片链接 alt = img.get("alt") # 图片名字 alt = re.sub(r'[*?。,?,.,!!/]', "", alt) # 替换非法命名字符 suffix = os.path.splitext(img_url)[-1] # 获取后缀 # print(img_url, alt, suffix) filename = alt + suffix # 图片名 self.image_queue.put((img_url, filename)) # 以元组形式放入队列class Download_Image(threading.Thread): def __init__(self, page_queue, image_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.page_queue = page_queue self.image_queue = image_queue def run(self): # 执行图片下载 start = time.time() while True: if self.image_queue.empty(): if self.page_queue.empty(): end = time.time() print(end - start) # 运行时间 return img_url, filename = self.image_queue.get() # 获取队列中的图片链接和名字 request.urlretrieve(img_url, 'images2/' + filename) # 下载 print('over')def main(): page_queue = Queue(100) # 页面队列 image_queue = Queue(500) # 图片队列 for i in range(21, 40): url = 'http://www.doutula.com/photo/list/?page={}'.format(i) # 下载21~41页的图片 page_queue.put(url) for x in range(5): # 开启5个线程 t = Get_Link(page_queue, image_queue) # 线程安全队列 t.start() for x in range(5): # 5个下载线程 t = Download_Image(page_queue, image_queue) t.start()if __name__ == '__main__': main()

同步运行时间约120s,异步开启5个线程,运行时间约44s,异步大幅节约了时间。

运行截图:

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。