python爬虫爬取片案例,python爬虫爬取片介绍

来个简单的多线程，爬取速度比单线程要快很多，下面上代码：#encoding:utf-8#多线程爬取import requestsimport osfrom bs4 import BeautifulSoupimport threadingimport urllib.requestFIRST_PAGE_谨慎的野狼 = 'http://www.qiubaichengren.com/{}.html'PAGE_谨慎的野狼_LIST = []IMG_谨慎的野狼_LIST = [] #所有的图片链接NAME_LIST = []gLock = threading.Lock()for x in range(1,100): page_url = FIRST_PAGE_谨慎的野狼.format(x) PAGE_谨慎的野狼_LIST.append(page_url)def get_page(): while True: gLock.acquire() if len(PAGE_谨慎的野狼_LIST) == 0: gLock.release() break else: page_url = PAGE_谨慎的野狼_LIST.pop() gLock.release() response = requests.get(page_url) content = response.content soup = BeautifulSoup(content, 'lxml') src = soup.find_all('div', class_='mala-text') imgs = soup.find_all('img') for img in src: url = img.find('img') link = url.get('src') title = url.get('alt') split_list = link.split('/') final = split_list.pop() t_split_list = final.split('.') suffix = t_split_list.pop() filename = title + '.' + suffix # 名字加后缀 gLock.acquire() NAME_LIST.append(filename) IMG_谨慎的野狼_LIST.append(link) gLock.release() #download_image(link, filename)def download_image(): while True: gLock.acquire() if len(IMG_谨慎的野狼_LIST) == 0: gLock.release() continue else: url = IMG_谨慎的野狼_LIST.pop() filename = NAME_LIST.pop() gLock.release() path = os.path.join('images', filename) urllib.request.urlretrieve(url, filename=path)def main(): for x in range(4): th = threading.Thread(target=get_page) th.start() for x in range(5): th = threading.Thread(target=download_image) th.start()if __name__ == "__main__": main()