关键字:正则表达式 python 爬虫

调试不易,老程序员恳请:转载请注明出处。   https://blog.csdn.net/tjzzy/article/details/86453247





首先把html文档按照标签分解,没有按照soup的find直接搜索,因为我不确定微信的公众号网页是以什么方式嵌入的图片。有时候,编辑会使用<img class= ... data-type=... src=""...>来嵌入一个图片,有时候是src在前,<img class= ... src=""... data-type=...>



# -*- coding: utf-8 -*-"""Created on Mon Dec 24 14:04:34 2018@author: Thor"""#selenium爬取图片#import sys#from selenium import webdriverimport osfrom datetime import datetimefrom re import findallimport refrom pprint import pprintfrom urllib.request import urlopendef strIsNum(string): #全部是数字时返回True return re.match("[0-9]+$", string) != Noneimage_path = '.\wechatimglu'#print(os.curdir)if not os.path.exists(image_path): #判断要放图片的目录是否存在: os.mkdir(image_path) print("true")#把文件保存目录改为当前目录下面的image_pathtry: os.chdir(image_path)except Exception as e: print(e) print("Failed to change directory")for root, dirs, files in os.walk("."): if root == ".":# print(root)# print(dirs)# print(files)#获取image_path目录下的所有文件名,split和join用于将扩展名去掉。 filelistWithoutExtName = [".".join(p.split(".")[:-1]) for p in files]maxFileName = max(int(x) for x in filelistWithoutExtName if strIsNum(x) )print(maxFileName)while(True): try: print("please input your start index:") #读取并判断index是否在文件夹中已经存在,如果存在则返回此循环继续读取index strindex = input() jpgindex = int(strindex) filename = "%d.%s"%(jpgindex,"png") if not strindex in filelistWithoutExtName: #print(filelistWithoutExtName) print("the start index is:%d,press any key to run,or ctrl-c to exit."%jpgindex) tempchar = input() break #jpgindex can be converted into int else: print("The index has already existed, please change.n if you just enter, the index will start with %s"%maxFileName) except Exception as e: pass#jpgindex = 161urls = []#website文件放在上一级目录里with open("..website.txt", "r") as filesource: for line in filesource.readlines(): url = line.strip() urls.append(url) print("Retrieving...")for url in urls: #打开URL with urlopen(url) as fp: content = fp.read().decode("gbk",errors='ignore') # 需要进行解码成字符串 #print(content) #写URL日志 with open("..\filehtm.txt","w") as f: f.write(content) ''' 把文本按照标签分开,<html> </html>会被分到两个变量输入result 再将结果输出到迭代器进行下载。 ''' resultDiv = findall("<(.+?)>", content) result = [] for contentdiv in resultDiv: #pattern = 'data-src="(.+?)" data-type="jpeg" ' pattern = 'data-src="(.+?)" data-type="(gif|png|jpeg|jpg)"' if re.search("data-type="(gif|png|jpeg|jpg)", contentdiv) != None: result += findall('data-src="(.+?)"', contentdiv) if re.search("data-src", contentdiv) == None: #没有找到数据 print("pppp",contentdiv) if re.search("img src="(.+?)"",contentdiv) != None: result += [ "http:" + url for url in findall('img src="(.+?)"', contentdiv) if re.match("http", url) == None] result += findall('img src="(.+?)"', contentdiv) else: result += findall('img.+? src="(.+?)"', contentdiv) #pattern = 'data-type="png" data-w=".+?" data-src="(.+?)"' #pattern = 'data-type="jpeg|png".+?data-src="(.+?)"' #result = findall(pattern, content) pattern = 'img src="(.+?)" style' result += findall(pattern, contentdiv) if len(result) == 0: if re.search("data-src", contentdiv) != None: result+= findall('src="(.+?)"', contentdiv) pprint(result) '''结果输出到迭代器进行下载''' for index, item in enumerate(result,1): try: if type(item) is not str: item = item[0] data = urlopen(str(item)).read() except Exception as e: print(e) print(str(item)) continue print('开始下载第%s 张图片: %s'%(index, item)) #提取图片中的最后fmt字符 格式作为图片扩展名 try: fixnameindex = item.index("fmt=") except Exception as e: fixnameindex = -1 """x形如 ...fmt=png ...fmt=jpeg,index会取f所在的位置, 所以下一句向后取4个位置(fmt=是4个字符),一直到最后一个字符,如果index取到了-1, 就是说没有找到fmt=的字样,那么默认在后面加jpg的扩展名""" fixname = item[fixnameindex +4 : ] if fixnameindex != -1 else "jpg" #文件名生成 filename = "%d.%s"%(jpgindex,fixname) #f = open(str(jpgindex) + fixname, "wb") f = open(filename, "wb") jpgindex += 1 f.write(data) f.close() filelog = open("logfile.log", "a+") if len(result) != 0: filelog.write("succ|%s|%sn"%(str(datetime.now()),url)) else: filelog.write("fail|%s|%sn"%(str(datetime.now()),url)) filelog.close()


