from bs4 import BeautifulSoupimport requestsdef one(url): r = requests.get(url, allow_redirects = False) fin = r.text #fin是上面的变量名?soup.find里是按级别找的 soup = BeautifulSoup(fin,'html.parser') comments = soup.find('div','hos-guide-box') comments_list = comments.find_all(['p','h2','h3','h4']) result = [] for i in range(len(comments_list)): result.append(comments_list[i].text.strip()) result1 = " ".join(result) return result1
XHR过滤掉不相关信息
def main():#这个网址打不开 r = requests.post('http://wapjbk.39.net/DiseaseArea/SpeciesCate', data={'id':4,'cateId':77}).json() list1 = [] file = [] #list1是网址,file是症状名 for i in r: list1.append(i.split('~')[0]) file.append(i.split('~')[1]) for i in range(len(list1)): url = 'http://wapjbk.39.net/'+list1[i]+'/zztz/' result1 = one(url) file1 ="D://dabao//爬虫练习//泌尿系统//" + file[i] + ".txt" f = open(file1,'a',encoding='utf-8') f.write(result1)if __name__ == '__main__': main()