爬虫爬取网页,反爬虫

优化的代码如下。

首先在循环中创建20个目录，然后在循环中写入这20个目录，每个目录最多写入50000个

#coding=utf-8

import os

导入随机

导入系统

import urllib.request

from bs4 import BeautifulSoup

来自urllib import error

导入re

ls=['meinv '，' zhenrenxiu '，' lianglichemo '，' rentiyishu '，' xiaohua '，' lianglichemo']

file _ list=OS.list dir (d :\ craw" () ) ) ) ) ) ) ) ) )。

efvalidatetitle(title ) :

rstr=r'[/\:*"？ '\"|]'#'/3360*？ '|'

new_title=re.sub(rstr，' _ '，title )将被替换为下划线

return new_title

def get_file_name () :

file=random.sample(file_list，1 ) [0]

path='d:craw/'str(file )；

ifOS.Path.Isdir(Path ) :

total _ num=len (OS.list dir (d : (craw (str ) file ) )

if total_num=50000:

file=get_file_name ()

else:

是OS.mkdir(Path )

print (创建目录(str ) path ) )

returnstr(path ) () ) ) ) ) )。

for k in ls:

forjinrange (1，101111 ) :

URL _ origin=' http://www.7160.com/'str (k ) )/' str (j ) )。

打印(URL _ origin ) )。

try:

page _ obj=urllib.request.urlopen (URL _ origin ) )

page _ soup=beautiful soup (page _ obj，' lxml ' )

total _ page _ obj=page _ soup.find (text=re.com pile ('共') ).string

pattern=re.compile(r'd ' ) )。

match=pattern.search (total _ page _ obj ) )。

if match==None:

total_page=0；

else:

total_page=match.group (；

forIinrange(1，int ) total_page )1) :

if i==1 :

url=url_origin '/index.html '

else:

url=url_origin '/index_' str(i ) '.html '

request=urllib.request.request (URL )

try:

RES=urllib.request.urlopen (request )

soup=beautifulsoup(RES，' lxml ' ) ) ) ) ) )。

title _ obj=soup.find (attrs={ ' class ' : ' picmainer ' } )

if title_obj is not None:

是打印(URL )

title=title_obj.h1.string

content=soup.find(img ) )。

src=content.get(src ) ) )。

file_name=validatetitle(title ) '.jpg '

urllib.request.URLretrieve(src，str ) get_file_name ) ) file_name ) )

print(str(get_file_name ) ) (file_name '保存成功) )

except Exception as e:

打印(异常) str (e ) )

except Exception as e:

打印(异常) str (e ) )