python代码,python爬虫实战案例

import urllib.request

import urllib.parse

导入re

导入操作系统

添加头。其中，Referer是必需的。否则，将返回403错误。用户代理是伪装成浏览器进行访问所必需的

header=

{

'用户代理' : ' Mozilla/5.0 (windows nt 10.0；双赢64； x64 ) appleWebKit/537.36(khtml，like Gecko ) chrome/56.0.2924.87 safari/537.36 '，

' referer ' : ' https://image.Baidu.com '

}

URL=' https://image.Baidu.com/search/AC JSON？ TN=result JSON _ com IPN=rjct=201326592 is=FP=resultqueryword={ word } cl=2lm=-1ie=utf-8oe=utf-8附加id

keyword=input ('请输入搜索关键字：')

#转码

keyword=urllib.parse.quote (keyword，' utf-8 ' ) ) ) )。

n=0

j=0

wile(n3000 ) :

error=0

n=30

#url

URL1=URL.format(word=keyword，pagenum=str(n ) )

#获取请求

rep=urllib.request.request (URL 1，headers=header ) ) ) ) ) ) )。

打开#页

rep=urllib.request.urlopen(rep )

#获取web内容

try:

html=rep.read ().decode () (utf-8 ) ) ) ) )。

#打印(html )

except:

print ('错误！' ）

错误=1

print ('错误页数： ' str ) n ) )

if error==1:

连续

#正规一致

p=re.compile('thumbURL.*？ .jpg ' )

#获取正规一致的结果，返回list

s=p.findall(html ) )。

ifos.path.isdir (r ' c : (users (87419 )桌面) pa )！=True:

操作系统. makedirs (r ' c : (用户(87419 )桌面(pa ) ) ) ) ) ) ) ) 652

withopen('testpic.txt '，' a ' ) as f:

#获取图像

for i in s:

是打印(I )

I=I.replace('thumbURL': '，'')

是打印(I )

是f.write(I )

f .写入((n ) ) ) ) )。

#保存图像

urllib.request.URLretrieve(I，r'c:(users(87419 ) desktop (pa/pic { num }.jpg '.format ) num=j ) )

j=1

f.close () )

print ('总滚动图像数为' str(j ) )