python爬虫实例,python为什么叫爬虫

这里的小红书(RED )爬虫和上一篇微信公众号的文章爬虫差不多，可以互相参考，但是发送内容源地址时提醒大家，文章地址第一次进入时不是真正的地址，小红书(RED )小将地址复制到其他浏览器进行分析后，内容源地址为“？ _at="后面跟着参数，这是真正的地址。 (这段时间，有人做出了反应，有些文章没能登顶，所以进行了分析。小红书(RED )的文章有不同的模板，div的命名中包含随机代码。更改代码后，应该可以使用。您采用的三个主要字段都在$relwvantart中。可以打印出来看。也可以使用与爬行动物相关的自定义函数

//*

爬行动物逻辑

获取html内容

publicfunctioncrawler ($ cent ent _ URL ) )。

{

//https://www.Xiaohong Shu.com/discovery/item/5a4ca 319 a 7c9b 8481 ea 24 c 7e？ _ at=36df 0d 880 CAE 739 ee 71 e 7e 94174 a6 D7 c 70351

//接收内容的源地址

$request=$centent_url；

$ html=$ this-geturl content ($ request )；

//print_r($html；退出；

$relwvantart=array (；

获取//div随机代码

preg _ match _ all (/class=(content (' data-v-).* )/iUs )、$html、$temp_variable、preg_patterrer

$variable=$temp_variable[1][0]；

//$ variable img=$ temp _ variable [1] [1]；

//print_r($variableimg；退出；

//获取文章的主要内容

preg _ match _ all (/class=(内容(' data-v-'.$ variable.).* ) ) ) ) ) )。

//$temp_content=$content[1][0]；

$temp_content='

'.$content[1][0].'； //print_r($temp_content )；退出；

//获取文章内容中的图像编号

preg _ match _ all ((/class=(cell image-cell ) ) data-v-).* (data-v-).$variable.'/ius '，$html )

$ temp _ contentimg=$ temp _ variable _ img [1] [0]；

//print_r($temp_contentimg；退出；

//内容中的图像处理

preg_match_all((/imgsrc=() ).* ) data-v-).$temp_contentimg.'/ius '，$temp_content，$

$temp_img=$temp_img[1]；

//print_r($temp_img；退出；

if (！ empty($temp_img ) )

//处理图像路径

//图像本地下载，获取返回图像的路径

$temp_content_img=array (；

Foreach($temp_imgas$k=$v ) {

$temp_v=str_replace((/)、https://)、$v )；

$ RES=$ this-crabcontentimg ($ temp _ v )；

$ temp _ content _ img [ $ k ]=$ RES [ ' save _ path ' ]；

}

//将内容中的图像路径替换为oss上的图像路径

foreach ($ temp _ content _ img as $ key=$ value ) {

$temp_content=str _ replace ($ temp _ img [ $ key ]、$value、$ temp _ content )；

}

//获取缩略图板的内容

preg_match_all () /

/iUs '，$html，$temp，PREG_PATTERN_ORDER； $img_url=$temp[1][0]；

$img_url=str_replace((/)、https://)、$img_URL )；

$RES=$this-crabimage($img_URL )；

$imgurl=$res['save_path']；

//print_r($imgurl；退出；

$relwvantart['img']=$imgurl；

}

//print_r($content；

//print_r(base64_encode ) $content )；退出；

$ relwvantart [ ' content ' ]=base64 _ encode $ temp _ content；

//获取文章标题

preg_match_all () /

()/iUs )、$html、$title、PREG_PATTERN_ORDER )； $title=$title[1][0]；

$relwvantart['title']=$title；

if(empty ) $title ) }

获取//元中的描述作为标题

preg_match_all('/ius '，$html，$desc，PREG_PATTERN_ORDER )；

$desc=$desc[1][0]；

$relwvantart['title']=$desc；

}

//获取缩略图板的内容

preg_match_all('/ius '，$html，$temp，PREG_PATTERN_ORDER )；

$tempicon=$temp[1]；

//print_r($tempicon；退出；

$img_url=array (；

//图像路径处理

Foreach($tempiconas$k=$v ) {

$img_url[$k]=str_replace (' )、' https: '和$v )；

}

Foreach($img_URLas$key=$value ) {

$img_url[$key]=str_replace '、'、$value；

}

//将图像本地保存并上传到操作系统

Foreach($img_URLas$I=$j ) {

$RES=$this-crabimage($j；

//print_r($RES；退出；

$imgurl[$i]=$res['save_path']；

}

//print_r($RES；

$relwvantart['img']=$imgurl；

返回$ relwvantart；

}