爬虫爬取收费视频,爬取付费视频

items.py

flhz.py

pipelines.py

目标：访问福利吧论坛的福利摘要文章，以json格式保存所有福利摘要文章中的热门视频标题、链接。所有福利摘要文中的图片为目录(目录名称为当前图片所在页面的url中的7位数字)，例如某福利文章的url为http://fulibus.net/20110

items.py #---coding : utf-8---importscrapyclassarticleitem (scrapy.item ) : title=scrapy.Field )

flhz.py# ----coding : utf-8---froms crapy.linkextractorsimportlinkextractorfromscrapy.spidersimportcrawlspider rulefromefrompy lassflhzspider(crawlspider ) : name=' flhz ' allowed _ domains=[ ' ] ] start _ URLs=[ ' http://fuli bus.net/catet tu girl.html articles _ list=link extractor (allow='d { 6，7 }. html )， restrict_xpaths='/Article ) COO rticle ' excerpt ' () (images _ list=link extractor (allow=' sinaimg (.cn/mm ) ) restrict _ XPaths='/articl icle dny _ extensions=' ' (rules=(rule (pages _ list )，rule ) Articles_list，callate foll lles callback=' parse _ image ' (#提取福利文的图像链接，发送请求，用parse_images分析响应(defparse_content(self， response (: article=article item ) article('title ' )=response.XPath (/h1/a/text ) )0).extra th span [ video _ title _ list=response.xist video _ link _ list=response.XPath (/block quote//a/@href ).extract ) video_tuple_list=zip ) video_tuple ) list video _ linkin video _ tuple _ list : video _ lip=video _ listyieldarticledefparse _ image (self， response ) : #获取请求每张图像请求标头内的Referer，将其中的7位数字作为保存目录#灵感来源如下图# 请参见dirname=response.request.headers.get (referer ) (-123360-5 ) dirname=response.request )。 ' w ' ) ASf:f.write(response.body ) except:OS.mkdir ) dirname ) withopen(dirname )/'response.URL(-1000

pipelines.py #---coding 3360 utf-8---importjsonclassflhzpipeline (object ) :def__init_ ) (self ) 330 ' w ' ) self.file.write (' [ ' ] def process _ item (self，item，spider ) : data=JSON.dumps ensure _ ascii=fascii=fare