python爬取百度新闻数据,Python爬虫爬取新闻关键词

一、python请求要抓取的url页面

要抓取的url http://news.baidu.com/ ，先python模拟请求该url

#!/usr/mlddt/python# -*- coding:utf-8 -*-import httplibclass NewsBaidu(object):def __init__(self):super(NewsBaidu,self).__init__()def request(self):conn = httplib.HTTPConnection('news.baidu.com') #请求的hostrequest_url = '/' #请求的网页路径body = '' #请求的参数headers = {} #请求所带的头信息，该参数是一个字典 conn.request('GET',request_url,body,headers)result = conn.getresponse()print u'获取百度新闻'print result.statusprint result.reasonif __name__ == '__main__':nb = NewsBaidu()nb.request()

运行效果

status =200 ,表示请求成功，result.read()

二、分析页面HTML

1、我们要抓取的内容，百度新闻左侧的列表的标题、href

2、加载re模块，正则匹配出我们要的内容，我们先看看HTML样式

这是我们要抓取的上面一部分页面HTML，我们可以看到 <a rel="external nofollow" href="http://www.gov.cn/zhengce/2016-02/22/content_5044753.htm" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=1">总理发话科技成果将堂堂正正走出"深闺"</a> ，包含我们想要的内容，【总理发话科技成果将堂堂正正走出"深闺"】和href这部分的内容【http://www.gov.cn/zhengce/2016-02/22/content_5044753.htm】用正则提取出来

pattern = re.compile(r'<strong>.*?rel="external nofollow" rel="external nofollow" rel="external nofollow" href="(.*?)" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=[0-9]+">(.*?)</a>.*?strong>',re.S)
下面一部分要抓取的HTML内容，我就不再分析，原理都一样。

三、源码

#!/usr/mlddt/python# -*- coding:utf-8 -*-import httplibimport urllibimport reclass NewsBaidu(object):def __init__(self):super(NewsBaidu,self).__init__()self.f = open(u'百度新闻.txt','a')def request(self):try:conn = httplib.HTTPConnection('news.baidu.com') #请求的hostrequest_url = '/' #请求的网页路径body = '' #请求的参数headers = {} #请求所带的头信息，该参数是一个字典 conn.request('GET',request_url,body,headers)result = conn.getresponse()print u'获取百度新闻'print result.statusprint result.reason#print result.read()if result.status == 200:data = result.read()self.main(data)except Exception,e:print efinally:conn.close()self.f.close()def main(self,data):print u'获取中...'pattern = re.compile(r'<strong>.*?rel="external nofollow" rel="external nofollow" rel="external nofollow" href="(.*?)" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=[0-9]+">(.*?)</a>.*?strong>',re.S)items = re.findall(pattern,data)content = ''for item in items:content +=item[1].strip()+'t'+item[0].strip()+'tn'pattern = re.compile(r'<a rel="external nofollow" rel="external nofollow" rel="external nofollow" href="(.*?)" target="_blank" mon="r=1">(.*?)</a>',re.S)items = re.findall(pattern,data)for item in items:pattern = re.compile(r'^http://.*<a href="(.*)$',re.S) #url 对某些url再次正则获取url = re.findall(pattern,item[0])if url:u = url[0]else :u = item[0]content +=item[1].strip()+'t'+u.strip()+'tn'pattern = re.compile(r'<a rel="external nofollow" rel="external nofollow" rel="external nofollow" href="(.*?)" mon="ct=1&a=2&c=top&pn=[0-9]+" target="_blank">(.*?)</a>',re.S)items = re.findall(pattern,data)del items[0]for item in items:content +=item[1].strip()+'t'+item[0].strip()+'tn'self.f.write(content)print u'完成'if __name__ == '__main__':nb = NewsBaidu()nb.request()