python线程池,python多线程爬取大量数据

Python作为一种强大的脚本语言，经常被用于编写爬虫程序。以下是Python爬虫多线程捕获代理服务器

首先，我在谷歌上浏览了包含代理服务器地址的网页，并决定从名为http://www.88181.com/的网站上抓起。我在上面爬了800个特工。（

#！ /usr/gsdls/env python

#编码： utf-8

import urllib2

导入re

导入教程

导入时间

原始代理列表=[ ]

已检查代理列表=[ ]

#抓住代理站点

portdicts={'v':'3'，' m':'4'，' a':'2'，' l':'9'，' q':'0'，' b '

targets=[]

foriinxrange (1，9 ) :

target=r ' http://www.88181.com/proxy % d.html ' % I

targets.append(target )

#打印目标

#正则

p=re.compile(r ' ' )。

() . () . () )？ () . )“”() #获取代理的类

classproxyget(threading.thread ) :

def _ _ init _ _ (自，目标) :

threading.Thread.__init__(self )

self.target=target

efgetproxy(self ) :

打印目标站点： self.target

req=urllib2. urlopen (self.target ) )

result=req.read (

#printchardet.detect(result ) )。

matchs=p.findall(result ) )。

for row in matchs:

ip=row[0]

port=row[1]

port=map (lambdax : port dicts [ x ]，port.split (' ' )

port=''.join(port ) )。

agent=row[2]

ADDR=row[3].decode(CP936 ).encode (utf-8 ) ) ) ) ) ) )。

proxy=[ip，port，addr]

#打印代理

原始代理列表. append (proxy )

defrun(self ) :

self.getProxy (

#检查代理的类

classproxycheck (threading.thread ) :

def _ _ init _ _ (自，代理列表) :

threading.Thread.__init__(self )

self.proxyList=proxyList

self.timeout=5

SLF.testurl=' http://www.Baidu.com/'

self.testStr='030173 '

差速器代理(自) :

cookies=urllib2. http cookie processor (

for proxy in self.proxyList:

proxy handler=urllib2. proxy handler ({ ' http ' : r ' http://% s 3360 % s ' % ) proxy[1]，proxy[1] ) )

# printr ' http://% s : % s ' % (proxy [0]，proxy[1]

(opener=urllib2. build _ opener (cookies，proxyHandler ) ) ) ) ) ) )。

opener.add headers=[ ('用户-代理'，' Mozilla/5.0(windowsnt6.2； WOW64； rv:22.0 ) Gecko/20100101 Firefox/22.0 ' ) ]

#urllib2.install_opener(opener ) )。

t1=time.time (

try:

# req=urllib2. urlopen (http://www.Baidu.com)，timeout=self.timeout ) () ) )。

req=opener.open(self.testurl，timeout=self.timeout ) )。

#print 'urlopen is ok . '

result=req.read (

#print 'read html . '

timeused=time.time((-T1

POS=result.find(self.teststr ) )。

#print 'pos is %s' %pos

if pos 1:

checked proxy list.append ((proxy [0]，proxy[1]，proxy[2]，timeused ) )

# print ' okip : % s % s % s ' % (代理[0]，代理[1]，代理[2]，时间) )

else:

连续

except Exception，e:

#print e.message

连续

defrun(self ) :

self.checkProxy (

if __name__=='__main__':

getThreads=[]

checkThreads=[]

#在每个目标站点打开线程以获取代理

傅里叶(len ) targets ) :

t=proxyget(targets[I] ) ) ) ) ) ) ) ) ) ) ) ) )

getthreads.append(t ) )。

傅里叶(len )获取阈值) :

getThreads[i].start (

傅里叶(len )获取阈值) :

getThreads[i].join (

print '.'*10 '共获得%s个代理' % len (原始代理列表).' *10

#打开20个线程负责检查，将捕获的代理分成20个，每个线程检查一个

forIinrange(20 ) :

t=代理检查(原始代理列表((len )原始代理列表(/20 ) I: ) ) len (原始代理列表(19 )/20 ) * (i1 ) ) )

checkthreads.append(t ) )。

傅里叶变换(len )检查阈值) :

checkThreads[i].start (

傅里叶变换(len )检查阈值) :

checkThreads[i].join (

print '.'*10 '的总%s个代理已验证“% len (已检查代理列表)”. ' *10 )

#持久化

f=open('proxy_list.txt '，' w '

forproxyinsorted (已检查代理列表，cmp=lambda x，y:CMP(x[3]，y[3] ) ) :

print ' checkedproxyis : % s : % st % st % s ' % (代理[0]、代理[1]、代理[2]、代理[3] ) )

f.write('%s:%s(t%s ) t%s(n ) % ) proxy[0]、proxy[1]、proxy[2]、proxy[3] )

f.close ) )部分log :对象网站： http://www.88181.com/proxy1.html

对象网站： http://www.88181.com/proxy2.html

对象网站： http://www.88181.com/proxy3.html

对象网站： http://www.88181.com/proxy4.html

对象网站： http://www.88181.com/proxy5.html

对象网站： http://www.88181.com/proxy6.html

对象网站： http://www.88181.com/proxy7.html

对象网站： http://www.88181.com/proxy8.html

.总共抓住了800个特工.

.共有478个代理通过了验证.

173.213.113.11:8089 United States 0.34155833817

173.213.113.1133603128 United States 0.347477912903

210.101.131.232:8080韩国首尔0.418715000153