python多线程抓取代理服务器

文章转载自:https://blog.linuxeye.com/410.html

 代理服务器:http://www.proxy.com.ru

 

  1 #coding: utf-8
  2 
  3 import urllib2
  4 import re
  5 import time
  6 import threading
  7 import MySQLdb
  8 
  9 rawProxyList = []
 10 checkedProxyList = []
 11 
 12 #抓取代理网站
 13 targets = []
 14 for i in xrange(1, 23):
 15     target = r"http://www.proxy.com.ru/list_%d.html" % i
 16     targets.append(target)
 17     #print target + "\n"
 18 
 19 #抓取代理服务器正则
 20 p = re.compile(r‘‘‘<tr><b><td>(\d+)</td><td>(.+?)</td><td>(\d+)</td><td>(.+?)</td><td>(.+?)</td></b></tr>‘‘‘)
 21 
 22 #获取代理的类
 23 
 24 class ProxyGet(threading.Thread):
 25     def __init__(self, target):
 26         threading.Thread.__init__(self)
 27         self.target = target
 28 
 29 
 30     def getProxy(self):
 31         req = urllib2.Request(self.target)
 32         respnse = urllib2.urlopen(req)
 33         result = respnse.read()
 34         matches = p.findall(result)
 35         #print matches
 36         for row in matches:
 37             ip = row[1]
 38             port = row[2]
 39             addr = row[4].decode("cp936").encode("utf-8")
 40             proxy = [ip, port, addr]
 41             #print proxy
 42             rawProxyList.append(proxy)
 43 
 44 
 45     def run(self):
 46         self.getProxy()
 47 
 48 #核对代理是否有效的类
 49 class ProxyCheck(threading.Thread):
 50     def __init__(self,proxyList):
 51         threading.Thread.__init__(self)
 52         self.proxyList = proxyList
 53         self.timeout = 5
 54         self.testUrl = "http://www.baidu.com/"
 55         self.testStr = "030173"
 56 
 57     def checkProxy(self):
 58         cookies = urllib2.HTTPCookieProcessor()
 59         for proxy in self.proxyList:
 60             proxyHandler = urllib2.ProxyHandler({"http": rhttp://%s:%s %(proxy[0], proxy[1])})
 61             #print r‘http://%s:%s‘ %(proxy[0],proxy[1])
 62             opener = urllib2.build_opener(cookies, proxyHandler)
 63             opener.addheaders = [(User-agent, Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0)]
 64             #urllib2.install_opener(opener)
 65             t1 = time.time()
 66 
 67             try:
 68                 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
 69                 req = opener.open(self.testUrl, timeout=self.timeout)
 70                 #print "urlopen is ok...."
 71                 result = req.read()
 72                 #print "read html...."
 73                 timeused = time.time() - t1
 74                 pos = result.find(self.testStr)
 75                 #print "pos is %s" %pos
 76 
 77                 if pos >= 1:
 78                     checkedProxyList.append((proxy[0], proxy[1], proxy[2], timeused))
 79                     print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
 80                 else:
 81                      continue
 82             except Exception, e:
 83                 #print e.message
 84                 continue
 85 
 86     def run(self):
 87         self.checkProxy()
 88 
 89 
 90 if __name__ == "__main__":
 91     getThreads = []
 92     checkThreads = []
 93 
 94 #对每个目标网站开启一个线程负责抓取代理
 95 for i in range(len(targets)):
 96     t = ProxyGet(targets[i])
 97     getThreads.append(t)
 98 
 99 for i in range(len(getThreads)):
100     getThreads[i].start()
101 
102 for i in range(len(getThreads)):
103     getThreads[i].join()
104 
105 print .*10 + "总共抓取了%s个代理" % len(rawProxyList) + .*10
106 
107 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
108 for i in range(20):
109     t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
110     checkThreads.append(t)
111 
112 for i in range(len(checkThreads)):
113     checkThreads[i].start()
114 
115 for i in range(len(checkThreads)):
116     checkThreads[i].join()
117 
118 print .*10 + "总共抓取了%s个代理" % len(checkedProxyList) + .*10
119 
120 #插入数据库,四个字段ip, port, speed, addr
121 def db_insert(insert_list):
122     try:
123         conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset=utf8)
124         cursor = conn.cursor()
125         cursor.execute(delete from proxy)
126         cursor.execute(alter table proxy AUTO_INCREMENT=1)
127         cursor.executemany("INSERT INTO proxy(ip,port,speed,address) VALUES(%s, %s, %s,%s)", insert_list)
128         conn.commit()
129         cursor.close()
130         conn.close()
131 
132     except MySQLdb.Error, e:
133         print "Mysql Error %d: %s" %(e.args[0], e.args[1])
134 
135 #代理排序持久化
136 proxy_ok = []
137 for proxy in sorted(checkedProxyList, cmp=lambda x, y: cmp(x[3], y[3])):
138     if proxy[3] < 8:
139         #print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
140         proxy_ok.append((proxy[0], proxy[1], proxy[3], proxy[2]))
141 
142 db_insert(proxy_ok)

 

文章来自:http://www.cnblogs.com/nju2014/p/4614698.html
© 2021 jiaocheng.bubufx.com  联系我们
ICP备案:鲁ICP备09046678号-3