含有章节索引的 *PUG 文章通用模板 ::-- hoxide [2006-04-29 09:12:35]
1. 代理服务器测试
简述 校园网上外网要money, 所以离不开proxy了, 不过网上n多proxy哪个最快呢? 好像有很多实现的软件, 但是用起来都不顺手, 既然会写程序, 为什么不自己写一个小工具?
1.1. 代码
1 import urllib
2 from HTMLParser import HTMLParser
3 from string import letters
4 import time
5
6 import pprint
7
8 def parserhtmllist(htmldata):
9 class MyHTMLParser(HTMLParser):
10 def set(self):
11 self.S = 'none'
12 self.I = []
13 self.l = 0
14 def handle_starttag(self, tag, attrs):
15 if ('class', 'cells') in attrs \
16 and tag == 'tr' :
17 self.l = 0
18 self.S = 'cell'
19 self.I.append([])
20 if self.S == 'cell' and tag =='td':
21 self.l += 1
22 self.S = 'celltd'
23 def handle_endtag(self, tag):
24 if self.S == 'celltd' and tag == 'td':
25 self.l -= 1
26 if self.l == 0:
27 self.S = 'cell'
28 if self.S == 'cell' and tag == 'tr':
29 self.S = 'none'
30 def handle_data(self,data):
31 if self.S == 'celltd' and self.l >=1:
32 self.I[-1].append(data)
33 def getlist(self):
34 I = []
35 for x in self.I:
36 try:
37 int(x[0])
38 I.append((x[1],x[2]))
39 except:
40 pass
41 return I
42
43 p = MyHTMLParser()
44 p.set()
45 p.feed(htmldata)
46 p.close()
47 return p.getlist()
48
49 def getproxylist(proxylisturl, testurls
50 , proxies = {},maxtime=20, debug=True):
51 opener = urllib.FancyURLopener(proxies)
52 data = opener.open(proxylisturl).read()
53 I = parserhtmllist(data)
54 TI = []
55 for server,port in I:
56 proxy = {'http':'http://%s:%s'%(server,port)}
57 opener = urllib.FancyURLopener(proxies)
58 TI.append([])
59 if debug:
60 print 'testing %s:%s'%(server,port)
61 for url in testurls:
62 try:
63 st = time.time()
64 filehandle = opener.open(url)
65 et = time.time()
66 TI[-1].append(et-st)
67 except IOError:
68 TI[-1].append(maxtime)
69 return zip(I,TI)
70
71 if __name__ == '__main__':
72 proxylisturl = 'http://www.haozs.net/proxyip/index.php?' \
73 'act=list&port=&type=&country=China&page=1'
74 #proxylisturl = 'a.html'
75
76 testurls = ['http://www.google.com']
77
78 M = getproxylist(proxylisturl, testurls)
79
80
81 M.sort(key=lambda x: sum(x[1]))
82
83 print '\nResult(sorted):'
84 for x in M:
85 print '%s:%s\t%g'%(x[0][0],x[0][1],sum(x[1]))
86
1.2. 分析
Html的处理上有点麻烦, 用了HTMLParser, 相信熟悉Parser的同学们肯定都明白.