含有章节索引的 *PUG 文章通用模板 ::-- ["zuroc"] [DateTime(2008-01-23T08:30:45Z)] TableOfContents
1. ZSPY
用python抓取网络
代码见 http://zspy.googlecode.com
写作中....
张沈鹏 [email protected] http://zsp.javaeye.com/
2008-1-23 16:42
1.1. 第一天PycURL
Pycurl http://pycurl.sourceforge.net/
外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.
从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
参考文献1,测试代码
1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
2 import StringIO
3
4 html = StringIO.StringIO()
5
6 import pycurl
7 c = pycurl.Curl()
8
9 c.setopt(pycurl.URL, 'http://www.baidu.com')
10
11 #写的回调
12 c.setopt(pycurl.WRITEFUNCTION, html.write)
13
14 c.setopt(pycurl.FOLLOWLOCATION, 1)
15
16 #最大重定向次数,可以预防重定向陷阱
17 c.setopt(pycurl.MAXREDIRS, 5)
18
19 #访问,阻塞到访问结束
20 c.perform()
21
22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
24
25 #输出百度首页的html
26 #print html.getvalue()
然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html
我自己改写了一个:)
1 #!/usr/bin/env python
2 #coding=utf-8
3
4 import threading
5 import pycurl
6 from cStringIO import StringIO
7
8 class UrlOpen(threading.Thread):
9 """异步下载网页"""
10
11 def __init__(self):
12 super(UrlOpen,self).__init__()
13 self.opener = pycurl.CurlMulti()
14 self.handle_list=[]
15
16 def add(self,url,recall,writer=StringIO()):
17 """
18 参数:网址,回调函数,存放临时数据的对象
19 """
20 c = pycurl.Curl()
21
22 #可以传给回调函数
23 c.url=url
24 c.content = writer
25 c.recall = recall
26 c.setopt(c.URL,url)
27 c.setopt(c.WRITEFUNCTION,c.content.write)
28
29 self.handle_list.append(c)
30 self.opener.add_handle(c)
31
32 def _remove(self,c):
33 c.close()
34 self.opener.remove_handle(c)
35 self.handle_list.remove(c)
36
37
38 def run(self):
39 num_handle=len(self.handle_list)
40 while 1:
41 ret = self.opener.select(10.0)
42 if ret == -1: continue
43 while 1:
44 num_handle_pre=num_handle
45 ret, num_handle =self.opener.perform()
46 #活动的连接数改变时
47 if num_handle!=num_handle_pre:
48 result=self.opener.info_read()
49 print result
50 for i in result[1]:
51 #成功
52 i.http_code = i.getinfo(i.HTTP_CODE)
53 self._remove(i)
54 i.recall(i)
55 for i in result[2]:
56 #失败,应该记录一下
57 self._remove(i)
58
59 if ret != pycurl.E_CALL_MULTI_PERFORM:
60 break
61
62 _opener=None
63 def urlopen(*arg,**key):
64 global _opener
65 if _opener is None:
66 _opener=UrlOpen()
67 _opener.add(*arg,**key)
68 _opener.start()
69 else:
70 _opener.add(*arg,**key)
71
72 def show(x):
73 print x.content.getvalue()
74 if __name__=="__main__":
75 urlopen("http://www.baidu.com/",show)
76 _opener.join()
又封装了一个异步打开网页的类和函数
#coding=utf-8 #coding=utf-8 import threading from cStringIO import StringIO import pycurl """ Asyn open url Author:[email protected] 2008-1-25 17:14 """ class UrlOpen(threading.Thread): """异步下载网页""" def __init__(self,): super(UrlOpen,self).__init__() self.opener = pycurl.CurlMulti() self.handle_list=[] self.waiting=[] def add(self,url,recall,catch=None,writer=StringIO()): """ 参数:网址,回调函数,存放临时数据的对象 """ if catch is None: def catch(curl,error_no,desp): #print "Error:%s - %s"%(error_no,desp) pass c = pycurl.Curl() #可以传给回调函数 c.url=url c.content = writer c.recall = recall c.catch=catch c.setopt(c.URL, url.encode('utf-8') if type(url) is unicode else url ) c.setopt(c.WRITEFUNCTION,c.content.write) self.waiting.append(c) def _add(self): waiting=self.waiting[:] self.waiting=[] for c in waiting: self.handle_list.append(c) self.opener.add_handle(c) def _remove(self,c): c.close() self.opener.remove_handle(c) self.handle_list.remove(c) def run(self): import select import time num_handle=0 while 1: if self.handle_list: ret = self.opener.select(1.0) if ret >= 0: while 1: num_handle_pre=num_handle ret, num_handle =self.opener.perform() #活动的连接数改变时 if num_handle!=num_handle_pre: result=self.opener.info_read() for i in result[1]: #成功 i.http_code = i.getinfo(i.HTTP_CODE) self._remove(i) i.recall(i) for i in result[2]: #失败,应该记录一下,或回调失败函数 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)') i[0].catch(*i) self._remove(i[0]) if ret != pycurl.E_CALL_MULTI_PERFORM: break else: time.sleep(1) self._add() _opener=None def urlopen(*arg,**key): global _opener if _opener is None: _opener=UrlOpen() _opener.start() _opener.add(*arg,**key) if __name__=="__main__": def show(x): print x.content.getvalue() print '--'*11 urlopen("http://www.baidu.com/",show) urlopen("http://www.google.com/",show) urlopen("http://www.sougou.com/",show) urlopen("http://www.yodao.com/",show) urlopen("http://www.yahoo.com/",show) urlopen("http://www.msn.com/",show) _opener.join()
1.1.1. 相关文献
PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337