含有章节索引的 *PUG 文章通用模板 ::-- ["zuroc"] [DateTime(2008-01-23T08:30:45Z)] TableOfContents

Include(CPUGnav)

1. ZSPY

用python抓取网络

代码见 http://zspy.googlecode.com

写作中....

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

1.1. 第一天PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   2 import StringIO
   3 
   4 html = StringIO.StringIO()
   5 
   6 import pycurl
   7 c = pycurl.Curl()
   8 
   9 c.setopt(pycurl.URL, 'http://www.baidu.com')
  10 
  11 #写的回调
  12 c.setopt(pycurl.WRITEFUNCTION, html.write)
  13 
  14 c.setopt(pycurl.FOLLOWLOCATION, 1)
  15 
  16 #最大重定向次数,可以预防重定向陷阱
  17 c.setopt(pycurl.MAXREDIRS, 5)
  18 
  19 #访问,阻塞到访问结束
  20 c.perform()
  21 
  22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  24 
  25 #输出百度首页的html
  26 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)

   1 #!/usr/bin/env python
   2 #coding=utf-8
   3 
   4 import threading
   5 import pycurl
   6 from cStringIO import StringIO
   7 
   8 class UrlOpen(threading.Thread):
   9     """异步下载网页"""
  10     
  11     def __init__(self):
  12         super(UrlOpen,self).__init__()
  13         self.opener = pycurl.CurlMulti()
  14         self.handle_list=[]
  15         
  16     def add(self,url,recall,writer=StringIO()):
  17         """
  18         参数:网址,回调函数,存放临时数据的对象
  19         """
  20         c = pycurl.Curl()
  21         
  22         #可以传给回调函数
  23         c.url=url
  24         c.content = writer
  25         c.recall = recall
  26         c.setopt(c.URL,url)
  27         c.setopt(c.WRITEFUNCTION,c.content.write)
  28 
  29         self.handle_list.append(c)
  30         self.opener.add_handle(c)
  31     
  32     def _remove(self,c):
  33         c.close()
  34         self.opener.remove_handle(c)
  35         self.handle_list.remove(c)
  36         
  37     
  38     def run(self):
  39         num_handle=len(self.handle_list)
  40         while 1:
  41             ret = self.opener.select(10.0)
  42             if ret == -1:  continue
  43             while 1:
  44                 num_handle_pre=num_handle
  45                 ret, num_handle =self.opener.perform()
  46                 #活动的连接数改变时
  47                 if num_handle!=num_handle_pre:
  48                     result=self.opener.info_read()
  49                     print result
  50                     for i in result[1]:
  51                         #成功
  52                         i.http_code = i.getinfo(i.HTTP_CODE)
  53                         self._remove(i)
  54                         i.recall(i)
  55                     for i in result[2]:
  56                         #失败,应该记录一下
  57                         self._remove(i)
  58 
  59                 if ret != pycurl.E_CALL_MULTI_PERFORM:
  60                     break
  61 
  62 _opener=None
  63 def urlopen(*arg,**key):
  64     global _opener
  65     if _opener is None:
  66         _opener=UrlOpen()
  67         _opener.add(*arg,**key)
  68         _opener.start()
  69     else:
  70         _opener.add(*arg,**key)
  71         
  72 def show(x):
  73     print x.content.getvalue()
  74 if __name__=="__main__":
  75     urlopen("http://www.baidu.com/",show)
  76     _opener.join()

又封装了一个异步打开网页的类和函数

#coding=utf-8

import threading
from cStringIO import StringIO

import pycurl
"""
Asyn open url
Author:[email protected]
2008-1-25 17:14
"""

class UrlOpen(threading.Thread):
    """异步下载网页"""
    
    def __init__(self,):
        super(UrlOpen,self).__init__()
        self.opener = pycurl.CurlMulti()
        self.handle_list=[]
        self.waiting=[]

    def add(self,url,recall,catch=None,writer=StringIO()):
        """
        参数:网址,回调函数,存放临时数据的对象
        """
        if catch is None:
            def catch(curl,error_no,desp):
                #print "Error:%s - %s"%(error_no,desp)
                pass

        c = pycurl.Curl()

        #可以传给回调函数
        c.url=url
        c.content = writer
        c.recall = recall
        c.catch=catch
        c.setopt(c.URL,
            url.encode('utf-8') if type(url) is unicode else url
        )
        c.setopt(c.WRITEFUNCTION,c.content.write)
        
        self.waiting.append(c)
    
    def _add(self):
        waiting=self.waiting[:]
        self.waiting=[]
        for c in waiting:
            self.handle_list.append(c)
            self.opener.add_handle(c)
        
    def _remove(self,c):
        c.close()
        self.opener.remove_handle(c)
        self.handle_list.remove(c)
        
    
    def run(self):
        import select
        import time
        num_handle=len(self.handle_list)
        count=1
        while 1:
            if self.handle_list:
                ret = self.opener.select(1.0)
                if ret >= 0:
                    while 1:
                        num_handle_pre=num_handle
                        ret, num_handle =self.opener.perform()
                        #活动的连接数改变时
                        if num_handle!=num_handle_pre:
                            result=self.opener.info_read()
                            for i in result[1]:
                                #成功
                                i.http_code = i.getinfo(i.HTTP_CODE)
                                self._remove(i)
                                i.recall(i)
                            for i in result[2]:
                                #失败,应该记录一下,或回调失败函数
                                #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
                                i[0].catch(*i)
                                self._remove(i[0])
                        if ret != pycurl.E_CALL_MULTI_PERFORM:
                            break
            else:
                time.sleep(1)
            self._add()

_opener=None
def urlopen(*arg,**key):
    global _opener
    if _opener is None:
        _opener=UrlOpen()
        _opener.start()
    _opener.add(*arg,**key)

if __name__=="__main__":
    def show(x):
        print x.content.getvalue()
        print '--'*11
    urlopen("http://www.baidu.com/",show)
    urlopen("http://www.google.com/",show)
    urlopen("http://www.sougou.com/",show)
    urlopen("http://www.yodao.com/",show)
    urlopen("http://www.yahoo.com/",show)
    urlopen("http://www.msn.com/",show)
    _opener.join()

1.1.1. 相关文献

2. 反馈

PageComment2