Differences between revisions 2 and 15 (spanning 13 versions)
Revision 2 as of 2008-01-23 08:33:02
Size: 6565
Editor: zuroc
Comment:
Revision 15 as of 2008-02-16 10:37:50
Size: 6999
Editor: zeehk
Comment:
Deletions are marked like this. Additions are marked like this.
Line 3: Line 3:
'''
含有章节索引的 *PUG 文章通用模板
'''
::-- ["zuroc"] [[[DateTime(2008-01-23T08:30:45Z)]]]
[[TableOfContents]]
## 默许导航,请保留
[[Include(CPUGnav)]]


= ZSPY =
''用python抓取网络''

代码见
https
://zspy.googlecode.com
== 第一天 ==

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度.
用于做网络爬虫,抓网页.


http://pycurl.sourceforge.net/download/
下载
pycurl-ssl-7.16.4.win32-py2.5.exe
安装.
代码见 http://zspy.googlecode.com

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

==
PycURL ==
Pycurl
http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
Line 31: Line 17:
Line 33: Line 20:

#像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
Line 35: Line 24:
#像操作文件一样操作字符串
Line 60: Line 48:






=== 文献 ===
==== 1. PycURL简单学习 ====
http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
PycURL 是一个C语言写的 libcurl 的 Python 绑定库。libcurl 是一个自由的,并且容易使用的用在客户端的 URL 传输库。它的功能很强大,在 PycURL 的主页上介绍的支持的功能有:

    supporting FTP, FTPS, HTTP, HTTPS, GOPHER, TELNET, DICT, FILE and LDAP. libcurl supports HTTPS certificates, HTTP POST, HTTP PUT, FTP uploading, kerberos, HTTP form based upload, proxies, cookies, user+password authentication, file transfer resume, http proxy tunneling and more!

那一大堆的协议已经让人惊喜了,特别是还有代理服务器和用户认证之类的功能。这个库相对于 urllib2 来说,它不是纯 Python 的,它是一个 C 库,但因此速度更快,但它不是很 pythonic ,学起来有些复杂。它在多种平台下都有移植,象 Linux , Mac, Windows, 和多种Unix。

我安装了一个,并且测试了一小段代码,是有些复杂,代码如下:
然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)
Line 78: Line 54:
            import pycurl
            c = pycurl.Curl()
            c.setopt(pycurl.URL, 'http://feeds.feedburner.com/solidot')
            import StringIO
            b = StringIO.StringIO()
            c.setopt(pycurl.WRITEFUNCTION, b.write)
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
    # c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
    # c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
            c.perform()
            print b.getvalue()

#!/usr/bin/env python
#coding=utf-8

import threading
import pycurl
from cStringIO import StringIO

class UrlOpen(threading.Thread):
    """异步下载网页"""

    def __init__(self):
        super(UrlOpen,self).__init__()
        self.opener = pycurl.CurlMulti()
        self.handle_list=[]

    def add(self,url,recall,writer=StringIO()):
        """
        参数:网址,回调函数,存放临时数据的对象
        """
        c = pycurl.Curl()

        #可以传给回调函数
        c.url=url
        c.content = writer
        c.recall = recall
        c.setopt(c.URL,url)
        c.setopt(c.WRITEFUNCTION,c.content.write)

        self.handle_list.append(c)
        self.opener.add_handle(c)

    def _remove(self,c):
        c.close()
        self.opener.remove_handle(c)
        self.handle_list.remove(c)


    def run(self):
        num_handle=len(self.handle_list)
        while 1:
            ret = self.opener.select(10.0)
            if ret == -1: continue
            while 1:
                num_handle_pre=num_handle
                ret, num_handle =self.opener.perform()
                #活动的连接数改变时
                if num_handle!=num_handle_pre:
                    result=self.opener.info_read()
                    print result
                    for i in result[1]:
                        #成功
                        i.http_code = i.getinfo(i.HTTP_CODE)
                        self._remove(i)
                        i.recall(i)
                    for i in result[2]:
                        #失败,应该记录一下
                        self._remove(i)

                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

_opener=None
def urlopen(*arg,**key):
    global _opener
    if _opener is None:
        _opener=UrlOpen()
        _opener.add(*arg,**key)
        _opener.start()
    else:
        _opener.add(*arg,**key)

def show(x):
    print x.content.getvalue()
if __name__=="__main__":
    urlopen("http://www.baidu.com/",show)
    _opener.join()
Line 91: Line 132:
上述代码将会把奇客(Solidot)的RSS抓下来。如果有代理服务器,那么修改一下注释的两行即可。在 PycURL 的主页上还有一个多线程抓取的例子,有兴趣的可以看一看。

==== 2. python中的pycurl模块学习 ====
文章作者:[email protected]
信息来源:邪恶八进制信息安全团队(www.eviloctal.com)

1、使用getinfo来获得更多的信息:
又封装了一个异步打开网页的类和函数
Line 100: Line 136:
#! /usr/bin/env python
# vi:ts=4:et
# $Id: test_getinfo.py,v 1.18 2003/05/01 19:35:01 mfx Exp $
# Author BY MSN:[email protected]
import time
#coding=utf-8

import threading
from cStringIO import StringIO
Line 106: Line 142:


## Callback function invoked when progress information is updated
#下面的函数用来显示下载的进度:
def progress(download_t, download_d, upload_t, upload_d):
  print "Total to download %d bytes, have %d bytes so far" % \
      (download_t, download_d)

url = "http://www.sohu.com/index.html"

print "Starting downloading", url
print
f = open("body.html", "wb") #新建一个文件并返回文件描述字,f用来保存返回的网页内容
h = open("header.txt", "wb")#h用来保存返回的包头header信息
i = open("info.txt","wb") #i用来保存getinfo()函数取回的信息
c = pycurl.Curl()
c.setopt(c.URL, url) #设置要访问的网址
c.setopt(c.WRITEDATA, f) #将返回的网页内容写入f文件描述字
c.setopt(c.NOPROGRESS, 0)
c.setopt(c.PROGRESSFUNCTION, progress)#调用过程函数
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 5)
c.setopt(c.WRITEHEADER, h)#将返回的包头header内容写入h文件描述字
c.setopt(c.OPT_FILETIME, 1)
c.perform() #执行上述访问网址的操作

print
print "HTTP-code:", c.getinfo(c.HTTP_CODE) #Outputs:200
buf=c.getinfo(c.HTTP_CODE)
i.write("HTTP-code:"+str(buf)) #将输出写入到i文件描述字中
print "Total-time:", c.getinfo(c.TOTAL_TIME) #下载总时间:0.795
buf=c.getinfo(c.TOTAL_TIME)
i.write('\r\n')
i.write("Total-time:"+str(buf))
print "Download speed: %.2f bytes/second" % c.getinfo(c.SPEED_DOWNLOAD) #下载速度:261032.00 bytes/second
print "Document size: %d bytes" % c.getinfo(c.SIZE_DOWNLOAD) #下载文档的大小:207521 bytes
print "Effective URL:", c.getinfo(c.EFFECTIVE_URL) #有效网址:http://www.sohu.com/index.html
print "Content-type:", c.getinfo(c.CONTENT_TYPE) #text/html
print "Namelookup-time:", c.getinfo(c.NAMELOOKUP_TIME) #DNS解析速度:0.065
print "Redirect-time:", c.getinfo(c.REDIRECT_TIME) #0.0
print "Redirect-count:", c.getinfo(c.REDIRECT_COUNT) #0
epoch = c.getinfo(c.INFO_FILETIME)
print "Filetime: %d (%s)" % (epoch, time.ctime(epoch)) #文件下载时间:1172361818 (Sun Feb 25 08:03:38 2007)
print
print "Header is in file 'header.txt', body is in file 'body.html'"

c.close()
f.close()
h.close()
"""
Asyn open url
Author:[email protected]
2008-1-25 17:14
"""

class UrlOpen(threading.Thread):
    """异步下载网页"""

    def __init__(self,):
        super(UrlOpen,self).__init__()
        self.opener = pycurl.CurlMulti()
        self.handle_list=[]
        self.waiting=[]

    def add(self,url,recall,catch=None,writer=StringIO()):
        """
        参数:网址,回调函数,存放临时数据的对象
        """
        if catch is None:
            def catch(curl,error_no,desp):
                #print "Error:%s - %s"%(error_no,desp)
                pass

        c = pycurl.Curl()

        #可以传给回调函数
        c.url=url
        c.content = writer
        c.recall = recall
        c.catch=catch
        c.setopt(c.URL,
            url.encode('utf-8') if type(url) is unicode else url
        )
        c.setopt(c.WRITEFUNCTION,c.content.write)

        self.waiting.append(c)

    def _add(self):
        waiting=self.waiting[:]
        self.waiting=[]
        for c in waiting:
            self.handle_list.append(c)
            self.opener.add_handle(c)

    def _remove(self,c):
        c.close()
        self.opener.remove_handle(c)
        self.handle_list.remove(c)


    def run(self):
        import select
        import time
        num_handle=0
        while 1:
            if self.handle_list:
                ret = self.opener.select(1.0)
                if ret >= 0:
                    while 1:
                        num_handle_pre=num_handle
                        ret, num_handle =self.opener.perform()
                        #活动的连接数改变时
                        if num_handle!=num_handle_pre:
                            result=self.opener.info_read()
                            for i in result[1]:
                                #成功
                                i.http_code = i.getinfo(i.HTTP_CODE)
                                self._remove(i)
                                i.recall(i)
                            for i in result[2]:
                                #失败,应该记录一下,或回调失败函数
                                #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
                                i[0].catch(*i)
                                self._remove(i[0])
                        if ret != pycurl.E_CALL_MULTI_PERFORM:
                            break
            else:
                time.sleep(1)
            self._add()

_opener=None
def urlopen(*arg,**key):
    global _opener
    if _opener is None:
        _opener=UrlOpen()
        _opener.start()
    _opener.add(*arg,**key)

if __name__=="__main__":
    def show(x):
        print x.content.getvalue()
        print '--'*11
    urlopen("http://www.baidu.com/",show)
    urlopen("http://www.google.com/",show)
    urlopen("http://www.sougou.com/",show)
    urlopen("http://www.yodao.com/",show)
    urlopen("http://www.yahoo.com/",show)
    urlopen("http://www.msn.com/",show)
    _opener.join()
Line 156: Line 244:

2、简单用法:
{{{
#!python
#!c:\python25\python
# vi:ts=4:et
# $Id: test_cb.py,v 1.14 2003/04/21 18:46:10 mfx Exp $
# Author BY MSN:[email protected]

import sys
import pycurl

## Callback function invoked when body data is ready
def body(buf):
  # Print body data to stdout
  sys.stdout.write(buf) #将buf的内容输出到标准输出
 

## Callback function invoked when header data is ready
def header(buf):
  # Print header data to stderr
  sys.stdout.write(buf)

c = pycurl.Curl()
c.setopt(pycurl.URL, 'http://www.sohu.com/') #设置要访问的网址
c.setopt(pycurl.WRITEFUNCTION, body) #调用body()函数来输出返回的信息
c.setopt(pycurl.HEADERFUNCTION, header)#调用header()函数来输出返回的信息
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform() #执行上述访问网址的操作
c.close()
}}}

=== 相关文献 ===
 * PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
 * python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

代码见 http://zspy.googlecode.com

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

1. PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   2 import StringIO
   3 
   4 html = StringIO.StringIO()
   5 
   6 import pycurl
   7 c = pycurl.Curl()
   8 
   9 c.setopt(pycurl.URL, 'http://www.baidu.com')
  10 
  11 #写的回调
  12 c.setopt(pycurl.WRITEFUNCTION, html.write)
  13 
  14 c.setopt(pycurl.FOLLOWLOCATION, 1)
  15 
  16 #最大重定向次数,可以预防重定向陷阱
  17 c.setopt(pycurl.MAXREDIRS, 5)
  18 
  19 #访问,阻塞到访问结束
  20 c.perform()
  21 
  22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  24 
  25 #输出百度首页的html
  26 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)

   1 #!/usr/bin/env python
   2 #coding=utf-8
   3 
   4 import threading
   5 import pycurl
   6 from cStringIO import StringIO
   7 
   8 class UrlOpen(threading.Thread):
   9     """异步下载网页"""
  10 
  11     def __init__(self):
  12         super(UrlOpen,self).__init__()
  13         self.opener = pycurl.CurlMulti()
  14         self.handle_list=[]
  15 
  16     def add(self,url,recall,writer=StringIO()):
  17         """
  18         参数:网址,回调函数,存放临时数据的对象
  19         """
  20         c = pycurl.Curl()
  21 
  22         #可以传给回调函数
  23         c.url=url
  24         c.content = writer
  25         c.recall = recall
  26         c.setopt(c.URL,url)
  27         c.setopt(c.WRITEFUNCTION,c.content.write)
  28 
  29         self.handle_list.append(c)
  30         self.opener.add_handle(c)
  31 
  32     def _remove(self,c):
  33         c.close()
  34         self.opener.remove_handle(c)
  35         self.handle_list.remove(c)
  36 
  37 
  38     def run(self):
  39         num_handle=len(self.handle_list)
  40         while 1:
  41             ret = self.opener.select(10.0)
  42             if ret == -1:  continue
  43             while 1:
  44                 num_handle_pre=num_handle
  45                 ret, num_handle =self.opener.perform()
  46                 #活动的连接数改变时
  47                 if num_handle!=num_handle_pre:
  48                     result=self.opener.info_read()
  49                     print result
  50                     for i in result[1]:
  51                         #成功
  52                         i.http_code = i.getinfo(i.HTTP_CODE)
  53                         self._remove(i)
  54                         i.recall(i)
  55                     for i in result[2]:
  56                         #失败,应该记录一下
  57                         self._remove(i)
  58 
  59                 if ret != pycurl.E_CALL_MULTI_PERFORM:
  60                     break
  61 
  62 _opener=None
  63 def urlopen(*arg,**key):
  64     global _opener
  65     if _opener is None:
  66         _opener=UrlOpen()
  67         _opener.add(*arg,**key)
  68         _opener.start()
  69     else:
  70         _opener.add(*arg,**key)
  71 
  72 def show(x):
  73     print x.content.getvalue()
  74 if __name__=="__main__":
  75     urlopen("http://www.baidu.com/",show)
  76     _opener.join()

又封装了一个异步打开网页的类和函数

   1 #coding=utf-8
   2 
   3 import threading
   4 from cStringIO import StringIO
   5 
   6 import pycurl
   7 """
   8 Asyn open url
   9 Author:[email protected]
  10 2008-1-25 17:14
  11 """
  12 
  13 class UrlOpen(threading.Thread):
  14     """异步下载网页"""
  15 
  16     def __init__(self,):
  17         super(UrlOpen,self).__init__()
  18         self.opener = pycurl.CurlMulti()
  19         self.handle_list=[]
  20         self.waiting=[]
  21 
  22     def add(self,url,recall,catch=None,writer=StringIO()):
  23         """
  24         参数:网址,回调函数,存放临时数据的对象
  25         """
  26         if catch is None:
  27             def catch(curl,error_no,desp):
  28                 #print "Error:%s - %s"%(error_no,desp)
  29                 pass
  30 
  31         c = pycurl.Curl()
  32 
  33         #可以传给回调函数
  34         c.url=url
  35         c.content = writer
  36         c.recall = recall
  37         c.catch=catch
  38         c.setopt(c.URL,
  39             url.encode('utf-8') if type(url) is unicode else url
  40         )
  41         c.setopt(c.WRITEFUNCTION,c.content.write)
  42 
  43         self.waiting.append(c)
  44 
  45     def _add(self):
  46         waiting=self.waiting[:]
  47         self.waiting=[]
  48         for c in waiting:
  49             self.handle_list.append(c)
  50             self.opener.add_handle(c)
  51 
  52     def _remove(self,c):
  53         c.close()
  54         self.opener.remove_handle(c)
  55         self.handle_list.remove(c)
  56 
  57 
  58     def run(self):
  59         import select
  60         import time
  61         num_handle=0
  62         while 1:
  63             if self.handle_list:
  64                 ret = self.opener.select(1.0)
  65                 if ret >= 0:
  66                     while 1:
  67                         num_handle_pre=num_handle
  68                         ret, num_handle =self.opener.perform()
  69                         #活动的连接数改变时
  70                         if num_handle!=num_handle_pre:
  71                             result=self.opener.info_read()
  72                             for i in result[1]:
  73                                 #成功
  74                                 i.http_code = i.getinfo(i.HTTP_CODE)
  75                                 self._remove(i)
  76                                 i.recall(i)
  77                             for i in result[2]:
  78                                 #失败,应该记录一下,或回调失败函数
  79                                 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
  80                                 i[0].catch(*i)
  81                                 self._remove(i[0])
  82                         if ret != pycurl.E_CALL_MULTI_PERFORM:
  83                             break
  84             else:
  85                 time.sleep(1)
  86             self._add()
  87 
  88 _opener=None
  89 def urlopen(*arg,**key):
  90     global _opener
  91     if _opener is None:
  92         _opener=UrlOpen()
  93         _opener.start()
  94     _opener.add(*arg,**key)
  95 
  96 if __name__=="__main__":
  97     def show(x):
  98         print x.content.getvalue()
  99         print '--'*11
 100     urlopen("http://www.baidu.com/",show)
 101     urlopen("http://www.google.com/",show)
 102     urlopen("http://www.sougou.com/",show)
 103     urlopen("http://www.yodao.com/",show)
 104     urlopen("http://www.yahoo.com/",show)
 105     urlopen("http://www.msn.com/",show)
 106     _opener.join()

1.1. 相关文献

2. 反馈

PageComment2

zspy (last edited 2009-12-25 07:15:17 by localhost)