1. ZSPY

用python抓取网络

代码见 http://zspy.googlecode.com

写作中....

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

1.1. 第一天PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   2 import StringIO
   3 
   4 html = StringIO.StringIO()
   5 
   6 import pycurl
   7 c = pycurl.Curl()
   8 
   9 c.setopt(pycurl.URL, 'http://www.baidu.com')
  10 
  11 #写的回调
  12 c.setopt(pycurl.WRITEFUNCTION, html.write)
  13 
  14 c.setopt(pycurl.FOLLOWLOCATION, 1)
  15 
  16 #最大重定向次数,可以预防重定向陷阱
  17 c.setopt(pycurl.MAXREDIRS, 5)
  18 
  19 #访问,阻塞到访问结束
  20 c.perform()
  21 
  22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  24 
  25 #输出百度首页的html
  26 #print html.getvalue()

然后看看多线程的例子

import os, sys
from cStringIO import StringIO

import pycurl


urls = (
    "http://curl.haxx.se",
    "http://www.python.org",
    "http://pycurl.sourceforge.net",
    "http://pycurl.sourceforge.net/tests/403_FORBIDDEN",  # that actually exists ;-)
    "http://pycurl.sourceforge.net/tests/404_NOT_FOUND",
)

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1], "rb").readlines()
except IndexError:
    # No file was specified
    pass

# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
    c = pycurl.Curl()
    # save info in standard Python attributes
    c.url = url.rstrip()
    c.body = StringIO()
    c.http_code = -1
    m.handles.append(c)
    # pycurl API calls
    c.setopt(c.URL, c.url)
    c.setopt(c.WRITEFUNCTION, c.body.write)
    m.add_handle(c)

# get data
num_handles = len(m.handles)
while num_handles:
     while 1:
         ret, num_handles = m.perform()
         if ret != pycurl.E_CALL_MULTI_PERFORM:
             break
     # currently no more I/O is pending, could do something in the meantime
     # (display a progress bar, etc.)
     m.select(1.0)

# close handles
for c in m.handles:
    # save info in standard Python attributes
    c.http_code = c.getinfo(c.HTTP_CODE)
    # pycurl API calls
    m.remove_handle(c)
    c.close()
m.close()

# print result
for c in m.handles:
    data = c.body.getvalue()
    if 0:
        print "**********", c.url, "**********"
        print data
    else:
        print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))

1.1.1. 相关文献

PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

2. 反馈

PageComment2

-  ⇤ ← Revision 3 as of 2008-01-23 08:33:39 → 
  Size: 6564
  Editor: zuroc
  Comment:
+   ← Revision 6 as of 2008-01-23 09:00:48 → ⇥
  Size: 3300
  Editor: zuroc
  Comment:
-Deletions are marked like this.
+Additions are marked like this.
 Line 17:
-== 第一天 ==
+写作中....

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

== 第一天PycURL ==
-Line 20:
+Line 27:
-Line 31:
+Line 39:
-Line 33:
+Line 42:
+#像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
-Line 35:
+Line 46:
-#像操作文件一样操作字符串
-Line 61:
+Line 71:
+然后看看多线程的例子
{{{
import os, sys
from cStringIO import StringIO

import pycurl


urls = (
    "http://curl.haxx.se",
    "http://www.python.org",
    "http://pycurl.sourceforge.net",
    "http://pycurl.sourceforge.net/tests/403_FORBIDDEN",  # that actually exists ;-)
    "http://pycurl.sourceforge.net/tests/404_NOT_FOUND",
)

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1], "rb").readlines()
except IndexError:
    # No file was specified
    pass

# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
    c = pycurl.Curl()
    # save info in standard Python attributes
    c.url = url.rstrip()
    c.body = StringIO()
    c.http_code = -1
    m.handles.append(c)
    # pycurl API calls
    c.setopt(c.URL, c.url)
    c.setopt(c.WRITEFUNCTION, c.body.write)
    m.add_handle(c)

# get data
num_handles = len(m.handles)
while num_handles:
     while 1:
         ret, num_handles = m.perform()
         if ret != pycurl.E_CALL_MULTI_PERFORM:
             break
     # currently no more I/O is pending, could do something in the meantime
     # (display a progress bar, etc.)
     m.select(1.0)

# close handles
for c in m.handles:
    # save info in standard Python attributes
    c.http_code = c.getinfo(c.HTTP_CODE)
    # pycurl API calls
    m.remove_handle(c)
    c.close()
m.close()

# print result
for c in m.handles:
    data = c.body.getvalue()
    if 0:
        print "**********", c.url, "**********"
        print data
    else:
        print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))

}}}
-Line 65:
+Line 143:
-=== 文献 ===
==== 1. PycURL简单学习 ====
http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
PycURL 是一个C语言写的 libcurl 的 Python 绑定库。libcurl 是一个自由的，并且容易使用的用在客户端的 URL 传输库。它的功能很强大，在 PycURL  的主页上介绍的支持的功能有：

    supporting FTP, FTPS, HTTP, HTTPS, GOPHER, TELNET, DICT, FILE and LDAP. libcurl supports HTTPS certificates, HTTP POST, HTTP PUT, FTP uploading, kerberos, HTTP form based upload, proxies, cookies, user+password authentication, file transfer resume, http proxy tunneling and more!

那一大堆的协议已经让人惊喜了，特别是还有代理服务器和用户认证之类的功能。这个库相对于 urllib2 来说，它不是纯 Python 的，它是一个 C 库，但因此速度更快，但它不是很 pythonic ，学起来有些复杂。它在多种平台下都有移植，象 Linux , Mac, Windows, 和多种Unix。

我安装了一个，并且测试了一小段代码，是有些复杂，代码如下：
{{{
#!python
            import pycurl
            c = pycurl.Curl()
            c.setopt(pycurl.URL, 'http://feeds.feedburner.com/solidot')
            import StringIO
            b = StringIO.StringIO()
            c.setopt(pycurl.WRITEFUNCTION, b.write)
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
    #        c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
    #        c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
            c.perform()
            print b.getvalue()
}}}
上述代码将会把奇客(Solidot)的RSS抓下来。如果有代理服务器，那么修改一下注释的两行即可。在 PycURL 的主页上还有一个多线程抓取的例子，有兴趣的可以看一看。

==== 2. python中的pycurl模块学习 ====
文章作者：[email protected]
信息来源：邪恶八进制信息安全团队（www.eviloctal.com）

1、使用getinfo来获得更多的信息：
{{{
#!python
#! /usr/bin/env python
# vi:ts=4:et
# $Id: test_getinfo.py,v 1.18 2003/05/01 19:35:01 mfx Exp $
# Author BY MSN:[email protected]
import time
import pycurl


## Callback function invoked when progress information is updated
#下面的函数用来显示下载的进度：
def progress(download_t, download_d, upload_t, upload_d):
  print "Total to download %d bytes, have %d bytes so far" % \
      (download_t, download_d)

url = "http://www.sohu.com/index.html"

print "Starting downloading", url
print
f = open("body.html", "wb") #新建一个文件并返回文件描述字,f用来保存返回的网页内容
h = open("header.txt", "wb")#h用来保存返回的包头header信息
i = open("info.txt","wb") #i用来保存getinfo()函数取回的信息
c = pycurl.Curl()
c.setopt(c.URL, url) #设置要访问的网址
c.setopt(c.WRITEDATA, f) #将返回的网页内容写入f文件描述字
c.setopt(c.NOPROGRESS, 0)
c.setopt(c.PROGRESSFUNCTION, progress)#调用过程函数
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 5)
c.setopt(c.WRITEHEADER, h)#将返回的包头header内容写入h文件描述字
c.setopt(c.OPT_FILETIME, 1)
c.perform() #执行上述访问网址的操作

print
print "HTTP-code:", c.getinfo(c.HTTP_CODE) #Outputs:200
buf=c.getinfo(c.HTTP_CODE)
i.write("HTTP-code:"+str(buf)) #将输出写入到i文件描述字中
print "Total-time:", c.getinfo(c.TOTAL_TIME) #下载总时间:0.795
buf=c.getinfo(c.TOTAL_TIME)
i.write('\r\n')
i.write("Total-time:"+str(buf))
print "Download speed: %.2f bytes/second" % c.getinfo(c.SPEED_DOWNLOAD) #下载速度:261032.00 bytes/second
print "Document size: %d bytes" % c.getinfo(c.SIZE_DOWNLOAD) #下载文档的大小:207521 bytes
print "Effective URL:", c.getinfo(c.EFFECTIVE_URL) #有效网址:http://www.sohu.com/index.html
print "Content-type:", c.getinfo(c.CONTENT_TYPE) #text/html
print "Namelookup-time:", c.getinfo(c.NAMELOOKUP_TIME) #DNS解析速度:0.065
print "Redirect-time:", c.getinfo(c.REDIRECT_TIME) #0.0
print "Redirect-count:", c.getinfo(c.REDIRECT_COUNT) #0
epoch = c.getinfo(c.INFO_FILETIME)
print "Filetime: %d (%s)" % (epoch, time.ctime(epoch)) #文件下载时间:1172361818 (Sun Feb 25 08:03:38 2007)
print
print "Header is in file 'header.txt', body is in file 'body.html'"

c.close()
f.close()
h.close()
}}}

2、简单用法：
{{{
#!python
#!c:\python25\python
# vi:ts=4:et
# $Id: test_cb.py,v 1.14 2003/04/21 18:46:10 mfx Exp $
# Author BY MSN:[email protected]

import sys
import pycurl

## Callback function invoked when body data is ready
def body(buf):
  # Print body data to stdout
  sys.stdout.write(buf) #将buf的内容输出到标准输出
 

## Callback function invoked when header data is ready
def header(buf):
  # Print header data to stderr
  sys.stdout.write(buf)

c = pycurl.Curl()
c.setopt(pycurl.URL, 'http://www.sohu.com/') #设置要访问的网址
c.setopt(pycurl.WRITEFUNCTION, body) #调用body()函数来输出返回的信息
c.setopt(pycurl.HEADERFUNCTION, header)#调用header()函数来输出返回的信息
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform() #执行上述访问网址的操作
c.close()
}}}
+=== 相关文献 ===
 * PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
 * python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

Diff for "zspy"

1. ZSPY

1.1. 第一天PycURL

1.1.1. 相关文献

2. 反馈