Differences between revisions 3 and 6 (spanning 3 versions)
Revision 3 as of 2008-01-23 08:33:39
Size: 6564
Editor: zuroc
Comment:
Revision 6 as of 2008-01-23 09:00:48
Size: 3300
Editor: zuroc
Comment:
Deletions are marked like this. Additions are marked like this.
Line 17: Line 17:
== 第一天 ==
写作中....

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

== 第一天PycURL ==
Line 20: Line 27:
Line 31: Line 39:
Line 33: Line 42:

#像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
Line 35: Line 46:
#像操作文件一样操作字符串
Line 61: Line 71:
然后看看多线程的例子
{{{
import os, sys
from cStringIO import StringIO

import pycurl


urls = (
    "http://curl.haxx.se",
    "http://www.python.org",
    "http://pycurl.sourceforge.net",
    "http://pycurl.sourceforge.net/tests/403_FORBIDDEN", # that actually exists ;-)
    "http://pycurl.sourceforge.net/tests/404_NOT_FOUND",
)

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1], "rb").readlines()
except IndexError:
    # No file was specified
    pass

# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
    c = pycurl.Curl()
    # save info in standard Python attributes
    c.url = url.rstrip()
    c.body = StringIO()
    c.http_code = -1
    m.handles.append(c)
    # pycurl API calls
    c.setopt(c.URL, c.url)
    c.setopt(c.WRITEFUNCTION, c.body.write)
    m.add_handle(c)

# get data
num_handles = len(m.handles)
while num_handles:
     while 1:
         ret, num_handles = m.perform()
         if ret != pycurl.E_CALL_MULTI_PERFORM:
             break
     # currently no more I/O is pending, could do something in the meantime
     # (display a progress bar, etc.)
     m.select(1.0)

# close handles
for c in m.handles:
    # save info in standard Python attributes
    c.http_code = c.getinfo(c.HTTP_CODE)
    # pycurl API calls
    m.remove_handle(c)
    c.close()
m.close()

# print result
for c in m.handles:
    data = c.body.getvalue()
    if 0:
        print "**********", c.url, "**********"
        print data
    else:
        print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))

}}}
Line 65: Line 143:

=== 文献 ===
==== 1. PycURL简单学习 ====
http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
PycURL 是一个C语言写的 libcurl 的 Python 绑定库。libcurl 是一个自由的,并且容易使用的用在客户端的 URL 传输库。它的功能很强大,在 PycURL 的主页上介绍的支持的功能有:

    supporting FTP, FTPS, HTTP, HTTPS, GOPHER, TELNET, DICT, FILE and LDAP. libcurl supports HTTPS certificates, HTTP POST, HTTP PUT, FTP uploading, kerberos, HTTP form based upload, proxies, cookies, user+password authentication, file transfer resume, http proxy tunneling and more!

那一大堆的协议已经让人惊喜了,特别是还有代理服务器和用户认证之类的功能。这个库相对于 urllib2 来说,它不是纯 Python 的,它是一个 C 库,但因此速度更快,但它不是很 pythonic ,学起来有些复杂。它在多种平台下都有移植,象 Linux , Mac, Windows, 和多种Unix。

我安装了一个,并且测试了一小段代码,是有些复杂,代码如下:
{{{
#!python
            import pycurl
            c = pycurl.Curl()
            c.setopt(pycurl.URL, 'http://feeds.feedburner.com/solidot')
            import StringIO
            b = StringIO.StringIO()
            c.setopt(pycurl.WRITEFUNCTION, b.write)
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
    # c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
    # c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
            c.perform()
            print b.getvalue()
}}}
上述代码将会把奇客(Solidot)的RSS抓下来。如果有代理服务器,那么修改一下注释的两行即可。在 PycURL 的主页上还有一个多线程抓取的例子,有兴趣的可以看一看。

==== 2. python中的pycurl模块学习 ====
文章作者:[email protected]
信息来源:邪恶八进制信息安全团队(www.eviloctal.com)

1、使用getinfo来获得更多的信息:
{{{
#!python
#! /usr/bin/env python
# vi:ts=4:et
# $Id: test_getinfo.py,v 1.18 2003/05/01 19:35:01 mfx Exp $
# Author BY MSN:[email protected]
import time
import pycurl


## Callback function invoked when progress information is updated
#下面的函数用来显示下载的进度:
def progress(download_t, download_d, upload_t, upload_d):
  print "Total to download %d bytes, have %d bytes so far" % \
      (download_t, download_d)

url = "http://www.sohu.com/index.html"

print "Starting downloading", url
print
f = open("body.html", "wb") #新建一个文件并返回文件描述字,f用来保存返回的网页内容
h = open("header.txt", "wb")#h用来保存返回的包头header信息
i = open("info.txt","wb") #i用来保存getinfo()函数取回的信息
c = pycurl.Curl()
c.setopt(c.URL, url) #设置要访问的网址
c.setopt(c.WRITEDATA, f) #将返回的网页内容写入f文件描述字
c.setopt(c.NOPROGRESS, 0)
c.setopt(c.PROGRESSFUNCTION, progress)#调用过程函数
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 5)
c.setopt(c.WRITEHEADER, h)#将返回的包头header内容写入h文件描述字
c.setopt(c.OPT_FILETIME, 1)
c.perform() #执行上述访问网址的操作

print
print "HTTP-code:", c.getinfo(c.HTTP_CODE) #Outputs:200
buf=c.getinfo(c.HTTP_CODE)
i.write("HTTP-code:"+str(buf)) #将输出写入到i文件描述字中
print "Total-time:", c.getinfo(c.TOTAL_TIME) #下载总时间:0.795
buf=c.getinfo(c.TOTAL_TIME)
i.write('\r\n')
i.write("Total-time:"+str(buf))
print "Download speed: %.2f bytes/second" % c.getinfo(c.SPEED_DOWNLOAD) #下载速度:261032.00 bytes/second
print "Document size: %d bytes" % c.getinfo(c.SIZE_DOWNLOAD) #下载文档的大小:207521 bytes
print "Effective URL:", c.getinfo(c.EFFECTIVE_URL) #有效网址:http://www.sohu.com/index.html
print "Content-type:", c.getinfo(c.CONTENT_TYPE) #text/html
print "Namelookup-time:", c.getinfo(c.NAMELOOKUP_TIME) #DNS解析速度:0.065
print "Redirect-time:", c.getinfo(c.REDIRECT_TIME) #0.0
print "Redirect-count:", c.getinfo(c.REDIRECT_COUNT) #0
epoch = c.getinfo(c.INFO_FILETIME)
print "Filetime: %d (%s)" % (epoch, time.ctime(epoch)) #文件下载时间:1172361818 (Sun Feb 25 08:03:38 2007)
print
print "Header is in file 'header.txt', body is in file 'body.html'"

c.close()
f.close()
h.close()
}}}

2、简单用法:
{{{
#!python
#!c:\python25\python
# vi:ts=4:et
# $Id: test_cb.py,v 1.14 2003/04/21 18:46:10 mfx Exp $
# Author BY MSN:[email protected]

import sys
import pycurl

## Callback function invoked when body data is ready
def body(buf):
  # Print body data to stdout
  sys.stdout.write(buf) #将buf的内容输出到标准输出
 

## Callback function invoked when header data is ready
def header(buf):
  # Print header data to stderr
  sys.stdout.write(buf)

c = pycurl.Curl()
c.setopt(pycurl.URL, 'http://www.sohu.com/') #设置要访问的网址
c.setopt(pycurl.WRITEFUNCTION, body) #调用body()函数来输出返回的信息
c.setopt(pycurl.HEADERFUNCTION, header)#调用header()函数来输出返回的信息
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform() #执行上述访问网址的操作
c.close()
}}}
=== 相关文献 ===
 * PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
 * python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

含有章节索引的 *PUG 文章通用模板 ::-- ["zuroc"] [DateTime(2008-01-23T08:30:45Z)] TableOfContents

Include(CPUGnav)

1. ZSPY

用python抓取网络

代码见 http://zspy.googlecode.com

写作中....

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

1.1. 第一天PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   2 import StringIO
   3 
   4 html = StringIO.StringIO()
   5 
   6 import pycurl
   7 c = pycurl.Curl()
   8 
   9 c.setopt(pycurl.URL, 'http://www.baidu.com')
  10 
  11 #写的回调
  12 c.setopt(pycurl.WRITEFUNCTION, html.write)
  13 
  14 c.setopt(pycurl.FOLLOWLOCATION, 1)
  15 
  16 #最大重定向次数,可以预防重定向陷阱
  17 c.setopt(pycurl.MAXREDIRS, 5)
  18 
  19 #访问,阻塞到访问结束
  20 c.perform()
  21 
  22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  24 
  25 #输出百度首页的html
  26 #print html.getvalue()

然后看看多线程的例子

import os, sys
from cStringIO import StringIO

import pycurl


urls = (
    "http://curl.haxx.se",
    "http://www.python.org",
    "http://pycurl.sourceforge.net",
    "http://pycurl.sourceforge.net/tests/403_FORBIDDEN",  # that actually exists ;-)
    "http://pycurl.sourceforge.net/tests/404_NOT_FOUND",
)

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1], "rb").readlines()
except IndexError:
    # No file was specified
    pass

# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
    c = pycurl.Curl()
    # save info in standard Python attributes
    c.url = url.rstrip()
    c.body = StringIO()
    c.http_code = -1
    m.handles.append(c)
    # pycurl API calls
    c.setopt(c.URL, c.url)
    c.setopt(c.WRITEFUNCTION, c.body.write)
    m.add_handle(c)

# get data
num_handles = len(m.handles)
while num_handles:
     while 1:
         ret, num_handles = m.perform()
         if ret != pycurl.E_CALL_MULTI_PERFORM:
             break
     # currently no more I/O is pending, could do something in the meantime
     # (display a progress bar, etc.)
     m.select(1.0)

# close handles
for c in m.handles:
    # save info in standard Python attributes
    c.http_code = c.getinfo(c.HTTP_CODE)
    # pycurl API calls
    m.remove_handle(c)
    c.close()
m.close()

# print result
for c in m.handles:
    data = c.body.getvalue()
    if 0:
        print "**********", c.url, "**********"
        print data
    else:
        print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))

1.1.1. 相关文献

2. 反馈

PageComment2

zspy (last edited 2009-12-25 07:15:17 by localhost)