Size: 6564
Comment:
|
Size: 3300
Comment:
|
Deletions are marked like this. | Additions are marked like this. |
Line 17: | Line 17: |
== 第一天 == | 写作中.... 张沈鹏 [email protected] http://zsp.javaeye.com/ 2008-1-23 16:42 == 第一天PycURL == |
Line 20: | Line 27: |
Line 31: | Line 39: |
Line 33: | Line 42: |
#像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些 |
|
Line 35: | Line 46: |
#像操作文件一样操作字符串 | |
Line 61: | Line 71: |
然后看看多线程的例子 {{{ import os, sys from cStringIO import StringIO import pycurl urls = ( "http://curl.haxx.se", "http://www.python.org", "http://pycurl.sourceforge.net", "http://pycurl.sourceforge.net/tests/403_FORBIDDEN", # that actually exists ;-) "http://pycurl.sourceforge.net/tests/404_NOT_FOUND", ) # Read list of URIs from file specified on commandline try: urls = open(sys.argv[1], "rb").readlines() except IndexError: # No file was specified pass # init m = pycurl.CurlMulti() m.handles = [] for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url.rstrip() c.body = StringIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) m.add_handle(c) # get data num_handles = len(m.handles) while num_handles: while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(1.0) # close handles for c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # pycurl API calls m.remove_handle(c) c.close() m.close() # print result for c in m.handles: data = c.body.getvalue() if 0: print "**********", c.url, "**********" print data else: print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data)) }}} |
|
Line 65: | Line 143: |
=== 文献 === ==== 1. PycURL简单学习 ==== http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx PycURL 是一个C语言写的 libcurl 的 Python 绑定库。libcurl 是一个自由的,并且容易使用的用在客户端的 URL 传输库。它的功能很强大,在 PycURL 的主页上介绍的支持的功能有: supporting FTP, FTPS, HTTP, HTTPS, GOPHER, TELNET, DICT, FILE and LDAP. libcurl supports HTTPS certificates, HTTP POST, HTTP PUT, FTP uploading, kerberos, HTTP form based upload, proxies, cookies, user+password authentication, file transfer resume, http proxy tunneling and more! 那一大堆的协议已经让人惊喜了,特别是还有代理服务器和用户认证之类的功能。这个库相对于 urllib2 来说,它不是纯 Python 的,它是一个 C 库,但因此速度更快,但它不是很 pythonic ,学起来有些复杂。它在多种平台下都有移植,象 Linux , Mac, Windows, 和多种Unix。 我安装了一个,并且测试了一小段代码,是有些复杂,代码如下: {{{ #!python import pycurl c = pycurl.Curl() c.setopt(pycurl.URL, 'http://feeds.feedburner.com/solidot') import StringIO b = StringIO.StringIO() c.setopt(pycurl.WRITEFUNCTION, b.write) c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) # c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080') # c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa') c.perform() print b.getvalue() }}} 上述代码将会把奇客(Solidot)的RSS抓下来。如果有代理服务器,那么修改一下注释的两行即可。在 PycURL 的主页上还有一个多线程抓取的例子,有兴趣的可以看一看。 ==== 2. python中的pycurl模块学习 ==== 文章作者:[email protected] 信息来源:邪恶八进制信息安全团队(www.eviloctal.com) 1、使用getinfo来获得更多的信息: {{{ #!python #! /usr/bin/env python # vi:ts=4:et # $Id: test_getinfo.py,v 1.18 2003/05/01 19:35:01 mfx Exp $ # Author BY MSN:[email protected] import time import pycurl ## Callback function invoked when progress information is updated #下面的函数用来显示下载的进度: def progress(download_t, download_d, upload_t, upload_d): print "Total to download %d bytes, have %d bytes so far" % \ (download_t, download_d) url = "http://www.sohu.com/index.html" print "Starting downloading", url f = open("body.html", "wb") #新建一个文件并返回文件描述字,f用来保存返回的网页内容 h = open("header.txt", "wb")#h用来保存返回的包头header信息 i = open("info.txt","wb") #i用来保存getinfo()函数取回的信息 c = pycurl.Curl() c.setopt(c.URL, url) #设置要访问的网址 c.setopt(c.WRITEDATA, f) #将返回的网页内容写入f文件描述字 c.setopt(c.NOPROGRESS, 0) c.setopt(c.PROGRESSFUNCTION, progress)#调用过程函数 c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.MAXREDIRS, 5) c.setopt(c.WRITEHEADER, h)#将返回的包头header内容写入h文件描述字 c.setopt(c.OPT_FILETIME, 1) c.perform() #执行上述访问网址的操作 print "HTTP-code:", c.getinfo(c.HTTP_CODE) #Outputs:200 buf=c.getinfo(c.HTTP_CODE) i.write("HTTP-code:"+str(buf)) #将输出写入到i文件描述字中 print "Total-time:", c.getinfo(c.TOTAL_TIME) #下载总时间:0.795 buf=c.getinfo(c.TOTAL_TIME) i.write('\r\n') i.write("Total-time:"+str(buf)) print "Download speed: %.2f bytes/second" % c.getinfo(c.SPEED_DOWNLOAD) #下载速度:261032.00 bytes/second print "Document size: %d bytes" % c.getinfo(c.SIZE_DOWNLOAD) #下载文档的大小:207521 bytes print "Effective URL:", c.getinfo(c.EFFECTIVE_URL) #有效网址:http://www.sohu.com/index.html print "Content-type:", c.getinfo(c.CONTENT_TYPE) #text/html print "Namelookup-time:", c.getinfo(c.NAMELOOKUP_TIME) #DNS解析速度:0.065 print "Redirect-time:", c.getinfo(c.REDIRECT_TIME) #0.0 print "Redirect-count:", c.getinfo(c.REDIRECT_COUNT) #0 epoch = c.getinfo(c.INFO_FILETIME) print "Filetime: %d (%s)" % (epoch, time.ctime(epoch)) #文件下载时间:1172361818 (Sun Feb 25 08:03:38 2007) print "Header is in file 'header.txt', body is in file 'body.html'" c.close() f.close() h.close() }}} 2、简单用法: {{{ #!python #!c:\python25\python # vi:ts=4:et # $Id: test_cb.py,v 1.14 2003/04/21 18:46:10 mfx Exp $ # Author BY MSN:[email protected] import sys import pycurl ## Callback function invoked when body data is ready def body(buf): # Print body data to stdout sys.stdout.write(buf) #将buf的内容输出到标准输出 ## Callback function invoked when header data is ready def header(buf): # Print header data to stderr sys.stdout.write(buf) c = pycurl.Curl() c.setopt(pycurl.URL, 'http://www.sohu.com/') #设置要访问的网址 c.setopt(pycurl.WRITEFUNCTION, body) #调用body()函数来输出返回的信息 c.setopt(pycurl.HEADERFUNCTION, header)#调用header()函数来输出返回的信息 c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.perform() #执行上述访问网址的操作 c.close() }}} |
=== 相关文献 === * PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx * python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337 |
含有章节索引的 *PUG 文章通用模板 ::-- ["zuroc"] [DateTime(2008-01-23T08:30:45Z)] TableOfContents
1. ZSPY
用python抓取网络
代码见 http://zspy.googlecode.com
写作中....
张沈鹏 [email protected] http://zsp.javaeye.com/
2008-1-23 16:42
1.1. 第一天PycURL
Pycurl http://pycurl.sourceforge.net/
外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.
从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
参考文献1,测试代码
1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
2 import StringIO
3
4 html = StringIO.StringIO()
5
6 import pycurl
7 c = pycurl.Curl()
8
9 c.setopt(pycurl.URL, 'http://www.baidu.com')
10
11 #写的回调
12 c.setopt(pycurl.WRITEFUNCTION, html.write)
13
14 c.setopt(pycurl.FOLLOWLOCATION, 1)
15
16 #最大重定向次数,可以预防重定向陷阱
17 c.setopt(pycurl.MAXREDIRS, 5)
18
19 #访问,阻塞到访问结束
20 c.perform()
21
22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
24
25 #输出百度首页的html
26 #print html.getvalue()
然后看看多线程的例子
import os, sys from cStringIO import StringIO import pycurl urls = ( "http://curl.haxx.se", "http://www.python.org", "http://pycurl.sourceforge.net", "http://pycurl.sourceforge.net/tests/403_FORBIDDEN", # that actually exists ;-) "http://pycurl.sourceforge.net/tests/404_NOT_FOUND", ) # Read list of URIs from file specified on commandline try: urls = open(sys.argv[1], "rb").readlines() except IndexError: # No file was specified pass # init m = pycurl.CurlMulti() m.handles = [] for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url.rstrip() c.body = StringIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) m.add_handle(c) # get data num_handles = len(m.handles) while num_handles: while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(1.0) # close handles for c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # pycurl API calls m.remove_handle(c) c.close() m.close() # print result for c in m.handles: data = c.body.getvalue() if 0: print "**********", c.url, "**********" print data else: print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))
1.1.1. 相关文献
PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337