##language:zh #pragma section-numbers on <<TableOfContents>> ## 默许导航,请保留 <<Include(CPUGnav)>> = 下载南方周末最新网页 = {{{kergee!z <kergee@gmail.com> hide details 9:22 am (15 minutes ago) reply-to python-chinese@lists.python.cn to python-chinese@lists.python.cn date Jul 6, 2007 9:22 AM subject Re: [python-chinese] 用Python处理下载新浪下载下来的电子书 }}} * '''下载南方周末最新网页并保存成一个txt文件''' 每周四傍晚南方周末会把文章发布在网站上,为了方便拷到手机上看,写了一个小脚本:会自动下载最新一期的南方周末并存为文本文件 {{{#!python # down html from zm and save html to txt # -*- coding:utf-8 -*- import htmllib, formatter, urllib, re website = 'http://www.nanfangdaily.com.cn/zm/' f = urllib.urlopen(website) html = f.read ().lower() i = html.find('url=') j = html.find('/',i+4) date = html[i+4:j] website += date f = urllib.urlopen(website) p = htmllib.HTMLParser(formatter.NullFormatter()) p.feed(f.read()) p.close() seen = set() for url in p.anchorlist: if url[-3::] == 'asp': if url in seen: continue seen.add(url) urls=list(seen) k=len(urls) doc=open(u'南方周末'.encode('gb18030')+date+'.txt','a') for l, url in enumerate(urls): f = urllib.urlopen(website+url[1:]) html = f.read() i = html.find('#ff0000') i = html.find('>',i+7) j = html.find('<',i+1) doc.write(html[i+1:j]) i = html.find('content01',j+1) i = html.find('>',i+9) j = html.find('</font',i+1) content = html[i+1:j] reobj = re.compile(r'</?[a-z][a-z0-9]*[^<>]*>',re.IGNORECASE) doc.write(reobj.sub('\n',content)+'\n------------\n') print l+1,'-->',k doc.close() print u'下载结束' }}}