##language:zh
#pragma section-numbers on

<<TableOfContents>>
## 默许导航,请保留
<<Include(CPUGnav)>>


= 下载南方周末最新网页 =
{{{kergee!z <kergee@gmail.com> 		 hide details	 9:22 am (15 minutes ago) 
	reply-to		python-chinese@lists.python.cn	 
	to		python-chinese@lists.python.cn	 
	date		Jul 6, 2007 9:22 AM	 
	subject		Re: [python-chinese] 用Python处理下载新浪下载下来的电子书
}}}
 * '''下载南方周末最新网页并保存成一个txt文件'''
每周四傍晚南方周末会把文章发布在网站上,为了方便拷到手机上看,写了一个小脚本:会自动下载最新一期的南方周末并存为文本文件
{{{#!python
# down html from zm and save html to txt
#  -*- coding:utf-8 -*-
import htmllib, formatter, urllib, re

website = 'http://www.nanfangdaily.com.cn/zm/'
f = urllib.urlopen(website)
html = f.read ().lower()
i = html.find('url=')
j = html.find('/',i+4)
date = html[i+4:j]
website += date

f = urllib.urlopen(website)
p = htmllib.HTMLParser(formatter.NullFormatter())
p.feed(f.read())
p.close()
seen = set()
for url in p.anchorlist:
 if url[-3::] == 'asp':
  if url in seen: continue
  seen.add(url)
  
urls=list(seen)
k=len(urls)
doc=open(u'南方周末'.encode('gb18030')+date+'.txt','a')
for l, url in enumerate(urls):
 f = urllib.urlopen(website+url[1:])
 html = f.read()
 i = html.find('#ff0000')
 i = html.find('>',i+7)
 j = html.find('<',i+1)
 doc.write(html[i+1:j])
 i = html.find('content01',j+1)
 i = html.find('>',i+9)
 j = html.find('</font',i+1)
 content = html[i+1:j]
 reobj = re.compile(r'</?[a-z][a-z0-9]*[^<>]*>',re.IGNORECASE)
 doc.write(reobj.sub('\n',content)+'\n------------\n')
 print l+1,'-->',k 
doc.close()
print u'下载结束'
}}}