1. 下载南方周末最新网页
{{{kergee!z <kergee@gmail.com> hide details 9:22 am (15 minutes ago)
reply-to python-chinese@lists.python.cn to python-chinese@lists.python.cn date Jul 6, 2007 9:22 AM subject Re: [python-chinese] 用Python处理下载新浪下载下来的电子书
}}}
下载南方周末最新网页并保存成一个txt文件
每周四傍晚南方周末会把文章发布在网站上,为了方便拷到手机上看,写了一个小脚本:会自动下载最新一期的南方周末并存为文本文件
Toggle line numbers
1 # down html from zm and save html to txt
2 # -*- coding:utf-8 -*-
3 import htmllib, formatter, urllib, re
4
5 website = 'http://www.nanfangdaily.com.cn/zm/'
6 f = urllib.urlopen(website)
7 html = f.read ().lower()
8 i = html.find('url=')
9 j = html.find('/',i+4)
10 date = html[i+4:j]
11 website += date
12
13 f = urllib.urlopen(website)
14 p = htmllib.HTMLParser(formatter.NullFormatter())
15 p.feed(f.read())
16 p.close()
17 seen = set()
18 for url in p.anchorlist:
19 if url[-3::] == 'asp':
20 if url in seen: continue
21 seen.add(url)
22
23 urls=list(seen)
24 k=len(urls)
25 doc=open(u'南方周末'.encode('gb18030')+date+'.txt','a')
26 for l, url in enumerate(urls):
27 f = urllib.urlopen(website+url[1:])
28 html = f.read()
29 i = html.find('#ff0000')
30 i = html.find('>',i+7)
31 j = html.find('<',i+1)
32 doc.write(html[i+1:j])
33 i = html.find('content01',j+1)
34 i = html.find('>',i+9)
35 j = html.find('</font',i+1)
36 content = html[i+1:j]
37 reobj = re.compile(r'</?[a-z][a-z0-9]*[^<>]*>',re.IGNORECASE)
38 doc.write(reobj.sub('\n',content)+'\n------------\n')
39 print l+1,'-->',k
40 doc.close()
41 print u'下载结束'
2. 反馈
PageComment2 ::-- ZoomQuiet [DateTime(2007-07-06T02:15:14Z)]