用Python处理下载新浪下载下来的电子书 ::-- ZoomQuiet [2007-06-01 02:45:05]
Contents
1. Ben Luo.处理下载的电子书
{{{Ben Luo <[email protected]> hide details 9:57 am (16 minutes ago)
reply-to [email protected] to [email protected] date Jun 1, 2007 9:57 AM subject [python-chinese] 用Python处理下载新浪下载下来的电子书
}}}用Firefox的Download All 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面?
1.1. html2txt.py
1 #####################
2 #html2txt.py
3 #####################
4
5 from formatter import AbstractFormatter, NullWriter
6 from htmllib import HTMLParser
7
8 def _(str, in_encoder="gbk", out_encoder="utf8"):
9 return unicode(str, in_encoder).encode(out_encoder)
10
11
12 class myWriter(NullWriter):
13 def __init__(self):
14 NullWriter.__init__(self)
15 self._bodyText = []
16
17 def send_flowing_data(self, str):
18 self._bodyText.append(str)
19
20 def _get_bodyText(self):
21 return '\n'.join(self._bodyText)
22
23 bodyText = property(_get_bodyText, None, None, 'plain text from body')
24
25 class myHTMLParser(HTMLParser):
26 def do_meta(self, attrs):
27 self.metas = attrs
28
29 def convertFile(filename):
30 mywriter = myWriter()
31 absformatter = AbstractFormatter(mywriter)
32 parser = myHTMLParser(absformatter)
33 parser.feed(open(filename).read())
34 return ( _(parser.title), parser.formatter.writer.bodyText )
35
36 import os
37 import os.path
38
39 OUTPUTDIR = "./txt"
40 INPUTDIR = "."
41 if __name__ == "__main__":
42 if not os.path.exists(OUTPUTDIR):
43 os.mkdir(OUTPUTDIR)
44
45 for file in os.listdir(INPUTDIR):
46 if file[-4:] == '.htm' or file[-5:] == '.html':
47 print "Coverting", file,
48 outfilename = os.path.splitext(file)[0]
49 a, text = convertFile(file)
50 outfilename = outfilename + '.txt'
51 outfullname = os.path.join(OUTPUTDIR, outfilename)
52 open(outfullname, "wt").write(text)
53 print "Done!"
1.2. pickupcontent.py
1 ################################
2 #pickupcontent.py
3 ################################
4
5 # -*- coding: utf-8 -*-
6
7 import sys
8 import glob
9 import os
10 import re
11
12 sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)]
13 startstr = u"^八十".encode("gb2312") # article title
14 endstr = u"^\[返回".encode("gb2312") #
15 tmp_start = re.compile(startstr)
16 tmp_end = re.compile(endstr)
17 for infile in sys.argv[1:]:
18 # print infile
19 f = open(infile,'r')
20 #print f
21 lines = f.readlines()
22 fout = ''
23 for index, line in enumerate(lines):
24 if tmp_start.match(line):
25 kstart = index
26 if tmp_end.match(line):
27 kend = index
28 break
29
30 f.close()
31 fout = fout.join(lines[kstart:kend])
32 tmp = open('tmp','w')
33 tmp.write(fout)
34 tmp.close()
35 os.remove(infile)
36 os.rename('tmp',infile)