##language:zh #pragma section-numbers on ''' 用Python处理下载新浪下载下来的电子书 ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = Ben Luo.处理下载的电子书 = {{{Ben Luo hide details 9:57 am (16 minutes ago) reply-to python-chinese@lists.python.cn to python-chinese@lists.python.cn date Jun 1, 2007 9:57 AM subject [python-chinese] 用Python处理下载新浪下载下来的电子书 }}}用Firefox的Download All 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面? == html2txt.py == {{{#!python ##################### #html2txt.py ##################### from formatter import AbstractFormatter, NullWriter from htmllib import HTMLParser def _(str, in_encoder="gbk", out_encoder="utf8"): return unicode(str, in_encoder).encode(out_encoder) class myWriter(NullWriter): def __init__(self): NullWriter.__init__(self) self._bodyText = [] def send_flowing_data(self, str): self._bodyText.append(str) def _get_bodyText(self): return '\n'.join(self._bodyText) bodyText = property(_get_bodyText, None, None, 'plain text from body') class myHTMLParser(HTMLParser): def do_meta(self, attrs): self.metas = attrs def convertFile(filename): mywriter = myWriter() absformatter = AbstractFormatter(mywriter) parser = myHTMLParser(absformatter) parser.feed(open(filename).read()) return ( _(parser.title), parser.formatter.writer.bodyText ) import os import os.path OUTPUTDIR = "./txt" INPUTDIR = "." if __name__ == "__main__": if not os.path.exists(OUTPUTDIR): os.mkdir(OUTPUTDIR) for file in os.listdir(INPUTDIR): if file[-4:] == '.htm' or file[-5:] == '.html': print "Coverting", file, outfilename = os.path.splitext(file)[0] a, text = convertFile(file) outfilename = outfilename + '.txt' outfullname = os.path.join(OUTPUTDIR, outfilename) open(outfullname, "wt").write(text) print "Done!" }}} == pickupcontent.py == {{{#!python ################################ #pickupcontent.py ################################ # -*- coding: utf-8 -*- import sys import glob import os import re sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)] startstr = u"^八十".encode("gb2312") # article title endstr = u"^\[返回".encode("gb2312") # tmp_start = re.compile(startstr) tmp_end = re.compile(endstr) for infile in sys.argv[1:]: # print infile f = open(infile,'r') #print f lines = f.readlines() fout = '' for index, line in enumerate(lines): if tmp_start.match(line): kstart = index if tmp_end.match(line): kend = index break f.close() fout = fout.join(lines[kstart:kend]) tmp = open('tmp','w') tmp.write(fout) tmp.close() os.remove(infile) os.rename('tmp',infile) }}} == 反馈 ==