##language:zh #pragma section-numbers off ##含有章节索引导航的 ZPyUG 文章通用模板 <> ## 默许导航,请保留 <> = 天涯文章下载器 = {{{ Wenwei Cai sender-time Sent at 20:37 (GMT+08:00). Current time there: 9:41 PM. ✆ reply-to python-cn@googlegroups.com to python-cn@googlegroups.com cc fengwei yin date Thu, Feb 4, 2010 at 20:37 subject [CPyUG] 天涯文章下载器 }}} 我和朋友正在学习python。在这过程中间做了一个天涯文章的下载器。现在能够下载鬼话版还有贴图版的楼主文章。主要可以节省些泡天涯论坛的时间。 目前还比较原始,大家要是发现问题或者有些改进,请给我们发过来。 ##startInc == 代码 == {{{ #!python #!/usr/bin/env python # -*- coding: utf_8 -*- # # Copyright @ 2010 Stanley Cai # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. __authors__ = ["Stanley Cai", "Fengwei Yin", "Zhifeng Wang"] __emails__ = ["stanley.w.cai@gmail.com"] import re import sys import urllib2 import urllib from urllib2 import Request, urlopen, URLError, HTTPError import os g_img_count = 0 g_img_dwld_failed = 0 def write_xhtml_head(fp): fp.write(""" """) def write_xhtml_style(fp): fp.write(""" """) def write_xhtml_body_head(fp): fp.write("""
""") def dwld_img(img_url, img_dir): global g_img_count global g_img_dwld_failed if g_img_dwld_failed > 0: return '' try: webimg_file = urllib2.urlopen(img_url) except HTTPError,e: g_img_dwld_failed = 1 return '' except URLError,e: g_img_dwld_failed = 1 return '' else: img_name1 = get_name(img_url) img_name2 = img_dir + '/' + img_name1 + str(g_img_count) + '.jpg' g_img_count += 1 local_file = open(img_name2, 'wb') local_file.write(webimg_file.read()) webimg_file.close() local_file.close() return img_name2 def process_img(post, img_dir): local_post = post reg = re.compile(r'http://[a-zA-Z0-9_/.]*.jpg', re.M | re.I | re.S) jpg_url = reg.findall(post) for old_img in jpg_url: new_img = dwld_img(old_img, img_dir) if len(new_img) > 0: print 'Image download OK. Refine htm file' local_post = local_post.replace(old_img, new_img) else: return post return local_post def get_name(url): name = url.split('/')[-1] name = name.split('.')[0] return name def main(url): global g_img_dwld_failed global g_img_dir g_img_dwld_failed = 0 g_img_dir = url data = urllib2.urlopen(url).read() RCharset = re.compile(r']*>', re.M) mo = RCharset.search(data) if mo: charset = mo.groups()[0] if charset != "utf-8": data = data.decode(charset, "ignore").encode("utf-8") RTitle = re.compile(r'.*([^<]+)', re.M | re.I) mo = RTitle.search(data) if not mo: print "Unsupported format" title = mo.groups()[0].decode("utf-8") print title writer = "" RWriter = re.compile(r"var chrAuthorName = [\'|\"][^<]*[\'|\"];", re.M | re.I) mo = RWriter.search(data) if mo: if mo.group().find('\'') < 0: writer = mo.group().split('\"')[1] else: writer = mo.group().split('\'')[1] else: print "No writer found" print writer g_img_dir = get_name(g_img_dir) os.mkdir(get_name(g_img_dir)) fp = open(title + ".htm", 'w') write_xhtml_head(fp) write_xhtml_style(fp) write_xhtml_body_head(fp) count = 0 while 1: RContent = re.compile(r'作者:]+>([^<]+)[^<]*?提交日期:([^<]+)<.*?]*>]*>楼主.*?
(.+?)]+>([^<]+)[^<]*?提交日期:([^<]+)<.*?
(.+?)]+>([^<]+)[^<]*?回复日期:([^<]+)<.*?(.+?)page #%d
\n' % count) for author, date, post in results: if writer == "": writer = author print author.decode("utf-8") if writer == author: fp.write('
日期: %s
\n' % date) wr_data = process_img(post, g_img_dir) fp.write(wr_data) count += 1 RNextPage = re.compile(r']*?href=([^>]+)><[^>]+>下一页', re.M | re.I | re.S) mo = RNextPage.search(data) if not mo: RNextPage = re.compile(r']*?href="([^>]+)">下一页', re.M | re.I | re.S) mo = RNextPage.search(data) if mo: print mo.groups() data = urllib2.urlopen(mo.groups()[0]).read() RCharset = re.compile(r']*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M) mo = RCharset.search(data) if mo: charset = mo.groups()[0] if charset != "utf-8": data = data.decode(charset, "ignore").encode("utf-8") else: # for ghost board only RNextPage = re.compile(r']*?href="javascript:([^"]+)" ><[^>]*>下一页<', re.M | re.I | re.S) mo = RNextPage.search(data) if mo: pages = mo.groups()[0] m = re.search(r"'(\d+)'", pages) if m: page = int(m.groups()[0]) print "page", page s = mo.start() print "start", s pd = {} reg = re.compile(r'') for mo in reg.findall(data[:s]): pd[mo[0]] = mo[1] pd['pID'] = str(page) params = urllib.urlencode(pd) f = urllib2.urlopen(url, params) data = f.read() RCharset = re.compile(r']*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M) mo = RCharset.search(data) if mo: charset = mo.groups()[0] if charset != "utf-8": data = data.decode(charset, "ignore").encode("utf-8") else: print "no support" break fp.write(""" """) fp.close() if __name__=="__main__": map(main, sys.argv[1:]) }}} ##endInc ---- '''反馈''' 创建 by -- ZoomQuiet [<>]