天涯文章下载器

Wenwei Cai <[email protected]>
sender-time     Sent at 20:37 (GMT+08:00). Current time there: 9:41 PM. ✆
reply-to        [email protected]
to      [email protected]
cc      fengwei yin <[email protected]>
date    Thu, Feb 4, 2010 at 20:37
subject [CPyUG] 天涯文章下载器

我和朋友正在学习python。在这过程中间做了一个天涯文章的下载器。现在能够下载鬼话版还有贴图版的楼主文章。主要可以节省些泡天涯论坛的时间。

目前还比较原始，大家要是发现问题或者有些改进，请给我们发过来。

代码

   1 #!/usr/bin/env python
   2 # -*- coding: utf_8 -*-
   3 #
   4 # Copyright @ 2010 Stanley Cai
   5 #
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 
  16 __authors__ = ["Stanley Cai", "Fengwei Yin", "Zhifeng Wang"]
  17 __emails__ = ["[email protected]"]
  18 
  19 import re
  20 import sys
  21 import urllib2
  22 import urllib
  23 from urllib2 import Request, urlopen, URLError, HTTPError
  24 import os
  25 
  26 g_img_count = 0
  27 g_img_dwld_failed = 0
  28 
  29 def write_xhtml_head(fp):
  30   fp.write("""
  31 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 4.0 Transitional//EN">
  32 <html>
  33 <head>
  34   """)
  35 
  36 def write_xhtml_style(fp):
  37   fp.write("""
  38 <STYLE>
  39 body{font-size:14px;LINE-HEIGHT:26px;MARGIN:0px;background-color: #E7F4FE;}
  40 TABLE{font-size:14px;LINE-HEIGHT:26px;}
  41 .ART{font-size:15px;LINE-HEIGHT:27px;}
  42 .B1{MARGIN:0px;PADDING:0px;LINE-HEIGHT:10px;}
  43 .B2 TABLE  {MARGIN:0px;PADDING:0PX;LINE-HEIGHT:16px;text-align: center;}
  44 .Header{BACKGROUND:#511F90;Color:#FFF;PADDING:5px;Line-HEIGHT:21px;PADDING-LEFT:15px;}
  45 .Header A{text-decoration: None;color:#FFF;}
  46 .Header A:hover{text-decoration: None;color:#f00;}
  47 .FL{float: right;Line-HEIGHT:19px;padding-bottom: 3px;}
  48 .AD{padding-top: 0px;padding-bottom: 10px;margin-top: 0px;}
  49 .mr{font-size:14px;color:#511F90;}
  50 </STYLE>
  51 <meta HTTP-EQUIV="Content-Type" content="text/html; charset=utf-8"> 
  52   """)
  53 
  54 def write_xhtml_body_head(fp):
  55   fp.write("""
  56 <body>
  57 <TABLE WIDTH=950 height=90 align='CENTER' cellpadding='3' cellspacing='0' >
  58 
  59 <TR><TD>
  60 <TABLE WIDTH=90%, ALIGN=CENTER>
  61 </TD>
  62   """)
  63 
  64 def dwld_img(img_url, img_dir):
  65   global g_img_count
  66   global g_img_dwld_failed
  67 
  68   if g_img_dwld_failed > 0:
  69     return ''  
  70 
  71   try:
  72     webimg_file = urllib2.urlopen(img_url)
  73   except HTTPError,e:
  74     g_img_dwld_failed = 1
  75     return ''
  76   except URLError,e:
  77     g_img_dwld_failed = 1
  78     return ''
  79   else:
  80     img_name1 = get_name(img_url)
  81     img_name2 = img_dir + '/' + img_name1 + str(g_img_count) + '.jpg'
  82     g_img_count += 1
  83     local_file = open(img_name2, 'wb')
  84     local_file.write(webimg_file.read())
  85     webimg_file.close()
  86     local_file.close() 
  87   return img_name2
  88 
  89 def process_img(post, img_dir):
  90   local_post = post
  91   reg = re.compile(r'http://[a-zA-Z0-9_/.]*.jpg', re.M | re.I | re.S)
  92   jpg_url = reg.findall(post)
  93   for old_img in jpg_url:
  94     new_img = dwld_img(old_img, img_dir)
  95     if len(new_img) > 0:
  96       print 'Image download OK. Refine htm file'
  97       local_post = local_post.replace(old_img, new_img)
  98     else:
  99       return post
 100   return local_post
 101 
 102 def get_name(url):
 103   name = url.split('/')[-1]
 104   name = name.split('.')[0]
 105   return name
 106 
 107 def main(url):
 108   global g_img_dwld_failed
 109   global g_img_dir
 110   g_img_dwld_failed = 0
 111   g_img_dir = url
 112 
 113   data = urllib2.urlopen(url).read()
 114   RCharset = re.compile(r'<meta.*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
 115   mo = RCharset.search(data)
 116   if mo:
 117     charset = mo.groups()[0]
 118     if charset != "utf-8":
 119       data = data.decode(charset, "ignore").encode("utf-8")
 120 
 121   RTitle = re.compile(r'.*<TITLE>([^<]+)</TITLE>', re.M | re.I)
 122   mo = RTitle.search(data)
 123   if not mo:
 124     print "Unsupported format"
 125   title = mo.groups()[0].decode("utf-8")
 126   print title
 127 
 128   writer = ""
 129   RWriter = re.compile(r"var chrAuthorName = [\'|\"][^<]*[\'|\"];", re.M | re.I)
 130   mo = RWriter.search(data)
 131   if mo:
 132     if mo.group().find('\'') < 0:
 133       writer = mo.group().split('\"')[1]
 134     else:
 135       writer = mo.group().split('\'')[1]
 136   else:
 137     print "No writer found"
 138   print writer
 139 
 140   g_img_dir = get_name(g_img_dir)
 141   os.mkdir(get_name(g_img_dir))
 142 
 143   fp = open(title + ".htm", 'w')
 144   write_xhtml_head(fp)
 145   write_xhtml_style(fp)
 146   write_xhtml_body_head(fp)
 147 
 148   count = 0
 149   while 1:
 150     RContent = re.compile(r'作者：<a[^>]+>([^<]+)</a>[^<]*?提交日期：([^<]+)<.*?<font[^>]*><font[^>]*>楼主</font></font>.*?</table>(.+?)<table', re.M | re.I | re.S)
 151     results = RContent.findall(data)
 152 
 153     if len(results) == 0:
 154       RContent = re.compile(r'作者：<a[^>]+>([^<]+)</a>[^<]*?提交日期：([^<]+)<.*?</table>(.+?)<table', re.M | re.I | re.S)
 155       results = RContent.findall(data)
 156 
 157     RContent = re.compile(r'作者：<a[^>]+>([^<]+)</a>[^<]*?回复日期：([^<]+)<.*?</table>(.+?)<table', re.M | re.I | re.S)
 158     results += RContent.findall(data)
 159 
 160     fp.write('<BR><B>page #%d </B> <BR>\n' % count)
 161     for author, date, post in results:
 162       if writer == "":
 163         writer = author
 164         print author.decode("utf-8")
 165       if writer == author:
 166         fp.write('<BR><B>日期： %s </B> <BR>\n' % date)
 167         wr_data = process_img(post, g_img_dir)
 168         fp.write(wr_data)
 169     count += 1
 170 
 171     RNextPage = re.compile(r'<a[^>]*?href=([^>]+)><[^>]+>下一页</a>', re.M | re.I | re.S)
 172     mo = RNextPage.search(data)
 173     if not mo:
 174       RNextPage = re.compile(r'<a[^>]*?href="([^>]+)">下一页</a>', re.M | re.I | re.S)
 175       mo = RNextPage.search(data)
 176      
 177     if mo:
 178       print mo.groups()
 179       data = urllib2.urlopen(mo.groups()[0]).read()
 180       RCharset = re.compile(r'<meta[^>]*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
 181       mo = RCharset.search(data)
 182       if mo:
 183         charset = mo.groups()[0]
 184         if charset != "utf-8":
 185           data = data.decode(charset, "ignore").encode("utf-8")
 186     else: # for ghost board only
 187       RNextPage = re.compile(r'<a[^>]*?href="javascript:([^"]+)" ><[^>]*>下一页<', re.M | re.I | re.S)
 188       mo = RNextPage.search(data)
 189       if mo:
 190         pages = mo.groups()[0]
 191         m = re.search(r"'(\d+)'", pages)
 192         if m:
 193           page = int(m.groups()[0])
 194           print "page", page
 195           s = mo.start()
 196           print "start", s
 197           pd = {}
 198           reg = re.compile(r'<input type="hidden" name="(\w+)" value="([^"]+)">')
 199           for mo in reg.findall(data[:s]):
 200             pd[mo[0]] = mo[1]
 201           pd['pID'] = str(page)
 202           params = urllib.urlencode(pd)
 203           f = urllib2.urlopen(url, params)
 204           data = f.read()
 205           RCharset = re.compile(r'<meta[^>]*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
 206           mo = RCharset.search(data)
 207           if mo:
 208             charset = mo.groups()[0]
 209             if charset != "utf-8":
 210               data = data.decode(charset, "ignore").encode("utf-8")
 211       else:
 212         print "no support"
 213         break
 214  
 215   fp.write("""
 216 </body>
 217 </html>
 218 """)
 219   fp.close()
 220       
 221 if __name__=="__main__":
 222   map(main, sys.argv[1:])

反馈

创建 by -- ZoomQuiet [2010-02-04 13:41:43]

MiscItems/2010-02-04

天涯文章下载器

代码