天涯文章下载器
Wenwei Cai <stanley.w.cai@gmail.com> sender-time Sent at 20:37 (GMT+08:00). Current time there: 9:41 PM. ✆ reply-to python-cn@googlegroups.com to python-cn@googlegroups.com cc fengwei yin <nh26223@gmail.com> date Thu, Feb 4, 2010 at 20:37 subject [CPyUG] 天涯文章下载器
我和朋友正在学习python。在这过程中间做了一个天涯文章的下载器。现在能够下载鬼话版还有贴图版的楼主文章。主要可以节省些泡天涯论坛的时间。
目前还比较原始,大家要是发现问题或者有些改进,请给我们发过来。
代码
Toggle line numbers
1 #!/usr/bin/env python
2 # -*- coding: utf_8 -*-
3 #
4 # Copyright @ 2010 Stanley Cai
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15
16 __authors__ = ["Stanley Cai", "Fengwei Yin", "Zhifeng Wang"]
17 __emails__ = ["stanley.w.cai@gmail.com"]
18
19 import re
20 import sys
21 import urllib2
22 import urllib
23 from urllib2 import Request, urlopen, URLError, HTTPError
24 import os
25
26 g_img_count = 0
27 g_img_dwld_failed = 0
28
29 def write_xhtml_head(fp):
30 fp.write("""
31 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 4.0 Transitional//EN">
32 <html>
33 <head>
34 """)
35
36 def write_xhtml_style(fp):
37 fp.write("""
38 <STYLE>
39 body{font-size:14px;LINE-HEIGHT:26px;MARGIN:0px;background-color: #E7F4FE;}
40 TABLE{font-size:14px;LINE-HEIGHT:26px;}
41 .ART{font-size:15px;LINE-HEIGHT:27px;}
42 .B1{MARGIN:0px;PADDING:0px;LINE-HEIGHT:10px;}
43 .B2 TABLE {MARGIN:0px;PADDING:0PX;LINE-HEIGHT:16px;text-align: center;}
44 .Header{BACKGROUND:#511F90;Color:#FFF;PADDING:5px;Line-HEIGHT:21px;PADDING-LEFT:15px;}
45 .Header A{text-decoration: None;color:#FFF;}
46 .Header A:hover{text-decoration: None;color:#f00;}
47 .FL{float: right;Line-HEIGHT:19px;padding-bottom: 3px;}
48 .AD{padding-top: 0px;padding-bottom: 10px;margin-top: 0px;}
49 .mr{font-size:14px;color:#511F90;}
50 </STYLE>
51 <meta HTTP-EQUIV="Content-Type" content="text/html; charset=utf-8">
52 """)
53
54 def write_xhtml_body_head(fp):
55 fp.write("""
56 <body>
57 <TABLE WIDTH=950 height=90 align='CENTER' cellpadding='3' cellspacing='0' >
58
59 <TR><TD>
60 <TABLE WIDTH=90%, ALIGN=CENTER>
61 </TD>
62 """)
63
64 def dwld_img(img_url, img_dir):
65 global g_img_count
66 global g_img_dwld_failed
67
68 if g_img_dwld_failed > 0:
69 return ''
70
71 try:
72 webimg_file = urllib2.urlopen(img_url)
73 except HTTPError,e:
74 g_img_dwld_failed = 1
75 return ''
76 except URLError,e:
77 g_img_dwld_failed = 1
78 return ''
79 else:
80 img_name1 = get_name(img_url)
81 img_name2 = img_dir + '/' + img_name1 + str(g_img_count) + '.jpg'
82 g_img_count += 1
83 local_file = open(img_name2, 'wb')
84 local_file.write(webimg_file.read())
85 webimg_file.close()
86 local_file.close()
87 return img_name2
88
89 def process_img(post, img_dir):
90 local_post = post
91 reg = re.compile(r'http://[a-zA-Z0-9_/.]*.jpg', re.M | re.I | re.S)
92 jpg_url = reg.findall(post)
93 for old_img in jpg_url:
94 new_img = dwld_img(old_img, img_dir)
95 if len(new_img) > 0:
96 print 'Image download OK. Refine htm file'
97 local_post = local_post.replace(old_img, new_img)
98 else:
99 return post
100 return local_post
101
102 def get_name(url):
103 name = url.split('/')[-1]
104 name = name.split('.')[0]
105 return name
106
107 def main(url):
108 global g_img_dwld_failed
109 global g_img_dir
110 g_img_dwld_failed = 0
111 g_img_dir = url
112
113 data = urllib2.urlopen(url).read()
114 RCharset = re.compile(r'<meta.*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
115 mo = RCharset.search(data)
116 if mo:
117 charset = mo.groups()[0]
118 if charset != "utf-8":
119 data = data.decode(charset, "ignore").encode("utf-8")
120
121 RTitle = re.compile(r'.*<TITLE>([^<]+)</TITLE>', re.M | re.I)
122 mo = RTitle.search(data)
123 if not mo:
124 print "Unsupported format"
125 title = mo.groups()[0].decode("utf-8")
126 print title
127
128 writer = ""
129 RWriter = re.compile(r"var chrAuthorName = [\'|\"][^<]*[\'|\"];", re.M | re.I)
130 mo = RWriter.search(data)
131 if mo:
132 if mo.group().find('\'') < 0:
133 writer = mo.group().split('\"')[1]
134 else:
135 writer = mo.group().split('\'')[1]
136 else:
137 print "No writer found"
138 print writer
139
140 g_img_dir = get_name(g_img_dir)
141 os.mkdir(get_name(g_img_dir))
142
143 fp = open(title + ".htm", 'w')
144 write_xhtml_head(fp)
145 write_xhtml_style(fp)
146 write_xhtml_body_head(fp)
147
148 count = 0
149 while 1:
150 RContent = re.compile(r'作者:<a[^>]+>([^<]+)</a>[^<]*?提交日期:([^<]+)<.*?<font[^>]*><font[^>]*>楼主</font></font>.*?</table>(.+?)<table', re.M | re.I | re.S)
151 results = RContent.findall(data)
152
153 if len(results) == 0:
154 RContent = re.compile(r'作者:<a[^>]+>([^<]+)</a>[^<]*?提交日期:([^<]+)<.*?</table>(.+?)<table', re.M | re.I | re.S)
155 results = RContent.findall(data)
156
157 RContent = re.compile(r'作者:<a[^>]+>([^<]+)</a>[^<]*?回复日期:([^<]+)<.*?</table>(.+?)<table', re.M | re.I | re.S)
158 results += RContent.findall(data)
159
160 fp.write('<BR><B>page #%d </B> <BR>\n' % count)
161 for author, date, post in results:
162 if writer == "":
163 writer = author
164 print author.decode("utf-8")
165 if writer == author:
166 fp.write('<BR><B>日期: %s </B> <BR>\n' % date)
167 wr_data = process_img(post, g_img_dir)
168 fp.write(wr_data)
169 count += 1
170
171 RNextPage = re.compile(r'<a[^>]*?href=([^>]+)><[^>]+>下一页</a>', re.M | re.I | re.S)
172 mo = RNextPage.search(data)
173 if not mo:
174 RNextPage = re.compile(r'<a[^>]*?href="([^>]+)">下一页</a>', re.M | re.I | re.S)
175 mo = RNextPage.search(data)
176
177 if mo:
178 print mo.groups()
179 data = urllib2.urlopen(mo.groups()[0]).read()
180 RCharset = re.compile(r'<meta[^>]*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
181 mo = RCharset.search(data)
182 if mo:
183 charset = mo.groups()[0]
184 if charset != "utf-8":
185 data = data.decode(charset, "ignore").encode("utf-8")
186 else: # for ghost board only
187 RNextPage = re.compile(r'<a[^>]*?href="javascript:([^"]+)" ><[^>]*>下一页<', re.M | re.I | re.S)
188 mo = RNextPage.search(data)
189 if mo:
190 pages = mo.groups()[0]
191 m = re.search(r"'(\d+)'", pages)
192 if m:
193 page = int(m.groups()[0])
194 print "page", page
195 s = mo.start()
196 print "start", s
197 pd = {}
198 reg = re.compile(r'<input type="hidden" name="(\w+)" value="([^"]+)">')
199 for mo in reg.findall(data[:s]):
200 pd[mo[0]] = mo[1]
201 pd['pID'] = str(page)
202 params = urllib.urlencode(pd)
203 f = urllib2.urlopen(url, params)
204 data = f.read()
205 RCharset = re.compile(r'<meta[^>]*?content="text/html; charset=([a-zA-Z0-9_]+)"[^>]*>', re.M)
206 mo = RCharset.search(data)
207 if mo:
208 charset = mo.groups()[0]
209 if charset != "utf-8":
210 data = data.decode(charset, "ignore").encode("utf-8")
211 else:
212 print "no support"
213 break
214
215 fp.write("""
216 </body>
217 </html>
218 """)
219 fp.close()
220
221 if __name__=="__main__":
222 map(main, sys.argv[1:])
反馈
创建 by -- ZoomQuiet [2010-02-04 13:41:43]