Devin Deng <[email protected]> hide details Mar 13 (12 hours ago) reply-to [email protected] to [email protected] date Mar 13, 2007 11:03 PM subject Re: [python-chinese] 地下室里的爬虫
Contents
去年写的Quick & Dirty 蜘蛛程序,抓指定网站的, 现在都忘光了,看能不能给大家参考一下。
1. Quick & Dirty 蜘蛛程序
1 # -*- coding: utf-8 -*-
2 from twisted.python import threadable
3 threadable.init()
4 from twisted.internet import reactor, threads
5
6 import urllib2
7 import urllib
8 import urlparse
9 import time
10 from sgmllib import SGMLParser
11
12 from usearch import USearch # 此部分负责数据库操作,无法公布源码
13
14 class URLLister(SGMLParser):
15
16 def reset(self):
17 SGMLParser.reset(self)
18 self.urls = []
19
20 def start_a(self, attrs):
21 href = [v for k, v in attrs if k=='href']
22 if href:
23 self.urls.extend(href)
24
25 class Filter:
26
27 def __init__(self, Host, denys=None, allows=None):
28 self.deny_words = denys
29 self.allow_words = allows
30
31 # Check url is valid or not.
32 def verify(self, url):
33
34 for k in self.deny_words:
35 if url.find(k) != -1:
36 return False
37
38 for k in self.allow_words:
39 if url.find(k) !=-1:
40 return True
41
42 return True
43
44
45
46 class Host:
47
48 def __init__(self, hostname, entry_url=None, description=None,
49 encoding=None, charset=None):
50 self.hostname = hostname
51 self.entry_url = entry_url
52 self.encoding = encoding
53 self.charset = charset
54 self.description = description
55
56 def configxml(self):
57 import elementtree.ElementTree as ET
58
59 root = ET.Element("config")
60 en = ET.SubElement(root, "encoding")
61 en.text = self.encoding
62
63 ch = ET.SubElement(root, "charset")
64 ch.text = self.charset
65
66 entry = ET.SubElement(root, "entry_url")
67 entry.text = self.entry_url
68
69 return ET.tostring(root)
70
71 def parse_config(self, configstring):
72 import elementtree.ElementTree as ET
73 from StringIO import StringIO
74 tree = ET.parse(StringIO(configstring))
75 self.encoding = tree.findtext(".//encoding")
76 self.charset = tree.findtext(".//charset")
77 self.entry_url = tree.findtext(".//entry_url")
78
79 def create(self):
80 u = USearch()
81 self.configs = self.configxml()
82
83 ret = u.CreateDomain(self.hostname,self.description, self.configs)
84 #print ret
85
86 def load(self, flag='A'): # 'A' means all, 0 means unvisited, 1 ==
87 visiting, 2 = visited.
88 # TODO: load domain data from backend database.
89 u = USearch()
90 try:
91 ret = u.ListDomain(flag)['result']
92 for d in ret:
93
94 if d.domain == self.hostname:
95 self.parse_config(d.parse_config)
96 self.description = d.description
97 return True
98 except:
99 pass
100 return False
101
102
103 class Page:
104
105 def __init__(self, url, host, description=None):
106 self.url = url
107 self.description = description
108 self.host = host
109 self.page_request = None
110 self.content = None
111
112 self.status_code = None
113 self.encoding = None
114 self.charset = None
115 self.length = 0
116 self.md5 = None
117 self.urls = []
118
119 # Read web page.
120 def get_page(self, url=None):
121 if not url: url = self.url
122 type = get_type(self.host.hostname,url)
123 if type != 0: return None
124 try:
125 opener = urllib2.build_opener()
126 opener.addheaders = [('User-agent', 'Mozilla/5.0')]
127 self.page_request = opener.open(urllib.unquote(url))
128 #self.page_request = urllib2.urlopen(url)
129 self.content = self.page_request.read()
130 self.status_code = self.page_request.code
131 return self.status_code
132 except:
133 self.stats_code = 500
134 print "ERROR READING: %s" % self.url
135 return None
136
137
138 def get_header(self):
139
140 if not self.page_request:
141 self.get_page()
142 header = self.page_request.info()
143 try:
144 self.length = header['Content-Length']
145 content_type = header['Content-Type']
146 #if content_type.find('charset') == -1:
147 self.charset = self.host.charset
148
149 self.encoding = self.host.encoding
150 except:
151 pass
152
153
154 def get_urls(self):
155
156 if not self.page_request:
157 self.get_page()
158
159 if self.status_code != 200:
160 return
161
162 parser = URLLister()
163
164 try:
165 parser.feed(self.content)
166 except:
167 print "ERROR: Parse urls error!"
168 return
169
170 #print "URLS: ", parser.urls
171 #self.urls = parser.urls
172 if not self.charset: self.charset = "gbk"
173 for i in parser.urls:
174 try:
175 type = get_type(self.host.hostname,i)
176
177 if type == 4:
178 i = join_url(self.host.hostname, self.url, i)
179 if type == 0 or type ==4:
180 if i:
181 i = urllib.quote(i)
182 self.urls.append(i.decode(self.charset).encode('utf-8'))
183 except:
184 pass
185
186 parser.close()
187 self.page_request.close()
188
189 def save_header(self):
190 # Save header info into db.
191 pass
192
193 def save_current_url(self):
194 save_url = urllib.quote(self.url)
195 usearch = USearch()
196 usearch.CreateUrl( domain=self.host.hostname, url=save_url,
197 length=self.length, status_code=self.status_code)
198
199 # Set URL's flag
200 def flag_url(self, flag):
201 usearch = USearch()
202 usearch.UpdateUrl(status=flag)
203
204 def save_urls(self):
205 # Save all the founded urls into db
206 print "RELEATED_URLS:", len(self.urls)
207 usearch = USearch()
208 usearch.CreateRelateUrl(urllib.quote(self.url), self.urls)
209
210 def save_page(self):
211 usearch = USearch()
212 import cgi
213
214 try:
215 content = self.content.decode(self.charset).encode('utf-8')
216 usearch.CreateSearchContent(self.url.decode(self.charset).encode('utf-8'),
217 content)
218 except:
219 print "ERROR to save page"
220 return -1
221 print "SAVE PAGE Done", self.url
222 return 0
223
224
225
226 def get_type(domain, url):
227 if not url: return 5
228 import urlparse
229 tup = urlparse.urlparse(url)
230 if tup[0] == "http":
231 # check if the same domain
232 if tup[1] == domain: return 0
233 else: return 1 # outside link
234 if tup[0] == "javascript":
235 return 2
236 if tup[0] == "ftp":
237 return 3
238 if tup[0] == "mailto":
239 return 5
240
241 return 4 # internal link
242
243 def join_url(domain, referral, url):
244
245 if not url or len(url) ==0: return None
246 tup = urlparse.urlparse(url)
247 if not tup: return None
248
249 if tup[0] == "javascript" or tup[0] == "ftp": return None
250
251
252 else:
253 if url[0] == "/": # means root link begins
254 newurl = "http://%s%s" % ( domain, url)
255 return newurl
256 if url[0] == ".": return None # ignore relative link at first.
257 else:
258
259 # if referral.rfind("/") != -1:
260 # referral = referral[0:referral.rfind("/")+1]
261 # newurl = "%s%s" % (referral, url)
262 newurl = urlparse.urljoin(referral, url)
263 return newurl
264
265 if __name__ == '__main__':
266
267 def done(x):
268
269 u = USearch()
270 x = urllib.quote(x.decode('gbk').encode('utf-8'))
271 u.SetUrlStatus(x, '2')
272 time.sleep(2)
273 print "DONE: ",x
274 url = next_url(h)
275 if not url: reactor.stop()
276 else:threads.deferToThread(spider, h, url ).addCallback(done)
277
278
279 def next_url(host):
280 u = USearch()
281 ret = u.GetTaskUrls(host.hostname,'0',1)['result']
282 try:
283 url = urllib.unquote(ret[0].url)
284 except:
285 return None
286
287 if urlparse.urlparse(url)[1] != host.hostname: next_url(host)
288 return urllib.unquote(ret[0].url)
289
290 def spider(host, surf_url):
291
292 #surf_url = surf_url.decode(host.charset).encode('utf-8')
293 surf_url = urllib.unquote(surf_url)
294 p = Page(surf_url, host)
295 #try:
296 if not p.get_page():
297 print "ERROR: GET %s error!" % surf_url
298 return surf_url # Something Wrong!
299 p.get_header() # Get page's header
300 p.get_urls() # Get all the urls in page
301 #print p.urls
302 p.save_current_url() # Save current page's url info into DB
303 p.save_urls()
304 p.save_page()
305 #except:
306 # pass
307
308 return surf_url
309
310
311 import sys
312 #host = Host("www.chilema.cn", "/Eat/", "Shenzhen Local", "","gb2312")
313 #host.create()
314
315 #~ h = Host("www.chilema.cn")
316 #~ h.load()
317
318 #~ #reactor.callInThread(Spider, h, "http://beta.u2m.cn/")
319 #~ #reactor.callInThread(Spider, h, "http://beta.u2m.cn/canyin/")
320 #~ #reactor.callInThread(Spider, h, "http://beta.u2m.cn/fb/")
321
322 #~ threads.deferToThread(spider, h,
323 "http://www.chilema.cn/Eat/").addCallback(done)
324
325 #host = Host("www.ziye114.com", "", "Beijing Local", "gb2312")
326 #host.create()
327
328 hostname = sys.argv[1]
329 entry_url = ""
330 if len(sys.argv) == 3: entry_url = sys.argv[2]
331
332 h = Host(hostname)
333 hostname_url = "http://%s/%s" % (hostname,entry_url)
334 h.load()
335 threads.deferToThread(spider, h, hostname_url).addCallback(done)
336 threads.deferToThread(spider, h, next_url(h)).addCallback(done)
337 threads.deferToThread(spider, h, next_url(h)).addCallback(done)
338 threads.deferToThread(spider, h, next_url(h)).addCallback(done)
339 reactor.run()
1.1. 反馈
::-- ZoomQuiet [2007-03-14 03:44:44]