含有章节索引的 *PUG 文章通用模板 ::-- ZoomQuiet [2007-12-02 03:47:31]
Contents
1. GPL词典
Jiahua Huang <[email protected]> reply-to [email protected], to [email protected], date Dec 2, 2007 11:40 AM subject [CPyUG:35998] Re: 请教有哪些好些的开放格式的词典?
http://stardict.sourceforge.net/ 上的许多词典号称是 GPL 的
- 读取 stardict 词典的 python 模块
1.1. 示例:
huahua@huahua:demo$ python Python 2.5.1 (r251:54863, Oct 5 2007, 13:36:32) [GCC 4.1.3 20070929 (prerelease) (Ubuntu 4.1.2-16ubuntu2)] on linux2 Type "help", "copyright", "credits" or "license" for more information. >>> import stardict >>> cdict = stardict.FileDb('/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb') >>> print cdict.match('.','python') python *['paiθɒn] n. 大蟒, 巨蟒
1.2. 'dictzip.py'
1 #!/bin/env python
2
3 """Functions that provide transparent read-only access to dictzipped files
4 """
5 # based on gzip.py from python library
6
7 import string, struct, sys, time
8 import zlib
9 import __builtin__
10
11 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
12
13 READ, WRITE = 1, 2
14
15 def write32(output, value):
16 output.write(struct.pack("<l", value))
17
18 def write32u(output, value):
19 output.write(struct.pack("<L", value))
20
21 def read32(input):
22 return struct.unpack("<l", input.read(4))[0]
23
24 def open(filename, mode="rb", compresslevel=9):
25 return DictzipFile(filename, mode, compresslevel)
26
27 class DictzipFile:
28
29 myfileobj = None
30
31 def __init__(self, filename=None, mode=None,
32 compresslevel=9, fileobj=None, cachesize=2):
33 if fileobj is None:
34 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
35 if filename is None:
36 if hasattr(fileobj, 'name'): filename = fileobj.name
37 else: filename = ''
38 if mode is None:
39 if hasattr(fileobj, 'mode'): mode = fileobj.mode
40 else: mode = 'rb'
41
42 if mode[0:1] == 'r':
43 self.mode = READ
44 self.filename = filename
45 else:
46 raise ValueError, "Mode " + mode + " not supported"
47
48 self.fileobj = fileobj
49 self._read_gzip_header()
50 self.pos = 0
51 self.cachesize = cachesize
52 self.cache = {}
53 self.cachekeys = []
54
55
56 def __repr__(self):
57 s = repr(self.fileobj)
58 return '<dictzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
59
60
61 def _read_gzip_header(self):
62 magic = self.fileobj.read(2)
63 if magic != '\037\213':
64 raise IOError, 'Not a gzipped file'
65 method = ord( self.fileobj.read(1) )
66 if method != 8:
67 raise IOError, 'Unknown compression method'
68 flag = ord( self.fileobj.read(1) )
69 # modtime = self.fileobj.read(4)
70 # extraflag = self.fileobj.read(1)
71 # os = self.fileobj.read(1)
72 self.fileobj.read(6)
73
74 if flag & FEXTRA:
75 # Read the extra field
76 xlen=ord(self.fileobj.read(1))
77 xlen=xlen+256*ord(self.fileobj.read(1))
78 extra = self.fileobj.read(xlen)
79 while 1:
80 l = ord(extra[2])+256*ord(extra[3])
81 e = extra[:4+l]
82 if e[:2]<>'RA':
83 extra=extra[4+l:]
84 if not extra:
85 raise "Missing dictzip extension"
86 continue
87 else:
88 break
89 length = ord(extra[2])+256*ord(extra[3])
90 ver = ord(extra[4])+256*ord(extra[5])
91 self.chlen = ord(extra[6])+256*ord(extra[7])
92 chcnt = ord(extra[8])+256*ord(extra[9])
93 p = 10
94 lens = []
95 for i in xrange(chcnt):
96 thischlen = ord(extra[p])+256*ord(extra[p+1])
97 p = p+2
98 lens.append(thischlen)
99 chpos = 0
100 self.chunks = []
101 for i in lens:
102 self.chunks.append( (chpos, i) )
103 chpos = chpos+i
104 self._lastpos = chpos
105 else:
106 raise "Missing dictzip extension"
107
108
109 if flag & FNAME:
110 # Read and discard a null-terminated string containing the filename
111 while (1):
112 s=self.fileobj.read(1)
113 if not s or s=='\000': break
114 if flag & FCOMMENT:
115 # Read and discard a null-terminated string containing a comment
116 while (1):
117 s=self.fileobj.read(1)
118 if not s or s=='\000': break
119 if flag & FHCRC:
120 self.fileobj.read(2) # Read & discard the 16-bit header CRC
121
122 self._firstpos = self.fileobj.tell()
123
124
125 def write(self,data):
126 raise ValueError, "write() not supported on DictzipFile object"
127
128 def writelines(self,lines):
129 raise ValueError, "writelines() not supported on DictzipFile object"
130
131 def _readchunk(self,n):
132 if n>=len(self.chunks):
133 return ''
134 if self.cache.has_key(n):
135 return self.cache[n]
136 self.fileobj.seek(self._firstpos+self.chunks[n][0])
137 s = self.fileobj.read(self.chunks[n][1])
138 dobj = zlib.decompressobj(-zlib.MAX_WBITS)
139 output = dobj.decompress(s)
140 del dobj
141 #self.cache = {} # crude hack until proper cache is done
142 self.cache[n] = output
143 self.cachekeys.append(n)
144 # delete the oldest filled up item in cache
145 if len(self.cachekeys) > self.cachesize:
146 try:
147 del self.cache[self.cachekeys[0]]
148 del self.cachekeys[0]
149 except KeyError:
150 pass
151 return output
152
153 def read(self, size=-1):
154 firstchunk = self.pos/self.chlen
155 offset = self.pos - firstchunk*self.chlen
156 if size == -1:
157 lastchunk = len(self.chunks)+1
158 finish = 0
159 npos = sys.maxint
160 else:
161 lastchunk = (self.pos+size)/self.chlen
162 finish = offset+size
163 npos = self.pos+size
164 buf = ""
165 for i in range(firstchunk, lastchunk+1):
166 buf = buf+self._readchunk(i)
167 r = buf[offset:finish]
168 self.pos = npos
169 return r
170
171 def close(self):
172 self.fileobj.close()
173
174 def __del__(self):
175 self.close()
176
177 def flush(self):
178 pass
179
180 def seek(self, pos, whence=0):
181 if whence == 0:
182 self.pos = pos
183 elif whence == 1:
184 self.pos = self.pos+pos
185 elif whence == 2:
186 raise "Seeking from end of file not supported"
187 # fixme
188
189 def tell(self):
190 return self.pos
191
192 def isatty(self):
193 return 0
194
195 def readline(self, size=-1):
196 if size < 0: size = sys.maxint
197 bufs = []
198 orig_size = size
199 oldpos = self.pos
200 readsize = min(100, size) # Read from the file in small chunks
201 while 1:
202 if size == 0:
203 return string.join(bufs, '') # Return resulting line
204
205 c = self.read(readsize)
206 i = string.find(c, '\n')
207 if i>=0:
208 self.pos = self.pos-len(c)+i+1
209 if size is not None:
210 # We set i=size to break out of the loop under two
211 # conditions: 1) there's no newline, and the chunk is
212 # larger than size, or 2) there is a newline, but the
213 # resulting line would be longer than 'size'.
214 if i==-1 and len(c) > size: i=size-1
215 elif size <= i: i = size -1
216
217 if i >= 0 or c == '':
218 bufs.append(c[:i+1]) # Add portion of last chunk
219 return string.join(bufs, '') # Return resulting line
220
221 # Append chunk to list, decrease 'size',
222 bufs.append(c)
223 size = size - len(c)
224 readsize = min(size, readsize * 2)
225
226 def readlines(self, sizehint=0):
227 # Negative numbers result in reading all the lines
228 if sizehint <= 0: sizehint = sys.maxint
229 L = []
230 while sizehint > 0:
231 line = self.readline()
232 if line == "": break
233 L.append( line )
234 sizehint = sizehint - len(line)
235 return L
236
237
238
239 def _test():
240 import sys
241 f = DictzipFile(sys.argv[1])
242 f.seek(32023449)
243 while 1:
244 r = f.readline()
245 if not r:
246 break
247 print `r`
248 #for i in range(1):
249 # r = f.read(1400000)
250 # if not r:
251 # break
252 # sys.stdout.write(r)
253 # sys.stdout.flush()
254
255
256 if __name__ == '__main__':
257 _test()
1.3. 'stardict.py'
1 #!/usr/bin/python
2
3 # -*- coding: utf-8 -*-
4
5
6
7 import mmap,os,md5,struct
8
9 import dictzip
10
11 from stat import ST_MTIME
12
13 import codecs
14
15
16
17 INT_SIZE = struct.calcsize('i')
18
19 TAB = '\t'
20
21 strategies = {
22
23 'exact' : (0, 'Match words exactly'),
24
25 'prefix' : (1, 'Match prefixes'),
26
27 'substring' : (2, 'Match substring occurring anywhere in word'),
28
29 'suffix' : (3, 'Match suffixes'),
30
31 'soundex' : (4, 'Match using SOUNDEX algorithm'),
32
33 'lev' : (5, 'Match words within Levenshtein distance one'),
34
35 're' : (6, 'POSIX 1003.2 regular expressions'),
36
37 'fnmatch' : (7, 'fnmatch-like (* ? as wildcards)'),
38
39 'metaphone' : (8, 'metaphone algorithm')
40
41 }
42
43
44
45 def unique_strings(l):
46
47 dict = {}
48
49 for s in l:
50
51 dict[s] = 1
52
53 return dict.keys()
54
55
56
57 b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
58
59 def b64_encode(val):
60
61 """Takes as input an integer val and returns a string of it encoded
62
63 with the base64 algorithm used by dict indexes."""
64
65 startfound = 0
66
67 retval = ""
68
69 for i in range(5, -1, -1):
70
71 thispart = (val >> (6 * i)) & ((2 ** 6) - 1)
72
73 if (not startfound) and (not thispart):
74
75 # Both zero -- keep going.
76
77 continue
78
79 startfound = 1
80
81 retval += b64_list[thispart]
82
83 if len(retval):
84
85 return retval
86
87 else:
88
89 return b64_list[0]
90
91
92
93 def b64_decode(str):
94
95 """Takes as input a string and returns an integer value of it decoded
96
97 with the base64 algorithm used by dict indexes."""
98
99 if not len(str):
100
101 return 0
102
103 retval = 0
104
105 shiftval = 0
106
107 for i in range(len(str) - 1, -1, -1):
108
109 val = b64_list.index(str[i])
110
111 retval = retval | (val << shiftval)
112
113 shiftval += 6
114
115 return retval
116
117
118
119 def sort_index(original, sorted):
120
121 l = open(original).readlines()
122
123 nl = []
124
125 for i in range(len(l)):
126
127 line = l[i].split(TAB, 1)
128
129 if len(line)<>2:
130
131 print "corrupted index entry", `l[i]`
132
133 continue
134
135 entry, rest = line
136
137 try:
138
139 entry = unicode(entry, 'utf-8')
140
141 except UnicodeDecodeError:
142
143 #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry)
144
145 entry = unicode(entry, 'ISO-8859-1')
146
147 entry = entry.lower()
148
149 entry = entry.encode('utf-8')
150
151 n = entry+TAB+rest
152
153 nl.append(n)
154
155 nl.sort()
156
157 f = open(sorted, "w")
158
159 for i in nl:
160
161 f.write(i)
162
163 f.close()
164
165
166
167 def getcachenames(orig_inx):
168
169 digest = md5.new(orig_inx).hexdigest()
170
171 inx = os.path.join(os.getcwd(), digest+".index")
172
173 pos = os.path.join(os.getcwd(), digest+".positions")
174
175 return inx, pos
176
177
178
179 def dbargs(name, data=None, inx=None,info_file=None):
180
181 """return tuple of: mmap object for posittions, mmap for index file, file object for data file
182
183 """
184
185 if not data:
186
187 data = name+".dict"
188
189 sep = os.sep
190
191 sep1 = os.altsep or os.sep
192
193 if not( (sep in data) or (sep1 in data) ):
194
195 data = "%s" % data
196
197
198
199 if not inx:
200
201 inx = name+".idx"
202
203 if not( (sep in inx) or (sep1 in inx) ):
204
205 inx = os.path.normpath(os.path.join(os.getcwd(),inx))
206
207
208
209 if not info_file:
210
211 info_file = name+".ifo"
212
213 if not( (sep in info_file) or (sep1 in info_file) ):
214
215 info_file = os.path.normpath(os.path.join(os.getcwd(),info_file))
216
217
218
219 if data[-3:]=='.dz':
220
221 datafile = dictzip.DictzipFile(data)
222
223 else:
224
225 try:
226
227 datafile = open(data)
228
229 except IOError:
230
231 datafile = dictzip.DictzipFile(data+".dz")
232
233
234
235 inx_file = open(inx,'rb').read()
236
237 info_file = open(info_file,'rb').read()
238
239
240
241 wordlist = []
242
243 runner = 0
244
245 i=0
246
247 while True:
248
249 wordlist.append(runner+9)
250
251 runner = inx_file.find('\0',runner+9)
252
253 if runner == -1:
254
255 break
256
257 i+=1
258
259
260
261 return wordlist, len(wordlist), inx_file, datafile,info_file
262
263
264
265 class FileDb:
266
267 """Entries in index are in UTF8, sorted byte-after-byte
268
269 """
270
271 def __init__(self, name, data=None, inx=None, info=None):
272
273 if not info:
274
275 info = name
276
277 self.name = name
278
279 self.info = info
280
281 self.data = data
282
283 self.datafo = None
284
285 self.index = None
286
287 self.info_file = None
288
289 self.inx = inx
290
291 self.initialized = 0
292
293 self.initialize()
294
295
296
297 def initialize(self):
298
299 if not self.initialized:
300
301 self.positions, self.nrwords, self.index, self.datafo, self.info_file = dbargs(self.name, self.data, self.inx)
302
303 self.initialized = 1
304
305
306
307 def transformentry(self, s):
308
309 if -1 !=self.info_file.find('sametypesequence=tm'):
310
311 s = '<font color=blue face="Lucida Sans Unicode">[' + s.replace('\0',']</font>',1)
312
313 # transforms read entry into plain text
314
315 # or, in the future, into mime/html/sgml/whatever
316
317 # to be overriden
318
319 return s
320
321
322
323 def readentry(self, arg):
324
325 entry, st, ln = arg
326
327 self.datafo.seek(st)
328
329 r = self.transformentry(self.datafo.read(ln))
330
331 return self.name, r
332
333
334
335 def define(self, word):
336
337 self.initialize()
338
339 r = []
340
341 #poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0)
342
343 poss = []
344
345 for i in poss:
346
347 self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])
348
349 l = self.index.readline().rstrip
350
351 entry, st, ln = l.split(TAB)
352
353 st, ln = b64_decode(st), b64_decode(ln)
354
355 r.append( (entry,st,ln) )
356
357 r = map(self.readentry, r)
358
359 return r
360
361
362
363
364
365 def get_word(self,index):
366
367 "get word from self.index"
368
369 left_b = self.positions[index]
370
371 right_b = self.positions[index+1] - 9
372
373 #print self.index[left_b:right_b]
374
375 return self.index[left_b:right_b]
376
377
378
379 def get_explanation(self,index):
380
381 "get word from self.index"
382
383 right_b = self.positions[index+1] - 9
384
385 offset_v = 0
386
387 size_v = 0
388
389 offset = self.index[right_b+1:right_b+5]
390
391 size = self.index[right_b+5:right_b+9]
392
393 offset_v = struct.unpack('!i',offset)[0]
394
395 size_v = struct.unpack('!i',size)[0]
396
397
398
399 self.datafo.seek(offset_v)
400
401 return self.datafo.read(size_v)
402
403
404
405 def match(self, strategy, word):
406
407 res = ""
408
409 ind = 0
410
411 left_i = 0
412
413 right_i = len(self.positions)-2
414
415 cur_i = 0
416
417 found = 0
418
419 while left_i<=right_i:
420
421 cur_i = (left_i+right_i)/2
422
423 cmpRes = cmp(word.lower(),self.get_word(cur_i).lower())
424
425 #print cmpRes
426
427 if cmpRes > 0:
428
429 left_i = cur_i+1
430
431 elif cmpRes < 0:
432
433 right_i = cur_i-1
434
435 else:
436
437 found = 1
438
439 break
440
441 #print found
442
443
444
445 res += self.get_word(cur_i) + '\n'
446
447 res += self.transformentry(self.get_explanation(cur_i))
448
449
450
451 return res
452
453
454
455 def __del__(self):
456
457 if self.datafo:
458
459 self.datafo.close()
460
461
462
463 if __name__ == "__main__":
464
465 import sys
466
467 cdict = FileDb(name='/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')
468
469 try:
470
471 word = sys.argv[1]
472
473 except:
474
475 word = 'python'
476
477 print cdict.match('.', word)
478
479 #print jmdict.match('.','がんばる')
480
481 #print jmdict.match('.','カード')
482
483 #print jmdict.match('.','θ理論')
484
485 #jmdict.match('.','but')
486
487 #jmdict = FileDb(name='jmdict-ja-en')