1. GPL词典

Jiahua Huang <[email protected]>
reply-to        [email protected],
to      [email protected],
date    Dec 2, 2007 11:40 AM
subject [CPyUG:35998] Re: 请教有哪些好些的开放格式的词典？

http://stardict.sourceforge.net/ 上的许多词典号称是 GPL 的

读取 stardict 词典的 python 模块

1.1. 示例：

huahua@huahua:demo$ python
Python 2.5.1 (r251:54863, Oct  5 2007, 13:36:32)
[GCC 4.1.3 20070929 (prerelease) (Ubuntu 4.1.2-16ubuntu2)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import stardict
>>> cdict = stardict.FileDb('/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')
>>> print cdict.match('.','python')
python
*['paiθɒn]
n. 大蟒, 巨蟒

1.2. 'dictzip.py'

   1 #!/bin/env python
   2 
   3 """Functions that provide transparent read-only access to dictzipped files
   4 """
   5 # based on gzip.py from python library
   6 
   7 import string, struct, sys, time
   8 import zlib
   9 import __builtin__
  10 
  11 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  12 
  13 READ, WRITE = 1, 2
  14 
  15 def write32(output, value):
  16     output.write(struct.pack("<l", value))
  17     
  18 def write32u(output, value):
  19     output.write(struct.pack("<L", value))
  20 
  21 def read32(input):
  22     return struct.unpack("<l", input.read(4))[0]
  23 
  24 def open(filename, mode="rb", compresslevel=9):
  25     return DictzipFile(filename, mode, compresslevel)
  26 
  27 class DictzipFile:
  28 
  29     myfileobj = None
  30 
  31     def __init__(self, filename=None, mode=None, 
  32                  compresslevel=9, fileobj=None, cachesize=2):
  33         if fileobj is None:
  34             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  35         if filename is None:
  36             if hasattr(fileobj, 'name'): filename = fileobj.name
  37             else: filename = ''
  38         if mode is None:
  39             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  40             else: mode = 'rb'
  41 
  42         if mode[0:1] == 'r':
  43             self.mode = READ
  44             self.filename = filename
  45         else:
  46             raise ValueError, "Mode " + mode + " not supported"
  47 
  48         self.fileobj = fileobj
  49         self._read_gzip_header()
  50         self.pos = 0
  51         self.cachesize = cachesize
  52         self.cache = {}
  53         self.cachekeys = []
  54 
  55 
  56     def __repr__(self):
  57         s = repr(self.fileobj)
  58         return '<dictzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
  59 
  60 
  61     def _read_gzip_header(self):
  62         magic = self.fileobj.read(2)
  63         if magic != '\037\213':
  64             raise IOError, 'Not a gzipped file'
  65         method = ord( self.fileobj.read(1) )
  66         if method != 8:
  67             raise IOError, 'Unknown compression method'
  68         flag = ord( self.fileobj.read(1) )
  69         # modtime = self.fileobj.read(4)
  70         # extraflag = self.fileobj.read(1)
  71         # os = self.fileobj.read(1)
  72         self.fileobj.read(6)
  73 
  74         if flag & FEXTRA:
  75             # Read the extra field
  76             xlen=ord(self.fileobj.read(1))              
  77             xlen=xlen+256*ord(self.fileobj.read(1))
  78             extra = self.fileobj.read(xlen)
  79             while 1:
  80                 l = ord(extra[2])+256*ord(extra[3])
  81                 e = extra[:4+l]
  82                 if e[:2]<>'RA':
  83                     extra=extra[4+l:]
  84                     if not extra:
  85                         raise "Missing dictzip extension"
  86                     continue
  87                 else:
  88                     break
  89             length = ord(extra[2])+256*ord(extra[3])
  90             ver = ord(extra[4])+256*ord(extra[5])
  91             self.chlen = ord(extra[6])+256*ord(extra[7])
  92             chcnt = ord(extra[8])+256*ord(extra[9])
  93             p = 10
  94             lens = []
  95             for i in xrange(chcnt):
  96                 thischlen = ord(extra[p])+256*ord(extra[p+1])
  97                 p = p+2
  98                 lens.append(thischlen)
  99             chpos = 0
 100             self.chunks = []
 101             for i in lens:
 102                 self.chunks.append( (chpos, i) )
 103                 chpos = chpos+i
 104             self._lastpos = chpos
 105         else:
 106             raise "Missing dictzip extension"
 107             
 108             
 109         if flag & FNAME:
 110             # Read and discard a null-terminated string containing the filename
 111             while (1):
 112                 s=self.fileobj.read(1)
 113                 if not s or s=='\000': break
 114         if flag & FCOMMENT:
 115             # Read and discard a null-terminated string containing a comment
 116             while (1):
 117                 s=self.fileobj.read(1)
 118                 if not s or s=='\000': break
 119         if flag & FHCRC:
 120             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 121 
 122         self._firstpos = self.fileobj.tell()
 123         
 124 
 125     def write(self,data):
 126         raise ValueError, "write() not supported on DictzipFile object"
 127 
 128     def writelines(self,lines):
 129         raise ValueError, "writelines() not supported on DictzipFile object"
 130 
 131     def _readchunk(self,n):
 132         if n>=len(self.chunks):
 133             return ''
 134         if self.cache.has_key(n):
 135             return self.cache[n]
 136         self.fileobj.seek(self._firstpos+self.chunks[n][0])
 137         s = self.fileobj.read(self.chunks[n][1])
 138         dobj = zlib.decompressobj(-zlib.MAX_WBITS)
 139         output = dobj.decompress(s)
 140         del dobj
 141         #self.cache = {} # crude hack until proper cache is done
 142         self.cache[n] = output
 143         self.cachekeys.append(n)
 144         # delete the oldest filled up item in cache
 145         if len(self.cachekeys) > self.cachesize:
 146             try:
 147                 del self.cache[self.cachekeys[0]]
 148                 del self.cachekeys[0]
 149             except KeyError:
 150                 pass
 151         return output
 152 
 153     def read(self, size=-1):
 154         firstchunk = self.pos/self.chlen
 155         offset = self.pos - firstchunk*self.chlen
 156         if size == -1:
 157             lastchunk = len(self.chunks)+1
 158             finish = 0
 159             npos = sys.maxint
 160         else:
 161             lastchunk = (self.pos+size)/self.chlen
 162             finish = offset+size
 163             npos = self.pos+size
 164         buf = ""
 165         for i in range(firstchunk, lastchunk+1):
 166             buf = buf+self._readchunk(i)
 167         r = buf[offset:finish]
 168         self.pos = npos
 169         return r
 170           
 171     def close(self):
 172         self.fileobj.close()
 173 
 174     def __del__(self):
 175         self.close()
 176         
 177     def flush(self):
 178         pass
 179 
 180     def seek(self, pos, whence=0):
 181         if whence == 0:
 182             self.pos = pos
 183         elif whence == 1:
 184             self.pos = self.pos+pos
 185         elif whence == 2:
 186             raise "Seeking from end of file not supported"
 187             # fixme
 188         
 189     def tell(self):
 190         return self.pos
 191 
 192     def isatty(self):
 193         return 0
 194 
 195     def readline(self, size=-1):
 196         if size < 0: size = sys.maxint
 197         bufs = []
 198         orig_size = size
 199         oldpos = self.pos
 200         readsize = min(100, size)    # Read from the file in small chunks
 201         while 1:
 202             if size == 0:
 203                 return string.join(bufs, '') # Return resulting line
 204 
 205             c = self.read(readsize)
 206             i = string.find(c, '\n')
 207             if i>=0:
 208                 self.pos = self.pos-len(c)+i+1
 209             if size is not None:
 210                 # We set i=size to break out of the loop under two
 211                 # conditions: 1) there's no newline, and the chunk is 
 212                 # larger than size, or 2) there is a newline, but the
 213                 # resulting line would be longer than 'size'.
 214                 if i==-1 and len(c) > size: i=size-1
 215                 elif size <= i: i = size -1
 216 
 217             if i >= 0 or c == '':
 218                 bufs.append(c[:i+1])    # Add portion of last chunk
 219                 return string.join(bufs, '') # Return resulting line
 220 
 221             # Append chunk to list, decrease 'size',
 222             bufs.append(c)
 223             size = size - len(c)
 224             readsize = min(size, readsize * 2)
 225             
 226     def readlines(self, sizehint=0):
 227         # Negative numbers result in reading all the lines
 228         if sizehint <= 0: sizehint = sys.maxint
 229         L = []
 230         while sizehint > 0:
 231             line = self.readline()
 232             if line == "": break
 233             L.append( line )
 234             sizehint = sizehint - len(line)
 235         return L
 236 
 237 
 238 
 239 def _test():
 240     import sys
 241     f = DictzipFile(sys.argv[1])
 242     f.seek(32023449)
 243     while 1:
 244         r = f.readline()
 245         if not r:
 246             break
 247         print `r`
 248     #for i in range(1):
 249     #    r = f.read(1400000)
 250     #    if not r:
 251     #        break
 252     #    sys.stdout.write(r)
 253     #    sys.stdout.flush()
 254     
 255 
 256 if __name__ == '__main__':
 257     _test()

1.3. 'stardict.py'

   1 #!/usr/bin/python
   2 
   3 # -*- coding: utf-8 -*-
   4 
   5 
   6 
   7 import mmap,os,md5,struct
   8 
   9 import dictzip
  10 
  11 from stat import ST_MTIME
  12 
  13 import codecs
  14 
  15 
  16 
  17 INT_SIZE = struct.calcsize('i')
  18 
  19 TAB = '\t'
  20 
  21 strategies = {
  22 
  23         'exact'     : (0, 'Match words exactly'),
  24 
  25         'prefix'    : (1, 'Match prefixes'),
  26 
  27         'substring' : (2, 'Match substring occurring anywhere in word'),
  28 
  29         'suffix'    : (3, 'Match suffixes'),
  30 
  31         'soundex'   : (4, 'Match using SOUNDEX algorithm'),
  32 
  33         'lev'       : (5, 'Match words within Levenshtein distance one'),
  34 
  35         're'        : (6, 'POSIX 1003.2 regular expressions'),
  36 
  37         'fnmatch'   : (7, 'fnmatch-like (* ? as wildcards)'),
  38 
  39         'metaphone' : (8, 'metaphone algorithm')
  40 
  41         }
  42 
  43 
  44 
  45 def unique_strings(l):
  46 
  47     dict = {}
  48 
  49     for s in l:
  50 
  51        dict[s] = 1
  52 
  53     return dict.keys()
  54 
  55 
  56 
  57 b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  58 
  59 def b64_encode(val):
  60 
  61     """Takes as input an integer val and returns a string of it encoded
  62 
  63     with the base64 algorithm used by dict indexes."""
  64 
  65     startfound = 0
  66 
  67     retval = ""
  68 
  69     for i in range(5, -1, -1):
  70 
  71         thispart = (val >> (6 * i)) & ((2 ** 6) - 1)
  72 
  73         if (not startfound) and (not thispart):
  74 
  75             # Both zero -- keep going.
  76 
  77             continue
  78 
  79         startfound = 1
  80 
  81         retval += b64_list[thispart]
  82 
  83     if len(retval):
  84 
  85         return retval
  86 
  87     else:
  88 
  89         return b64_list[0]
  90 
  91     
  92 
  93 def b64_decode(str):
  94 
  95     """Takes as input a string and returns an integer value of it decoded
  96 
  97     with the base64 algorithm used by dict indexes."""
  98 
  99     if not len(str):
 100 
 101         return 0
 102 
 103     retval = 0
 104 
 105     shiftval = 0
 106 
 107     for i in range(len(str) - 1, -1, -1):
 108 
 109         val = b64_list.index(str[i])
 110 
 111         retval = retval | (val << shiftval)
 112 
 113         shiftval += 6
 114 
 115     return retval
 116 
 117 
 118 
 119 def sort_index(original, sorted):
 120 
 121     l = open(original).readlines()
 122 
 123     nl = []
 124 
 125     for i in range(len(l)):
 126 
 127         line = l[i].split(TAB, 1)
 128 
 129         if len(line)<>2:
 130 
 131             print "corrupted index entry", `l[i]`
 132 
 133             continue
 134 
 135         entry, rest = line
 136 
 137         try:
 138 
 139             entry = unicode(entry, 'utf-8')
 140 
 141         except UnicodeDecodeError:
 142 
 143             #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry)
 144 
 145             entry = unicode(entry, 'ISO-8859-1')
 146 
 147         entry = entry.lower()
 148 
 149         entry = entry.encode('utf-8')
 150 
 151         n = entry+TAB+rest
 152 
 153         nl.append(n)
 154 
 155     nl.sort()
 156 
 157     f = open(sorted, "w")        
 158 
 159     for i in nl:
 160 
 161         f.write(i)
 162 
 163     f.close()
 164 
 165 
 166 
 167 def getcachenames(orig_inx):
 168 
 169     digest = md5.new(orig_inx).hexdigest()
 170 
 171     inx = os.path.join(os.getcwd(), digest+".index")
 172 
 173     pos = os.path.join(os.getcwd(), digest+".positions")
 174 
 175     return inx, pos
 176 
 177 
 178 
 179 def dbargs(name, data=None, inx=None,info_file=None):
 180 
 181     """return tuple of:  mmap object for posittions, mmap for index file, file object for data file 
 182 
 183     """
 184 
 185     if not data:
 186 
 187         data = name+".dict"
 188 
 189     sep = os.sep
 190 
 191     sep1 = os.altsep or os.sep
 192 
 193     if not( (sep in data) or (sep1 in data) ):
 194 
 195         data = "%s" % data
 196 
 197 
 198 
 199     if not inx:
 200 
 201         inx = name+".idx"
 202 
 203     if not( (sep in inx) or (sep1 in inx) ):
 204 
 205         inx = os.path.normpath(os.path.join(os.getcwd(),inx))
 206 
 207 
 208 
 209     if not info_file:
 210 
 211         info_file = name+".ifo"
 212 
 213     if not( (sep in info_file) or (sep1 in info_file) ):
 214 
 215         info_file = os.path.normpath(os.path.join(os.getcwd(),info_file))
 216 
 217 
 218 
 219     if data[-3:]=='.dz':
 220 
 221         datafile = dictzip.DictzipFile(data)
 222 
 223     else:
 224 
 225         try:
 226 
 227             datafile = open(data)
 228 
 229         except IOError:
 230 
 231             datafile = dictzip.DictzipFile(data+".dz")
 232 
 233         
 234 
 235     inx_file = open(inx,'rb').read()
 236 
 237     info_file = open(info_file,'rb').read()
 238 
 239 
 240 
 241     wordlist = []
 242 
 243     runner = 0
 244 
 245     i=0
 246 
 247     while True:
 248 
 249         wordlist.append(runner+9)
 250 
 251         runner = inx_file.find('\0',runner+9)
 252 
 253         if runner == -1:
 254 
 255                 break
 256 
 257         i+=1
 258 
 259     
 260 
 261     return wordlist, len(wordlist), inx_file, datafile,info_file
 262 
 263 
 264 
 265 class FileDb:
 266 
 267     """Entries in index are in UTF8, sorted byte-after-byte
 268 
 269     """
 270 
 271     def __init__(self, name, data=None, inx=None, info=None):
 272 
 273         if not info:
 274 
 275             info = name
 276 
 277         self.name = name
 278 
 279         self.info = info
 280 
 281         self.data = data
 282 
 283         self.datafo = None
 284 
 285         self.index = None
 286 
 287         self.info_file = None
 288 
 289         self.inx = inx
 290 
 291         self.initialized = 0
 292 
 293         self.initialize()
 294 
 295 
 296 
 297     def initialize(self):
 298 
 299         if not self.initialized:
 300 
 301             self.positions, self.nrwords, self.index, self.datafo, self.info_file  = dbargs(self.name, self.data, self.inx)
 302 
 303             self.initialized = 1
 304 
 305             
 306 
 307     def transformentry(self, s):
 308 
 309         if -1 !=self.info_file.find('sametypesequence=tm'):
 310 
 311             s = '<font color=blue face="Lucida Sans Unicode">[' + s.replace('\0',']</font>',1)
 312 
 313         # transforms read entry into plain text
 314 
 315         # or, in the future, into mime/html/sgml/whatever
 316 
 317         # to be overriden
 318 
 319         return s
 320 
 321 
 322 
 323     def readentry(self, arg):
 324 
 325         entry, st, ln = arg
 326 
 327         self.datafo.seek(st)
 328 
 329         r = self.transformentry(self.datafo.read(ln))
 330 
 331         return self.name, r
 332 
 333 
 334 
 335     def define(self, word):
 336 
 337         self.initialize()
 338 
 339         r = []
 340 
 341         #poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0)
 342 
 343         poss = []
 344 
 345         for i in poss:
 346 
 347             self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])
 348 
 349             l = self.index.readline().rstrip
 350 
 351             entry, st, ln = l.split(TAB)
 352 
 353             st, ln = b64_decode(st), b64_decode(ln)
 354 
 355             r.append( (entry,st,ln) )
 356 
 357         r = map(self.readentry, r)
 358 
 359         return r
 360 
 361 
 362 
 363                 
 364 
 365     def get_word(self,index):
 366 
 367         "get word from self.index"
 368 
 369         left_b = self.positions[index]
 370 
 371         right_b = self.positions[index+1] - 9
 372 
 373         #print self.index[left_b:right_b]
 374 
 375         return self.index[left_b:right_b]
 376 
 377 
 378 
 379     def get_explanation(self,index):
 380 
 381         "get word from self.index"
 382 
 383         right_b = self.positions[index+1] - 9
 384 
 385         offset_v = 0
 386 
 387         size_v =  0
 388 
 389         offset = self.index[right_b+1:right_b+5]
 390 
 391         size = self.index[right_b+5:right_b+9]
 392 
 393         offset_v = struct.unpack('!i',offset)[0]
 394 
 395         size_v = struct.unpack('!i',size)[0]
 396 
 397 
 398 
 399         self.datafo.seek(offset_v)
 400 
 401         return self.datafo.read(size_v) 
 402 
 403 
 404 
 405     def match(self, strategy, word):
 406 
 407         res = ""
 408 
 409         ind = 0
 410 
 411         left_i = 0
 412 
 413         right_i = len(self.positions)-2
 414 
 415         cur_i = 0
 416 
 417         found = 0
 418 
 419         while left_i<=right_i:
 420 
 421             cur_i = (left_i+right_i)/2
 422 
 423             cmpRes = cmp(word.lower(),self.get_word(cur_i).lower())
 424 
 425             #print cmpRes
 426 
 427             if cmpRes > 0:
 428 
 429                 left_i = cur_i+1 
 430 
 431             elif cmpRes < 0:
 432 
 433                 right_i = cur_i-1
 434 
 435             else:
 436 
 437                 found = 1
 438 
 439                 break
 440 
 441         #print found
 442 
 443 
 444 
 445         res += self.get_word(cur_i) + '\n'
 446 
 447         res += self.transformentry(self.get_explanation(cur_i))
 448 
 449 
 450 
 451         return res
 452 
 453 
 454 
 455     def __del__(self):
 456 
 457         if self.datafo:
 458 
 459             self.datafo.close()
 460 
 461 
 462 
 463 if __name__ == "__main__":
 464 
 465     import sys
 466 
 467         cdict = FileDb(name='/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')
 468 
 469     try:
 470 
 471         word = sys.argv[1]
 472 
 473     except:
 474 
 475         word = 'python'
 476 
 477         print cdict.match('.', word)
 478 
 479         #print jmdict.match('.','がんばる')
 480 
 481         #print jmdict.match('.','カード')
 482 
 483         #print jmdict.match('.','θ理論')
 484 
 485         #jmdict.match('.','but')
 486 
 487         #jmdict = FileDb(name='jmdict-ja-en')

Diff for "MicroProj/2007-12-02"

1. GPL词典

1.1. 示例：

1.2. 'dictzip.py'

1.3. 'stardict.py'