##language:zh #pragma section-numbers on ''' 含有章节索引的 *PUG 文章通用模板 ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = GPL词典 = {{{ Jiahua Huang reply-to python-cn@googlegroups.com, to python-cn@googlegroups.com, date Dec 2, 2007 11:40 AM subject [CPyUG:35998] Re: 请教有哪些好些的开放格式的词典? }}} http://stardict.sourceforge.net/ 上的许多词典号称是 GPL 的 * 读取 stardict 词典的 python 模块 == 示例: == {{{ huahua@huahua:demo$ python Python 2.5.1 (r251:54863, Oct 5 2007, 13:36:32) [GCC 4.1.3 20070929 (prerelease) (Ubuntu 4.1.2-16ubuntu2)] on linux2 Type "help", "copyright", "credits" or "license" for more information. >>> import stardict >>> cdict = stardict.FileDb('/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb') >>> print cdict.match('.','python') python *['paiθɒn] n. 大蟒, 巨蟒 }}} == 'dictzip.py' == {{{#!python #!/bin/env python """Functions that provide transparent read-only access to dictzipped files """ # based on gzip.py from python library import string, struct, sys, time import zlib import __builtin__ FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 def write32(output, value): output.write(struct.pack("' def _read_gzip_header(self): magic = self.fileobj.read(2) if magic != '\037\213': raise IOError, 'Not a gzipped file' method = ord( self.fileobj.read(1) ) if method != 8: raise IOError, 'Unknown compression method' flag = ord( self.fileobj.read(1) ) # modtime = self.fileobj.read(4) # extraflag = self.fileobj.read(1) # os = self.fileobj.read(1) self.fileobj.read(6) if flag & FEXTRA: # Read the extra field xlen=ord(self.fileobj.read(1)) xlen=xlen+256*ord(self.fileobj.read(1)) extra = self.fileobj.read(xlen) while 1: l = ord(extra[2])+256*ord(extra[3]) e = extra[:4+l] if e[:2]<>'RA': extra=extra[4+l:] if not extra: raise "Missing dictzip extension" continue else: break length = ord(extra[2])+256*ord(extra[3]) ver = ord(extra[4])+256*ord(extra[5]) self.chlen = ord(extra[6])+256*ord(extra[7]) chcnt = ord(extra[8])+256*ord(extra[9]) p = 10 lens = [] for i in xrange(chcnt): thischlen = ord(extra[p])+256*ord(extra[p+1]) p = p+2 lens.append(thischlen) chpos = 0 self.chunks = [] for i in lens: self.chunks.append( (chpos, i) ) chpos = chpos+i self._lastpos = chpos else: raise "Missing dictzip extension" if flag & FNAME: # Read and discard a null-terminated string containing the filename while (1): s=self.fileobj.read(1) if not s or s=='\000': break if flag & FCOMMENT: # Read and discard a null-terminated string containing a comment while (1): s=self.fileobj.read(1) if not s or s=='\000': break if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC self._firstpos = self.fileobj.tell() def write(self,data): raise ValueError, "write() not supported on DictzipFile object" def writelines(self,lines): raise ValueError, "writelines() not supported on DictzipFile object" def _readchunk(self,n): if n>=len(self.chunks): return '' if self.cache.has_key(n): return self.cache[n] self.fileobj.seek(self._firstpos+self.chunks[n][0]) s = self.fileobj.read(self.chunks[n][1]) dobj = zlib.decompressobj(-zlib.MAX_WBITS) output = dobj.decompress(s) del dobj #self.cache = {} # crude hack until proper cache is done self.cache[n] = output self.cachekeys.append(n) # delete the oldest filled up item in cache if len(self.cachekeys) > self.cachesize: try: del self.cache[self.cachekeys[0]] del self.cachekeys[0] except KeyError: pass return output def read(self, size=-1): firstchunk = self.pos/self.chlen offset = self.pos - firstchunk*self.chlen if size == -1: lastchunk = len(self.chunks)+1 finish = 0 npos = sys.maxint else: lastchunk = (self.pos+size)/self.chlen finish = offset+size npos = self.pos+size buf = "" for i in range(firstchunk, lastchunk+1): buf = buf+self._readchunk(i) r = buf[offset:finish] self.pos = npos return r def close(self): self.fileobj.close() def __del__(self): self.close() def flush(self): pass def seek(self, pos, whence=0): if whence == 0: self.pos = pos elif whence == 1: self.pos = self.pos+pos elif whence == 2: raise "Seeking from end of file not supported" # fixme def tell(self): return self.pos def isatty(self): return 0 def readline(self, size=-1): if size < 0: size = sys.maxint bufs = [] orig_size = size oldpos = self.pos readsize = min(100, size) # Read from the file in small chunks while 1: if size == 0: return string.join(bufs, '') # Return resulting line c = self.read(readsize) i = string.find(c, '\n') if i>=0: self.pos = self.pos-len(c)+i+1 if size is not None: # We set i=size to break out of the loop under two # conditions: 1) there's no newline, and the chunk is # larger than size, or 2) there is a newline, but the # resulting line would be longer than 'size'. if i==-1 and len(c) > size: i=size-1 elif size <= i: i = size -1 if i >= 0 or c == '': bufs.append(c[:i+1]) # Add portion of last chunk return string.join(bufs, '') # Return resulting line # Append chunk to list, decrease 'size', bufs.append(c) size = size - len(c) readsize = min(size, readsize * 2) def readlines(self, sizehint=0): # Negative numbers result in reading all the lines if sizehint <= 0: sizehint = sys.maxint L = [] while sizehint > 0: line = self.readline() if line == "": break L.append( line ) sizehint = sizehint - len(line) return L def _test(): import sys f = DictzipFile(sys.argv[1]) f.seek(32023449) while 1: r = f.readline() if not r: break print `r` #for i in range(1): # r = f.read(1400000) # if not r: # break # sys.stdout.write(r) # sys.stdout.flush() if __name__ == '__main__': _test() }}} == 'stardict.py' == {{{#!python #!/usr/bin/python # -*- coding: utf-8 -*- import mmap,os,md5,struct import dictzip from stat import ST_MTIME import codecs INT_SIZE = struct.calcsize('i') TAB = '\t' strategies = { 'exact' : (0, 'Match words exactly'), 'prefix' : (1, 'Match prefixes'), 'substring' : (2, 'Match substring occurring anywhere in word'), 'suffix' : (3, 'Match suffixes'), 'soundex' : (4, 'Match using SOUNDEX algorithm'), 'lev' : (5, 'Match words within Levenshtein distance one'), 're' : (6, 'POSIX 1003.2 regular expressions'), 'fnmatch' : (7, 'fnmatch-like (* ? as wildcards)'), 'metaphone' : (8, 'metaphone algorithm') } def unique_strings(l): dict = {} for s in l: dict[s] = 1 return dict.keys() b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" def b64_encode(val): """Takes as input an integer val and returns a string of it encoded with the base64 algorithm used by dict indexes.""" startfound = 0 retval = "" for i in range(5, -1, -1): thispart = (val >> (6 * i)) & ((2 ** 6) - 1) if (not startfound) and (not thispart): # Both zero -- keep going. continue startfound = 1 retval += b64_list[thispart] if len(retval): return retval else: return b64_list[0] def b64_decode(str): """Takes as input a string and returns an integer value of it decoded with the base64 algorithm used by dict indexes.""" if not len(str): return 0 retval = 0 shiftval = 0 for i in range(len(str) - 1, -1, -1): val = b64_list.index(str[i]) retval = retval | (val << shiftval) shiftval += 6 return retval def sort_index(original, sorted): l = open(original).readlines() nl = [] for i in range(len(l)): line = l[i].split(TAB, 1) if len(line)<>2: print "corrupted index entry", `l[i]` continue entry, rest = line try: entry = unicode(entry, 'utf-8') except UnicodeDecodeError: #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry) entry = unicode(entry, 'ISO-8859-1') entry = entry.lower() entry = entry.encode('utf-8') n = entry+TAB+rest nl.append(n) nl.sort() f = open(sorted, "w") for i in nl: f.write(i) f.close() def getcachenames(orig_inx): digest = md5.new(orig_inx).hexdigest() inx = os.path.join(os.getcwd(), digest+".index") pos = os.path.join(os.getcwd(), digest+".positions") return inx, pos def dbargs(name, data=None, inx=None,info_file=None): """return tuple of: mmap object for posittions, mmap for index file, file object for data file """ if not data: data = name+".dict" sep = os.sep sep1 = os.altsep or os.sep if not( (sep in data) or (sep1 in data) ): data = "%s" % data if not inx: inx = name+".idx" if not( (sep in inx) or (sep1 in inx) ): inx = os.path.normpath(os.path.join(os.getcwd(),inx)) if not info_file: info_file = name+".ifo" if not( (sep in info_file) or (sep1 in info_file) ): info_file = os.path.normpath(os.path.join(os.getcwd(),info_file)) if data[-3:]=='.dz': datafile = dictzip.DictzipFile(data) else: try: datafile = open(data) except IOError: datafile = dictzip.DictzipFile(data+".dz") inx_file = open(inx,'rb').read() info_file = open(info_file,'rb').read() wordlist = [] runner = 0 i=0 while True: wordlist.append(runner+9) runner = inx_file.find('\0',runner+9) if runner == -1: break i+=1 return wordlist, len(wordlist), inx_file, datafile,info_file class FileDb: """Entries in index are in UTF8, sorted byte-after-byte """ def __init__(self, name, data=None, inx=None, info=None): if not info: info = name self.name = name self.info = info self.data = data self.datafo = None self.index = None self.info_file = None self.inx = inx self.initialized = 0 self.initialize() def initialize(self): if not self.initialized: self.positions, self.nrwords, self.index, self.datafo, self.info_file = dbargs(self.name, self.data, self.inx) self.initialized = 1 def transformentry(self, s): if -1 !=self.info_file.find('sametypesequence=tm'): s = '[' + s.replace('\0',']',1) # transforms read entry into plain text # or, in the future, into mime/html/sgml/whatever # to be overriden return s def readentry(self, arg): entry, st, ln = arg self.datafo.seek(st) r = self.transformentry(self.datafo.read(ln)) return self.name, r def define(self, word): self.initialize() r = [] #poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0) poss = [] for i in poss: self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0]) l = self.index.readline().rstrip entry, st, ln = l.split(TAB) st, ln = b64_decode(st), b64_decode(ln) r.append( (entry,st,ln) ) r = map(self.readentry, r) return r def get_word(self,index): "get word from self.index" left_b = self.positions[index] right_b = self.positions[index+1] - 9 #print self.index[left_b:right_b] return self.index[left_b:right_b] def get_explanation(self,index): "get word from self.index" right_b = self.positions[index+1] - 9 offset_v = 0 size_v = 0 offset = self.index[right_b+1:right_b+5] size = self.index[right_b+5:right_b+9] offset_v = struct.unpack('!i',offset)[0] size_v = struct.unpack('!i',size)[0] self.datafo.seek(offset_v) return self.datafo.read(size_v) def match(self, strategy, word): res = "" ind = 0 left_i = 0 right_i = len(self.positions)-2 cur_i = 0 found = 0 while left_i<=right_i: cur_i = (left_i+right_i)/2 cmpRes = cmp(word.lower(),self.get_word(cur_i).lower()) #print cmpRes if cmpRes > 0: left_i = cur_i+1 elif cmpRes < 0: right_i = cur_i-1 else: found = 1 break #print found res += self.get_word(cur_i) + '\n' res += self.transformentry(self.get_explanation(cur_i)) return res def __del__(self): if self.datafo: self.datafo.close() if __name__ == "__main__": import sys cdict = FileDb(name='/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb') try: word = sys.argv[1] except: word = 'python' print cdict.match('.', word) #print jmdict.match('.','がんばる') #print jmdict.match('.','カード') #print jmdict.match('.','θ理論') #jmdict.match('.','but') #jmdict = FileDb(name='jmdict-ja-en') }}} ##= 反馈 =