Differences between revisions 1 and 4 (spanning 3 versions)
Revision 1 as of 2007-12-02 03:47:31
Size: 1058
Editor: ZoomQuiet
Comment:
Revision 4 as of 2009-11-28 15:08:13
Size: 17367
Editor: Elias
Comment: 删除对PageComment2组件的引用
Deletions are marked like this. Additions are marked like this.
Line 26: Line 26:
{{{
Line 38: Line 39:
== 'dictzip.py' ==
{{{#!python
#!/bin/env python

"""Functions that provide transparent read-only access to dictzipped files
"""
# based on gzip.py from python library

import string, struct, sys, time
import zlib
import __builtin__

FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

READ, WRITE = 1, 2

def write32(output, value):
    output.write(struct.pack("<l", value))
    
def write32u(output, value):
    output.write(struct.pack("<L", value))

def read32(input):
    return struct.unpack("<l", input.read(4))[0]

def open(filename, mode="rb", compresslevel=9):
    return DictzipFile(filename, mode, compresslevel)

class DictzipFile:

    myfileobj = None

    def __init__(self, filename=None, mode=None,
                 compresslevel=9, fileobj=None, cachesize=2):
        if fileobj is None:
            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
        if filename is None:
            if hasattr(fileobj, 'name'): filename = fileobj.name
            else: filename = ''
        if mode is None:
            if hasattr(fileobj, 'mode'): mode = fileobj.mode
            else: mode = 'rb'

        if mode[0:1] == 'r':
            self.mode = READ
            self.filename = filename
        else:
            raise ValueError, "Mode " + mode + " not supported"

        self.fileobj = fileobj
        self._read_gzip_header()
        self.pos = 0
        self.cachesize = cachesize
        self.cache = {}
        self.cachekeys = []


    def __repr__(self):
        s = repr(self.fileobj)
        return '<dictzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'


    def _read_gzip_header(self):
        magic = self.fileobj.read(2)
        if magic != '\037\213':
            raise IOError, 'Not a gzipped file'
        method = ord( self.fileobj.read(1) )
        if method != 8:
            raise IOError, 'Unknown compression method'
        flag = ord( self.fileobj.read(1) )
        # modtime = self.fileobj.read(4)
        # extraflag = self.fileobj.read(1)
        # os = self.fileobj.read(1)
        self.fileobj.read(6)

        if flag & FEXTRA:
            # Read the extra field
            xlen=ord(self.fileobj.read(1))
            xlen=xlen+256*ord(self.fileobj.read(1))
            extra = self.fileobj.read(xlen)
            while 1:
                l = ord(extra[2])+256*ord(extra[3])
                e = extra[:4+l]
                if e[:2]<>'RA':
                    extra=extra[4+l:]
                    if not extra:
                        raise "Missing dictzip extension"
                    continue
                else:
                    break
            length = ord(extra[2])+256*ord(extra[3])
            ver = ord(extra[4])+256*ord(extra[5])
            self.chlen = ord(extra[6])+256*ord(extra[7])
            chcnt = ord(extra[8])+256*ord(extra[9])
            p = 10
            lens = []
            for i in xrange(chcnt):
                thischlen = ord(extra[p])+256*ord(extra[p+1])
                p = p+2
                lens.append(thischlen)
            chpos = 0
            self.chunks = []
            for i in lens:
                self.chunks.append( (chpos, i) )
                chpos = chpos+i
            self._lastpos = chpos
        else:
            raise "Missing dictzip extension"
            
            
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while (1):
                s=self.fileobj.read(1)
                if not s or s=='\000': break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while (1):
                s=self.fileobj.read(1)
                if not s or s=='\000': break
        if flag & FHCRC:
            self.fileobj.read(2) # Read & discard the 16-bit header CRC

        self._firstpos = self.fileobj.tell()
        

    def write(self,data):
        raise ValueError, "write() not supported on DictzipFile object"

    def writelines(self,lines):
        raise ValueError, "writelines() not supported on DictzipFile object"

    def _readchunk(self,n):
        if n>=len(self.chunks):
            return ''
        if self.cache.has_key(n):
            return self.cache[n]
        self.fileobj.seek(self._firstpos+self.chunks[n][0])
        s = self.fileobj.read(self.chunks[n][1])
        dobj = zlib.decompressobj(-zlib.MAX_WBITS)
        output = dobj.decompress(s)
        del dobj
        #self.cache = {} # crude hack until proper cache is done
        self.cache[n] = output
        self.cachekeys.append(n)
        # delete the oldest filled up item in cache
        if len(self.cachekeys) > self.cachesize:
            try:
                del self.cache[self.cachekeys[0]]
                del self.cachekeys[0]
            except KeyError:
                pass
        return output

    def read(self, size=-1):
        firstchunk = self.pos/self.chlen
        offset = self.pos - firstchunk*self.chlen
        if size == -1:
            lastchunk = len(self.chunks)+1
            finish = 0
            npos = sys.maxint
        else:
            lastchunk = (self.pos+size)/self.chlen
            finish = offset+size
            npos = self.pos+size
        buf = ""
        for i in range(firstchunk, lastchunk+1):
            buf = buf+self._readchunk(i)
        r = buf[offset:finish]
        self.pos = npos
        return r
          
    def close(self):
        self.fileobj.close()

    def __del__(self):
        self.close()
        
    def flush(self):
        pass

    def seek(self, pos, whence=0):
        if whence == 0:
            self.pos = pos
        elif whence == 1:
            self.pos = self.pos+pos
        elif whence == 2:
            raise "Seeking from end of file not supported"
            # fixme
        
    def tell(self):
        return self.pos

    def isatty(self):
        return 0

    def readline(self, size=-1):
        if size < 0: size = sys.maxint
        bufs = []
        orig_size = size
        oldpos = self.pos
        readsize = min(100, size) # Read from the file in small chunks
        while 1:
            if size == 0:
                return string.join(bufs, '') # Return resulting line

            c = self.read(readsize)
            i = string.find(c, '\n')
            if i>=0:
                self.pos = self.pos-len(c)+i+1
            if size is not None:
                # We set i=size to break out of the loop under two
                # conditions: 1) there's no newline, and the chunk is
                # larger than size, or 2) there is a newline, but the
                # resulting line would be longer than 'size'.
                if i==-1 and len(c) > size: i=size-1
                elif size <= i: i = size -1

            if i >= 0 or c == '':
                bufs.append(c[:i+1]) # Add portion of last chunk
                return string.join(bufs, '') # Return resulting line

            # Append chunk to list, decrease 'size',
            bufs.append(c)
            size = size - len(c)
            readsize = min(size, readsize * 2)
            
    def readlines(self, sizehint=0):
        # Negative numbers result in reading all the lines
        if sizehint <= 0: sizehint = sys.maxint
        L = []
        while sizehint > 0:
            line = self.readline()
            if line == "": break
            L.append( line )
            sizehint = sizehint - len(line)
        return L



def _test():
    import sys
    f = DictzipFile(sys.argv[1])
    f.seek(32023449)
    while 1:
        r = f.readline()
        if not r:
            break
        print `r`
    #for i in range(1):
    # r = f.read(1400000)
    # if not r:
    # break
    # sys.stdout.write(r)
    # sys.stdout.flush()
    

if __name__ == '__main__':
    _test()
}}}
== 'stardict.py' ==
{{{#!python
#!/usr/bin/python

# -*- coding: utf-8 -*-



import mmap,os,md5,struct

import dictzip

from stat import ST_MTIME

import codecs



INT_SIZE = struct.calcsize('i')

TAB = '\t'

strategies = {

        'exact' : (0, 'Match words exactly'),

        'prefix' : (1, 'Match prefixes'),

        'substring' : (2, 'Match substring occurring anywhere in word'),

        'suffix' : (3, 'Match suffixes'),

        'soundex' : (4, 'Match using SOUNDEX algorithm'),

        'lev' : (5, 'Match words within Levenshtein distance one'),

        're' : (6, 'POSIX 1003.2 regular expressions'),

        'fnmatch' : (7, 'fnmatch-like (* ? as wildcards)'),

        'metaphone' : (8, 'metaphone algorithm')

        }



def unique_strings(l):

    dict = {}

    for s in l:

       dict[s] = 1

    return dict.keys()



b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"

def b64_encode(val):

    """Takes as input an integer val and returns a string of it encoded

    with the base64 algorithm used by dict indexes."""

    startfound = 0

    retval = ""

    for i in range(5, -1, -1):

        thispart = (val >> (6 * i)) & ((2 ** 6) - 1)

        if (not startfound) and (not thispart):

            # Both zero -- keep going.

            continue

        startfound = 1

        retval += b64_list[thispart]

    if len(retval):

        return retval

    else:

        return b64_list[0]

    

def b64_decode(str):

    """Takes as input a string and returns an integer value of it decoded

    with the base64 algorithm used by dict indexes."""

    if not len(str):

        return 0

    retval = 0

    shiftval = 0

    for i in range(len(str) - 1, -1, -1):

        val = b64_list.index(str[i])

        retval = retval | (val << shiftval)

        shiftval += 6

    return retval



def sort_index(original, sorted):

    l = open(original).readlines()

    nl = []

    for i in range(len(l)):

        line = l[i].split(TAB, 1)

        if len(line)<>2:

            print "corrupted index entry", `l[i]`

            continue

        entry, rest = line

        try:

            entry = unicode(entry, 'utf-8')

        except UnicodeDecodeError:

            #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry)

            entry = unicode(entry, 'ISO-8859-1')

        entry = entry.lower()

        entry = entry.encode('utf-8')

        n = entry+TAB+rest

        nl.append(n)

    nl.sort()

    f = open(sorted, "w")

    for i in nl:

        f.write(i)

    f.close()



def getcachenames(orig_inx):

    digest = md5.new(orig_inx).hexdigest()

    inx = os.path.join(os.getcwd(), digest+".index")

    pos = os.path.join(os.getcwd(), digest+".positions")

    return inx, pos



def dbargs(name, data=None, inx=None,info_file=None):

    """return tuple of: mmap object for posittions, mmap for index file, file object for data file

    """

    if not data:

        data = name+".dict"

    sep = os.sep

    sep1 = os.altsep or os.sep

    if not( (sep in data) or (sep1 in data) ):

        data = "%s" % data



    if not inx:

        inx = name+".idx"

    if not( (sep in inx) or (sep1 in inx) ):

        inx = os.path.normpath(os.path.join(os.getcwd(),inx))



    if not info_file:

        info_file = name+".ifo"

    if not( (sep in info_file) or (sep1 in info_file) ):

        info_file = os.path.normpath(os.path.join(os.getcwd(),info_file))



    if data[-3:]=='.dz':

        datafile = dictzip.DictzipFile(data)

    else:

        try:

            datafile = open(data)

        except IOError:

            datafile = dictzip.DictzipFile(data+".dz")

        

    inx_file = open(inx,'rb').read()

    info_file = open(info_file,'rb').read()



    wordlist = []

    runner = 0

    i=0

    while True:

     wordlist.append(runner+9)

     runner = inx_file.find('\0',runner+9)

     if runner == -1:

      break

     i+=1

    

    return wordlist, len(wordlist), inx_file, datafile,info_file



class FileDb:

    """Entries in index are in UTF8, sorted byte-after-byte

    """

    def __init__(self, name, data=None, inx=None, info=None):

        if not info:

            info = name

        self.name = name

        self.info = info

        self.data = data

        self.datafo = None

        self.index = None

        self.info_file = None

        self.inx = inx

        self.initialized = 0

        self.initialize()



    def initialize(self):

        if not self.initialized:

            self.positions, self.nrwords, self.index, self.datafo, self.info_file = dbargs(self.name, self.data, self.inx)

            self.initialized = 1

            

    def transformentry(self, s):

        if -1 !=self.info_file.find('sametypesequence=tm'):

            s = '<font color=blue face="Lucida Sans Unicode">[' + s.replace('\0',']</font>',1)

        # transforms read entry into plain text

        # or, in the future, into mime/html/sgml/whatever

        # to be overriden

        return s



    def readentry(self, arg):

        entry, st, ln = arg

        self.datafo.seek(st)

        r = self.transformentry(self.datafo.read(ln))

        return self.name, r



    def define(self, word):

        self.initialize()

        r = []

        #poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0)

        poss = []

        for i in poss:

            self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])

            l = self.index.readline().rstrip

            entry, st, ln = l.split(TAB)

            st, ln = b64_decode(st), b64_decode(ln)

            r.append( (entry,st,ln) )

        r = map(self.readentry, r)

        return r



                

    def get_word(self,index):

        "get word from self.index"

        left_b = self.positions[index]

        right_b = self.positions[index+1] - 9

        #print self.index[left_b:right_b]

        return self.index[left_b:right_b]



    def get_explanation(self,index):

        "get word from self.index"

        right_b = self.positions[index+1] - 9

        offset_v = 0

        size_v = 0

        offset = self.index[right_b+1:right_b+5]

        size = self.index[right_b+5:right_b+9]

        offset_v = struct.unpack('!i',offset)[0]

        size_v = struct.unpack('!i',size)[0]



        self.datafo.seek(offset_v)

        return self.datafo.read(size_v)



    def match(self, strategy, word):

        res = ""

        ind = 0

        left_i = 0

        right_i = len(self.positions)-2

        cur_i = 0

        found = 0

        while left_i<=right_i:

            cur_i = (left_i+right_i)/2

            cmpRes = cmp(word.lower(),self.get_word(cur_i).lower())

            #print cmpRes

            if cmpRes > 0:

                left_i = cur_i+1

            elif cmpRes < 0:

                right_i = cur_i-1

            else:

                found = 1

                break

        #print found



        res += self.get_word(cur_i) + '\n'

        res += self.transformentry(self.get_explanation(cur_i))



        return res



    def __del__(self):

        if self.datafo:

            self.datafo.close()



if __name__ == "__main__":

    import sys

 cdict = FileDb(name='/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')

    try:

        word = sys.argv[1]

    except:

        word = 'python'

 print cdict.match('.', word)

 #print jmdict.match('.','がんばる')

 #print jmdict.match('.','カード')

 #print jmdict.match('.','θ理論')

 #jmdict.match('.','but')

 #jmdict = FileDb(name='jmdict-ja-en')

}}}
Line 40: Line 792:
[[PageComment2]]

含有章节索引的 *PUG 文章通用模板 ::-- ZoomQuiet [DateTime(2007-12-02T03:47:31Z)] TableOfContents

Include(CPUGnav)

1. GPL词典

Jiahua Huang <[email protected]>
reply-to        [email protected],
to      [email protected],
date    Dec 2, 2007 11:40 AM
subject [CPyUG:35998] Re: 请教有哪些好些的开放格式的词典?

http://stardict.sourceforge.net/ 上的许多词典号称是 GPL 的

  • 读取 stardict 词典的 python 模块

1.1. 示例:

huahua@huahua:demo$ python
Python 2.5.1 (r251:54863, Oct  5 2007, 13:36:32)
[GCC 4.1.3 20070929 (prerelease) (Ubuntu 4.1.2-16ubuntu2)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import stardict
>>> cdict = stardict.FileDb('/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')
>>> print cdict.match('.','python')
python
*['paiθɒn]
n. 大蟒, 巨蟒

1.2. 'dictzip.py'

   1 #!/bin/env python
   2 
   3 """Functions that provide transparent read-only access to dictzipped files
   4 """
   5 # based on gzip.py from python library
   6 
   7 import string, struct, sys, time
   8 import zlib
   9 import __builtin__
  10 
  11 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  12 
  13 READ, WRITE = 1, 2
  14 
  15 def write32(output, value):
  16     output.write(struct.pack("<l", value))
  17     
  18 def write32u(output, value):
  19     output.write(struct.pack("<L", value))
  20 
  21 def read32(input):
  22     return struct.unpack("<l", input.read(4))[0]
  23 
  24 def open(filename, mode="rb", compresslevel=9):
  25     return DictzipFile(filename, mode, compresslevel)
  26 
  27 class DictzipFile:
  28 
  29     myfileobj = None
  30 
  31     def __init__(self, filename=None, mode=None, 
  32                  compresslevel=9, fileobj=None, cachesize=2):
  33         if fileobj is None:
  34             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  35         if filename is None:
  36             if hasattr(fileobj, 'name'): filename = fileobj.name
  37             else: filename = ''
  38         if mode is None:
  39             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  40             else: mode = 'rb'
  41 
  42         if mode[0:1] == 'r':
  43             self.mode = READ
  44             self.filename = filename
  45         else:
  46             raise ValueError, "Mode " + mode + " not supported"
  47 
  48         self.fileobj = fileobj
  49         self._read_gzip_header()
  50         self.pos = 0
  51         self.cachesize = cachesize
  52         self.cache = {}
  53         self.cachekeys = []
  54 
  55 
  56     def __repr__(self):
  57         s = repr(self.fileobj)
  58         return '<dictzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
  59 
  60 
  61     def _read_gzip_header(self):
  62         magic = self.fileobj.read(2)
  63         if magic != '\037\213':
  64             raise IOError, 'Not a gzipped file'
  65         method = ord( self.fileobj.read(1) )
  66         if method != 8:
  67             raise IOError, 'Unknown compression method'
  68         flag = ord( self.fileobj.read(1) )
  69         # modtime = self.fileobj.read(4)
  70         # extraflag = self.fileobj.read(1)
  71         # os = self.fileobj.read(1)
  72         self.fileobj.read(6)
  73 
  74         if flag & FEXTRA:
  75             # Read the extra field
  76             xlen=ord(self.fileobj.read(1))              
  77             xlen=xlen+256*ord(self.fileobj.read(1))
  78             extra = self.fileobj.read(xlen)
  79             while 1:
  80                 l = ord(extra[2])+256*ord(extra[3])
  81                 e = extra[:4+l]
  82                 if e[:2]<>'RA':
  83                     extra=extra[4+l:]
  84                     if not extra:
  85                         raise "Missing dictzip extension"
  86                     continue
  87                 else:
  88                     break
  89             length = ord(extra[2])+256*ord(extra[3])
  90             ver = ord(extra[4])+256*ord(extra[5])
  91             self.chlen = ord(extra[6])+256*ord(extra[7])
  92             chcnt = ord(extra[8])+256*ord(extra[9])
  93             p = 10
  94             lens = []
  95             for i in xrange(chcnt):
  96                 thischlen = ord(extra[p])+256*ord(extra[p+1])
  97                 p = p+2
  98                 lens.append(thischlen)
  99             chpos = 0
 100             self.chunks = []
 101             for i in lens:
 102                 self.chunks.append( (chpos, i) )
 103                 chpos = chpos+i
 104             self._lastpos = chpos
 105         else:
 106             raise "Missing dictzip extension"
 107             
 108             
 109         if flag & FNAME:
 110             # Read and discard a null-terminated string containing the filename
 111             while (1):
 112                 s=self.fileobj.read(1)
 113                 if not s or s=='\000': break
 114         if flag & FCOMMENT:
 115             # Read and discard a null-terminated string containing a comment
 116             while (1):
 117                 s=self.fileobj.read(1)
 118                 if not s or s=='\000': break
 119         if flag & FHCRC:
 120             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 121 
 122         self._firstpos = self.fileobj.tell()
 123         
 124 
 125     def write(self,data):
 126         raise ValueError, "write() not supported on DictzipFile object"
 127 
 128     def writelines(self,lines):
 129         raise ValueError, "writelines() not supported on DictzipFile object"
 130 
 131     def _readchunk(self,n):
 132         if n>=len(self.chunks):
 133             return ''
 134         if self.cache.has_key(n):
 135             return self.cache[n]
 136         self.fileobj.seek(self._firstpos+self.chunks[n][0])
 137         s = self.fileobj.read(self.chunks[n][1])
 138         dobj = zlib.decompressobj(-zlib.MAX_WBITS)
 139         output = dobj.decompress(s)
 140         del dobj
 141         #self.cache = {} # crude hack until proper cache is done
 142         self.cache[n] = output
 143         self.cachekeys.append(n)
 144         # delete the oldest filled up item in cache
 145         if len(self.cachekeys) > self.cachesize:
 146             try:
 147                 del self.cache[self.cachekeys[0]]
 148                 del self.cachekeys[0]
 149             except KeyError:
 150                 pass
 151         return output
 152 
 153     def read(self, size=-1):
 154         firstchunk = self.pos/self.chlen
 155         offset = self.pos - firstchunk*self.chlen
 156         if size == -1:
 157             lastchunk = len(self.chunks)+1
 158             finish = 0
 159             npos = sys.maxint
 160         else:
 161             lastchunk = (self.pos+size)/self.chlen
 162             finish = offset+size
 163             npos = self.pos+size
 164         buf = ""
 165         for i in range(firstchunk, lastchunk+1):
 166             buf = buf+self._readchunk(i)
 167         r = buf[offset:finish]
 168         self.pos = npos
 169         return r
 170           
 171     def close(self):
 172         self.fileobj.close()
 173 
 174     def __del__(self):
 175         self.close()
 176         
 177     def flush(self):
 178         pass
 179 
 180     def seek(self, pos, whence=0):
 181         if whence == 0:
 182             self.pos = pos
 183         elif whence == 1:
 184             self.pos = self.pos+pos
 185         elif whence == 2:
 186             raise "Seeking from end of file not supported"
 187             # fixme
 188         
 189     def tell(self):
 190         return self.pos
 191 
 192     def isatty(self):
 193         return 0
 194 
 195     def readline(self, size=-1):
 196         if size < 0: size = sys.maxint
 197         bufs = []
 198         orig_size = size
 199         oldpos = self.pos
 200         readsize = min(100, size)    # Read from the file in small chunks
 201         while 1:
 202             if size == 0:
 203                 return string.join(bufs, '') # Return resulting line
 204 
 205             c = self.read(readsize)
 206             i = string.find(c, '\n')
 207             if i>=0:
 208                 self.pos = self.pos-len(c)+i+1
 209             if size is not None:
 210                 # We set i=size to break out of the loop under two
 211                 # conditions: 1) there's no newline, and the chunk is 
 212                 # larger than size, or 2) there is a newline, but the
 213                 # resulting line would be longer than 'size'.
 214                 if i==-1 and len(c) > size: i=size-1
 215                 elif size <= i: i = size -1
 216 
 217             if i >= 0 or c == '':
 218                 bufs.append(c[:i+1])    # Add portion of last chunk
 219                 return string.join(bufs, '') # Return resulting line
 220 
 221             # Append chunk to list, decrease 'size',
 222             bufs.append(c)
 223             size = size - len(c)
 224             readsize = min(size, readsize * 2)
 225             
 226     def readlines(self, sizehint=0):
 227         # Negative numbers result in reading all the lines
 228         if sizehint <= 0: sizehint = sys.maxint
 229         L = []
 230         while sizehint > 0:
 231             line = self.readline()
 232             if line == "": break
 233             L.append( line )
 234             sizehint = sizehint - len(line)
 235         return L
 236 
 237 
 238 
 239 def _test():
 240     import sys
 241     f = DictzipFile(sys.argv[1])
 242     f.seek(32023449)
 243     while 1:
 244         r = f.readline()
 245         if not r:
 246             break
 247         print `r`
 248     #for i in range(1):
 249     #    r = f.read(1400000)
 250     #    if not r:
 251     #        break
 252     #    sys.stdout.write(r)
 253     #    sys.stdout.flush()
 254     
 255 
 256 if __name__ == '__main__':
 257     _test()

1.3. 'stardict.py'

   1 #!/usr/bin/python
   2 
   3 # -*- coding: utf-8 -*-
   4 
   5 
   6 
   7 import mmap,os,md5,struct
   8 
   9 import dictzip
  10 
  11 from stat import ST_MTIME
  12 
  13 import codecs
  14 
  15 
  16 
  17 INT_SIZE = struct.calcsize('i')
  18 
  19 TAB = '\t'
  20 
  21 strategies = {
  22 
  23         'exact'     : (0, 'Match words exactly'),
  24 
  25         'prefix'    : (1, 'Match prefixes'),
  26 
  27         'substring' : (2, 'Match substring occurring anywhere in word'),
  28 
  29         'suffix'    : (3, 'Match suffixes'),
  30 
  31         'soundex'   : (4, 'Match using SOUNDEX algorithm'),
  32 
  33         'lev'       : (5, 'Match words within Levenshtein distance one'),
  34 
  35         're'        : (6, 'POSIX 1003.2 regular expressions'),
  36 
  37         'fnmatch'   : (7, 'fnmatch-like (* ? as wildcards)'),
  38 
  39         'metaphone' : (8, 'metaphone algorithm')
  40 
  41         }
  42 
  43 
  44 
  45 def unique_strings(l):
  46 
  47     dict = {}
  48 
  49     for s in l:
  50 
  51        dict[s] = 1
  52 
  53     return dict.keys()
  54 
  55 
  56 
  57 b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  58 
  59 def b64_encode(val):
  60 
  61     """Takes as input an integer val and returns a string of it encoded
  62 
  63     with the base64 algorithm used by dict indexes."""
  64 
  65     startfound = 0
  66 
  67     retval = ""
  68 
  69     for i in range(5, -1, -1):
  70 
  71         thispart = (val >> (6 * i)) & ((2 ** 6) - 1)
  72 
  73         if (not startfound) and (not thispart):
  74 
  75             # Both zero -- keep going.
  76 
  77             continue
  78 
  79         startfound = 1
  80 
  81         retval += b64_list[thispart]
  82 
  83     if len(retval):
  84 
  85         return retval
  86 
  87     else:
  88 
  89         return b64_list[0]
  90 
  91     
  92 
  93 def b64_decode(str):
  94 
  95     """Takes as input a string and returns an integer value of it decoded
  96 
  97     with the base64 algorithm used by dict indexes."""
  98 
  99     if not len(str):
 100 
 101         return 0
 102 
 103     retval = 0
 104 
 105     shiftval = 0
 106 
 107     for i in range(len(str) - 1, -1, -1):
 108 
 109         val = b64_list.index(str[i])
 110 
 111         retval = retval | (val << shiftval)
 112 
 113         shiftval += 6
 114 
 115     return retval
 116 
 117 
 118 
 119 def sort_index(original, sorted):
 120 
 121     l = open(original).readlines()
 122 
 123     nl = []
 124 
 125     for i in range(len(l)):
 126 
 127         line = l[i].split(TAB, 1)
 128 
 129         if len(line)<>2:
 130 
 131             print "corrupted index entry", `l[i]`
 132 
 133             continue
 134 
 135         entry, rest = line
 136 
 137         try:
 138 
 139             entry = unicode(entry, 'utf-8')
 140 
 141         except UnicodeDecodeError:
 142 
 143             #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry)
 144 
 145             entry = unicode(entry, 'ISO-8859-1')
 146 
 147         entry = entry.lower()
 148 
 149         entry = entry.encode('utf-8')
 150 
 151         n = entry+TAB+rest
 152 
 153         nl.append(n)
 154 
 155     nl.sort()
 156 
 157     f = open(sorted, "w")        
 158 
 159     for i in nl:
 160 
 161         f.write(i)
 162 
 163     f.close()
 164 
 165 
 166 
 167 def getcachenames(orig_inx):
 168 
 169     digest = md5.new(orig_inx).hexdigest()
 170 
 171     inx = os.path.join(os.getcwd(), digest+".index")
 172 
 173     pos = os.path.join(os.getcwd(), digest+".positions")
 174 
 175     return inx, pos
 176 
 177 
 178 
 179 def dbargs(name, data=None, inx=None,info_file=None):
 180 
 181     """return tuple of:  mmap object for posittions, mmap for index file, file object for data file 
 182 
 183     """
 184 
 185     if not data:
 186 
 187         data = name+".dict"
 188 
 189     sep = os.sep
 190 
 191     sep1 = os.altsep or os.sep
 192 
 193     if not( (sep in data) or (sep1 in data) ):
 194 
 195         data = "%s" % data
 196 
 197 
 198 
 199     if not inx:
 200 
 201         inx = name+".idx"
 202 
 203     if not( (sep in inx) or (sep1 in inx) ):
 204 
 205         inx = os.path.normpath(os.path.join(os.getcwd(),inx))
 206 
 207 
 208 
 209     if not info_file:
 210 
 211         info_file = name+".ifo"
 212 
 213     if not( (sep in info_file) or (sep1 in info_file) ):
 214 
 215         info_file = os.path.normpath(os.path.join(os.getcwd(),info_file))
 216 
 217 
 218 
 219     if data[-3:]=='.dz':
 220 
 221         datafile = dictzip.DictzipFile(data)
 222 
 223     else:
 224 
 225         try:
 226 
 227             datafile = open(data)
 228 
 229         except IOError:
 230 
 231             datafile = dictzip.DictzipFile(data+".dz")
 232 
 233         
 234 
 235     inx_file = open(inx,'rb').read()
 236 
 237     info_file = open(info_file,'rb').read()
 238 
 239 
 240 
 241     wordlist = []
 242 
 243     runner = 0
 244 
 245     i=0
 246 
 247     while True:
 248 
 249         wordlist.append(runner+9)
 250 
 251         runner = inx_file.find('\0',runner+9)
 252 
 253         if runner == -1:
 254 
 255                 break
 256 
 257         i+=1
 258 
 259     
 260 
 261     return wordlist, len(wordlist), inx_file, datafile,info_file
 262 
 263 
 264 
 265 class FileDb:
 266 
 267     """Entries in index are in UTF8, sorted byte-after-byte
 268 
 269     """
 270 
 271     def __init__(self, name, data=None, inx=None, info=None):
 272 
 273         if not info:
 274 
 275             info = name
 276 
 277         self.name = name
 278 
 279         self.info = info
 280 
 281         self.data = data
 282 
 283         self.datafo = None
 284 
 285         self.index = None
 286 
 287         self.info_file = None
 288 
 289         self.inx = inx
 290 
 291         self.initialized = 0
 292 
 293         self.initialize()
 294 
 295 
 296 
 297     def initialize(self):
 298 
 299         if not self.initialized:
 300 
 301             self.positions, self.nrwords, self.index, self.datafo, self.info_file  = dbargs(self.name, self.data, self.inx)
 302 
 303             self.initialized = 1
 304 
 305             
 306 
 307     def transformentry(self, s):
 308 
 309         if -1 !=self.info_file.find('sametypesequence=tm'):
 310 
 311             s = '<font color=blue face="Lucida Sans Unicode">[' + s.replace('\0',']</font>',1)
 312 
 313         # transforms read entry into plain text
 314 
 315         # or, in the future, into mime/html/sgml/whatever
 316 
 317         # to be overriden
 318 
 319         return s
 320 
 321 
 322 
 323     def readentry(self, arg):
 324 
 325         entry, st, ln = arg
 326 
 327         self.datafo.seek(st)
 328 
 329         r = self.transformentry(self.datafo.read(ln))
 330 
 331         return self.name, r
 332 
 333 
 334 
 335     def define(self, word):
 336 
 337         self.initialize()
 338 
 339         r = []
 340 
 341         #poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0)
 342 
 343         poss = []
 344 
 345         for i in poss:
 346 
 347             self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])
 348 
 349             l = self.index.readline().rstrip
 350 
 351             entry, st, ln = l.split(TAB)
 352 
 353             st, ln = b64_decode(st), b64_decode(ln)
 354 
 355             r.append( (entry,st,ln) )
 356 
 357         r = map(self.readentry, r)
 358 
 359         return r
 360 
 361 
 362 
 363                 
 364 
 365     def get_word(self,index):
 366 
 367         "get word from self.index"
 368 
 369         left_b = self.positions[index]
 370 
 371         right_b = self.positions[index+1] - 9
 372 
 373         #print self.index[left_b:right_b]
 374 
 375         return self.index[left_b:right_b]
 376 
 377 
 378 
 379     def get_explanation(self,index):
 380 
 381         "get word from self.index"
 382 
 383         right_b = self.positions[index+1] - 9
 384 
 385         offset_v = 0
 386 
 387         size_v =  0
 388 
 389         offset = self.index[right_b+1:right_b+5]
 390 
 391         size = self.index[right_b+5:right_b+9]
 392 
 393         offset_v = struct.unpack('!i',offset)[0]
 394 
 395         size_v = struct.unpack('!i',size)[0]
 396 
 397 
 398 
 399         self.datafo.seek(offset_v)
 400 
 401         return self.datafo.read(size_v) 
 402 
 403 
 404 
 405     def match(self, strategy, word):
 406 
 407         res = ""
 408 
 409         ind = 0
 410 
 411         left_i = 0
 412 
 413         right_i = len(self.positions)-2
 414 
 415         cur_i = 0
 416 
 417         found = 0
 418 
 419         while left_i<=right_i:
 420 
 421             cur_i = (left_i+right_i)/2
 422 
 423             cmpRes = cmp(word.lower(),self.get_word(cur_i).lower())
 424 
 425             #print cmpRes
 426 
 427             if cmpRes > 0:
 428 
 429                 left_i = cur_i+1 
 430 
 431             elif cmpRes < 0:
 432 
 433                 right_i = cur_i-1
 434 
 435             else:
 436 
 437                 found = 1
 438 
 439                 break
 440 
 441         #print found
 442 
 443 
 444 
 445         res += self.get_word(cur_i) + '\n'
 446 
 447         res += self.transformentry(self.get_explanation(cur_i))
 448 
 449 
 450 
 451         return res
 452 
 453 
 454 
 455     def __del__(self):
 456 
 457         if self.datafo:
 458 
 459             self.datafo.close()
 460 
 461 
 462 
 463 if __name__ == "__main__":
 464 
 465     import sys
 466 
 467         cdict = FileDb(name='/usr/share/stardict/dic/stardict-dic-langdao-ec-gb/langdao-ec-gb')
 468 
 469     try:
 470 
 471         word = sys.argv[1]
 472 
 473     except:
 474 
 475         word = 'python'
 476 
 477         print cdict.match('.', word)
 478 
 479         #print jmdict.match('.','がんばる')
 480 
 481         #print jmdict.match('.','カード')
 482 
 483         #print jmdict.match('.','θ理論')
 484 
 485         #jmdict.match('.','but')
 486 
 487         #jmdict = FileDb(name='jmdict-ja-en')

MicroProj/2007-12-02 (last edited 2009-12-25 07:17:47 by localhost)