##language:zh
#pragma section-numbers on
'''

'''
::-- ZoomQuiet [<<DateTime(2007-11-06T14:25:17Z)>>]
<<TableOfContents>>
## 默许导航,请保留
<<Include(CPUGnav)>>


= 快速尝试 =
{{{
Jiahua Huang <jhuangjiahua@gmail.com>
reply-to	python-cn@googlegroups.com,
to	"python. cn" <python-cn@googlegroups.com>,
date	Nov 6, 2007 10:11 PM
subject	[CPyUG:34610] 试了下 Durus 对象数据库
mailing list	<python-cn.googlegroups.com> Filter messages from this mailing list
mailed-by	googlegroups.com
}}}

	
	
晚饭前跟大妈[[http://groups.google.com/group/python-cn/t/dbd8bc9dddc016cf|聊了会 Durus]] ,
Durus 是 Quixote 团队的作品, 轻量级的 zodb 对象数据库.

顺便测试了下 Durus 处理大数据库的情况

== 测试用例 ==
{{{#!python
##用的倒排索引全文搜索类
class WordIndex:
   '''简单的倒排索引
   '''
   def __init__(self, wordDict={}, Dict=dict, commit=None):
       '''dWordsIndex 是索引字典
       '''
       self.dWordsIndex = wordDict
       self.Dict = Dict #用 btree 或 OOBTree
       self.commit = commit
   def addIndex(self, text, uid=None):
       '''添加索引
       '''
       dWordsIndex = self.dWordsIndex
       Dict = self.Dict
       uid = uid or _str2hash(text)
       words = getWordFs(text) #分词,带词频
       for word, f in words.iteritems():
           if not word in dWordsIndex:
               dWordsIndex[word]=Dict()
           #Uids = dWordsIndex[word]
           #Uids[uid] = f
           dWordsIndex[word][uid] = f
       if self.commit: self.commit()
       return uid
   def delIndex(self, text, uid=None):
       '''删除索引
       '''
       dWordsIndex = self.dWordsIndex
       uid = uid or _str2hash(text)
       words = getWords(text)
       if not words: return False
       for word in words:
           if word in dWordsIndex and uid in dWordsIndex[word]:
               del dWordsIndex[word][uid]
       if self.commit: self.commit()
       return uid
   def query(self, QueryString):
       '''
       @TODO:
       '''
       pass
   def searchIndex(self, text):
       '''搜索文章，返回 uid
       '''
       dWordsIndex = self.dWordsIndex
       words = getWords(text) #分词
       if not words: return []
       dicts = map(dWordsIndex.get, words)
       while None in dicts: dicts.remove(None) ## 怎么来的 None?
       if not dicts : return []
       dicts = map(lambda i:i[1], sorted(map(lambda i:(len(i),i),
dicts))) ##按字典长度排序，先捅掉较短的
       #return reduce(lambda d1,d2: set(d1) & set(d2), dicts)
       return eval('&'.join(map(lambda i:'set(dicts[%s])'%i,
xrange(0, len(dicts))))) ##更快?
   def _dumpIndex(self):
       print '{'
       for i,t in self.dWordsIndex.iteritems(): print "'%s' : %s,"%(i,t)
       print '}'


class _TextIndex(WordIndex):
   '''演示用 WordIndex 索引
   '''
   def __init__(self, wordDict={}, textDict={}, Dict=dict, commit=None):
       self.dWordsIndex = wordDict
       self.dTextDict = textDict
       self.commit = commit
       self.Dict = dict
       WordIndex.__init__(self, wordDict, Dict, commit)
   def add(self, text, uid=None):
       dTextDict = self.dTextDict
       uid = uid or _str2hash(text)
       dTextDict[uid] = text
       self.addIndex(text, uid)
       return uid
   def delete(uid):
       dTextDict = self.dTextDict
       if not uid in dTextDict: return False
       text = dTextDict[uid]
       self.delIndex(text, uid)
       del dTextDict[uid]
   def search(self, text):
       rev = self.searchIndex(text)
       return rev
   def _search(self, text):
       dTextDict = self.dTextDict
       rev = self.searchIndex(text)
       if not rev: return False
       word1 = cutword.cutword(text)[0].decode('utf8', 'ignore')
       print rev
       for uid in rev:
           print '=== %s ==='%uid
           text = dTextDict[uid].decode('utf8', 'ignore')
           ord = text.rfind(word1)
           print text[max(0, ord-120):ord+120]
           print
   def _words4uid(self, uid):
       '对比的暴力搜索'
       dWordsIndex = self.dWordsIndex
       for word, uids in dWordsIndex.iteritems():
           if uid in uids: print word,
   def _uids4word(self, word):
       '对比的暴力搜索'
       dTextDict = self.dTextDict
       for uid, text in dTextDict.iteritems():
           if word in text: print uid,


##连接 Durus 数据库用
def getdb_durus():
   #from durus.file_storage import FileStorage
   #from durus.connection import Connection
   #connection = Connection(FileStorage("testIndexWord.durus"))
   # 用 Durus 服务
   from durus.client_storage import ClientStorage
   from durus.connection import Connection
   connection = Connection(ClientStorage())
   ##
   root = connection.get_root()
   from durus import persistent, persistent_dict, persistent_list
   from durus import btree
   Tree = btree.BTree
   List = persistent_list.PersistentList
   def getdb(name):
       if not name in root:
           root[name] = Tree()
           connection.commit()
       return root[name]
   dTextDict = getdb('TextDict')
   dWordsIndex = getdb('WordsIndex')
   commit = connection.commit
   ti = _TextIndex(wordDict=dWordsIndex, textDict=dTextDict,
Dict=Tree, commit=commit)
   return ti



##用下边的生成随机字符串来填充数据库
import random
def _randomword(n=1, m=7):
   return ''.join(map(lambda
i:random.choice('abcdefghijklmnopqrstuvwxyz'),
range(random.randrange(n, m))))

def _randomtext(n=10, m=100):
   return ' '.join(map(lambda i:_randomword(1, 3),
range(random.randrange(n, m))))

def _randzhtext(n=10, m=100):
   return ''.join(map(lambda i:unichr(random.randrange(19968,
21000)).encode('utf8'), range(random.randrange(n, m))))


##用这个测试运行时间
import time
def _timeit(_src):
   '''测试 src 运行时间
   '''
   exec('''
_t0 = time.time()
%s
_t1 = time.time()
_t3 = _t1 - _t0
'''%_src)
   return _t3

## 用这个来随机填充英文
_timeit("for i in range(300000): print i, ti.add(_randomtext(10,100),
_randomword(2,11)) ")
## 用这个来随机填充中文
_timeit("for i in range(300000): print i, ti.add(_randzhtext(10,100),
_randzhtext(2, 10))  ")
}}}


== 测试行动 ==
插入 59W 条随机字符的文章后 SWAP 占用 500M,
然后客户端被 Linux 杀掉

重新连接,测试搜索
{{{

>>> _timeit(ti.search('东东'))
1.5020370483398438e-05
>>> _timeit(ti.search('东东'))
4.0531158447265625e-06

## 下边去掉了 print
>>> _timeit("ti._words4uid('东东')")
0.074419975280761719
>>> _timeit("ti._words4uid('东东')")
0.074203968048095703

>>> _timeit("ti._uids4word('东东')")
1.1149060726165771
>>> _timeit("ti._uids4word('东东')")
1.1026270389556885
}}}

 * 这时的数据库文件大小为 1.6G
{{{
-rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus
}}}
= 反馈 =