##language:zh #pragma section-numbers on ''' ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = 快速尝试 = {{{ Jiahua Huang reply-to python-cn@googlegroups.com, to "python. cn" , date Nov 6, 2007 10:11 PM subject [CPyUG:34610] 试了下 Durus 对象数据库 mailing list Filter messages from this mailing list mailed-by googlegroups.com }}} 晚饭前跟大妈[[http://groups.google.com/group/python-cn/t/dbd8bc9dddc016cf|聊了会 Durus]] , Durus 是 Quixote 团队的作品, 轻量级的 zodb 对象数据库. 顺便测试了下 Durus 处理大数据库的情况 == 测试用例 == {{{#!python ##用的倒排索引全文搜索类 class WordIndex: '''简单的倒排索引 ''' def __init__(self, wordDict={}, Dict=dict, commit=None): '''dWordsIndex 是索引字典 ''' self.dWordsIndex = wordDict self.Dict = Dict #用 btree 或 OOBTree self.commit = commit def addIndex(self, text, uid=None): '''添加索引 ''' dWordsIndex = self.dWordsIndex Dict = self.Dict uid = uid or _str2hash(text) words = getWordFs(text) #分词,带词频 for word, f in words.iteritems(): if not word in dWordsIndex: dWordsIndex[word]=Dict() #Uids = dWordsIndex[word] #Uids[uid] = f dWordsIndex[word][uid] = f if self.commit: self.commit() return uid def delIndex(self, text, uid=None): '''删除索引 ''' dWordsIndex = self.dWordsIndex uid = uid or _str2hash(text) words = getWords(text) if not words: return False for word in words: if word in dWordsIndex and uid in dWordsIndex[word]: del dWordsIndex[word][uid] if self.commit: self.commit() return uid def query(self, QueryString): ''' @TODO: ''' pass def searchIndex(self, text): '''搜索文章,返回 uid ''' dWordsIndex = self.dWordsIndex words = getWords(text) #分词 if not words: return [] dicts = map(dWordsIndex.get, words) while None in dicts: dicts.remove(None) ## 怎么来的 None? if not dicts : return [] dicts = map(lambda i:i[1], sorted(map(lambda i:(len(i),i), dicts))) ##按字典长度排序,先捅掉较短的 #return reduce(lambda d1,d2: set(d1) & set(d2), dicts) return eval('&'.join(map(lambda i:'set(dicts[%s])'%i, xrange(0, len(dicts))))) ##更快? def _dumpIndex(self): print '{' for i,t in self.dWordsIndex.iteritems(): print "'%s' : %s,"%(i,t) print '}' class _TextIndex(WordIndex): '''演示用 WordIndex 索引 ''' def __init__(self, wordDict={}, textDict={}, Dict=dict, commit=None): self.dWordsIndex = wordDict self.dTextDict = textDict self.commit = commit self.Dict = dict WordIndex.__init__(self, wordDict, Dict, commit) def add(self, text, uid=None): dTextDict = self.dTextDict uid = uid or _str2hash(text) dTextDict[uid] = text self.addIndex(text, uid) return uid def delete(uid): dTextDict = self.dTextDict if not uid in dTextDict: return False text = dTextDict[uid] self.delIndex(text, uid) del dTextDict[uid] def search(self, text): rev = self.searchIndex(text) return rev def _search(self, text): dTextDict = self.dTextDict rev = self.searchIndex(text) if not rev: return False word1 = cutword.cutword(text)[0].decode('utf8', 'ignore') print rev for uid in rev: print '=== %s ==='%uid text = dTextDict[uid].decode('utf8', 'ignore') ord = text.rfind(word1) print text[max(0, ord-120):ord+120] print def _words4uid(self, uid): '对比的暴力搜索' dWordsIndex = self.dWordsIndex for word, uids in dWordsIndex.iteritems(): if uid in uids: print word, def _uids4word(self, word): '对比的暴力搜索' dTextDict = self.dTextDict for uid, text in dTextDict.iteritems(): if word in text: print uid, ##连接 Durus 数据库用 def getdb_durus(): #from durus.file_storage import FileStorage #from durus.connection import Connection #connection = Connection(FileStorage("testIndexWord.durus")) # 用 Durus 服务 from durus.client_storage import ClientStorage from durus.connection import Connection connection = Connection(ClientStorage()) ## root = connection.get_root() from durus import persistent, persistent_dict, persistent_list from durus import btree Tree = btree.BTree List = persistent_list.PersistentList def getdb(name): if not name in root: root[name] = Tree() connection.commit() return root[name] dTextDict = getdb('TextDict') dWordsIndex = getdb('WordsIndex') commit = connection.commit ti = _TextIndex(wordDict=dWordsIndex, textDict=dTextDict, Dict=Tree, commit=commit) return ti ##用下边的生成随机字符串来填充数据库 import random def _randomword(n=1, m=7): return ''.join(map(lambda i:random.choice('abcdefghijklmnopqrstuvwxyz'), range(random.randrange(n, m)))) def _randomtext(n=10, m=100): return ' '.join(map(lambda i:_randomword(1, 3), range(random.randrange(n, m)))) def _randzhtext(n=10, m=100): return ''.join(map(lambda i:unichr(random.randrange(19968, 21000)).encode('utf8'), range(random.randrange(n, m)))) ##用这个测试运行时间 import time def _timeit(_src): '''测试 src 运行时间 ''' exec(''' _t0 = time.time() %s _t1 = time.time() _t3 = _t1 - _t0 '''%_src) return _t3 ## 用这个来随机填充英文 _timeit("for i in range(300000): print i, ti.add(_randomtext(10,100), _randomword(2,11)) ") ## 用这个来随机填充中文 _timeit("for i in range(300000): print i, ti.add(_randzhtext(10,100), _randzhtext(2, 10)) ") }}} == 测试行动 == 插入 59W 条随机字符的文章后 SWAP 占用 500M, 然后客户端被 Linux 杀掉 重新连接,测试搜索 {{{ >>> _timeit(ti.search('东东')) 1.5020370483398438e-05 >>> _timeit(ti.search('东东')) 4.0531158447265625e-06 ## 下边去掉了 print >>> _timeit("ti._words4uid('东东')") 0.074419975280761719 >>> _timeit("ti._words4uid('东东')") 0.074203968048095703 >>> _timeit("ti._uids4word('东东')") 1.1149060726165771 >>> _timeit("ti._uids4word('东东')") 1.1026270389556885 }}} * 这时的数据库文件大小为 1.6G {{{ -rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus }}} = 反馈 =