#!/usr/bin/env python # -*- coding: UTF-8 -*- """Created by Lee, Chia-ming (2006 winter) / The Chung Hwa Institute of Buddhist Studies""" import os, re from struct import * from time import time, strftime, gmtime def optScreen( strings ): """Send a string can't be encode by "cp950". Return string of hex(unicode-caracter).""" line = "" for k in strings: try: line += k.encode('cp950') except: line += "["+hex(ord(k))+"]" return line class SuffixArray: """Include functions related to suffix array works.""" def __init__( self, fname, suffix="" ): """ [ fname ] = The main text file [ suffix ] send it for: doSearch(), splitSuffixArray() [ suffix ] none for: getSuffixArray()""" self.F = open( fname, 'rb' ) ##主要檔案位置## if suffix != "": self.S = open( suffix, 'rb' ) ##主要檔案的 Suffix Array## self.S_len = os.stat( suffix )[6] ##Suffix Array 的檔案大小## if self.S_len > 1000000: #1mb 大小以上的 Suffix Array 才建索引 cache self.cache_p = [0]*1024 ##紀錄 Suffix Array 位置的 cache## self.cache_w = [""]*1024 ##紀錄 self.cache_p 對應回 self.S 再對應回 self.F 的內容字串## self.__getCache__( 0, self.S_len/4, 1 ) self.File_main = fname ##主要檔的檔名## self.Suffix_A = suffix ##主要 Suffix Array 的檔名## def __getCache__( self, i, j, c ): """Create 2 cache lists for search. [ i ] = the start position [ j ] = the end position [ c ] = position of cache list""" if c < 1024: k = ( i+j )/2 ##中間位置 ( k ) = 首尾位置 ( i, j ) 除以 2## self.cache_p[c] = k self.S.seek( k*4 ) ofst = unpack( 'L', self.S.read(4) )[0] ##提出 self.S 實際值## w = self.__getWords__( ofst, 64 ) ##取出 self.F 64bytes( 32個字 )## self.cache_w[c] = w self.__getCache__( i, k, c*2 ) self.__getCache__( k, j, c*2+1) def __getWords__( self, ofst, length ): """Send offset and bytes. Return a string from the main file( self.F ). [ ofst ] = the offset of self.F [ length ] = the bytes wanted to read""" self.F.seek( ofst ) return self.F.read( length ).decode('utf16') ####### Creat Suffix Array ####### def getSuffixArray( self, cache=1000000 ): """Create a suffix array file for self.F. [ cache ] = default is 1mb( for 256 mb memery). Suggest 3mb for 512 or highter memery.""" Ts = time() ##開始時間## print self.File_main p = os.path.split( self.File_main ) ##拆檔名 p[0] = 路徑, p[1] = 檔名## if os.path.isdir( p[0]+"/tmp" ): #建 tmp 資料夾 pass else: os.mkdir( p[0]+"/tmp" ) tmpath = p[0]+"/tmp/" ##暫存目錄完整路徑## self.F.seek(2) # utf16 檔案先跳過 2 bytes 的 BOM D = {} ##offset 與 word 的暫存 dic. 每次紀錄至 cache 大小後排序, 暫存出去, 再清空## i = c = n = 0 ## i = self.F 檔案 bytes 的累加, c = 已處理中文字數的累加, n = 排序暫存檔數的累加 ## # fw_ref = open( self.File_main+"_ref", "w" ) ##Cbeta 做 offset to 各經的索引用## while 1: i += 2 offset = self.F.tell() ## self.F 目前的 offset ## word = self.F.read(256).decode("utf16") ##每個 offset 向後讀取 256 bytes ( 128 個字的字串 )## # try: # word.encode("cp950") # except: # word = optScreen( word ) # print "[%d: %s]\n%s" % ( offset, word[0], word ) # raw_input() # self.F.seek( offset + 2 ) if word == "": #檔案結束, 做最後一次的儲存 n += 1 self.__saveTmpFiles__( D, n, tmpath ) D = {} print u" Total %d bytes, %d chi-char. tmp files saving down" % ( i, c ) break else: if word[0] != "\n" and word[0] > "~": #如果跑到中文字, ***依檔案不同要做判斷上更改 c += 1 D[offset] = word """ #Cbeta 做 concordance 需要每個 offset 屬於哪一部經的索引# if word[0] != "\n" and word[0] != "n" and word[0] > "9": if word[0] == "T" or word[0] == "X": sutra_ref = word.split( "\n", 1 ) fw_ref.write( sutra_ref[0] + "\t" + str(offset) + "\n" ) else: c += 1 D[offset] = word""" self.F.seek( offset + 2 ) if i % cache == 0: #跑多少 bytes 後, 將目前結果排序後輸出暫存. 內定 cache 為 1mb n += 1 self.__saveTmpFiles__( D, n, tmpath ) D = {} print u" so far %d bytes, %d chi-chars processed." % ( i, c ) # raw_input() # fw_ref.close() Tm = time() ##分批排序暫存完畢後的時間## print strftime('%H:%M:%S', gmtime(Tm-Ts)) self.__combinTmps__( tmpath, p[1] ) #合併 suffix array 暫存檔 Te = time() ##結束時間## print strftime('%H:%M:%S', gmtime(Te-Ts)) def __saveTmpFiles__( self, Dic, no, path ): """Save the sorted result of tmp dictionary. [ Dic ] = {offset: 128 chi-char, ...} ex: {28: 'chi-string...', ...} [ no ] = No. of tmp files [ path ] = path of tmp files""" L = Dic.items() ##字典轉成陣列## L.sort( lambda x,y: cmp(x[1], y[1]) ) fw = open( path+str(no), 'wb' ) ##開暫欲寫入的存檔## for k in L: fw.write( pack('L', k[0]) ) # print "offset: %d, string: %s" % ( k[0], k[1][0]) # raw_input() fw.close() print "tmp No. %d sorted & saved, include %d chi-chars." % ( no, len(L) ) def __combinTmps__( self, path, orgfn ): """Combin all sorted tmp files. [ orgfn ] = the file name of self.F [ path ] = folder of where all tmp files saved""" L = os.listdir( path ) ##儲存所有暫存檔名的陣列## if len(L) != 1: #如果資料夾內還有兩個以上的檔案 # print len(L) # raw_input() i = 0 ##參數辨別讀到單數或雙數檔## for k in L: #兩兩合併 i += 1 if i == 1: f1n = k ##暫存單數檔名## elif i == 2: f1 = open( path+f1n, "rb" ) ##開單數檔## f2 = open( path+k, "rb" ) ##開雙數檔## print f1n, "working with", k fw = open( path+"tmp", "wb" ) ##每累積兩個檔案就開始合併作業. 結果暫存於 tmp 檔## f1_ofst = unpack( 'L', f1.read(4) )[0] ##取得單數檔的 offset ## self.F.seek( f1_ofst ) f1w = self.F.read(256).decode("utf16") ##讀取 256 bytes 並解碼成字串## f2_ofst = unpack( 'L', f2.read(4) )[0] ##雙數檔的 offset ## self.F.seek( f2_ofst ) f2w = self.F.read(256).decode("utf16") ##取得 128 個字## flag1 = flag2 = "on" ## flag1, flag2 單雙數兩個檔案是否結束的辨識參數## while flag1 == "on" or flag2 == "on": if flag2 == "off": fw.write( pack('L', f1_ofst) ) fw.write( f1.read() ) break elif flag1 == "off": fw.write( pack('L', f2_ofst) ) fw.write(f2.read()) break elif f1w <= f2w: # if f1w == f2w: #如果兩個 256 bytes 的長字串相等, 值得 check 一下是否為複製 # print "***ofst_1: %d, ofst_2: %d following same contents !! file may be duplicated." % ( f1_ofst, f2_ofst ) fw.write( pack('L', f1_ofst) ) # 1 <= 2 寫入 1 try: # unpack 不出東西, 回傳錯誤, 表示該 tmp 檔跑完. f1_ofst = unpack( 'L', f1.read(4) )[0] self.F.seek( f1_ofst ) f1w = self.F.read(256).decode("utf16") except: flag1 = "off" elif f1w > f2w: fw.write( pack('L', f2_ofst) ) # 1> 2 寫入 2 try: f2_ofst = unpack( 'L', f2.read(4) )[0] self.F.seek( f2_ofst ) f2w = self.F.read(256).decode("utf16") except: flag2 = "off" f1.close() f2.close() fw.close() os.remove( path+f1n ) #合併完後刪除單數檔 os.remove( path+k ) #刪除雙數檔 os.rename( path+"tmp", path+f1n ) #將 tmp 更名為單數檔檔名 i = 0 # print "check", f1n, k # raw_input() self.__combinTmps__( path, orgfn ) else: self.F.close() path2 = path.replace( "tmp/", "" ) os.rename( path+L[0], path2+orgfn+"_SA" ) os.rmdir( path[:-1] ) # path = ..../tmp/ << / 去掉才能 rmdir print L[0], orgfn, "suffix array created" ####### Search ####### def __backwardStr__( self, strg ): """Return a backward string of [ strg ]""" line = "" ##儲存字串反過來的值## for k in strg: line = k + line return line def doSearch( self, KW, opt="" ): """Search the sent term. Return ( counts, start offset ) or ( counts, concordance strings ) [ KW ] = the term want to search [ opt = "con" ] = do concordance search in normal files [ opt = "con_bk" ] = do concordance search in backward files""" kw_len = len(KW)*2 ##欲搜尋字串的 bytes 數## S_end = self.S_len/4 ##suffix array 的長度(個數)## i = 0 ##suffix array 來回跳動的起始位置## j = self.S_len/4 ##suffix array 來回跳動的結束位置## # print "search for %s, len: %d, sa: %d, sa_s: %d, sa_e: %d" % (KW, kw_len, self.S_len, i, j) if self.S_len > 1000000: #有 cache ck = 1 ##cache lists 位置的參數## while ck < 1024: k = self.cache_p[ck] ##位置 cache 的參數## w = self.cache_w[ck][0:len(KW)] ##字串 cache 的參數## """#如果 cache 字串長度小於欲搜尋的字串長度. suffix array > 1mb 才做 cache, 不會遇到這種狀況 try: w = self.cache_w[ck][0:len(KW)] except: w = self.cache_w[ck]""" if KW < w: j = k-1 ck = ck*2 elif KW > w: i = k+1 ck = ck*2+1 else: break #比對到直接跳出, 剛剛的 i, j 傳給下一個迴圈 # print i, j # raw_input() ans = ( 0, " not found !!" ) ##搜尋結果參數## while i <= j: k = ( i+j )/2 if k == S_end: #最後一個位置, 後面沒有資料 break self.S.seek( k*4 ) offset = unpack( "L", self.S.read(4) )[0] ##suffix array 中該位置紀錄的 offset (for self.F)## w = self.__getWords__( offset, kw_len ) ##每個offset向後抓KW同長度的字元## # print w # raw_input() if KW == w: #比對到第一個 sa_start_p = k ##紀錄出現該字串 suffix array 的開始與結束位置## sa_end_p = k k_bk = k while i <= k_bk: #前半 ks = ( i+k_bk )/2 self.S.seek( ks*4 ) offset = unpack( "L", self.S.read(4) )[0] w = self.__getWords__( offset, kw_len ) #到每個offset向後抓KW同長度的字元來比 if KW == w: sa_start_p = ks k_bk = ks-1 elif KW != w: i = ks+1 sa_start_p = i #目前的 ks 確定不是, 下一個才可能是 while k <= j: #後半 ks = ( k+j )/2 if ks == S_end: break self.S.seek( ks*4 ) offset = unpack( "L", self.S.read(4) )[0] w = self.__getWords__( offset, kw_len ) #到每個offset向後抓KW同長度的字元來比 if KW == w: sa_end_p = ks k = ks+1 elif KW != w: j = ks-1 sa_end_p = j c = sa_end_p - sa_start_p + 1 ##搜尋到[ KW ]的總數 = 開始位置 - 結束位置 + 1 ## ans = ( c, sa_start_p*4 ) ##搜尋結果. 回傳次數與起始的 suffix array 檔的 bytes數## if opt == "con" or opt == "con_bk": line = "" ##存放 concordance 的結果## self.S.seek( sa_start_p*4 ) for k in range( c ): offset = unpack( 'L', self.S.read(4) )[0] if offset < 22: #往前扣 offset 扣到 < 0 時, 由 0 開始讀, 讀 (offset + kw_byte + 20) 個 bytes. self.F.seek( 2 ) tmp = self.F.read( offset + kw_len + 20 ).decode("utf16").replace( "\n", u"。" ) if opt == "con_bk": tmp = self.__backwardStr__( tmp ) line += "SOF> " + tmp + "\n" else: #以原offset為準前後各多抓 40 個 bytes(10 個字). self.F.seek( offset-20 ) tmp = self.F.read( 40 + kw_len ).decode("utf16").replace( "\n", u"。" ) if opt == "con_bk": tmp = self.__backwardStr__( tmp ) line += tmp + "\n" ans = ( c, line ) ##搜尋結果. 回傳次數與 concordance 結果## break elif KW < w: j = k-1 else: i = k+1 return ans ####### Split a Suffix Array File ####### def splitSuffixArray( self, n ): """Split a suffix array file when it's too large to extract terms. [ n ] = No. of splited file""" p = c = self.S_len/4/n ## p = 變動計算參數; c = 等分後 new suffix array 個數的平均值 print self.S_len, p, n # pt = [] i = 1 for k in range( n-1): self.S.seek( p*4 ) w_st = cp = "" while 1: self.F.seek( unpack( 'L', self.S.read(4) )[0] ) p += 1 #suffix array 向下移動一個位置 w_st = self.F.read(2).decode('utf16') if cp != "" and cp != w_st: print cp, w_st, p-1 # pt.append( (p-1)*4 ) fo = open( self.Suffix_A + str(k), 'wb' ) self.S.seek( (i-1)*4 ) fo.write( self.S.read( (p-i+1)*4 ) ) #多讀一個 fo.close() i = p break cp = w_st p += c fo = open( self.Suffix_A + "e", 'wb' ) self.S.seek( (i-1)*4 ) fo.write( self.S.read() ) #多讀一個 fo.close() if __name__ == "__main__": # print SuffixArray.__doc__ # print SuffixArray.getSuffixArray.__doc__ # print SuffixArray.doSearch.__doc__ # print SuffixArray.splitSuffixArray.__doc__ objS1 = SuffixArray( "./Files/test/test_f", "./Files/test/test_f_SA" ) objS2 = SuffixArray( "./Files/test/test_b", "./Files/test/test_b_SA" ) # objS1.splitSuffixArray( 4 ) #切 suffix array L = [u'佛說', u'T', u'正覺'] for k in L: rst = objS1.doSearch( k, "con" ) print rst[0] try: print rst[1] except: print optScreen( rst[1] ) n = "" for ks in k: n = ks + n rst = objS2.doSearch( n, "con_bk" ) print rst[0] try: print rst[1] except: print optScreen( rst[1] )