#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""Created by Lee, Chia-ming (2006 winter) / The Chung Hwa Institute of Buddhist Studies"""

import os, re
from struct import *
from time import time, strftime, gmtime

def optScreen( strings ):
	"""Send a string can't be encode by "cp950".
		Return string of hex(unicode-caracter)."""
	
	line = ""
	for k in strings:
		try:
			line += k.encode('cp950')
		except:
			line += "["+hex(ord(k))+"]"
	return line

class SuffixArray:
	"""Include functions related to suffix array works."""
	
	def __init__( self, fname, suffix="" ):
		"""	[ fname ] = The main text file
			[ suffix ] send it for: doSearch(), splitSuffixArray()
			[ suffix ] none for: getSuffixArray()"""
		
		self.F = open( fname, 'rb' )				##主要檔案位置##
		if suffix != "":
			self.S = open( suffix, 'rb' )			##主要檔案的 Suffix Array##
			self.S_len = os.stat( suffix )[6]	##Suffix Array 的檔案大小##
			if self.S_len > 1000000:	#1mb 大小以上的 Suffix Array 才建索引 cache
				self.cache_p = [0]*1024			##紀錄 Suffix Array 位置的 cache##
				self.cache_w = [""]*1024			##紀錄 self.cache_p 對應回 self.S 再對應回 self.F 的內容字串##
				self.__getCache__( 0, self.S_len/4, 1 )

		self.File_main = fname	##主要檔的檔名##
		self.Suffix_A = suffix		##主要 Suffix Array 的檔名##
	
	def __getCache__( self, i, j, c ):
		"""Create 2 cache lists for search.
			[ i ] = the start position
			[ j ] = the end position
			[ c ] = position of cache list"""
			
		if c < 1024:
			k = ( i+j )/2		##中間位置 ( k ) = 首尾位置 ( i, j ) 除以 2##
			self.cache_p[c] = k
			self.S.seek( k*4 )
			ofst = unpack( 'L', self.S.read(4) )[0]	##提出 self.S 實際值##
			w = self.__getWords__( ofst, 64 )		##取出 self.F 64bytes( 32個字 )##
			self.cache_w[c] = w
			
			self.__getCache__( i, k, c*2 )
			self.__getCache__( k, j, c*2+1)

	def __getWords__( self, ofst, length ):
		"""Send offset and bytes.
			Return a string from the main file( self.F ).
			[ ofst ] = the offset of self.F
			[ length ] = the bytes wanted to read"""
		self.F.seek( ofst )
		return self.F.read( length ).decode('utf16')
		
####### Creat Suffix Array #######
	def getSuffixArray( self, cache=1000000 ):
		"""Create a suffix array file for self.F.
			[ cache ] = default is 1mb( for 256 mb memery). Suggest 3mb for 512 or highter memery."""

		Ts = time()		##開始時間##
		print self.File_main
		p = os.path.split( self.File_main )		##拆檔名 p[0] = 路徑, p[1] = 檔名##
		if os.path.isdir( p[0]+"/tmp" ):	#建 tmp 資料夾
			pass
		else:
			os.mkdir( p[0]+"/tmp" )
		tmpath = p[0]+"/tmp/"						##暫存目錄完整路徑##
	
		self.F.seek(2)	# utf16 檔案先跳過 2 bytes 的 BOM
		D = {}				##offset 與 word 的暫存 dic. 每次紀錄至 cache 大小後排序, 暫存出去, 再清空##
		i = c = n = 0		## i = self.F 檔案 bytes 的累加, c = 已處理中文字數的累加, n = 排序暫存檔數的累加 ##
#		fw_ref = open( self.File_main+"_ref", "w" )		##Cbeta 做 offset to 各經的索引用##
		while 1:
			i += 2
			offset = self.F.tell()		## self.F 目前的 offset ##
			word = self.F.read(256).decode("utf16")	##每個 offset 向後讀取 256 bytes ( 128 個字的字串 )##

#			try:
#				word.encode("cp950")
#			except:
#				word = optScreen( word )
#			print "[%d: %s]\n%s" % ( offset, word[0], word )
#			raw_input()
#			self.F.seek( offset + 2 )

			if word == "":	#檔案結束, 做最後一次的儲存
				n += 1
				self.__saveTmpFiles__( D, n, tmpath )
				D = {}
				print u"  Total %d bytes, %d chi-char. tmp files saving down" % ( i, c )
				break
			else:
				if word[0] != "\n" and word[0] > "~":	#如果跑到中文字, ***依檔案不同要做判斷上更改
					c += 1
					D[offset] = word
					
				"""	#Cbeta 做 concordance 需要每個 offset 屬於哪一部經的索引#
				if word[0] != "\n" and word[0] != "n" and word[0] > "9":
					if word[0] == "T" or word[0] == "X":
						sutra_ref = word.split( "\n", 1 )
						fw_ref.write( sutra_ref[0] + "\t" + str(offset) + "\n" )
					else:
						c += 1
						D[offset] = word"""
						
			self.F.seek( offset + 2 )
	
			if i % cache == 0:	#跑多少 bytes 後, 將目前結果排序後輸出暫存. 內定 cache 為 1mb
				n += 1
				self.__saveTmpFiles__( D, n, tmpath )
				D = {}
				print u"  so far %d bytes, %d chi-chars processed." % ( i, c )
#				raw_input()

#		fw_ref.close()
		Tm = time()		##分批排序暫存完畢後的時間##
		print strftime('%H:%M:%S', gmtime(Tm-Ts))
	
		self.__combinTmps__( tmpath, p[1] )	#合併 suffix array 暫存檔
		Te = time()		##結束時間##
		print strftime('%H:%M:%S', gmtime(Te-Ts))
	
	def __saveTmpFiles__( self, Dic, no, path ):
		"""Save the sorted result of tmp dictionary.
			[ Dic ] = {offset: 128 chi-char, ...} ex: {28: 'chi-string...', ...}
			[ no ] = No. of tmp files
			[ path ] = path of tmp files"""
		
		L = Dic.items()		##字典轉成陣列##
		L.sort( lambda x,y: cmp(x[1], y[1]) )
		fw = open( path+str(no), 'wb' )		##開暫欲寫入的存檔##
		for k in L:
			fw.write( pack('L', k[0]) )
#			print "offset: %d, string: %s" % ( k[0], k[1][0])
#			raw_input()
		fw.close()
		print "tmp No. %d sorted & saved, include %d chi-chars." % ( no, len(L) )
		
	def __combinTmps__( self, path, orgfn ):
		"""Combin all sorted tmp files.
			[ orgfn ] = the file name of self.F
			[ path ] = folder of where all tmp files saved"""
		
		L = os.listdir( path )	##儲存所有暫存檔名的陣列##
		if len(L) != 1:	#如果資料夾內還有兩個以上的檔案
#			print len(L)
#			raw_input()
			i = 0		##參數辨別讀到單數或雙數檔##
			for k in L:	#兩兩合併
				i += 1
				if i == 1:
					f1n = k		##暫存單數檔名##
				elif i == 2:
					f1 = open( path+f1n, "rb" )		##開單數檔##
					f2 = open( path+k, "rb" )			##開雙數檔##
					print f1n, "working with", k
	
					fw = open( path+"tmp", "wb" )				##每累積兩個檔案就開始合併作業. 結果暫存於 tmp 檔##
					
					f1_ofst = unpack( 'L', f1.read(4) )[0]		##取得單數檔的 offset ##
					self.F.seek( f1_ofst )
					f1w = self.F.read(256).decode("utf16")	##讀取 256 bytes 並解碼成字串##
					f2_ofst = unpack( 'L', f2.read(4) )[0]		##雙數檔的 offset ##
					self.F.seek( f2_ofst )
					f2w = self.F.read(256).decode("utf16")	##取得 128 個字##
	
					flag1 = flag2 = "on"							## flag1, flag2 單雙數兩個檔案是否結束的辨識參數##
					while flag1 == "on" or flag2 == "on":
						if flag2 == "off":
							fw.write( pack('L', f1_ofst) )
							fw.write( f1.read() )
							break							
						elif flag1 == "off":
							fw.write( pack('L', f2_ofst) )
							fw.write(f2.read())
							break
						elif f1w <= f2w:
#							if f1w == f2w:	#如果兩個 256 bytes 的長字串相等, 值得 check 一下是否為複製
#								print "***ofst_1: %d, ofst_2: %d following same contents !! file may be duplicated." % ( f1_ofst, f2_ofst )
	
							fw.write( pack('L', f1_ofst) )	# 1 <= 2 寫入 1
							try:	# unpack 不出東西, 回傳錯誤, 表示該 tmp 檔跑完.
								f1_ofst = unpack( 'L', f1.read(4) )[0]
								self.F.seek( f1_ofst )
								f1w = self.F.read(256).decode("utf16")
							except:
								flag1 = "off"
						elif f1w > f2w:
							fw.write( pack('L', f2_ofst) )	# 1> 2 寫入 2
							try:
								f2_ofst = unpack( 'L', f2.read(4) )[0]
								self.F.seek( f2_ofst )
								f2w = self.F.read(256).decode("utf16")
							except:
								flag2 = "off"
					f1.close()
					f2.close()
					fw.close()
					
					os.remove( path+f1n )	#合併完後刪除單數檔
					os.remove( path+k )	#刪除雙數檔
					os.rename( path+"tmp", path+f1n )	#將 tmp 更名為單數檔檔名
					i = 0
#					print "check", f1n, k
#					raw_input()
			self.__combinTmps__( path, orgfn )
		else:
			self.F.close()
			path2 = path.replace( "tmp/", "" )
			os.rename( path+L[0], path2+orgfn+"_SA" )
			os.rmdir( path[:-1] )	# path = ..../tmp/ << / 去掉才能 rmdir
			print L[0], orgfn, "suffix array created"
	
####### Search #######
	def __backwardStr__( self, strg ):
		"""Return a backward string of [ strg ]"""
		line = ""		##儲存字串反過來的值##
		for k in strg:
			line = k + line
		return line
		
	def doSearch( self, KW, opt="" ):
		"""Search the sent term.
			Return ( counts, start offset ) or ( counts, concordance strings )
			[ KW ] = the term want to search
			[ opt = "con" ] = do concordance search in normal files
			[ opt = "con_bk" ] = do concordance search in backward files"""
		
		kw_len = len(KW)*2		##欲搜尋字串的 bytes 數##
		S_end = self.S_len/4		##suffix array 的長度(個數)##
		i = 0								##suffix array 來回跳動的起始位置##
		j = self.S_len/4				##suffix array 來回跳動的結束位置##
#		print "search for %s, len: %d, sa: %d, sa_s: %d, sa_e: %d" % (KW, kw_len, self.S_len, i, j)
		if self.S_len > 1000000:	#有 cache
			ck = 1											##cache lists 位置的參數##
			while ck < 1024:
				k = self.cache_p[ck]					##位置 cache 的參數##
				w = self.cache_w[ck][0:len(KW)]	##字串 cache 的參數##
				
				"""#如果 cache 字串長度小於欲搜尋的字串長度. suffix array > 1mb 才做 cache, 不會遇到這種狀況
				try:
					w = self.cache_w[ck][0:len(KW)]
				except:
					w = self.cache_w[ck]"""
				
				if KW < w:
					j = k-1
					ck = ck*2
				elif KW > w:
					i = k+1
					ck = ck*2+1
				else:
					break	#比對到直接跳出, 剛剛的 i, j 傳給下一個迴圈
#			print i, j
#			raw_input()

		ans = ( 0, " not found !!" )		##搜尋結果參數##
		while i <= j:
			k = ( i+j )/2
			if k == S_end:	#最後一個位置, 後面沒有資料
				break
			self.S.seek( k*4 )
			offset = unpack( "L", self.S.read(4) )[0]		##suffix array 中該位置紀錄的 offset (for self.F)##
			w = self.__getWords__( offset, kw_len )		##每個offset向後抓KW同長度的字元##
#			print w
#			raw_input()			
			if KW == w:	#比對到第一個
				sa_start_p = k		##紀錄出現該字串 suffix array 的開始與結束位置##
				sa_end_p = k	
				
				k_bk = k
				while i <= k_bk:	#前半
					ks = ( i+k_bk )/2
					self.S.seek( ks*4 )
					offset = unpack( "L", self.S.read(4) )[0]
					w = self.__getWords__( offset, kw_len )	#到每個offset向後抓KW同長度的字元來比
					if KW == w:
						sa_start_p = ks
						k_bk = ks-1
					elif KW != w:
						i = ks+1
						sa_start_p = i	#目前的 ks 確定不是, 下一個才可能是
				while k <= j:	#後半
					ks = ( k+j )/2
					if ks == S_end:
						break
					self.S.seek( ks*4 )
					offset = unpack( "L", self.S.read(4) )[0]
					w = self.__getWords__( offset, kw_len )	#到每個offset向後抓KW同長度的字元來比
					if KW == w:
						sa_end_p = ks
						k = ks+1
					elif KW != w:
						j = ks-1
						sa_end_p = j
				c = sa_end_p - sa_start_p + 1		##搜尋到[ KW ]的總數 = 開始位置 - 結束位置 + 1 ##
				ans = ( c, sa_start_p*4 )				##搜尋結果. 回傳次數與起始的 suffix array 檔的 bytes數##
				
				if opt == "con" or opt == "con_bk":
					line = ""									##存放 concordance 的結果##
					self.S.seek( sa_start_p*4 )
					for k in range( c ):
						offset = unpack( 'L', self.S.read(4) )[0]
						if offset < 22:	#往前扣 offset 扣到 < 0 時, 由 0 開始讀, 讀 (offset + kw_byte + 20) 個 bytes.
							self.F.seek( 2 )
							tmp = self.F.read( offset + kw_len + 20 ).decode("utf16").replace( "\n", u"。" )
							if opt == "con_bk":
								tmp = self.__backwardStr__( tmp )
							line += "SOF> " + tmp + "\n"
						else:	#以原offset為準前後各多抓 40 個 bytes(10 個字).
							self.F.seek( offset-20 )
							tmp = self.F.read( 40 + kw_len ).decode("utf16").replace( "\n", u"。" )
							if opt == "con_bk":
								tmp = self.__backwardStr__( tmp )
							line += tmp + "\n"
					ans = ( c, line )		##搜尋結果. 回傳次數與 concordance 結果##
				break
				
			elif KW < w:
				j = k-1
			else:
				i = k+1
		return ans

####### Split a Suffix Array File #######
	def splitSuffixArray( self, n ):
		"""Split a suffix array file when it's too large to extract terms.
			[ n ] = No. of splited file"""
		
		p = c = self.S_len/4/n		## p = 變動計算參數; c = 等分後 new suffix array 個數的平均值
		print self.S_len, p, n
#		pt = []
		i = 1
		for k in range( n-1):
			self.S.seek( p*4 )
			w_st = cp = ""
			while 1:
				self.F.seek( unpack( 'L', self.S.read(4) )[0] )
				p += 1	#suffix array 向下移動一個位置
				w_st = self.F.read(2).decode('utf16')
				if cp != "" and cp != w_st:
					print cp, w_st, p-1
#					pt.append( (p-1)*4 )
					fo = open( self.Suffix_A + str(k), 'wb' )
					self.S.seek( (i-1)*4 )
					fo.write( self.S.read( (p-i+1)*4 ) )	#多讀一個
					fo.close()
					i = p
					break
				cp = w_st
			p += c

		fo = open( self.Suffix_A + "e", 'wb' )
		self.S.seek( (i-1)*4 )
		fo.write( self.S.read() )	#多讀一個
		fo.close()
	

if __name__ == "__main__":
#	print SuffixArray.__doc__
#	print SuffixArray.getSuffixArray.__doc__
#	print SuffixArray.doSearch.__doc__
#	print SuffixArray.splitSuffixArray.__doc__
	
	objS1 = SuffixArray( "./Files/test/test_f", "./Files/test/test_f_SA" )
	objS2 = SuffixArray( "./Files/test/test_b", "./Files/test/test_b_SA" )
#	objS1.splitSuffixArray( 4 )	#切 suffix array
	
	L = [u'佛說', u'T', u'正覺']
	for k in L:
		rst = objS1.doSearch( k, "con" )
		print rst[0]
		try:
			print rst[1]
		except:
			print optScreen( rst[1] )
		
		n = ""
		for ks in k:
			n = ks + n
		rst = objS2.doSearch( n, "con_bk" )
		print rst[0]
		try:
			print rst[1]
		except:
			print optScreen( rst[1] )