From be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 04:57:58 +0200 Subject: Adding upstream version 1.44.3. Signed-off-by: Daniel Baumann --- web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx | 427 +++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx (limited to 'web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx') diff --git a/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx b/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx new file mode 100644 index 00000000..8109b475 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx @@ -0,0 +1,427 @@ +import "metadata.jsx"; +import "fm-index.jsx"; +import "binary-util.jsx"; +import "query.jsx"; +import "search-result.jsx"; +import "stemmer/stemmer.jsx"; +import "console.jsx"; + + +class Oktavia +{ + var _fmindex : FMIndex; + var _metadatas : Map.; + var _metadataLabels : string[]; + var _stemmer : Nullable.; + var _stemmingResult : Map.; + + // char code remap tables + var _utf162compressCode : string[]; + var _compressCode2utf16 : string[]; + + // sentinels + static const eof = String.fromCharCode(0); + static const eob = String.fromCharCode(1); + static const unknown = String.fromCharCode(3); + + function constructor () + { + this._fmindex = new FMIndex(); + this._metadatas = {} : Map.; + this._metadataLabels = [] : string[]; + this._stemmer = null; + this._stemmingResult = {} : Map.; + this._utf162compressCode = [Oktavia.eof, Oktavia.eob, Oktavia.unknown]; + this._utf162compressCode.length = 65536; + this._compressCode2utf16 = [Oktavia.eof, Oktavia.eob, Oktavia.unknown]; + } + + function setStemmer (stemmer : Stemmer) : void + { + this._stemmer = stemmer; + } + + function getPrimaryMetadata () : Metadata + { + return this._metadatas[this._metadataLabels[0]]; + } + + function addSection (key : string) : Section + { + if (this._metadataLabels.indexOf(key) != -1) + { + throw new Error('Metadata name ' + key + ' is already exists'); + } + this._metadataLabels.push(key); + var section = new Section(this); + this._metadatas[key] = section; + return section; + } + + function getSection (key : string) : Section + { + if (this._metadataLabels.indexOf(key) == -1) + { + throw new Error('Metadata name ' + key + " does't exists"); + } + return this._metadatas[key] as Section; + } + + function addSplitter (key : string) : Splitter + { + if (this._metadataLabels.indexOf(key) != -1) + { + throw new Error('Metadata name ' + key + ' is already exists'); + } + this._metadataLabels.push(key); + var splitter = new Splitter(this); + this._metadatas[key] = splitter; + return splitter; + } + + function getSplitter (key : string) : Splitter + { + if (this._metadataLabels.indexOf(key) == -1) + { + throw new Error('Metadata name ' + key + " does't exists"); + } + return this._metadatas[key] as Splitter; + } + + function addTable (key : string, headers : string[]) : Table + { + if (this._metadataLabels.indexOf(key) != -1) + { + throw new Error('Metadata name ' + key + ' is already exists'); + } + this._metadataLabels.push(key); + var table = new Table(this, headers); + this._metadatas[key] = table; + return table; + } + + function getTable (key : string) : Table + { + if (this._metadataLabels.indexOf(key) == -1) + { + throw new Error('Metadata name ' + key + " does't exists"); + } + return this._metadatas[key] as Table; + } + + function addBlock (key : string) : Block + { + if (this._metadataLabels.indexOf(key) != -1) + { + throw new Error('Metadata name ' + key + ' is already exists'); + } + this._metadataLabels.push(key); + var block = new Block(this); + this._metadatas[key] = block; + return block; + } + + function getBlock (key : string) : Block + { + if (this._metadataLabels.indexOf(key) == -1) + { + throw new Error('Metadata name ' + key + " does't exists"); + } + return this._metadatas[key] as Block; + } + + function addEndOfBlock () : void + { + this._fmindex.push(Oktavia.eob); + } + + function addWord (words : string) : void + { + var str = [] : string[]; + str.length = words.length; + for (var i = 0; i < words.length; i++) + { + var charCode = words.charCodeAt(i); + var newCharCode = this._utf162compressCode[charCode]; + if (newCharCode == null) + { + newCharCode = String.fromCharCode(this._compressCode2utf16.length); + this._utf162compressCode[charCode] = newCharCode; + this._compressCode2utf16.push(String.fromCharCode(charCode)); + } + str.push(newCharCode); + } + this._fmindex.push(str.join('')); + } + + function addWord (words : string, stemming : boolean) : void + { + this.addWord(words); + var wordList = words.split(/\s+/); + for (var i = 0; i < wordList.length; i++) + { + var originalWord = wordList[i]; + var smallWord = originalWord.slice(0, 1).toLowerCase() + originalWord.slice(1); + var registerWord : Nullable. = null; + if (stemming && this._stemmer) + { + var baseWord = this._stemmer.stemWord(originalWord.toLowerCase()); + if (originalWord.indexOf(baseWord) == -1) + { + registerWord = baseWord; + } + } + else if (originalWord != smallWord) + { + registerWord = smallWord; + } + if (registerWord) + { + var compressedCodeWord = this._convertToCompressionCode(originalWord); + var stemmedList = this._stemmingResult[registerWord]; + if (!stemmedList) + { + stemmedList = [compressedCodeWord]; + this._stemmingResult[registerWord] = stemmedList; + } + else if (stemmedList.indexOf(compressedCodeWord) == -1) + { + stemmedList.push(compressedCodeWord); + } + } + } + } + + function _convertToCompressionCode (keyword : string) : string + { + var resultChars = [] : string[]; + for (var i = 0; i < keyword.length; i++) + { + var chr = this._utf162compressCode[keyword.charCodeAt(i)]; + if (chr == null) + { + resultChars.push(Oktavia.unknown); + } + else + { + resultChars.push(chr); + } + } + return resultChars.join(''); + } + + function rawSearch (keyword : string, stemming : boolean) : int[] + { + var result : int[]; + if (stemming) + { + result = [] : int[]; + if (this._stemmer) + { + var baseWord = this._stemmer.stemWord(keyword.toLowerCase()); + var stemmedList = this._stemmingResult[baseWord]; + if (stemmedList) + { + for (var i = 0; i < stemmedList.length; i++) + { + var word = stemmedList[i]; + result = result.concat(this._fmindex.search(word)); + } + } + } + } + else + { + result = this._fmindex.search(this._convertToCompressionCode(keyword)); + } + return result; + } + + function search (queries : Query[]) : SearchSummary + { + var summary = new SearchSummary(this); + for (var i = 0; i < queries.length; i++) + { + summary.addQuery(this._searchQuery(queries[i])); + } + summary.mergeResult(); + return summary; + } + + function _searchQuery (query : Query) : SingleResult + { + var result = new SingleResult(query.word, query.or, query.not); + var positions : int[]; + if (query.raw) + { + positions = this.rawSearch(query.word, false); + } + else + { + positions = this.rawSearch(query.word, false).concat(this.rawSearch(query.word, true)); + } + this.getPrimaryMetadata().grouping(result, positions, query.word, !query.raw); + return result; + } + + function build () : void + { + this.build(5, false); + } + + function build (cacheDensity : int, verbose : boolean) : void + { + for (var key in this._metadatas) + { + this._metadatas[key]._build(); + } + var cacheRange = Math.round(Math.max(1, (100 / Math.min(100, Math.max(0.01, cacheDensity))))); + var maxChar = this._compressCode2utf16.length; + this._fmindex.build(Oktavia.eof, maxChar, cacheRange, verbose); + } + + function dump () : string + { + return this.dump(false); + } + + function dump (verbose : boolean) : string + { + var headerSource = "oktavia-01"; + var header = Binary.dumpString(headerSource).slice(1); + if (verbose) + { + console.log("Source text size: " + (this._fmindex.size() * 2) as string + ' bytes'); + } + var fmdata = this._fmindex.dump(verbose); + var result = [ + header, + fmdata + ]; + + result.push(Binary.dump16bitNumber(this._compressCode2utf16.length)); + for (var i = 3; i < this._compressCode2utf16.length; i++) + { + result.push(this._compressCode2utf16[i]); + } + if (verbose) + { + console.log('Char Code Map: ' + (this._compressCode2utf16.length * 2 - 2) as string + ' bytes'); + } + + var report = new CompressionReport(); + result.push(Binary.dumpStringListMap(this._stemmingResult, report)); + if (verbose) + { + console.log('Stemmed Word Table: ' + (result[result.length - 1].length) as string + ' bytes (' + report.rate() as string + '%)'); + } + + result.push(Binary.dump16bitNumber(this._metadataLabels.length)); + for (var i = 0; i < this._metadataLabels.length; i++) + { + var report = new CompressionReport(); + var name = this._metadataLabels[i]; + var data = this._metadatas[name]._dump(report); + result.push(Binary.dumpString(name, report), data); + if (verbose) + { + console.log('Meta Data ' + name + ': ' + (data.length * 2) as string + ' bytes (' + report.rate() as string + '%)'); + } + } + return result.join(''); + } + + function load (data : string) : void + { + var headerSource = "oktavia-01"; + var header = Binary.dumpString(headerSource).slice(1); + if (data.slice(0, 5) != header) + { + throw new Error('Invalid data file'); + } + this._metadatas = {} : Map.; + this._metadataLabels = [] : string[]; + + var offset = 5; + offset = this._fmindex.load(data, offset); + var charCodeCount = Binary.load16bitNumber(data, offset++); + this._compressCode2utf16 = [Oktavia.eof, Oktavia.eob, Oktavia.unknown]; + this._utf162compressCode = [Oktavia.eof, Oktavia.eob, Oktavia.unknown]; + for (var i = 3; i < charCodeCount; i++) + { + var charCode = Binary.load16bitNumber(data, offset++); + this._compressCode2utf16.push(String.fromCharCode(charCode)); + this._utf162compressCode[charCode] = String.fromCharCode(i); + } + + var stemmedWords = Binary.loadStringListMap(data, offset); + this._stemmingResult = stemmedWords.result; + offset = stemmedWords.offset; + + var metadataCount = Binary.load16bitNumber(data, offset++); + for (var i = 0; i < metadataCount; i++) + { + var nameResult = Binary.loadString(data, offset); + var name = nameResult.result; + var offset = nameResult.offset; + var type = Binary.load16bitNumber(data, offset++); + switch (type) + { + case 0: + offset = Section._load(this, name, data, offset); + break; + case 1: + offset = Splitter._load(this, name, data, offset); + break; + case 2: + offset = Table._load(this, name, data, offset); + break; + case 3: + offset = Block._load(this, name, data, offset); + break; + } + } + } + + function contentSize () : int + { + return this._fmindex.contentSize(); + } + + function wordPositionType (position : int) : int + { + var result = 0; + if (position == 0) + { + result = 4; + } + else + { + var ahead = this._fmindex.getSubstring(position - 1, 1); + if (/\s/.test(ahead)) + { + result = 2; + } + else if (/\W/.test(ahead)) + { + result = 1; + } + else if (Oktavia.eob == ahead) + { + result = 3; + } + } + return result; + } + + function _getSubstring (position : int, length : int) : string + { + var result = this._fmindex.getSubstring(position, length); + var str = [] : string[]; + for (var i = 0; i < result.length; i++) + { + str.push(this._compressCode2utf16[result.charCodeAt(i)]); + } + return str.join(''); + } +} -- cgit v1.2.3