summaryrefslogtreecommitdiffstats
path: root/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx
diff options
context:
space:
mode:
Diffstat (limited to 'web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx')
-rw-r--r--web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx427
1 files changed, 0 insertions, 427 deletions
diff --git a/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx b/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx
deleted file mode 100644
index 8109b4759..000000000
--- a/web/server/h2o/libh2o/misc/oktavia/src/oktavia.jsx
+++ /dev/null
@@ -1,427 +0,0 @@
-import "metadata.jsx";
-import "fm-index.jsx";
-import "binary-util.jsx";
-import "query.jsx";
-import "search-result.jsx";
-import "stemmer/stemmer.jsx";
-import "console.jsx";
-
-
-class Oktavia
-{
- var _fmindex : FMIndex;
- var _metadatas : Map.<Metadata>;
- var _metadataLabels : string[];
- var _stemmer : Nullable.<Stemmer>;
- var _stemmingResult : Map.<string[]>;
-
- // char code remap tables
- var _utf162compressCode : string[];
- var _compressCode2utf16 : string[];
-
- // sentinels
- static const eof = String.fromCharCode(0);
- static const eob = String.fromCharCode(1);
- static const unknown = String.fromCharCode(3);
-
- function constructor ()
- {
- this._fmindex = new FMIndex();
- this._metadatas = {} : Map.<Metadata>;
- this._metadataLabels = [] : string[];
- this._stemmer = null;
- this._stemmingResult = {} : Map.<string[]>;
- this._utf162compressCode = [Oktavia.eof, Oktavia.eob, Oktavia.unknown];
- this._utf162compressCode.length = 65536;
- this._compressCode2utf16 = [Oktavia.eof, Oktavia.eob, Oktavia.unknown];
- }
-
- function setStemmer (stemmer : Stemmer) : void
- {
- this._stemmer = stemmer;
- }
-
- function getPrimaryMetadata () : Metadata
- {
- return this._metadatas[this._metadataLabels[0]];
- }
-
- function addSection (key : string) : Section
- {
- if (this._metadataLabels.indexOf(key) != -1)
- {
- throw new Error('Metadata name ' + key + ' is already exists');
- }
- this._metadataLabels.push(key);
- var section = new Section(this);
- this._metadatas[key] = section;
- return section;
- }
-
- function getSection (key : string) : Section
- {
- if (this._metadataLabels.indexOf(key) == -1)
- {
- throw new Error('Metadata name ' + key + " does't exists");
- }
- return this._metadatas[key] as Section;
- }
-
- function addSplitter (key : string) : Splitter
- {
- if (this._metadataLabels.indexOf(key) != -1)
- {
- throw new Error('Metadata name ' + key + ' is already exists');
- }
- this._metadataLabels.push(key);
- var splitter = new Splitter(this);
- this._metadatas[key] = splitter;
- return splitter;
- }
-
- function getSplitter (key : string) : Splitter
- {
- if (this._metadataLabels.indexOf(key) == -1)
- {
- throw new Error('Metadata name ' + key + " does't exists");
- }
- return this._metadatas[key] as Splitter;
- }
-
- function addTable (key : string, headers : string[]) : Table
- {
- if (this._metadataLabels.indexOf(key) != -1)
- {
- throw new Error('Metadata name ' + key + ' is already exists');
- }
- this._metadataLabels.push(key);
- var table = new Table(this, headers);
- this._metadatas[key] = table;
- return table;
- }
-
- function getTable (key : string) : Table
- {
- if (this._metadataLabels.indexOf(key) == -1)
- {
- throw new Error('Metadata name ' + key + " does't exists");
- }
- return this._metadatas[key] as Table;
- }
-
- function addBlock (key : string) : Block
- {
- if (this._metadataLabels.indexOf(key) != -1)
- {
- throw new Error('Metadata name ' + key + ' is already exists');
- }
- this._metadataLabels.push(key);
- var block = new Block(this);
- this._metadatas[key] = block;
- return block;
- }
-
- function getBlock (key : string) : Block
- {
- if (this._metadataLabels.indexOf(key) == -1)
- {
- throw new Error('Metadata name ' + key + " does't exists");
- }
- return this._metadatas[key] as Block;
- }
-
- function addEndOfBlock () : void
- {
- this._fmindex.push(Oktavia.eob);
- }
-
- function addWord (words : string) : void
- {
- var str = [] : string[];
- str.length = words.length;
- for (var i = 0; i < words.length; i++)
- {
- var charCode = words.charCodeAt(i);
- var newCharCode = this._utf162compressCode[charCode];
- if (newCharCode == null)
- {
- newCharCode = String.fromCharCode(this._compressCode2utf16.length);
- this._utf162compressCode[charCode] = newCharCode;
- this._compressCode2utf16.push(String.fromCharCode(charCode));
- }
- str.push(newCharCode);
- }
- this._fmindex.push(str.join(''));
- }
-
- function addWord (words : string, stemming : boolean) : void
- {
- this.addWord(words);
- var wordList = words.split(/\s+/);
- for (var i = 0; i < wordList.length; i++)
- {
- var originalWord = wordList[i];
- var smallWord = originalWord.slice(0, 1).toLowerCase() + originalWord.slice(1);
- var registerWord : Nullable.<string> = null;
- if (stemming && this._stemmer)
- {
- var baseWord = this._stemmer.stemWord(originalWord.toLowerCase());
- if (originalWord.indexOf(baseWord) == -1)
- {
- registerWord = baseWord;
- }
- }
- else if (originalWord != smallWord)
- {
- registerWord = smallWord;
- }
- if (registerWord)
- {
- var compressedCodeWord = this._convertToCompressionCode(originalWord);
- var stemmedList = this._stemmingResult[registerWord];
- if (!stemmedList)
- {
- stemmedList = [compressedCodeWord];
- this._stemmingResult[registerWord] = stemmedList;
- }
- else if (stemmedList.indexOf(compressedCodeWord) == -1)
- {
- stemmedList.push(compressedCodeWord);
- }
- }
- }
- }
-
- function _convertToCompressionCode (keyword : string) : string
- {
- var resultChars = [] : string[];
- for (var i = 0; i < keyword.length; i++)
- {
- var chr = this._utf162compressCode[keyword.charCodeAt(i)];
- if (chr == null)
- {
- resultChars.push(Oktavia.unknown);
- }
- else
- {
- resultChars.push(chr);
- }
- }
- return resultChars.join('');
- }
-
- function rawSearch (keyword : string, stemming : boolean) : int[]
- {
- var result : int[];
- if (stemming)
- {
- result = [] : int[];
- if (this._stemmer)
- {
- var baseWord = this._stemmer.stemWord(keyword.toLowerCase());
- var stemmedList = this._stemmingResult[baseWord];
- if (stemmedList)
- {
- for (var i = 0; i < stemmedList.length; i++)
- {
- var word = stemmedList[i];
- result = result.concat(this._fmindex.search(word));
- }
- }
- }
- }
- else
- {
- result = this._fmindex.search(this._convertToCompressionCode(keyword));
- }
- return result;
- }
-
- function search (queries : Query[]) : SearchSummary
- {
- var summary = new SearchSummary(this);
- for (var i = 0; i < queries.length; i++)
- {
- summary.addQuery(this._searchQuery(queries[i]));
- }
- summary.mergeResult();
- return summary;
- }
-
- function _searchQuery (query : Query) : SingleResult
- {
- var result = new SingleResult(query.word, query.or, query.not);
- var positions : int[];
- if (query.raw)
- {
- positions = this.rawSearch(query.word, false);
- }
- else
- {
- positions = this.rawSearch(query.word, false).concat(this.rawSearch(query.word, true));
- }
- this.getPrimaryMetadata().grouping(result, positions, query.word, !query.raw);
- return result;
- }
-
- function build () : void
- {
- this.build(5, false);
- }
-
- function build (cacheDensity : int, verbose : boolean) : void
- {
- for (var key in this._metadatas)
- {
- this._metadatas[key]._build();
- }
- var cacheRange = Math.round(Math.max(1, (100 / Math.min(100, Math.max(0.01, cacheDensity)))));
- var maxChar = this._compressCode2utf16.length;
- this._fmindex.build(Oktavia.eof, maxChar, cacheRange, verbose);
- }
-
- function dump () : string
- {
- return this.dump(false);
- }
-
- function dump (verbose : boolean) : string
- {
- var headerSource = "oktavia-01";
- var header = Binary.dumpString(headerSource).slice(1);
- if (verbose)
- {
- console.log("Source text size: " + (this._fmindex.size() * 2) as string + ' bytes');
- }
- var fmdata = this._fmindex.dump(verbose);
- var result = [
- header,
- fmdata
- ];
-
- result.push(Binary.dump16bitNumber(this._compressCode2utf16.length));
- for (var i = 3; i < this._compressCode2utf16.length; i++)
- {
- result.push(this._compressCode2utf16[i]);
- }
- if (verbose)
- {
- console.log('Char Code Map: ' + (this._compressCode2utf16.length * 2 - 2) as string + ' bytes');
- }
-
- var report = new CompressionReport();
- result.push(Binary.dumpStringListMap(this._stemmingResult, report));
- if (verbose)
- {
- console.log('Stemmed Word Table: ' + (result[result.length - 1].length) as string + ' bytes (' + report.rate() as string + '%)');
- }
-
- result.push(Binary.dump16bitNumber(this._metadataLabels.length));
- for (var i = 0; i < this._metadataLabels.length; i++)
- {
- var report = new CompressionReport();
- var name = this._metadataLabels[i];
- var data = this._metadatas[name]._dump(report);
- result.push(Binary.dumpString(name, report), data);
- if (verbose)
- {
- console.log('Meta Data ' + name + ': ' + (data.length * 2) as string + ' bytes (' + report.rate() as string + '%)');
- }
- }
- return result.join('');
- }
-
- function load (data : string) : void
- {
- var headerSource = "oktavia-01";
- var header = Binary.dumpString(headerSource).slice(1);
- if (data.slice(0, 5) != header)
- {
- throw new Error('Invalid data file');
- }
- this._metadatas = {} : Map.<Metadata>;
- this._metadataLabels = [] : string[];
-
- var offset = 5;
- offset = this._fmindex.load(data, offset);
- var charCodeCount = Binary.load16bitNumber(data, offset++);
- this._compressCode2utf16 = [Oktavia.eof, Oktavia.eob, Oktavia.unknown];
- this._utf162compressCode = [Oktavia.eof, Oktavia.eob, Oktavia.unknown];
- for (var i = 3; i < charCodeCount; i++)
- {
- var charCode = Binary.load16bitNumber(data, offset++);
- this._compressCode2utf16.push(String.fromCharCode(charCode));
- this._utf162compressCode[charCode] = String.fromCharCode(i);
- }
-
- var stemmedWords = Binary.loadStringListMap(data, offset);
- this._stemmingResult = stemmedWords.result;
- offset = stemmedWords.offset;
-
- var metadataCount = Binary.load16bitNumber(data, offset++);
- for (var i = 0; i < metadataCount; i++)
- {
- var nameResult = Binary.loadString(data, offset);
- var name = nameResult.result;
- var offset = nameResult.offset;
- var type = Binary.load16bitNumber(data, offset++);
- switch (type)
- {
- case 0:
- offset = Section._load(this, name, data, offset);
- break;
- case 1:
- offset = Splitter._load(this, name, data, offset);
- break;
- case 2:
- offset = Table._load(this, name, data, offset);
- break;
- case 3:
- offset = Block._load(this, name, data, offset);
- break;
- }
- }
- }
-
- function contentSize () : int
- {
- return this._fmindex.contentSize();
- }
-
- function wordPositionType (position : int) : int
- {
- var result = 0;
- if (position == 0)
- {
- result = 4;
- }
- else
- {
- var ahead = this._fmindex.getSubstring(position - 1, 1);
- if (/\s/.test(ahead))
- {
- result = 2;
- }
- else if (/\W/.test(ahead))
- {
- result = 1;
- }
- else if (Oktavia.eob == ahead)
- {
- result = 3;
- }
- }
- return result;
- }
-
- function _getSubstring (position : int, length : int) : string
- {
- var result = this._fmindex.getSubstring(position, length);
- var str = [] : string[];
- for (var i = 0; i < result.length; i++)
- {
- str.push(this._compressCode2utf16[result.charCodeAt(i)]);
- }
- return str.join('');
- }
-}