diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 02:57:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 02:57:58 +0000 |
commit | be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97 (patch) | |
tree | 9754ff1ca740f6346cf8483ec915d4054bc5da2d /web/server/h2o/libh2o/misc/oktavia/tool | |
parent | Initial commit. (diff) | |
download | netdata-upstream.tar.xz netdata-upstream.zip |
Adding upstream version 1.44.3.upstream/1.44.3upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'web/server/h2o/libh2o/misc/oktavia/tool')
21 files changed, 1496 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/httpstatus.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/httpstatus.jsx new file mode 100644 index 00000000..a4d7451e --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/httpstatus.jsx @@ -0,0 +1,130 @@ +import "console.jsx"; +import "oktavia.jsx"; +import "metadata.jsx"; +import "query-parser.jsx"; +import "stemmer/english-stemmer.jsx"; + + +class HTTPStatus +{ + var oktavia : Oktavia; + var splitter : Splitter; + var httpstatus : string[]; + + function constructor () + { + this.oktavia = new Oktavia; + this.oktavia.setStemmer(new EnglishStemmer()); + this.splitter = this.oktavia.addSplitter('line break'); + this.makeIndex(); + } + + function makeIndex () : void + { + this.httpstatus = [ + "100: Continue", + "101: Switching Protocols", + "102: Processing", + "200: OK", + "201: Created", + "202: Accepted", + "203: Non-Authoritative Information", + "204: No Content", + "205: Reset Content", + "206: Partial Content", + "207: Multi-Status", + "208: Already Reported", + "300: Multiple Choices", + "301: Moved Permanently", + "302: Found", + "303: See Other", + "304: Not Modified", + "305: Use Proxy", + "307: Temporary Redirect", + "400: Bad Request", + "401: Unauthorized", + "402: Payment Required", + "403: Forbidden", + "404: Not Found", + "405: Method Not Allowed", + "406: Not Acceptable", + "407: Proxy Authentication Required", + "408: Request Timeout", + "409: Conflict", + "410: Gone", + "411: Length Required", + "412: Precondition Failed", + "413: Request Entity Too Large", + "414: Request-URI Too Large", + "415: Unsupported Media Type", + "416: Request Range Not Satisfiable", + "417: Expectation Failed", + "418: I'm a teapot", + "422: Unprocessable Entity", + "423: Locked", + "424: Failed Dependency", + "425: No code", + "426: Upgrade Required", + "428: Precondition Required", + "429: Too Many Requests", + "431: Request Header Fields Too Large", + "449: Retry with", + "500: Internal Server Error", + "501: Not Implemented", + "502: Bad Gateway", + "503: Service Unavailable", + "504: Gateway Timeout", + "505: HTTP Version Not Supported", + "506: Variant Also Negotiates", + "507: Insufficient Storage", + "509: Bandwidth Limit Exceeded", + "510: Not Extended" + ]; + for (var i in this.httpstatus) + { + this.oktavia.addWord(this.httpstatus[i], true); + this.splitter.split(); + } + this.oktavia.build(); + } + + function search (words : string[]) : string + { + var queryParser = new QueryParser(); + var queries = queryParser.parse(words); + if (queries.length == 0) + { + var result = this.httpstatus.join('\n'); + result = result + "\n\nToday's status: " + this.random(); + return result; + } + else + { + var summary = this.oktavia.search(queries); + if (summary.size() == 0) + { + return "not found "; + } + var resultWords = [] : string[]; + for (var i in summary.result.unitIds) + { + resultWords.push(this.splitter.getContent(summary.result.unitIds[i])); + } + return resultWords.join('\n'); + } + } + + function random () : string + { + return this.httpstatus[Math.round(Math.random() * this.httpstatus.length)]; + } +} + +class _Main +{ + static function main (argv : string []) : void + { + var httpstatus = new HTTPStatus(); + console.log(httpstatus.search(argv)); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx new file mode 100644 index 00000000..f2593bc9 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx @@ -0,0 +1,470 @@ +import "console.jsx"; +import "js/nodejs.jsx"; + +import "oktavia.jsx"; +import "getopt.jsx"; +import "htmlparser.jsx"; +import "csvparser.jsx"; +import "textparser.jsx"; +import "binary-util.jsx"; + +import "stemmer/stemmer.jsx"; +import "stemmer/danish-stemmer.jsx"; +import "stemmer/dutch-stemmer.jsx"; +import "stemmer/english-stemmer.jsx"; +import "stemmer/finnish-stemmer.jsx"; +import "stemmer/french-stemmer.jsx"; +import "stemmer/german-stemmer.jsx"; +import "stemmer/hungarian-stemmer.jsx"; +import "stemmer/italian-stemmer.jsx"; +import "stemmer/norwegian-stemmer.jsx"; +import "stemmer/porter-stemmer.jsx"; +import "stemmer/portuguese-stemmer.jsx"; +import "stemmer/romanian-stemmer.jsx"; +import "stemmer/russian-stemmer.jsx"; +import "stemmer/spanish-stemmer.jsx"; +import "stemmer/swedish-stemmer.jsx"; +import "stemmer/turkish-stemmer.jsx"; + + +class _Main +{ + static function usage () : void + { + console.log([ + "usage: oktavia_mkindex [options]", + "", + "Common Options:", + " -i, --input [input folder/file ] : Target files to search. .html, .csv, .txt are available.", + " -o, --output [outputfolder] : Directory that will store output files.", + " : This is a relative path from root.", + " : Default value is 'search'. ", + " -t, --type [type] : Export type. 'index'(default), 'base64', 'cmd', 'js',", + " : 'commonjs' are available.", + " : 'index' is a just index file. 'cmd' is a base64 code with search program.", + " : Others are base64 source code style output.", + " -m, --mode [mode] : Mode type. 'html', 'csv', 'text' are available.", + " -c, --cache-density [percent] : Cache data density. It effects file size and search speed.", + " : 100% become four times of base index file size. Default value is 5%.", + " : Valid value is 0.1% - 100%.", + " -n, --name [function] : A variable name for 'js' output or property name", + " : for 'js' and 'commonjs'. Default value is 'searchIndex'.", + " -q, --quiet : Hide detail information.", + " -h, --help : Display this message.", + "", + "HTML Mode Options:", + " -r, --root [document root] : Document root folder. Default is current. ", + " : Indexer creates result file path from this folder.", + " -p, --prefix [directory prefix] : Directory prefix for a document root from a server root.", + " : If your domain is example.com and 'manual' is passed,", + " : document root become http://example.com/manual/.", + " : It effects search result URL. Default value is '/'.", + " -u, --unit [search unit] : 'file', 'h1'-'h6'. Default value is 'file'.", + " -f, --filter [target tag] : Only contents inside this tag is indexed.", + " : Default value is \"article,#content,#main,div.body\".", + " -s, --stemmer [algorithm] : Select stemming algorithm.", + " -w, --word-splitter [splitter] : Use optional word splitter.", + " : 'ts' (TinySegmenter for Japanese) is available", + "", + "Text Mode Options:", + " -s, --stemmer [algorithm] : Select stemming algorithm.", + " -w, --word-splitter [splitter] : Use optional word splitter.", + " : 'ts' (TinySegmenter for Japanese) is available", + " -u, --unit [search unit] : file, block, line. Default value is 'file'.", + "", + "Supported Stemmer Algorithms:", + " danish, dutch, english, finnish, french german, hungarian italian", + " norwegian, porter, portuguese, romanian, russian, spanish, swedish, turkish" + ].join('\n')); + } + + static function main(args : string[]) : void + { + console.log("Search Engine Oktavia - Index Generator\n"); + + var inputs = [] : string[]; + var root = process.cwd(); + var prefix = '/'; + var output = "search"; + var showhelp = false; + var notrun = false; + var unit = 'file'; + var type = 'js'; + var mode = ''; + var verbose = true; + var filter = [] : string[]; + var algorithm : Nullable.<string> = null; + var wordsplitter : Nullable.<string> = null; + var cacheDensity : number = 5.0; + var name = null : Nullable.<string>; + var validModes = ['html', 'csv', 'text']; + var validUnitsForHTML = ['file', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']; + var validUnitsForText = ['file', 'block', 'line']; + var validStemmers = [ + 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', + 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', + 'spanish', 'swedish', 'turkish' + ]; + var validTypes = ['index', 'base64', 'cmd', 'js', 'commonjs']; + var validWordSplitters = ['ts']; + + var optstring = "n:(name)q(quiet)m:(mode)i:(input)r:(root)p:(prefix)o:(output)h(help)u:(unit)f:(filter)s:(stemmer)w:(word-splitter)t:(type)c:(cache-density)"; + var parser = new BasicParser(optstring, args); + var opt = parser.getopt(); + while (opt) + { + switch (opt.option) + { + case "m": + if (validModes.indexOf(opt.optarg) == -1) + { + console.error("Option m/mode should be 'html', 'csv', 'text'."); + notrun = true; + } + mode = opt.optarg; + break; + case "i": + inputs.push(opt.optarg); + break; + case "r": + root = node.path.resolve(opt.optarg); + break; + case "p": + prefix = opt.optarg; + break; + case "n": + name = opt.optarg; + break; + case "o": + output = opt.optarg; + if (output.slice(0, 1) == '/') + { + output = output.slice(1); + } + break; + case "h": + showhelp = true; + break; + case "q": + verbose = false; + break; + case "u": + unit = opt.optarg; + break; + case "f": + var items = opt.optarg.split(','); + for (var i in items) + { + filter.push(items[i]); + } + break; + case "t": + if (validTypes.indexOf(opt.optarg) == -1) + { + console.error('Option -t/--type is invalid.'); + notrun = true; + } + else + { + type = opt.optarg; + } + break; + case "s": + if (validStemmers.indexOf(opt.optarg) == -1) + { + console.error('Option -s/--stemmer is invalid.'); + notrun = true; + } + else + { + algorithm = opt.optarg; + } + break; + case "w": + + break; + case "c": + var match = /(\d+\.?\d*)/.exec(opt.optarg); + if (match) + { + var tmpValue = match[1] as number; + if (0.1 <= tmpValue && tmpValue <= 100) + { + cacheDensity = tmpValue; + } + else + { + console.error('Option -c/--cache-density should be in 0.1 - 100.'); + notrun = true; + } + } + else + { + console.error('Option -c/--cache-density is invalid.'); + notrun = true; + } + break; + case "?": + notrun = true; + break; + } + opt = parser.getopt(); + } + var inputTextFiles = [] : string[]; + var inputHTMLFiles = [] : string[]; + var inputCSVFiles = [] : string[]; + if (filter.length == 0) + { + filter = ['article', '#content', '#main', 'div.body']; + } + for (var i in inputs) + { + var input = inputs[i]; + if (!node.fs.existsSync(input)) + { + console.error("Following input folder/file doesn't exist: " + input); + notrun = true; + } + else + { + var stat = node.fs.statSync(input); + if (stat.isFile()) + { + _Main._checkFileType(node.path.resolve(input), inputTextFiles, inputHTMLFiles, inputCSVFiles); + } + else if (stat.isDirectory()) + { + _Main._checkDirectory(input, inputTextFiles, inputHTMLFiles, inputCSVFiles); + } + else + { + console.error("Following input is not folder or file: " + input); + notrun = true; + } + } + } + if (inputTextFiles.length == 0 && inputHTMLFiles.length == 0 && inputCSVFiles.length == 0 || !mode) + { + showhelp = true; + } + if (showhelp) + { + _Main.usage(); + } + else if (!notrun) + { + var stemmer : Nullable.<Stemmer> = null; + if (algorithm) + { + stemmer = _Main._createStemmer(algorithm); + } + var dump = null : Nullable.<string>; + switch (mode) + { + case 'html': + var unitIndex = validUnitsForHTML.indexOf(unit); + if (unitIndex == -1) + { + console.error('Option -u/--unit should be file, h1, h2, h3, h4, h5, h6. But ' + unit); + } + else + { + var htmlParser = new HTMLParser(unitIndex, root, prefix, filter, stemmer); + for (var i = 0; i < inputHTMLFiles.length; i++) + { + htmlParser.parse(inputHTMLFiles[i]); + } + console.log('generating index...'); + if (verbose) + { + console.log(''); + } + dump = htmlParser.dump(cacheDensity, verbose); + } + break; + case 'csv': + var csvParser = new CSVParser(root, stemmer); + for (var i in inputCSVFiles) + { + csvParser.parse(inputCSVFiles[i]); + } + break; + case 'text': + if (validUnitsForText.indexOf(unit) == -1) + { + console.error('Option u/unit should be file, block, line. But ' + unit); + } + else + { + var textParser = new TextParser(unit, root, stemmer); + for (var i in inputTextFiles) + { + textParser.parse(inputTextFiles[i]); + } + } + break; + } + if (dump) + { + var indexFilePath = ""; + switch (type) + { + case 'index': + indexFilePath = node.path.resolve(root, output, 'searchindex.okt'); + var dirPath = node.path.dirname(indexFilePath); + _Main._mkdirP(dirPath); + node.fs.writeFileSync(indexFilePath, dump, "utf16le"); + break; + case 'base64': + indexFilePath = node.path.resolve(root, output, 'searchindex.okt.b64'); + var dirPath = node.path.dirname(indexFilePath); + _Main._mkdirP(dirPath); + node.fs.writeFileSync(indexFilePath, Binary.base64encode(dump), "utf8"); + break; + case 'cmd': + break; + case 'js': + indexFilePath = node.path.resolve(root, output, 'searchindex.js'); + var dirPath = node.path.dirname(indexFilePath); + _Main._mkdirP(dirPath); + if (name == null) + { + name = 'searchIndex'; + } + var contents = [ + '// Oktavia Search Index', + 'var ' + name + ' = "' + Binary.base64encode(dump) + '";', '' + ]; + node.fs.writeFileSync(indexFilePath, contents.join('\n'), "utf8"); + break; + case 'commonjs': + indexFilePath = node.path.resolve(root, output, 'searchindex.js'); + var dirPath = node.path.dirname(indexFilePath); + _Main._mkdirP(dirPath); + if (name == null) + { + name = 'searchIndex'; + } + var contents = [ + '// Oktavia Search Index', + 'exports.' + name + ' = "' + Binary.base64encode(dump) + '";', '' + ]; + node.fs.writeFileSync(indexFilePath, contents.join('\n'), "utf8"); + break; + } + if (indexFilePath) + { + console.log("generated: " + indexFilePath); + } + } + } + } + + static function _checkFileType (path : string, texts : string[], HTMLs : string[], CSVs : string[]) : void + { + var match = path.match(/(.*)\.(.*)/); + if (match && match[1]) + { + switch (match[2].toLowerCase()) + { + case 'html': + case 'htm': + HTMLs.push(path); + break; + case 'csv': + CSVs.push(path); + break; + default: + texts.push(path); + } + } + } + + static function _checkDirectory (path : string, texts : string[], HTMLs : string[], CSVs : string[]) : void + { + var files = node.fs.readdirSync(path); + for (var j in files) + { + var filepath = node.path.resolve(path, files[j]); + var stat = node.fs.statSync(filepath); + if (stat.isFile()) + { + _Main._checkFileType(filepath, texts, HTMLs, CSVs); + } + else if (stat.isDirectory()) + { + _Main._checkDirectory(filepath, texts, HTMLs, CSVs); + } + } + } + + static function _mkdirP (path : string) : void + { + if (node.fs.existsSync(path)) + { + return; + } + _Main._mkdirP(node.path.dirname(path)); + node.fs.mkdirSync(path); + } + + static function _createStemmer (algorithm : string) : Stemmer + { + var stemmer : Stemmer; + switch (algorithm.toLowerCase()) + { + case "danish": + stemmer = new DanishStemmer(); + break; + case "dutch": + stemmer = new DutchStemmer(); + break; + case "english": + stemmer = new EnglishStemmer(); + break; + case "finnish": + stemmer = new FinnishStemmer(); + break; + case "french": + stemmer = new FrenchStemmer(); + break; + case "german": + stemmer = new GermanStemmer(); + break; + case "hungarian": + stemmer = new HungarianStemmer(); + break; + case "italian": + stemmer = new ItalianStemmer(); + break; + case "norwegian": + stemmer = new NorwegianStemmer(); + break; + case "porter": + stemmer = new PorterStemmer(); + break; + case "portuguese": + stemmer = new PortugueseStemmer(); + break; + case "romanian": + stemmer = new RomanianStemmer(); + break; + case "russian": + stemmer = new RussianStemmer(); + break; + case "spanish": + stemmer = new SpanishStemmer(); + break; + case "swedish": + stemmer = new SwedishStemmer(); + break; + case "turkish": + stemmer = new TurkishStemmer(); + break; + default: + stemmer = new EnglishStemmer(); + break; + } + return stemmer; + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-search.jsx new file mode 100644 index 00000000..719c71b8 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-search.jsx @@ -0,0 +1,370 @@ +import "console.jsx"; +import "js/nodejs.jsx"; + +import "oktavia.jsx"; +import "getopt.jsx"; +import "query-parser.jsx"; +import "search-result.jsx"; +import "style.jsx"; +import "binary-util.jsx"; + +import "stemmer/stemmer.jsx"; +import "stemmer/danish-stemmer.jsx"; +import "stemmer/dutch-stemmer.jsx"; +import "stemmer/english-stemmer.jsx"; +import "stemmer/finnish-stemmer.jsx"; +import "stemmer/french-stemmer.jsx"; +import "stemmer/german-stemmer.jsx"; +import "stemmer/hungarian-stemmer.jsx"; +import "stemmer/italian-stemmer.jsx"; +import "stemmer/norwegian-stemmer.jsx"; +import "stemmer/porter-stemmer.jsx"; +import "stemmer/portuguese-stemmer.jsx"; +import "stemmer/romanian-stemmer.jsx"; +import "stemmer/russian-stemmer.jsx"; +import "stemmer/spanish-stemmer.jsx"; +import "stemmer/swedish-stemmer.jsx"; +import "stemmer/turkish-stemmer.jsx"; + + +class Search +{ + var style : Style; + + function search (indexFile : string, queryStrings : string[], num : int, style : Style, algorithm : Nullable.<string>) : void + { + this.style = style; + var oktavia = new Oktavia(); + if (algorithm != null) + { + oktavia.setStemmer(this.createStemmer(algorithm)); + } + if (!this.loadIndex(oktavia, indexFile)) + { + return; + } + console.time('searching'); + var queryParser = new QueryParser(); + queryParser.parse(queryStrings); + var summary = oktavia.search(queryParser.queries); + console.timeEnd('searching'); + if (summary.size() == 0) + { + this.notFound(summary, queryStrings); + } + else + { + this.showResult(oktavia, summary, num); + } + } + + function loadIndex (oktavia : Oktavia, filepath : string) : boolean + { + var ext = node.path.extname(filepath); + var content : string; + var result = true; + switch (ext) + { + case ".okt": + content = node.fs.readFileSync(filepath, "utf16le"); + oktavia.load(content); + break; + case ".b64": + content = node.fs.readFileSync(filepath, "utf8"); + oktavia.load(Binary.base64decode(content)); + break; + case ".js": + content = node.fs.readFileSync(filepath, "utf8"); + var index = content.indexOf('"'); + var lastIndex = content.lastIndexOf('"'); + oktavia.load(Binary.base64decode(content.slice(index, lastIndex))); + break; + default: + console.log("unknown file extension: " + ext); + result = false; + break; + } + return result; + } + + function sortResult (oktavia : Oktavia, summary : SearchSummary) : SearchUnit[] + { + for (var i = 0; i < summary.result.units.length; i++) + { + var score = 0; + var unit = summary.result.units[i]; + for (var pos in unit.positions) + { + var position = unit.positions[pos]; + if (oktavia.wordPositionType(position.position)) + { + score += 10; + } + else + { + score += 1; + } + if (!position.stemmed) + { + score += 2; + } + } + unit.score = score; + } + return summary.getSortedResult(); + } + + function showResult (oktavia : Oktavia, summary : SearchSummary, num : int) : void + { + var results = this.sortResult(oktavia, summary); + var style = this.style; + var metadata = oktavia.getPrimaryMetadata(); + for (var i = 0; i < results.length; i++) + { + var unit = results[i]; + var info = metadata.getInformation(unit.id).split(Oktavia.eob); + /*console.log(info.replace(Oktavia.eob, ' -- ') + '\n'); + + ' ----------------------------------------------- ' + + unit.score as string + ' pt');*/ + console.log(style.convert('<title>' + info[0] + '</title>') + ' ' + style.convert('<url>' + info[1] + '</url>')); + var offset = info[0].length + 1; + var content = metadata.getContent(unit.id); + var start = 0; + var positions = unit.getPositions(); + if (content.indexOf(info[0]) == 1) + { + content = content.slice(info[0].length + 2, content.length); + start += (info[0].length + 2); + } + var end = start + num; + var split = false; + if (positions[0].position > end - positions[0].word.length) + { + end = positions[0].position + Math.floor(num / 2); + split = true; + } + for (var j = positions.length - 1; j > -1; j--) + { + var pos = positions[j]; + if (pos.position + pos.word.length < end) + { + /*log('--------------begin : ' + (pos.position - start) as string); + log(content.slice(0, pos.position - start)); + log('--------------match : ' + pos.word.length as string); + .log(content.slice(pos.position - start, pos.position + pos.word.length - start)); + log('--------------match : ' + (content.length - pos.position + pos.word.length - start) as string); + log(content.slice(pos.position + pos.word.length - start, content.length)); + log('--------------end');*/ + content = [ + content.slice(0, pos.position - start), + style.convert('<hit>*</hit>').replace('*', content.slice(pos.position - start, pos.position + pos.word.length - start)), + content.slice(pos.position + pos.word.length - start, content.length) + ].join(''); + } + } + var text : string; + if (split) + { + text = [ + content.slice(0, Math.floor(num / 2)) + ' ...', + content.slice(-Math.floor(num / 2), end - start)].join('\n'); + } + else + { + text = content.slice(0, end - start) + ' ...\n'; + } + text = text.replace(Oktavia.eob, ' ').replace(/\n\n+/, '\n\n'); + console.log(text); + } + console.log(style.convert('<summary>' + (summary.size() as string) + " results.</summary>\n")); + } + + function notFound (summary : SearchSummary, query : string[]) : void + { + var style = this.style; + if (query.length > 1) + { + console.log("Suggestions:"); + var proposals = summary.getProposal(); + for (var i = 0; i < proposals.length; i++) + { + var proposal = proposals[i]; + var querywords = [] : string[]; + for (var j = 0; j < query.length; j++) + { + if (j != proposal.omit) + { + querywords.push(style.convert('<hit>' + query[j] + '</hit>')); + } + else + { + //querywords.push(style.convert('<del>' + query[j] + '</del>')); + } + } + console.log("* Expected result: " + querywords.join(" ") + " - " + (proposal.expect as string) + " hit"); + } + } + else + { + console.log(style.convert("Your search - <hit>" + query[0] + "</hit> - didn't match any documents.")); + } + } + + function createStemmer (algorithm : string) : Stemmer + { + var stemmer : Stemmer; + switch (algorithm.toLowerCase()) + { + case "danish": + stemmer = new DanishStemmer(); + break; + case "dutch": + stemmer = new DutchStemmer(); + break; + case "english": + stemmer = new EnglishStemmer(); + break; + case "finnish": + stemmer = new FinnishStemmer(); + break; + case "french": + stemmer = new FrenchStemmer(); + break; + case "german": + stemmer = new GermanStemmer(); + break; + case "hungarian": + stemmer = new HungarianStemmer(); + break; + case "italian": + stemmer = new ItalianStemmer(); + break; + case "norwegian": + stemmer = new NorwegianStemmer(); + break; + case "porter": + stemmer = new PorterStemmer(); + break; + case "portuguese": + stemmer = new PortugueseStemmer(); + break; + case "romanian": + stemmer = new RomanianStemmer(); + break; + case "russian": + stemmer = new RussianStemmer(); + break; + case "spanish": + stemmer = new SpanishStemmer(); + break; + case "swedish": + stemmer = new SwedishStemmer(); + break; + case "turkish": + stemmer = new TurkishStemmer(); + break; + default: + stemmer = new EnglishStemmer(); + break; + } + return stemmer; + } +} + +class _Main { + static function usage () : void + { + console.log([ + "usage: oktavia_search index_file [options] query...", + "", + "Options:", + " -m, --mono : Don't use color.", + " -s, --stemmer [algorithm] : Select stemming algorithm.", + " -n, --number [char number] : Result display number. Default value = 250", + " -h, --help : Display this message.", + "", + "Search Query Syntax:", + " word1 word2 : All words.", + ' "word1 word2" : Exact words or phrase.', + " word1 OR word2 : Any of these words.", + " word1 -word2 : None of these words." + ].join('\n')); + } + + static function main(args : string[]) : void + { + console.log("Search Engine Oktavia - Command-line Search Client\n"); + + var indexFile : Nullable.<string> = null; + var showhelp = false; + var notrun = false; + var styleType = 'console'; + var num : int = 250; + var queryStrings = [] : string[]; + var algorithm : Nullable.<string> = null; + + var validStemmers = [ + 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', + 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', + 'spanish', 'swedish', 'turkish' + ]; + + if (args.length == 0) + { + showhelp = true; + } + else if (!node.fs.existsSync(args[0])) + { + console.error("Index file '" + args[0] + "' doesn't exist."); + notrun = true; + } + else + { + indexFile = args[0]; + } + + var optstring = "m(mono)s:(stemmer)n:(number)h(help)"; + var parser = new BasicParser(optstring, args.slice(1)); + var opt = parser.getopt(); + while (opt) + { + switch (opt.option) + { + case "s": + if (validStemmers.indexOf(opt.optarg) == -1) + { + console.error('Option s/stemmer is invalid.'); + notrun = true; + } + else + { + algorithm = opt.optarg; + } + break; + case "m": + styleType = 'ignore'; + break; + case "n": + num = opt.optarg as int; + break; + case "h": + showhelp = true; + break; + default: + queryStrings.push(opt.option); + break; + } + opt = parser.getopt(); + } + if (showhelp || queryStrings.length == 0) + { + _Main.usage(); + } + else if (!notrun) + { + var style = new Style(styleType); + var search = new Search(); + search.search(indexFile, queryStrings, num, style, algorithm); + } + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/search_simple.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/search_simple.jsx new file mode 100644 index 00000000..f9b86751 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/search_simple.jsx @@ -0,0 +1,39 @@ +import "nodejs.jsx"; +import "fm_index.jsx"; + +class _Main +{ + static function usage () : void + { + log "Simple FM-Index Search Engine: Oktavia"; + log ""; + log "[usage]"; + log " search [input db file name] keyword"; + } + + static function main(args : string[]) : void + { + if (args.length <2) + { + _Main.usage(); + } + else + { + var indexFileName = args.shift(); + log "index file name: ", indexFileName; + var fm_index = new FMIndex(); + fm_index.load(node.fs.readFileSync(indexFileName, "utf16le")); + for (var i in args) + { + log "[search world]", args[i]; + var results = fm_index.search(args[i]); + for (var j in results) + { + var result = results[j]; + log "[", result[0], "]: ", "(", result[1], ")"; + } + log results.length, " hits"; + } + } + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-danish-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-danish-search.jsx new file mode 100644 index 00000000..98dba01d --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-danish-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/danish-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new DanishStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-dutch-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-dutch-search.jsx new file mode 100644 index 00000000..117f2cce --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-dutch-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/dutch-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new DutchStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-english-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-english-search.jsx new file mode 100644 index 00000000..d30ad2cc --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-english-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/english-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new EnglishStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-finnish-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-finnish-search.jsx new file mode 100644 index 00000000..64006395 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-finnish-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/finnish-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new FinnishStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-french-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-french-search.jsx new file mode 100644 index 00000000..777f5e2a --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-french-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/french-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new FrenchStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-german-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-german-search.jsx new file mode 100644 index 00000000..58831870 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-german-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/german-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new GermanStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-hungarian-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-hungarian-search.jsx new file mode 100644 index 00000000..a14fe345 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-hungarian-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/hungarian-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new HungarianStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-italian-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-italian-search.jsx new file mode 100644 index 00000000..30769d1a --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-italian-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/italian-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new ItalianStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-norwegian-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-norwegian-search.jsx new file mode 100644 index 00000000..180e6b04 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-norwegian-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/norwegian-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new NorwegianStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-porter-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-porter-search.jsx new file mode 100644 index 00000000..ba1de086 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-porter-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/porter-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new PorterStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-portuguese-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-portuguese-search.jsx new file mode 100644 index 00000000..89ed1a0f --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-portuguese-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/portuguese-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new PortugueseStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-romanian-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-romanian-search.jsx new file mode 100644 index 00000000..ef8b47fc --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-romanian-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/romanian-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new RomanianStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-russian-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-russian-search.jsx new file mode 100644 index 00000000..2a572d71 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-russian-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/russian-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new RussianStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-search.jsx new file mode 100644 index 00000000..22dc3f77 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-search.jsx @@ -0,0 +1,327 @@ +import "oktavia.jsx"; +import "binary-util.jsx"; +import "query.jsx"; +import "query-string-parser.jsx"; +import "search-result.jsx"; +import "style.jsx"; +import "stemmer/stemmer.jsx"; +import "console.jsx"; + + +class _Result +{ + var title : string; + var url : string; + var content : string; + var score : int; + function constructor (title : string, url : string, content : string, score : int) + { + this.title = title; + this.url = url; + this.content = content; + this.score = score; + } +} + +class _Proposal +{ + var options : string; + var label : string; + var count : int; + function constructor (options : string, label : string, count : int) + { + this.options = options; + this.label = label; + this.count = count; + } +} + +class OktaviaSearch +{ + var _oktavia : Oktavia; + static var _stemmer : Nullable.<Stemmer> = null; + static var _instance : Nullable.<OktaviaSearch> = null; + var _queryString : Nullable.<string>; + var _queries : Query[]; + var _highlight : string; + var _callback : Nullable.<function(:int, :int):void>; + var _entriesPerPage : int; + var _currentPage : int; + var _result : SearchUnit[]; + var _proposals : Proposal[]; + var _currentFolderDepth : int; + + function constructor (entriesPerPage : int) + { + this._oktavia = new Oktavia(); + this._entriesPerPage = entriesPerPage; + this._currentPage = 1; + this._queryString = null; + this._callback = null; + OktaviaSearch._instance = this; + } + + static function setStemmer(stemmer : Stemmer) : void + { + if (OktaviaSearch._instance) + { + OktaviaSearch._instance._oktavia.setStemmer(stemmer); + } + else + { + OktaviaSearch._stemmer = stemmer; + } + } + + function loadIndex (index : string) : void + { + if (OktaviaSearch._stemmer) + { + this._oktavia.setStemmer(OktaviaSearch._stemmer); + } + this._oktavia.load(Binary.base64decode(index)); + if (this._queryString) + { + this.search(this._queryString, this._callback); + this._queryString = null; + this._callback = null; + } + } + + function search (queryString : string, callback : function(:int, :int):void) : void + { + if (this._oktavia) + { + var queryParser = new QueryStringParser(); + this._queries = queryParser.parse(queryString); + this._highlight = queryParser.highlight(); + var summary = this._oktavia.search(this._queries); + if (summary.size() > 0) + { + this._result = this._sortResult(summary); + this._proposals = [] : Proposal[]; + this._currentPage = 1; + } + else + { + this._result = [] : SearchUnit[]; + if (this._queries.length > 1) + { + this._proposals = summary.getProposal(); + } + else + { + this._proposals = [] : Proposal[]; + } + this._currentPage = 1; + } + callback(this.resultSize(), this.totalPages()); + } + else + { + this._queryString = queryString; + this._callback = callback; + } + } + + function resultSize () : int + { + return this._result.length; + } + + function totalPages () : int + { + return Math.ceil(this._result.length / this._entriesPerPage); + } + + function currentPage () : int + { + return this._currentPage; + } + + function setCurrentPage (page : int) : void + { + this._currentPage = page; + } + + function hasPrevPage () : boolean + { + return (this._currentPage != 1); + } + + function hasNextPage () : boolean + { + return (this._currentPage != this.totalPages()); + } + + function pageIndexes () : string[] + { + var result = [] : string[]; + var total = this.totalPages(); + if (total < 10) + { + for (var i = 1; i <= total; i++) + { + result.push(i as string); + } + } + else if (this._currentPage <= 5) + { + for (var i = 1; i <= 7; i++) + { + result.push(i as string); + } + result.push('...', total as string); + } + else if (total - 5 <= this._currentPage) + { + result.push('1', '...'); + for (var i = total - 8; i <= total; i++) + { + result.push(i as string); + } + } + else + { + result.push('1', '...'); + for (var i = this._currentPage - 3; i <= this._currentPage + 3; i++) + { + result.push(i as string); + } + result.push('...', total as string); + } + return result; + } + + function getResult () : _Result[] + { + var style = new Style('html'); + var start = (this._currentPage - 1) * this._entriesPerPage; + var last = Math.min(this._currentPage * this._entriesPerPage, this._result.length); + var metadata = this._oktavia.getPrimaryMetadata(); + var num = 250; + + var results = [] : _Result[]; + + for (var i = start; i < last; i++) + { + var unit = this._result[i]; + var info = metadata.getInformation(unit.id).split(Oktavia.eob); + + var offset = info[0].length + 1; + var content = metadata.getContent(unit.id); + var start = 0; + var positions = unit.getPositions(); + if (content.indexOf(info[0]) == 1) + { + content = content.slice(info[0].length + 2, content.length); + start += (info[0].length + 2); + } + var end = start + num; + var split = false; + if (positions[0].position > end - positions[0].word.length) + { + end = positions[0].position + Math.floor(num / 2); + split = true; + } + for (var j = positions.length - 1; j > -1; j--) + { + var pos = positions[j]; + if (pos.position + pos.word.length < end) + { + content = [ + content.slice(0, pos.position - start), + style.convert('<hit>*</hit>').replace('*', content.slice(pos.position - start, pos.position + pos.word.length - start)), + content.slice(pos.position + pos.word.length - start, content.length) + ].join(''); + } + } + var text : string; + if (split) + { + text = [ + content.slice(0, Math.floor(num / 2)) + ' ...', + content.slice(-Math.floor(num / 2), end - start)].join('<br/>'); + } + else + { + text = content.slice(0, end - start) + ' ...<br/>'; + } + text = text.replace(Oktavia.eob, ' ').replace(/(<br\/>)(<br\/>)+/, '<br/><br/>'); + results.push(new _Result(info[0], info[1], text, unit.score)); + } + return results; + } + + function getHighlight () : string + { + return this._highlight; + } + + function getProposals () : _Proposal[] + { + var style = new Style('html'); + var results = [] : _Proposal[]; + + if (this._queries.length > 1) + { + for (var i = 0; i < this._proposals.length; i++) + { + var proposal = this._proposals[i]; + if (proposal.expect > 0) + { + var label = [] : string[]; + var option = [] : string[]; + for (var j = 0; j < this._queries.length; j++) + { + if (j != proposal.omit) + { + label.push(style.convert('<hit>' + this._queries[j].toString() + '</hit>')); + option.push(this._queries[j].toString()); + } + else + { + label.push(style.convert('<del>' + this._queries[j].toString() + '</del>')); + } + } + results.push(new _Proposal(option.join(' '), label.join(' '), proposal.expect)); + } + } + } + return results; + } + + function _sortResult (summary : SearchSummary) : SearchUnit[] + { + for (var i = 0; i < summary.result.units.length; i++) + { + var score = 0; + var unit = summary.result.units[i]; + for (var pos in unit.positions) + { + var position = unit.positions[pos]; + if (this._oktavia.wordPositionType(position.position)) + { + score += 10; + } + else + { + score += 1; + } + if (!position.stemmed) + { + score += 2; + } + } + unit.score = score; + } + return summary.getSortedResult(); + } +} + +class _Main +{ + static function main(args : string[]) : void + { + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-spanish-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-spanish-search.jsx new file mode 100644 index 00000000..3ad9b8d1 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-spanish-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/spanish-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new SpanishStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-swedish-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-swedish-search.jsx new file mode 100644 index 00000000..9b900a48 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-swedish-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/swedish-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new SwedishStemmer); + } +} diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-turkish-search.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-turkish-search.jsx new file mode 100644 index 00000000..24d04058 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/tool/web/oktavia-turkish-search.jsx @@ -0,0 +1,10 @@ +import "oktavia-search.jsx"; +import "stemmer/turkish-stemmer.jsx"; + +class _Main +{ + static function main(args : string[]) : void + { + OktaviaSearch.setStemmer(new TurkishStemmer); + } +} |