diff options
Diffstat (limited to 'web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx')
-rw-r--r-- | web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx | 470 |
1 files changed, 0 insertions, 470 deletions
diff --git a/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx b/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx deleted file mode 100644 index f2593bc9e..000000000 --- a/web/server/h2o/libh2o/misc/oktavia/tool/oktavia-mkindex.jsx +++ /dev/null @@ -1,470 +0,0 @@ -import "console.jsx"; -import "js/nodejs.jsx"; - -import "oktavia.jsx"; -import "getopt.jsx"; -import "htmlparser.jsx"; -import "csvparser.jsx"; -import "textparser.jsx"; -import "binary-util.jsx"; - -import "stemmer/stemmer.jsx"; -import "stemmer/danish-stemmer.jsx"; -import "stemmer/dutch-stemmer.jsx"; -import "stemmer/english-stemmer.jsx"; -import "stemmer/finnish-stemmer.jsx"; -import "stemmer/french-stemmer.jsx"; -import "stemmer/german-stemmer.jsx"; -import "stemmer/hungarian-stemmer.jsx"; -import "stemmer/italian-stemmer.jsx"; -import "stemmer/norwegian-stemmer.jsx"; -import "stemmer/porter-stemmer.jsx"; -import "stemmer/portuguese-stemmer.jsx"; -import "stemmer/romanian-stemmer.jsx"; -import "stemmer/russian-stemmer.jsx"; -import "stemmer/spanish-stemmer.jsx"; -import "stemmer/swedish-stemmer.jsx"; -import "stemmer/turkish-stemmer.jsx"; - - -class _Main -{ - static function usage () : void - { - console.log([ - "usage: oktavia_mkindex [options]", - "", - "Common Options:", - " -i, --input [input folder/file ] : Target files to search. .html, .csv, .txt are available.", - " -o, --output [outputfolder] : Directory that will store output files.", - " : This is a relative path from root.", - " : Default value is 'search'. ", - " -t, --type [type] : Export type. 'index'(default), 'base64', 'cmd', 'js',", - " : 'commonjs' are available.", - " : 'index' is a just index file. 'cmd' is a base64 code with search program.", - " : Others are base64 source code style output.", - " -m, --mode [mode] : Mode type. 'html', 'csv', 'text' are available.", - " -c, --cache-density [percent] : Cache data density. It effects file size and search speed.", - " : 100% become four times of base index file size. Default value is 5%.", - " : Valid value is 0.1% - 100%.", - " -n, --name [function] : A variable name for 'js' output or property name", - " : for 'js' and 'commonjs'. Default value is 'searchIndex'.", - " -q, --quiet : Hide detail information.", - " -h, --help : Display this message.", - "", - "HTML Mode Options:", - " -r, --root [document root] : Document root folder. Default is current. ", - " : Indexer creates result file path from this folder.", - " -p, --prefix [directory prefix] : Directory prefix for a document root from a server root.", - " : If your domain is example.com and 'manual' is passed,", - " : document root become http://example.com/manual/.", - " : It effects search result URL. Default value is '/'.", - " -u, --unit [search unit] : 'file', 'h1'-'h6'. Default value is 'file'.", - " -f, --filter [target tag] : Only contents inside this tag is indexed.", - " : Default value is \"article,#content,#main,div.body\".", - " -s, --stemmer [algorithm] : Select stemming algorithm.", - " -w, --word-splitter [splitter] : Use optional word splitter.", - " : 'ts' (TinySegmenter for Japanese) is available", - "", - "Text Mode Options:", - " -s, --stemmer [algorithm] : Select stemming algorithm.", - " -w, --word-splitter [splitter] : Use optional word splitter.", - " : 'ts' (TinySegmenter for Japanese) is available", - " -u, --unit [search unit] : file, block, line. Default value is 'file'.", - "", - "Supported Stemmer Algorithms:", - " danish, dutch, english, finnish, french german, hungarian italian", - " norwegian, porter, portuguese, romanian, russian, spanish, swedish, turkish" - ].join('\n')); - } - - static function main(args : string[]) : void - { - console.log("Search Engine Oktavia - Index Generator\n"); - - var inputs = [] : string[]; - var root = process.cwd(); - var prefix = '/'; - var output = "search"; - var showhelp = false; - var notrun = false; - var unit = 'file'; - var type = 'js'; - var mode = ''; - var verbose = true; - var filter = [] : string[]; - var algorithm : Nullable.<string> = null; - var wordsplitter : Nullable.<string> = null; - var cacheDensity : number = 5.0; - var name = null : Nullable.<string>; - var validModes = ['html', 'csv', 'text']; - var validUnitsForHTML = ['file', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']; - var validUnitsForText = ['file', 'block', 'line']; - var validStemmers = [ - 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', - 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', - 'spanish', 'swedish', 'turkish' - ]; - var validTypes = ['index', 'base64', 'cmd', 'js', 'commonjs']; - var validWordSplitters = ['ts']; - - var optstring = "n:(name)q(quiet)m:(mode)i:(input)r:(root)p:(prefix)o:(output)h(help)u:(unit)f:(filter)s:(stemmer)w:(word-splitter)t:(type)c:(cache-density)"; - var parser = new BasicParser(optstring, args); - var opt = parser.getopt(); - while (opt) - { - switch (opt.option) - { - case "m": - if (validModes.indexOf(opt.optarg) == -1) - { - console.error("Option m/mode should be 'html', 'csv', 'text'."); - notrun = true; - } - mode = opt.optarg; - break; - case "i": - inputs.push(opt.optarg); - break; - case "r": - root = node.path.resolve(opt.optarg); - break; - case "p": - prefix = opt.optarg; - break; - case "n": - name = opt.optarg; - break; - case "o": - output = opt.optarg; - if (output.slice(0, 1) == '/') - { - output = output.slice(1); - } - break; - case "h": - showhelp = true; - break; - case "q": - verbose = false; - break; - case "u": - unit = opt.optarg; - break; - case "f": - var items = opt.optarg.split(','); - for (var i in items) - { - filter.push(items[i]); - } - break; - case "t": - if (validTypes.indexOf(opt.optarg) == -1) - { - console.error('Option -t/--type is invalid.'); - notrun = true; - } - else - { - type = opt.optarg; - } - break; - case "s": - if (validStemmers.indexOf(opt.optarg) == -1) - { - console.error('Option -s/--stemmer is invalid.'); - notrun = true; - } - else - { - algorithm = opt.optarg; - } - break; - case "w": - - break; - case "c": - var match = /(\d+\.?\d*)/.exec(opt.optarg); - if (match) - { - var tmpValue = match[1] as number; - if (0.1 <= tmpValue && tmpValue <= 100) - { - cacheDensity = tmpValue; - } - else - { - console.error('Option -c/--cache-density should be in 0.1 - 100.'); - notrun = true; - } - } - else - { - console.error('Option -c/--cache-density is invalid.'); - notrun = true; - } - break; - case "?": - notrun = true; - break; - } - opt = parser.getopt(); - } - var inputTextFiles = [] : string[]; - var inputHTMLFiles = [] : string[]; - var inputCSVFiles = [] : string[]; - if (filter.length == 0) - { - filter = ['article', '#content', '#main', 'div.body']; - } - for (var i in inputs) - { - var input = inputs[i]; - if (!node.fs.existsSync(input)) - { - console.error("Following input folder/file doesn't exist: " + input); - notrun = true; - } - else - { - var stat = node.fs.statSync(input); - if (stat.isFile()) - { - _Main._checkFileType(node.path.resolve(input), inputTextFiles, inputHTMLFiles, inputCSVFiles); - } - else if (stat.isDirectory()) - { - _Main._checkDirectory(input, inputTextFiles, inputHTMLFiles, inputCSVFiles); - } - else - { - console.error("Following input is not folder or file: " + input); - notrun = true; - } - } - } - if (inputTextFiles.length == 0 && inputHTMLFiles.length == 0 && inputCSVFiles.length == 0 || !mode) - { - showhelp = true; - } - if (showhelp) - { - _Main.usage(); - } - else if (!notrun) - { - var stemmer : Nullable.<Stemmer> = null; - if (algorithm) - { - stemmer = _Main._createStemmer(algorithm); - } - var dump = null : Nullable.<string>; - switch (mode) - { - case 'html': - var unitIndex = validUnitsForHTML.indexOf(unit); - if (unitIndex == -1) - { - console.error('Option -u/--unit should be file, h1, h2, h3, h4, h5, h6. But ' + unit); - } - else - { - var htmlParser = new HTMLParser(unitIndex, root, prefix, filter, stemmer); - for (var i = 0; i < inputHTMLFiles.length; i++) - { - htmlParser.parse(inputHTMLFiles[i]); - } - console.log('generating index...'); - if (verbose) - { - console.log(''); - } - dump = htmlParser.dump(cacheDensity, verbose); - } - break; - case 'csv': - var csvParser = new CSVParser(root, stemmer); - for (var i in inputCSVFiles) - { - csvParser.parse(inputCSVFiles[i]); - } - break; - case 'text': - if (validUnitsForText.indexOf(unit) == -1) - { - console.error('Option u/unit should be file, block, line. But ' + unit); - } - else - { - var textParser = new TextParser(unit, root, stemmer); - for (var i in inputTextFiles) - { - textParser.parse(inputTextFiles[i]); - } - } - break; - } - if (dump) - { - var indexFilePath = ""; - switch (type) - { - case 'index': - indexFilePath = node.path.resolve(root, output, 'searchindex.okt'); - var dirPath = node.path.dirname(indexFilePath); - _Main._mkdirP(dirPath); - node.fs.writeFileSync(indexFilePath, dump, "utf16le"); - break; - case 'base64': - indexFilePath = node.path.resolve(root, output, 'searchindex.okt.b64'); - var dirPath = node.path.dirname(indexFilePath); - _Main._mkdirP(dirPath); - node.fs.writeFileSync(indexFilePath, Binary.base64encode(dump), "utf8"); - break; - case 'cmd': - break; - case 'js': - indexFilePath = node.path.resolve(root, output, 'searchindex.js'); - var dirPath = node.path.dirname(indexFilePath); - _Main._mkdirP(dirPath); - if (name == null) - { - name = 'searchIndex'; - } - var contents = [ - '// Oktavia Search Index', - 'var ' + name + ' = "' + Binary.base64encode(dump) + '";', '' - ]; - node.fs.writeFileSync(indexFilePath, contents.join('\n'), "utf8"); - break; - case 'commonjs': - indexFilePath = node.path.resolve(root, output, 'searchindex.js'); - var dirPath = node.path.dirname(indexFilePath); - _Main._mkdirP(dirPath); - if (name == null) - { - name = 'searchIndex'; - } - var contents = [ - '// Oktavia Search Index', - 'exports.' + name + ' = "' + Binary.base64encode(dump) + '";', '' - ]; - node.fs.writeFileSync(indexFilePath, contents.join('\n'), "utf8"); - break; - } - if (indexFilePath) - { - console.log("generated: " + indexFilePath); - } - } - } - } - - static function _checkFileType (path : string, texts : string[], HTMLs : string[], CSVs : string[]) : void - { - var match = path.match(/(.*)\.(.*)/); - if (match && match[1]) - { - switch (match[2].toLowerCase()) - { - case 'html': - case 'htm': - HTMLs.push(path); - break; - case 'csv': - CSVs.push(path); - break; - default: - texts.push(path); - } - } - } - - static function _checkDirectory (path : string, texts : string[], HTMLs : string[], CSVs : string[]) : void - { - var files = node.fs.readdirSync(path); - for (var j in files) - { - var filepath = node.path.resolve(path, files[j]); - var stat = node.fs.statSync(filepath); - if (stat.isFile()) - { - _Main._checkFileType(filepath, texts, HTMLs, CSVs); - } - else if (stat.isDirectory()) - { - _Main._checkDirectory(filepath, texts, HTMLs, CSVs); - } - } - } - - static function _mkdirP (path : string) : void - { - if (node.fs.existsSync(path)) - { - return; - } - _Main._mkdirP(node.path.dirname(path)); - node.fs.mkdirSync(path); - } - - static function _createStemmer (algorithm : string) : Stemmer - { - var stemmer : Stemmer; - switch (algorithm.toLowerCase()) - { - case "danish": - stemmer = new DanishStemmer(); - break; - case "dutch": - stemmer = new DutchStemmer(); - break; - case "english": - stemmer = new EnglishStemmer(); - break; - case "finnish": - stemmer = new FinnishStemmer(); - break; - case "french": - stemmer = new FrenchStemmer(); - break; - case "german": - stemmer = new GermanStemmer(); - break; - case "hungarian": - stemmer = new HungarianStemmer(); - break; - case "italian": - stemmer = new ItalianStemmer(); - break; - case "norwegian": - stemmer = new NorwegianStemmer(); - break; - case "porter": - stemmer = new PorterStemmer(); - break; - case "portuguese": - stemmer = new PortugueseStemmer(); - break; - case "romanian": - stemmer = new RomanianStemmer(); - break; - case "russian": - stemmer = new RussianStemmer(); - break; - case "spanish": - stemmer = new SpanishStemmer(); - break; - case "swedish": - stemmer = new SwedishStemmer(); - break; - case "turkish": - stemmer = new TurkishStemmer(); - break; - default: - stemmer = new EnglishStemmer(); - break; - } - return stemmer; - } -} |