From be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 04:57:58 +0200 Subject: Adding upstream version 1.44.3. Signed-off-by: Daniel Baumann --- .../h2o/libh2o/misc/oktavia/src/htmlparser.jsx | 280 +++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 web/server/h2o/libh2o/misc/oktavia/src/htmlparser.jsx (limited to 'web/server/h2o/libh2o/misc/oktavia/src/htmlparser.jsx') diff --git a/web/server/h2o/libh2o/misc/oktavia/src/htmlparser.jsx b/web/server/h2o/libh2o/misc/oktavia/src/htmlparser.jsx new file mode 100644 index 00000000..3d0ace78 --- /dev/null +++ b/web/server/h2o/libh2o/misc/oktavia/src/htmlparser.jsx @@ -0,0 +1,280 @@ +import "console.jsx"; +import "js/nodejs.jsx"; +import "oktavia.jsx"; +import "metadata.jsx"; +import "sax.jsx"; +import "stemmer/stemmer.jsx"; + + +class _HTMLHandler extends SAXHandler +{ + var startParse : boolean; + var startTag : string; + var stack : string []; + var oktavia : Oktavia; + var section : Section; + var tag : Block; + var filter : TagFilter; + var filepath : string; + var unit : int; + var currentLink : string; + var currentTitle : string; + var lastId : string; + var waitTitle : boolean; + var sectionCount : int; + var inCode : boolean; + var addText : boolean; + + function constructor (oktavia : Oktavia, filepath : string, unit : int, filter : TagFilter) + { + super(); + this.startParse = false; + this.stack = [] : string[]; + this.oktavia = oktavia; + this.section = this.oktavia.getSection('section'); + this.tag = this.oktavia.getBlock('tag'); + this.unit = unit; + this.filter = filter; + this.filepath = filepath; + this.currentTitle = ''; + this.lastId = ''; + this.waitTitle = false; + this.addText = false; + } + + override function onready () : void + { + this.currentLink = this.filepath; + this.inCode = false; + } + + override function onopentag (tagname : string, attributes : Map.) : void + { + var headingId = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']; + if (this.startParse) + { + this.stack.push(tagname); + if ('id' in attributes) + { + this.lastId = attributes['id']; + } + switch (tagname) + { + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + if (headingId.indexOf(tagname) < this.unit) + { + if (this.oktavia.contentSize() > 0) + { + this.section.setTail(this.currentTitle + Oktavia.eob + this.currentLink); + } + this.currentLink = this.filepath + '#' + this.lastId; + this.currentTitle = ''; + this.waitTitle = true; + } + this.oktavia.addEndOfBlock(); + this.tag.startBlock(tagname); + break; + case 'pre': + this.tag.startBlock('pre'); + this.oktavia.addEndOfBlock(); + this.inCode = true; + break; + case 'p': + case 'div': + case 'blockquote': + this.oktavia.addEndOfBlock(); + break; + } + } + else + { + if (this.filter.match(tagname, attributes)) + { + this.startParse = true; + this.startTag = tagname; + this.stack.push(tagname); + } + } + if (tagname == 'title') + { + this.waitTitle = true; + this.currentTitle = ''; + } + } + + override function onclosetag (tagname : string) : void + { + if (this.startParse) + { + switch (tagname) + { + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + this.tag.endBlock(); + if (this.addText) + { + this.oktavia.addWord('\n'); + this.addText = false; + } + this.waitTitle = false; + break; + case 'pre': + this.inCode = false; + this.tag.endBlock(); + if (this.addText) + { + this.oktavia.addWord('\n'); + this.addText = false; + } + break; + case 'div': + case 'p': + case 'blockquote': + if (this.addText) + { + this.oktavia.addWord('\n'); + this.addText = false; + } + break; + } + if (this.stack.length == 0) + { + this.startParse = false; + } + } + if (tagname == 'title') + { + this.waitTitle = false; + } + } + + override function ontext (text : string) : void + { + if (this.startParse) + { + this.oktavia.addWord(text, !this.inCode); + this.addText = true; + } + if (this.waitTitle) + { + this.currentTitle += text; + } + } + + override function onend () : void + { + this.section.setTail(this.currentTitle + Oktavia.eob + this.currentLink); + } +} + +class TagFilter +{ + var tags : string[]; + var ids : string[]; + var classes : string[]; + var tagAndClasses : string[]; + + function constructor (filters : string[]) + { + this.tags = [] : string[]; + this.ids = [] : string[]; + this.classes = [] : string[]; + this.tagAndClasses = [] : string[]; + + for (var i = 0; i < filters.length; i++) + { + var filter = filters[i]; + switch (filter.charAt(0)) + { + case '#': + this.ids.push(filter.slice(1)); + break; + case '.': + this.classes.push(filter.slice(1)); + break; + default: + if (filter.indexOf('.') != -1) + { + this.tags.push(filter); + } + else + { + this.tagAndClasses.push(filter); + } + } + } + } + + function match (tagname : string, attributes : Map.) : boolean + { + var result = false; + if (this.tags.indexOf(tagname) != -1) + { + result = true; + } + else if (attributes['id'] && this.ids.indexOf(attributes['id']) != -1) + { + result = true; + } + else if (attributes['class']) + { + var classname = attributes['class']; + if (this.classes.indexOf(classname) != -1 || + this.tagAndClasses.indexOf(tagname + '.' + classname) != -1) + { + result = true; + } + } + return result; + } +} + +class HTMLParser +{ + var oktavia : Oktavia; + var unit : int; + var root : string; + var prefix : string; + var filter : TagFilter; + + function constructor (unit : int, root : string, prefix : string, filter : string[], stemmer : Nullable.) + { + this.unit = unit; + this.root = root; + this.prefix = prefix; + this.filter = new TagFilter(filter); + this.oktavia = new Oktavia(); + this.oktavia.addSection('section'); + this.oktavia.addBlock('tag'); + if (stemmer) + { + this.oktavia.setStemmer(stemmer); + } + } + + function parse (filepath : string) : void + { + var relative = this.prefix + node.path.relative(this.root, filepath); + console.log('reading: ' + relative); + var lines = node.fs.readFileSync(filepath, 'utf8'); + var handler = new _HTMLHandler(this.oktavia, relative, this.unit, this.filter); + var parser = new SAXParser(handler); + parser.parse(lines); + } + + function dump (cacheDensity : int, verbose : boolean) : string + { + console.log('\nbuilding...\n'); + this.oktavia.build(cacheDensity, verbose); + return this.oktavia.dump(verbose); + } +} -- cgit v1.2.3