summaryrefslogtreecommitdiffstats
path: root/src/HtmlParser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/HtmlParser.cc')
-rw-r--r--src/HtmlParser.cc217
1 files changed, 217 insertions, 0 deletions
diff --git a/src/HtmlParser.cc b/src/HtmlParser.cc
new file mode 100644
index 0000000..591c4c7
--- /dev/null
+++ b/src/HtmlParser.cc
@@ -0,0 +1,217 @@
+/*
+ * nghttp2 - HTTP/2 C Library
+ *
+ * Copyright (c) 2012 Tatsuhiro Tsujikawa
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "HtmlParser.h"
+
+#include <libxml/uri.h>
+
+#include "util.h"
+
+namespace nghttp2 {
+
+ParserData::ParserData(const std::string &base_uri)
+ : base_uri(base_uri), inside_head(0) {}
+
+HtmlParser::HtmlParser(const std::string &base_uri)
+ : base_uri_(base_uri), parser_ctx_(nullptr), parser_data_(base_uri) {}
+
+HtmlParser::~HtmlParser() { htmlFreeParserCtxt(parser_ctx_); }
+
+namespace {
+StringRef get_attr(const xmlChar **attrs, const StringRef &name) {
+ if (attrs == nullptr) {
+ return StringRef{};
+ }
+ for (; *attrs; attrs += 2) {
+ if (util::strieq(StringRef{attrs[0], strlen(reinterpret_cast<const char *>(
+ attrs[0]))},
+ name)) {
+ return StringRef{attrs[1],
+ strlen(reinterpret_cast<const char *>(attrs[1]))};
+ }
+ }
+ return StringRef{};
+}
+} // namespace
+
+namespace {
+ResourceType
+get_resource_type_for_preload_as(const StringRef &attribute_value) {
+ if (util::strieq_l("image", attribute_value)) {
+ return REQ_IMG;
+ } else if (util::strieq_l("style", attribute_value)) {
+ return REQ_CSS;
+ } else if (util::strieq_l("script", attribute_value)) {
+ return REQ_UNBLOCK_JS;
+ } else {
+ return REQ_OTHERS;
+ }
+}
+} // namespace
+
+namespace {
+void add_link(ParserData *parser_data, const StringRef &uri,
+ ResourceType res_type) {
+ auto u = xmlBuildURI(
+ reinterpret_cast<const xmlChar *>(uri.c_str()),
+ reinterpret_cast<const xmlChar *>(parser_data->base_uri.c_str()));
+ if (u) {
+ parser_data->links.push_back(
+ std::make_pair(reinterpret_cast<char *>(u), res_type));
+ free(u);
+ }
+}
+} // namespace
+
+namespace {
+void start_element_func(void *user_data, const xmlChar *src_name,
+ const xmlChar **attrs) {
+ auto parser_data = static_cast<ParserData *>(user_data);
+ auto name =
+ StringRef{src_name, strlen(reinterpret_cast<const char *>(src_name))};
+ if (util::strieq_l("head", name)) {
+ ++parser_data->inside_head;
+ }
+ if (util::strieq_l("link", name)) {
+ auto rel_attr = get_attr(attrs, StringRef::from_lit("rel"));
+ auto href_attr = get_attr(attrs, StringRef::from_lit("href"));
+ if (rel_attr.empty() || href_attr.empty()) {
+ return;
+ }
+ if (util::strieq_l("shortcut icon", rel_attr)) {
+ add_link(parser_data, href_attr, REQ_OTHERS);
+ } else if (util::strieq_l("stylesheet", rel_attr)) {
+ add_link(parser_data, href_attr, REQ_CSS);
+ } else if (util::strieq_l("preload", rel_attr)) {
+ auto as_attr = get_attr(attrs, StringRef::from_lit("as"));
+ if (as_attr.empty()) {
+ return;
+ }
+ add_link(parser_data, href_attr,
+ get_resource_type_for_preload_as(as_attr));
+ }
+ } else if (util::strieq_l("img", name)) {
+ auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
+ if (src_attr.empty()) {
+ return;
+ }
+ add_link(parser_data, src_attr, REQ_IMG);
+ } else if (util::strieq_l("script", name)) {
+ auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
+ if (src_attr.empty()) {
+ return;
+ }
+ if (parser_data->inside_head) {
+ add_link(parser_data, src_attr, REQ_JS);
+ } else {
+ add_link(parser_data, src_attr, REQ_UNBLOCK_JS);
+ }
+ }
+}
+} // namespace
+
+namespace {
+void end_element_func(void *user_data, const xmlChar *name) {
+ auto parser_data = static_cast<ParserData *>(user_data);
+ if (util::strieq_l(
+ "head",
+ StringRef{name, strlen(reinterpret_cast<const char *>(name))})) {
+ --parser_data->inside_head;
+ }
+}
+} // namespace
+
+namespace {
+xmlSAXHandler saxHandler = {
+ nullptr, // internalSubsetSAXFunc
+ nullptr, // isStandaloneSAXFunc
+ nullptr, // hasInternalSubsetSAXFunc
+ nullptr, // hasExternalSubsetSAXFunc
+ nullptr, // resolveEntitySAXFunc
+ nullptr, // getEntitySAXFunc
+ nullptr, // entityDeclSAXFunc
+ nullptr, // notationDeclSAXFunc
+ nullptr, // attributeDeclSAXFunc
+ nullptr, // elementDeclSAXFunc
+ nullptr, // unparsedEntityDeclSAXFunc
+ nullptr, // setDocumentLocatorSAXFunc
+ nullptr, // startDocumentSAXFunc
+ nullptr, // endDocumentSAXFunc
+ &start_element_func, // startElementSAXFunc
+ &end_element_func, // endElementSAXFunc
+ nullptr, // referenceSAXFunc
+ nullptr, // charactersSAXFunc
+ nullptr, // ignorableWhitespaceSAXFunc
+ nullptr, // processingInstructionSAXFunc
+ nullptr, // commentSAXFunc
+ nullptr, // warningSAXFunc
+ nullptr, // errorSAXFunc
+ nullptr, // fatalErrorSAXFunc
+ nullptr, // getParameterEntitySAXFunc
+ nullptr, // cdataBlockSAXFunc
+ nullptr, // externalSubsetSAXFunc
+ 0, // unsigned int initialized
+ nullptr, // void * _private
+ nullptr, // startElementNsSAX2Func
+ nullptr, // endElementNsSAX2Func
+ nullptr, // xmlStructuredErrorFunc
+};
+} // namespace
+
+int HtmlParser::parse_chunk(const char *chunk, size_t size, int fin) {
+ if (!parser_ctx_) {
+ parser_ctx_ =
+ htmlCreatePushParserCtxt(&saxHandler, &parser_data_, chunk, size,
+ base_uri_.c_str(), XML_CHAR_ENCODING_NONE);
+ if (!parser_ctx_) {
+ return -1;
+ } else {
+ if (fin) {
+ return parse_chunk_internal(nullptr, 0, fin);
+ } else {
+ return 0;
+ }
+ }
+ } else {
+ return parse_chunk_internal(chunk, size, fin);
+ }
+}
+
+int HtmlParser::parse_chunk_internal(const char *chunk, size_t size, int fin) {
+ int rv = htmlParseChunk(parser_ctx_, chunk, size, fin);
+ if (rv == 0) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+const std::vector<std::pair<std::string, ResourceType>> &
+HtmlParser::get_links() const {
+ return parser_data_.links;
+}
+
+void HtmlParser::clear_links() { parser_data_.links.clear(); }
+
+} // namespace nghttp2