summaryrefslogtreecommitdiffstats
path: root/xmlreader/source/xmlreader.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'xmlreader/source/xmlreader.cxx')
-rw-r--r--xmlreader/source/xmlreader.cxx941
1 files changed, 941 insertions, 0 deletions
diff --git a/xmlreader/source/xmlreader.cxx b/xmlreader/source/xmlreader.cxx
new file mode 100644
index 000000000..5153db2fb
--- /dev/null
+++ b/xmlreader/source/xmlreader.cxx
@@ -0,0 +1,941 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+#include <climits>
+
+#include <com/sun/star/container/NoSuchElementException.hpp>
+#include <com/sun/star/uno/RuntimeException.hpp>
+#include <osl/file.h>
+#include <rtl/character.hxx>
+#include <rtl/string.h>
+#include <rtl/ustring.hxx>
+#include <sal/log.hxx>
+#include <sal/types.h>
+#include <xmlreader/pad.hxx>
+#include <xmlreader/span.hxx>
+#include <xmlreader/xmlreader.hxx>
+
+namespace xmlreader {
+
+namespace {
+
+bool isSpace(char c) {
+ switch (c) {
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ case ' ':
+ return true;
+ default:
+ return false;
+ }
+}
+
+}
+
+XmlReader::XmlReader(OUString const & fileUrl)
+ : fileUrl_(fileUrl)
+ , fileHandle_(nullptr)
+{
+ oslFileError e = osl_openFile(
+ fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
+ switch (e)
+ {
+ case osl_File_E_None:
+ break;
+ case osl_File_E_NOENT:
+ throw css::container::NoSuchElementException( fileUrl_ );
+ default:
+ throw css::uno::RuntimeException(
+ "cannot open " + fileUrl_ + ": " + OUString::number(e));
+ }
+ e = osl_getFileSize(fileHandle_, &fileSize_);
+ if (e == osl_File_E_None) {
+ e = osl_mapFile(
+ fileHandle_, &fileAddress_, fileSize_, 0,
+ osl_File_MapFlag_WillNeed);
+ }
+ if (e != osl_File_E_None) {
+ oslFileError e2 = osl_closeFile(fileHandle_);
+ if (e2 != osl_File_E_None) {
+ SAL_WARN(
+ "xmlreader",
+ "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
+ }
+ throw css::uno::RuntimeException(
+ "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
+ }
+ namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
+ namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
+ pos_ = static_cast< char * >(fileAddress_);
+ end_ = pos_ + fileSize_;
+ state_ = State::Content;
+ firstAttribute_ = true;
+}
+
+XmlReader::~XmlReader() {
+ if (!fileHandle_)
+ return;
+ oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
+ if (e != osl_File_E_None) {
+ SAL_WARN(
+ "xmlreader",
+ "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
+ }
+ e = osl_closeFile(fileHandle_);
+ if (e != osl_File_E_None) {
+ SAL_WARN(
+ "xmlreader",
+ "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
+ }
+}
+
+int XmlReader::registerNamespaceIri(Span const & iri) {
+ int id = toNamespaceId(namespaceIris_.size());
+ namespaceIris_.push_back(iri);
+ if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
+ // Old user layer .xcu files used the xsi namespace prefix without
+ // declaring a corresponding namespace binding, see issue 77174; reading
+ // those files during migration would fail without this hack that can be
+ // removed once migration is no longer relevant (see
+ // configmgr::Components::parseModificationLayer):
+ namespaces_.emplace_back(Span("xsi"), id);
+ }
+ return id;
+}
+
+XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
+{
+ switch (state_) {
+ case State::Content:
+ switch (reportText) {
+ case Text::NONE:
+ return handleSkippedText(data, nsId);
+ case Text::Raw:
+ return handleRawText(data);
+ default: // Text::Normalized
+ return handleNormalizedText(data);
+ }
+ case State::StartTag:
+ return handleStartTag(nsId, data);
+ case State::EndTag:
+ return handleEndTag();
+ case State::EmptyElementTag:
+ handleElementEnd();
+ return Result::End;
+ default: // State::Done
+ return Result::Done;
+ }
+}
+
+bool XmlReader::nextAttribute(int * nsId, Span * localName) {
+ assert(nsId != nullptr && localName != nullptr);
+ if (firstAttribute_) {
+ currentAttribute_ = attributes_.begin();
+ firstAttribute_ = false;
+ } else {
+ ++currentAttribute_;
+ }
+ if (currentAttribute_ == attributes_.end()) {
+ return false;
+ }
+ if (currentAttribute_->nameColon == nullptr) {
+ *nsId = NAMESPACE_NONE;
+ *localName = Span(
+ currentAttribute_->nameBegin,
+ currentAttribute_->nameEnd - currentAttribute_->nameBegin);
+ } else {
+ *nsId = getNamespaceId(
+ Span(
+ currentAttribute_->nameBegin,
+ currentAttribute_->nameColon - currentAttribute_->nameBegin));
+ *localName = Span(
+ currentAttribute_->nameColon + 1,
+ currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
+ }
+ return true;
+}
+
+Span XmlReader::getAttributeValue(bool fullyNormalize) {
+ return handleAttributeValue(
+ currentAttribute_->valueBegin, currentAttribute_->valueEnd,
+ fullyNormalize);
+}
+
+int XmlReader::getNamespaceId(Span const & prefix) const {
+ auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
+ [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
+
+ if (i != namespaces_.rend())
+ return i->nsId;
+
+ return NAMESPACE_UNKNOWN;
+}
+
+
+void XmlReader::normalizeLineEnds(Span const & text) {
+ char const * p = text.begin;
+ sal_Int32 n = text.length;
+ for (;;) {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
+ if (i < 0) {
+ break;
+ }
+ pad_.add(p, i);
+ p += i + 1;
+ n -= i + 1;
+ if (n == 0 || *p != '\x0A') {
+ pad_.add("\x0A");
+ }
+ }
+ pad_.add(p, n);
+}
+
+void XmlReader::skipSpace() {
+ while (isSpace(peek())) {
+ ++pos_;
+ }
+}
+
+bool XmlReader::skipComment() {
+ if (rtl_str_shortenedCompare_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
+ RTL_CONSTASCII_LENGTH("--")) !=
+ 0)
+ {
+ return false;
+ }
+ pos_ += RTL_CONSTASCII_LENGTH("--");
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "premature end (within comment) of " + fileUrl_ );
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("--");
+ if (read() != '>') {
+ throw css::uno::RuntimeException(
+ "illegal \"--\" within comment in " + fileUrl_ );
+ }
+ return true;
+}
+
+void XmlReader::skipProcessingInstruction() {
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "bad '<?' in " + fileUrl_ );
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("?>");
+}
+
+void XmlReader::skipDocumentTypeDeclaration() {
+ // Neither is it checked that the doctypedecl is at the correct position in
+ // the document, nor that it is well-formed:
+ for (;;) {
+ char c = read();
+ switch (c) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ "premature end (within DTD) of " + fileUrl_ );
+ case '"':
+ case '\'':
+ {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(
+ pos_, end_ - pos_, c);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "premature end (within DTD) of " + fileUrl_ );
+ }
+ pos_ += i + 1;
+ }
+ break;
+ case '>':
+ return;
+ case '[':
+ for (;;) {
+ c = read();
+ switch (c) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ "premature end (within DTD) of " + fileUrl_ );
+ case '"':
+ case '\'':
+ {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(
+ pos_, end_ - pos_, c);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "premature end (within DTD) of " + fileUrl_ );
+ }
+ pos_ += i + 1;
+ }
+ break;
+ case '<':
+ switch (read()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ "premature end (within DTD) of " + fileUrl_ );
+ case '!':
+ skipComment();
+ break;
+ case '?':
+ skipProcessingInstruction();
+ break;
+ default:
+ break;
+ }
+ break;
+ case ']':
+ skipSpace();
+ if (read() != '>') {
+ throw css::uno::RuntimeException(
+ "missing \">\" of DTD in " + fileUrl_ );
+ }
+ return;
+ default:
+ break;
+ }
+ }
+ default:
+ break;
+ }
+ }
+}
+
+Span XmlReader::scanCdataSection() {
+ if (rtl_str_shortenedCompare_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
+ RTL_CONSTASCII_LENGTH("[CDATA[")) !=
+ 0)
+ {
+ return Span();
+ }
+ pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
+ char const * begin = pos_;
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "premature end (within CDATA section) of " + fileUrl_ );
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
+ return Span(begin, i);
+}
+
+bool XmlReader::scanName(char const ** nameColon) {
+ assert(nameColon != nullptr && *nameColon == nullptr);
+ for (char const * begin = pos_;; ++pos_) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ case ' ':
+ case '/':
+ case '=':
+ case '>':
+ return pos_ != begin;
+ case ':':
+ *nameColon = pos_;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
+ assert(begin != nullptr && begin <= end);
+ Span iri(handleAttributeValue(begin, end, false));
+ for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
+ if (namespaceIris_[i] == iri) {
+ return toNamespaceId(i);
+ }
+ }
+ return XmlReader::NAMESPACE_UNKNOWN;
+}
+
+char const * XmlReader::handleReference(char const * position, char const * end)
+{
+ assert(position != nullptr && *position == '&' && position < end);
+ ++position;
+ if (*position == '#') {
+ ++position;
+ sal_uInt32 val = 0;
+ char const * p;
+ if (*position == 'x') {
+ ++position;
+ p = position;
+ for (;; ++position) {
+ char c = *position;
+ if (c >= '0' && c <= '9') {
+ val = 16 * val + (c - '0');
+ } else if (c >= 'A' && c <= 'F') {
+ val = 16 * val + (c - 'A') + 10;
+ } else if (c >= 'a' && c <= 'f') {
+ val = 16 * val + (c - 'a') + 10;
+ } else {
+ break;
+ }
+ if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
+ throw css::uno::RuntimeException(
+ "'&#x...' too large in " + fileUrl_ );
+ }
+ }
+ } else {
+ p = position;
+ for (;; ++position) {
+ char c = *position;
+ if (c >= '0' && c <= '9') {
+ val = 10 * val + (c - '0');
+ } else {
+ break;
+ }
+ if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
+ throw css::uno::RuntimeException(
+ "'&#...' too large in " + fileUrl_ );
+ }
+ }
+ }
+ if (position == p || *position++ != ';') {
+ throw css::uno::RuntimeException(
+ "'&#...' missing ';' in " + fileUrl_ );
+ }
+ assert(rtl::isUnicodeCodePoint(val));
+ if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
+ (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
+ {
+ throw css::uno::RuntimeException(
+ "character reference denoting invalid character in " + fileUrl_ );
+ }
+ char buf[4];
+ sal_Int32 len;
+ if (val < 0x80) {
+ buf[0] = static_cast< char >(val);
+ len = 1;
+ } else if (val < 0x800) {
+ buf[0] = static_cast< char >((val >> 6) | 0xC0);
+ buf[1] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 2;
+ } else if (val < 0x10000) {
+ buf[0] = static_cast< char >((val >> 12) | 0xE0);
+ buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+ buf[2] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 3;
+ } else {
+ buf[0] = static_cast< char >((val >> 18) | 0xF0);
+ buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
+ buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+ buf[3] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 4;
+ }
+ pad_.addEphemeral(buf, len);
+ return position;
+ } else {
+ struct EntityRef {
+ char const * inBegin;
+ sal_Int32 const inLength;
+ char const * outBegin;
+ sal_Int32 const outLength;
+ };
+ static EntityRef const refs[] = {
+ { RTL_CONSTASCII_STRINGPARAM("amp;"),
+ RTL_CONSTASCII_STRINGPARAM("&") },
+ { RTL_CONSTASCII_STRINGPARAM("lt;"),
+ RTL_CONSTASCII_STRINGPARAM("<") },
+ { RTL_CONSTASCII_STRINGPARAM("gt;"),
+ RTL_CONSTASCII_STRINGPARAM(">") },
+ { RTL_CONSTASCII_STRINGPARAM("apos;"),
+ RTL_CONSTASCII_STRINGPARAM("'") },
+ { RTL_CONSTASCII_STRINGPARAM("quot;"),
+ RTL_CONSTASCII_STRINGPARAM("\"") } };
+ for (const auto & ref : refs) {
+ if (rtl_str_shortenedCompare_WithLength(
+ position, end - position, ref.inBegin, ref.inLength,
+ ref.inLength) ==
+ 0)
+ {
+ position += ref.inLength;
+ pad_.add(ref.outBegin, ref.outLength);
+ return position;
+ }
+ }
+ throw css::uno::RuntimeException(
+ "unknown entity reference in " + fileUrl_ );
+ }
+}
+
+Span XmlReader::handleAttributeValue(
+ char const * begin, char const * end, bool fullyNormalize)
+{
+ pad_.clear();
+ if (fullyNormalize) {
+ while (begin != end && isSpace(*begin)) {
+ ++begin;
+ }
+ while (end != begin && isSpace(end[-1])) {
+ --end;
+ }
+ char const * p = begin;
+ enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+ // a single true space character can go into the current span,
+ // everything else breaks the span
+ Space space = SPACE_NONE;
+ while (p != end) {
+ switch (*p) {
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ switch (space) {
+ case SPACE_NONE:
+ pad_.add(begin, p - begin);
+ pad_.add(" ");
+ space = SPACE_BREAK;
+ break;
+ case SPACE_SPAN:
+ pad_.add(begin, p - begin);
+ space = SPACE_BREAK;
+ break;
+ case SPACE_BREAK:
+ break;
+ }
+ begin = ++p;
+ break;
+ case ' ':
+ switch (space) {
+ case SPACE_NONE:
+ ++p;
+ space = SPACE_SPAN;
+ break;
+ case SPACE_SPAN:
+ pad_.add(begin, p - begin);
+ begin = ++p;
+ space = SPACE_BREAK;
+ break;
+ case SPACE_BREAK:
+ begin = ++p;
+ break;
+ }
+ break;
+ case '&':
+ pad_.add(begin, p - begin);
+ p = handleReference(p, end);
+ begin = p;
+ space = SPACE_NONE;
+ break;
+ default:
+ ++p;
+ space = SPACE_NONE;
+ break;
+ }
+ }
+ pad_.add(begin, p - begin);
+ } else {
+ char const * p = begin;
+ while (p != end) {
+ switch (*p) {
+ case '\x09':
+ case '\x0A':
+ pad_.add(begin, p - begin);
+ begin = ++p;
+ pad_.add(" ");
+ break;
+ case '\x0D':
+ pad_.add(begin, p - begin);
+ ++p;
+ if (peek() == '\x0A') {
+ ++p;
+ }
+ begin = p;
+ pad_.add(" ");
+ break;
+ case '&':
+ pad_.add(begin, p - begin);
+ p = handleReference(p, end);
+ begin = p;
+ break;
+ default:
+ ++p;
+ break;
+ }
+ }
+ pad_.add(begin, p - begin);
+ }
+ return pad_.get();
+}
+
+XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
+ assert(nsId != nullptr && localName);
+ char const * nameBegin = pos_;
+ char const * nameColon = nullptr;
+ if (!scanName(&nameColon)) {
+ throw css::uno::RuntimeException(
+ "bad tag name in " + fileUrl_ );
+ }
+ char const * nameEnd = pos_;
+ NamespaceList::size_type inheritedNamespaces = namespaces_.size();
+ bool hasDefaultNs = false;
+ int defaultNsId = NAMESPACE_NONE;
+ attributes_.clear();
+ for (;;) {
+ char const * p = pos_;
+ skipSpace();
+ if (peek() == '/' || peek() == '>') {
+ break;
+ }
+ if (pos_ == p) {
+ throw css::uno::RuntimeException(
+ "missing whitespace before attribute in " + fileUrl_ );
+ }
+ char const * attrNameBegin = pos_;
+ char const * attrNameColon = nullptr;
+ if (!scanName(&attrNameColon)) {
+ throw css::uno::RuntimeException(
+ "bad attribute name in " + fileUrl_ );
+ }
+ char const * attrNameEnd = pos_;
+ skipSpace();
+ if (read() != '=') {
+ throw css::uno::RuntimeException(
+ "missing '=' in " + fileUrl_ );
+ }
+ skipSpace();
+ char del = read();
+ if (del != '\'' && del != '"') {
+ throw css::uno::RuntimeException(
+ "bad attribute value in " + fileUrl_ );
+ }
+ char const * valueBegin = pos_;
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ "unterminated attribute value in " + fileUrl_ );
+ }
+ char const * valueEnd = pos_ + i;
+ pos_ += i + 1;
+ if (attrNameColon == nullptr &&
+ Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
+ {
+ hasDefaultNs = true;
+ defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
+ } else if (attrNameColon != nullptr &&
+ Span(attrNameBegin, attrNameColon - attrNameBegin) ==
+ "xmlns")
+ {
+ namespaces_.emplace_back(
+ Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
+ scanNamespaceIri(valueBegin, valueEnd));
+ } else {
+ attributes_.emplace_back(
+ attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
+ valueEnd);
+ }
+ }
+ if (!hasDefaultNs && !elements_.empty()) {
+ defaultNsId = elements_.top().defaultNamespaceId;
+ }
+ firstAttribute_ = true;
+ if (peek() == '/') {
+ state_ = State::EmptyElementTag;
+ ++pos_;
+ } else {
+ state_ = State::Content;
+ }
+ if (peek() != '>') {
+ throw css::uno::RuntimeException(
+ "missing '>' in " + fileUrl_ );
+ }
+ ++pos_;
+ elements_.push(
+ ElementData(
+ Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
+ defaultNsId));
+ if (nameColon == nullptr) {
+ *nsId = defaultNsId;
+ *localName = Span(nameBegin, nameEnd - nameBegin);
+ } else {
+ *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
+ *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
+ }
+ return Result::Begin;
+}
+
+XmlReader::Result XmlReader::handleEndTag() {
+ if (elements_.empty()) {
+ throw css::uno::RuntimeException(
+ "spurious end tag in " + fileUrl_ );
+ }
+ char const * nameBegin = pos_;
+ char const * nameColon = nullptr;
+ if (!scanName(&nameColon) ||
+ !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
+ {
+ throw css::uno::RuntimeException(
+ "tag mismatch in " + fileUrl_ );
+ }
+ handleElementEnd();
+ skipSpace();
+ if (peek() != '>') {
+ throw css::uno::RuntimeException(
+ "missing '>' in " + fileUrl_ );
+ }
+ ++pos_;
+ return Result::End;
+}
+
+void XmlReader::handleElementEnd() {
+ assert(!elements_.empty());
+ auto end = elements_.top().inheritedNamespaces;
+ namespaces_.resize(end);
+ elements_.pop();
+ state_ = elements_.empty() ? State::Done : State::Content;
+}
+
+XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
+ for (;;) {
+ auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
+ if (!i) {
+ throw css::uno::RuntimeException(
+ "premature end of " + fileUrl_ );
+ }
+ pos_ = i + 1;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (!skipComment() && !scanCdataSection().is()) {
+ skipDocumentTypeDeclaration();
+ }
+ break;
+ case '/':
+ ++pos_;
+ return handleEndTag();
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ break;
+ default:
+ return handleStartTag(nsId, data);
+ }
+ }
+}
+
+XmlReader::Result XmlReader::handleRawText(Span * text) {
+ pad_.clear();
+ for (char const * begin = pos_;;) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ "premature end of " + fileUrl_ );
+ case '\x0D':
+ pad_.add(begin, pos_ - begin);
+ ++pos_;
+ if (peek() != '\x0A') {
+ pad_.add("\x0A");
+ }
+ begin = pos_;
+ break;
+ case '&':
+ pad_.add(begin, pos_ - begin);
+ pos_ = handleReference(pos_, end_);
+ begin = pos_;
+ break;
+ case '<':
+ pad_.add(begin, pos_ - begin);
+ ++pos_;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (!skipComment()) {
+ Span cdata(scanCdataSection());
+ if (cdata.is()) {
+ normalizeLineEnds(cdata);
+ } else {
+ skipDocumentTypeDeclaration();
+ }
+ }
+ begin = pos_;
+ break;
+ case '/':
+ *text = pad_.get();
+ ++pos_;
+ state_ = State::EndTag;
+ return Result::Text;
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ begin = pos_;
+ break;
+ default:
+ *text = pad_.get();
+ state_ = State::StartTag;
+ return Result::Text;
+ }
+ break;
+ default:
+ ++pos_;
+ break;
+ }
+ }
+}
+
+XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
+ pad_.clear();
+ char const * flowBegin = pos_;
+ char const * flowEnd = pos_;
+ enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+ // a single true space character can go into the current flow,
+ // everything else breaks the flow
+ Space space = SPACE_START;
+ for (;;) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ "premature end of " + fileUrl_ );
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ switch (space) {
+ case SPACE_START:
+ case SPACE_BREAK:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ space = SPACE_BREAK;
+ break;
+ }
+ ++pos_;
+ break;
+ case ' ':
+ switch (space) {
+ case SPACE_START:
+ case SPACE_BREAK:
+ break;
+ case SPACE_NONE:
+ space = SPACE_SPAN;
+ break;
+ case SPACE_SPAN:
+ space = SPACE_BREAK;
+ break;
+ }
+ ++pos_;
+ break;
+ case '&':
+ switch (space) {
+ case SPACE_START:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ pad_.add(flowBegin, pos_ - flowBegin);
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(" ");
+ break;
+ }
+ pos_ = handleReference(pos_, end_);
+ flowBegin = pos_;
+ flowEnd = pos_;
+ space = SPACE_NONE;
+ break;
+ case '<':
+ ++pos_;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (skipComment()) {
+ space = SPACE_BREAK;
+ } else {
+ Span cdata(scanCdataSection());
+ if (cdata.is()) {
+ // CDATA is not normalized (similar to character
+ // references; it keeps the code simple), but it might
+ // arguably be better to normalize it:
+ switch (space) {
+ case SPACE_START:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ pad_.add(flowBegin, pos_ - flowBegin);
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(" ");
+ break;
+ }
+ normalizeLineEnds(cdata);
+ flowBegin = pos_;
+ flowEnd = pos_;
+ space = SPACE_NONE;
+ } else {
+ skipDocumentTypeDeclaration();
+ }
+ }
+ break;
+ case '/':
+ ++pos_;
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ *text = pad_.get();
+ state_ = State::EndTag;
+ return Result::Text;
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ space = SPACE_BREAK;
+ break;
+ default:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ *text = pad_.get();
+ state_ = State::StartTag;
+ return Result::Text;
+ }
+ break;
+ default:
+ switch (space) {
+ case SPACE_START:
+ flowBegin = pos_;
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(" ");
+ flowBegin = pos_;
+ break;
+ }
+ flowEnd = ++pos_;
+ space = SPACE_NONE;
+ break;
+ }
+ }
+}
+
+int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
+ assert(pos <= INT_MAX);
+ return static_cast< int >(pos);
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */