8 files changed, 1155 insertions, 0 deletions
diff --git a/xmlreader/IwyuFilter_xmlreader.yaml b/xmlreader/IwyuFilter_xmlreader.yaml
new file mode 100644
index 000000000..bb9f26a1e
--- /dev/null
+++ b/xmlreader/IwyuFilter_xmlreader.yaml
@@ -0,0 +1,2 @@
+---
+assumeFilename: xmlreader/source/xmlreader.cxx
diff --git a/xmlreader/Library_xmlreader.mk b/xmlreader/Library_xmlreader.mk
new file mode 100644
index 000000000..f99d58474
--- /dev/null
+++ b/xmlreader/Library_xmlreader.mk
@@ -0,0 +1,44 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This file incorporates work covered by the following license notice:
+#
+#   Licensed to the Apache Software Foundation (ASF) under one or more
+#   contributor license agreements. See the NOTICE file distributed
+#   with this work for additional information regarding copyright
+#   ownership. The ASF licenses this file to you under the Apache
+#   License, Version 2.0 (the "License"); you may not use this file
+#   except in compliance with the License. You may obtain a copy of
+#   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+#
+
+$(eval $(call gb_Library_Library,xmlreader))
+
+$(eval $(call gb_Library_add_defs,xmlreader,\
+	-DOOO_DLLIMPLEMENTATION_XMLREADER \
+))
+
+$(eval $(call gb_Library_set_is_ure_library_or_dependency,xmlreader))
+
+$(eval $(call gb_Library_use_external,xmlreader,boost_headers))
+
+$(eval $(call gb_Library_use_libraries,xmlreader,\
+    cppu \
+	sal \
+))
+    # cppu is only needed due to the cppumaker -C hack
+
+$(eval $(call gb_Library_add_exception_objects,xmlreader,\
+	xmlreader/source/pad \
+	xmlreader/source/span \
+	xmlreader/source/xmlreader \
+))
+
+$(eval $(call gb_Library_use_udk_api,xmlreader))
+
+# vim: set noet sw=4 ts=4:
diff --git a/xmlreader/Makefile b/xmlreader/Makefile
new file mode 100644
index 000000000..ccb1c85a0
--- /dev/null
+++ b/xmlreader/Makefile
@@ -0,0 +1,7 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+
+module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include $(module_directory)/../solenv/gbuild/partial_build.mk
+
+# vim: set noet sw=4 ts=4:
diff --git a/xmlreader/Module_xmlreader.mk b/xmlreader/Module_xmlreader.mk
new file mode 100644
index 000000000..2d2d91805
--- /dev/null
+++ b/xmlreader/Module_xmlreader.mk
@@ -0,0 +1,26 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This file incorporates work covered by the following license notice:
+#
+#   Licensed to the Apache Software Foundation (ASF) under one or more
+#   contributor license agreements. See the NOTICE file distributed
+#   with this work for additional information regarding copyright
+#   ownership. The ASF licenses this file to you under the Apache
+#   License, Version 2.0 (the "License"); you may not use this file
+#   except in compliance with the License. You may obtain a copy of
+#   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+#
+
+$(eval $(call gb_Module_Module,xmlreader))
+
+$(eval $(call gb_Module_add_targets,xmlreader,\
+	Library_xmlreader \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/xmlreader/README.md b/xmlreader/README.md
new file mode 100644
index 000000000..8d76e2052
--- /dev/null
+++ b/xmlreader/README.md
@@ -0,0 +1,6 @@
+# Fast and Small XML Pull Parser
+
+Implements a simple, fast pull parser, currently used by `configmgr` and
+`stoc`'s simpleregistry code (used to register UNO components in
+services.rdb files). It supports a subset of XML features, but is fast
+and small.
diff --git a/xmlreader/source/pad.cxx b/xmlreader/source/pad.cxx
new file mode 100644
index 000000000..ce45f805f
--- /dev/null
+++ b/xmlreader/source/pad.cxx
@@ -0,0 +1,78 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+
+#include <sal/types.h>
+#include <xmlreader/pad.hxx>
+#include <xmlreader/span.hxx>
+
+namespace xmlreader {
+
+void Pad::add(char const * begin, sal_Int32 length) {
+    assert(
+        begin != nullptr && length >= 0 && !(span_.is() && buffer_.getLength() != 0));
+    if (length != 0) {
+        flushSpan();
+        if (buffer_.isEmpty()) {
+            span_ = Span(begin, length);
+        } else {
+            buffer_.append(begin, length);
+        }
+    }
+}
+
+void Pad::addEphemeral(char const * begin, sal_Int32 length) {
+    assert(
+        begin != nullptr && length >= 0 && !(span_.is() && buffer_.getLength() != 0));
+    if (length != 0) {
+        flushSpan();
+        buffer_.append(begin, length);
+    }
+}
+
+void Pad::clear() {
+    assert(!(span_.is() && buffer_.getLength() != 0));
+    span_.clear();
+    buffer_.setLength(0);
+}
+
+Span Pad::get() const {
+    assert(!(span_.is() && buffer_.getLength() != 0));
+    if (span_.is()) {
+        return span_;
+    } else if (buffer_.isEmpty()) {
+        return Span("");
+    } else {
+        return Span(buffer_.getStr(), buffer_.getLength());
+    }
+}
+
+void Pad::flushSpan() {
+    if (span_.is()) {
+        buffer_.append(span_.begin, span_.length);
+        span_.clear();
+    }
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/xmlreader/source/span.cxx b/xmlreader/source/span.cxx
new file mode 100644
index 000000000..29433c561
--- /dev/null
+++ b/xmlreader/source/span.cxx
@@ -0,0 +1,51 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+
+#include <com/sun/star/uno/RuntimeException.hpp>
+#include <rtl/textcvt.h>
+#include <rtl/textenc.h>
+#include <rtl/ustring.h>
+#include <rtl/ustring.hxx>
+#include <sal/types.h>
+#include <xmlreader/span.hxx>
+
+namespace xmlreader {
+
+OUString Span::convertFromUtf8() const {
+    assert(is());
+    rtl_uString * s = nullptr;
+    if (!rtl_convertStringToUString(
+            &s, begin, length, RTL_TEXTENCODING_UTF8,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR |
+             RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR |
+             RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR)))
+    {
+        throw css::uno::RuntimeException(
+            "cannot convert from UTF-8");
+    }
+    return OUString(s, SAL_NO_ACQUIRE);
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/xmlreader/source/xmlreader.cxx b/xmlreader/source/xmlreader.cxx
new file mode 100644
index 000000000..5153db2fb
--- /dev/null
+++ b/xmlreader/source/xmlreader.cxx
@@ -0,0 +1,941 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+#include <climits>
+
+#include <com/sun/star/container/NoSuchElementException.hpp>
+#include <com/sun/star/uno/RuntimeException.hpp>
+#include <osl/file.h>
+#include <rtl/character.hxx>
+#include <rtl/string.h>
+#include <rtl/ustring.hxx>
+#include <sal/log.hxx>
+#include <sal/types.h>
+#include <xmlreader/pad.hxx>
+#include <xmlreader/span.hxx>
+#include <xmlreader/xmlreader.hxx>
+
+namespace xmlreader {
+
+namespace {
+
+bool isSpace(char c) {
+    switch (c) {
+    case '\x09':
+    case '\x0A':
+    case '\x0D':
+    case ' ':
+        return true;
+    default:
+        return false;
+    }
+}
+
+}
+
+XmlReader::XmlReader(OUString const & fileUrl)
+    : fileUrl_(fileUrl)
+    , fileHandle_(nullptr)
+{
+    oslFileError e = osl_openFile(
+        fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
+    switch (e)
+    {
+    case osl_File_E_None:
+        break;
+    case osl_File_E_NOENT:
+        throw css::container::NoSuchElementException( fileUrl_ );
+    default:
+        throw css::uno::RuntimeException(
+            "cannot open " + fileUrl_ + ": " + OUString::number(e));
+    }
+    e = osl_getFileSize(fileHandle_, &fileSize_);
+    if (e == osl_File_E_None) {
+        e = osl_mapFile(
+            fileHandle_, &fileAddress_, fileSize_, 0,
+            osl_File_MapFlag_WillNeed);
+    }
+    if (e != osl_File_E_None) {
+        oslFileError e2 = osl_closeFile(fileHandle_);
+        if (e2 != osl_File_E_None) {
+            SAL_WARN(
+                "xmlreader",
+                "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
+        }
+        throw css::uno::RuntimeException(
+            "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
+    }
+    namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
+    namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
+    pos_ = static_cast< char * >(fileAddress_);
+    end_ = pos_ + fileSize_;
+    state_ = State::Content;
+    firstAttribute_ = true;
+}
+
+XmlReader::~XmlReader() {
+    if (!fileHandle_)
+        return;
+    oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
+    if (e != osl_File_E_None) {
+        SAL_WARN(
+            "xmlreader",
+            "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
+    }
+    e = osl_closeFile(fileHandle_);
+    if (e != osl_File_E_None) {
+        SAL_WARN(
+            "xmlreader",
+            "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
+    }
+}
+
+int XmlReader::registerNamespaceIri(Span const & iri) {
+    int id = toNamespaceId(namespaceIris_.size());
+    namespaceIris_.push_back(iri);
+    if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
+        // Old user layer .xcu files used the xsi namespace prefix without
+        // declaring a corresponding namespace binding, see issue 77174; reading
+        // those files during migration would fail without this hack that can be
+        // removed once migration is no longer relevant (see
+        // configmgr::Components::parseModificationLayer):
+        namespaces_.emplace_back(Span("xsi"), id);
+    }
+    return id;
+}
+
+XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
+{
+    switch (state_) {
+    case State::Content:
+        switch (reportText) {
+        case Text::NONE:
+            return handleSkippedText(data, nsId);
+        case Text::Raw:
+            return handleRawText(data);
+        default: // Text::Normalized
+            return handleNormalizedText(data);
+        }
+    case State::StartTag:
+        return handleStartTag(nsId, data);
+    case State::EndTag:
+        return handleEndTag();
+    case State::EmptyElementTag:
+        handleElementEnd();
+        return Result::End;
+    default: // State::Done
+        return Result::Done;
+    }
+}
+
+bool XmlReader::nextAttribute(int * nsId, Span * localName) {
+    assert(nsId != nullptr && localName != nullptr);
+    if (firstAttribute_) {
+        currentAttribute_ = attributes_.begin();
+        firstAttribute_ = false;
+    } else {
+        ++currentAttribute_;
+    }
+    if (currentAttribute_ == attributes_.end()) {
+        return false;
+    }
+    if (currentAttribute_->nameColon == nullptr) {
+        *nsId = NAMESPACE_NONE;
+        *localName = Span(
+            currentAttribute_->nameBegin,
+            currentAttribute_->nameEnd - currentAttribute_->nameBegin);
+    } else {
+        *nsId = getNamespaceId(
+            Span(
+                currentAttribute_->nameBegin,
+                currentAttribute_->nameColon - currentAttribute_->nameBegin));
+        *localName = Span(
+            currentAttribute_->nameColon + 1,
+            currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
+    }
+    return true;
+}
+
+Span XmlReader::getAttributeValue(bool fullyNormalize) {
+    return handleAttributeValue(
+        currentAttribute_->valueBegin, currentAttribute_->valueEnd,
+        fullyNormalize);
+}
+
+int XmlReader::getNamespaceId(Span const & prefix) const {
+    auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
+        [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
+
+    if (i != namespaces_.rend())
+        return i->nsId;
+
+    return NAMESPACE_UNKNOWN;
+}
+
+
+void XmlReader::normalizeLineEnds(Span const & text) {
+    char const * p = text.begin;
+    sal_Int32 n = text.length;
+    for (;;) {
+        sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
+        if (i < 0) {
+            break;
+        }
+        pad_.add(p, i);
+        p += i + 1;
+        n -= i + 1;
+        if (n == 0 || *p != '\x0A') {
+            pad_.add("\x0A");
+        }
+    }
+    pad_.add(p, n);
+}
+
+void XmlReader::skipSpace() {
+    while (isSpace(peek())) {
+        ++pos_;
+    }
+}
+
+bool XmlReader::skipComment() {
+    if (rtl_str_shortenedCompare_WithLength(
+            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
+            RTL_CONSTASCII_LENGTH("--")) !=
+        0)
+    {
+        return false;
+    }
+    pos_ += RTL_CONSTASCII_LENGTH("--");
+    sal_Int32 i = rtl_str_indexOfStr_WithLength(
+        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
+    if (i < 0) {
+        throw css::uno::RuntimeException(
+            "premature end (within comment) of " + fileUrl_ );
+    }
+    pos_ += i + RTL_CONSTASCII_LENGTH("--");
+    if (read() != '>') {
+        throw css::uno::RuntimeException(
+            "illegal \"--\" within comment in " + fileUrl_ );
+    }
+    return true;
+}
+
+void XmlReader::skipProcessingInstruction() {
+    sal_Int32 i = rtl_str_indexOfStr_WithLength(
+        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
+    if (i < 0) {
+        throw css::uno::RuntimeException(
+            "bad '<?' in " + fileUrl_ );
+    }
+    pos_ += i + RTL_CONSTASCII_LENGTH("?>");
+}
+
+void XmlReader::skipDocumentTypeDeclaration() {
+    // Neither is it checked that the doctypedecl is at the correct position in
+    // the document, nor that it is well-formed:
+    for (;;) {
+        char c = read();
+        switch (c) {
+        case '\0': // i.e., EOF
+            throw css::uno::RuntimeException(
+                "premature end (within DTD) of " + fileUrl_ );
+        case '"':
+        case '\'':
+            {
+                sal_Int32 i = rtl_str_indexOfChar_WithLength(
+                    pos_, end_ - pos_, c);
+                if (i < 0) {
+                    throw css::uno::RuntimeException(
+                        "premature end (within DTD) of " + fileUrl_ );
+                }
+                pos_ += i + 1;
+            }
+            break;
+        case '>':
+            return;
+        case '[':
+            for (;;) {
+                c = read();
+                switch (c) {
+                case '\0': // i.e., EOF
+                    throw css::uno::RuntimeException(
+                        "premature end (within DTD) of " + fileUrl_ );
+                case '"':
+                case '\'':
+                    {
+                        sal_Int32 i = rtl_str_indexOfChar_WithLength(
+                            pos_, end_ - pos_, c);
+                        if (i < 0) {
+                            throw css::uno::RuntimeException(
+                                "premature end (within DTD) of " + fileUrl_ );
+                        }
+                        pos_ += i + 1;
+                    }
+                    break;
+                case '<':
+                    switch (read()) {
+                    case '\0': // i.e., EOF
+                        throw css::uno::RuntimeException(
+                            "premature end (within DTD) of " + fileUrl_ );
+                    case '!':
+                        skipComment();
+                        break;
+                    case '?':
+                        skipProcessingInstruction();
+                        break;
+                    default:
+                        break;
+                    }
+                    break;
+                case ']':
+                    skipSpace();
+                    if (read() != '>') {
+                        throw css::uno::RuntimeException(
+                            "missing \">\" of DTD in " + fileUrl_ );
+                    }
+                    return;
+                default:
+                    break;
+                }
+            }
+        default:
+            break;
+        }
+    }
+}
+
+Span XmlReader::scanCdataSection() {
+    if (rtl_str_shortenedCompare_WithLength(
+            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
+            RTL_CONSTASCII_LENGTH("[CDATA[")) !=
+        0)
+    {
+        return Span();
+    }
+    pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
+    char const * begin = pos_;
+    sal_Int32 i = rtl_str_indexOfStr_WithLength(
+        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
+    if (i < 0) {
+        throw css::uno::RuntimeException(
+            "premature end (within CDATA section) of " + fileUrl_ );
+    }
+    pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
+    return Span(begin, i);
+}
+
+bool XmlReader::scanName(char const ** nameColon) {
+    assert(nameColon != nullptr && *nameColon == nullptr);
+    for (char const * begin = pos_;; ++pos_) {
+        switch (peek()) {
+        case '\0': // i.e., EOF
+        case '\x09':
+        case '\x0A':
+        case '\x0D':
+        case ' ':
+        case '/':
+        case '=':
+        case '>':
+            return pos_ != begin;
+        case ':':
+            *nameColon = pos_;
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
+    assert(begin != nullptr && begin <= end);
+    Span iri(handleAttributeValue(begin, end, false));
+    for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
+        if (namespaceIris_[i] == iri) {
+            return toNamespaceId(i);
+        }
+    }
+    return XmlReader::NAMESPACE_UNKNOWN;
+}
+
+char const * XmlReader::handleReference(char const * position, char const * end)
+{
+    assert(position != nullptr && *position == '&' && position < end);
+    ++position;
+    if (*position == '#') {
+        ++position;
+        sal_uInt32 val = 0;
+        char const * p;
+        if (*position == 'x') {
+            ++position;
+            p = position;
+            for (;; ++position) {
+                char c = *position;
+                if (c >= '0' && c <= '9') {
+                    val = 16 * val + (c - '0');
+                } else if (c >= 'A' && c <= 'F') {
+                    val = 16 * val + (c - 'A') + 10;
+                } else if (c >= 'a' && c <= 'f') {
+                    val = 16 * val + (c - 'a') + 10;
+                } else {
+                    break;
+                }
+                if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
+                    throw css::uno::RuntimeException(
+                        "'&#x...' too large in " + fileUrl_ );
+                }
+            }
+        } else {
+            p = position;
+            for (;; ++position) {
+                char c = *position;
+                if (c >= '0' && c <= '9') {
+                    val = 10 * val + (c - '0');
+                } else {
+                    break;
+                }
+                if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
+                    throw css::uno::RuntimeException(
+                        "'&#...' too large in " + fileUrl_ );
+                }
+            }
+        }
+        if (position == p || *position++ != ';') {
+            throw css::uno::RuntimeException(
+                "'&#...' missing ';' in " + fileUrl_ );
+        }
+        assert(rtl::isUnicodeCodePoint(val));
+        if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
+            (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
+        {
+            throw css::uno::RuntimeException(
+                "character reference denoting invalid character in " + fileUrl_ );
+        }
+        char buf[4];
+        sal_Int32 len;
+        if (val < 0x80) {
+            buf[0] = static_cast< char >(val);
+            len = 1;
+        } else if (val < 0x800) {
+            buf[0] = static_cast< char >((val >> 6) | 0xC0);
+            buf[1] = static_cast< char >((val & 0x3F) | 0x80);
+            len = 2;
+        } else if (val < 0x10000) {
+            buf[0] = static_cast< char >((val >> 12) | 0xE0);
+            buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+            buf[2] = static_cast< char >((val & 0x3F) | 0x80);
+            len = 3;
+        } else {
+            buf[0] = static_cast< char >((val >> 18) | 0xF0);
+            buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
+            buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+            buf[3] = static_cast< char >((val & 0x3F) | 0x80);
+            len = 4;
+        }
+        pad_.addEphemeral(buf, len);
+        return position;
+    } else {
+        struct EntityRef {
+            char const * inBegin;
+            sal_Int32 const inLength;
+            char const * outBegin;
+            sal_Int32 const outLength;
+        };
+        static EntityRef const refs[] = {
+            { RTL_CONSTASCII_STRINGPARAM("amp;"),
+              RTL_CONSTASCII_STRINGPARAM("&") },
+            { RTL_CONSTASCII_STRINGPARAM("lt;"),
+              RTL_CONSTASCII_STRINGPARAM("<") },
+            { RTL_CONSTASCII_STRINGPARAM("gt;"),
+              RTL_CONSTASCII_STRINGPARAM(">") },
+            { RTL_CONSTASCII_STRINGPARAM("apos;"),
+              RTL_CONSTASCII_STRINGPARAM("'") },
+            { RTL_CONSTASCII_STRINGPARAM("quot;"),
+              RTL_CONSTASCII_STRINGPARAM("\"") } };
+        for (const auto & ref : refs) {
+            if (rtl_str_shortenedCompare_WithLength(
+                    position, end - position, ref.inBegin, ref.inLength,
+                    ref.inLength) ==
+                0)
+            {
+                position += ref.inLength;
+                pad_.add(ref.outBegin, ref.outLength);
+                return position;
+            }
+        }
+        throw css::uno::RuntimeException(
+            "unknown entity reference in " + fileUrl_ );
+    }
+}
+
+Span XmlReader::handleAttributeValue(
+    char const * begin, char const * end, bool fullyNormalize)
+{
+    pad_.clear();
+    if (fullyNormalize) {
+        while (begin != end && isSpace(*begin)) {
+            ++begin;
+        }
+        while (end != begin && isSpace(end[-1])) {
+            --end;
+        }
+        char const * p = begin;
+        enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+            // a single true space character can go into the current span,
+            // everything else breaks the span
+        Space space = SPACE_NONE;
+        while (p != end) {
+            switch (*p) {
+            case '\x09':
+            case '\x0A':
+            case '\x0D':
+                switch (space) {
+                case SPACE_NONE:
+                    pad_.add(begin, p - begin);
+                    pad_.add(" ");
+                    space = SPACE_BREAK;
+                    break;
+                case SPACE_SPAN:
+                    pad_.add(begin, p - begin);
+                    space = SPACE_BREAK;
+                    break;
+                case SPACE_BREAK:
+                    break;
+                }
+                begin = ++p;
+                break;
+            case ' ':
+                switch (space) {
+                case SPACE_NONE:
+                    ++p;
+                    space = SPACE_SPAN;
+                    break;
+                case SPACE_SPAN:
+                    pad_.add(begin, p - begin);
+                    begin = ++p;
+                    space = SPACE_BREAK;
+                    break;
+                case SPACE_BREAK:
+                    begin = ++p;
+                    break;
+                }
+                break;
+            case '&':
+                pad_.add(begin, p - begin);
+                p = handleReference(p, end);
+                begin = p;
+                space = SPACE_NONE;
+                break;
+            default:
+                ++p;
+                space = SPACE_NONE;
+                break;
+            }
+        }
+        pad_.add(begin, p - begin);
+    } else {
+        char const * p = begin;
+        while (p != end) {
+            switch (*p) {
+            case '\x09':
+            case '\x0A':
+                pad_.add(begin, p - begin);
+                begin = ++p;
+                pad_.add(" ");
+                break;
+            case '\x0D':
+                pad_.add(begin, p - begin);
+                ++p;
+                if (peek() == '\x0A') {
+                    ++p;
+                }
+                begin = p;
+                pad_.add(" ");
+                break;
+            case '&':
+                pad_.add(begin, p - begin);
+                p = handleReference(p, end);
+                begin = p;
+                break;
+            default:
+                ++p;
+                break;
+            }
+        }
+        pad_.add(begin, p - begin);
+    }
+    return pad_.get();
+}
+
+XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
+    assert(nsId != nullptr && localName);
+    char const * nameBegin = pos_;
+    char const * nameColon = nullptr;
+    if (!scanName(&nameColon)) {
+        throw css::uno::RuntimeException(
+            "bad tag name in " + fileUrl_ );
+    }
+    char const * nameEnd = pos_;
+    NamespaceList::size_type inheritedNamespaces = namespaces_.size();
+    bool hasDefaultNs = false;
+    int defaultNsId = NAMESPACE_NONE;
+    attributes_.clear();
+    for (;;) {
+        char const * p = pos_;
+        skipSpace();
+        if (peek() == '/' || peek() == '>') {
+            break;
+        }
+        if (pos_ == p) {
+            throw css::uno::RuntimeException(
+                "missing whitespace before attribute in " + fileUrl_ );
+        }
+        char const * attrNameBegin = pos_;
+        char const * attrNameColon = nullptr;
+        if (!scanName(&attrNameColon)) {
+            throw css::uno::RuntimeException(
+                "bad attribute name in " + fileUrl_ );
+        }
+        char const * attrNameEnd = pos_;
+        skipSpace();
+        if (read() != '=') {
+            throw css::uno::RuntimeException(
+                "missing '=' in " + fileUrl_ );
+        }
+        skipSpace();
+        char del = read();
+        if (del != '\'' && del != '"') {
+            throw css::uno::RuntimeException(
+                "bad attribute value in " + fileUrl_ );
+        }
+        char const * valueBegin = pos_;
+        sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
+        if (i < 0) {
+            throw css::uno::RuntimeException(
+                "unterminated attribute value in " + fileUrl_ );
+        }
+        char const * valueEnd = pos_ + i;
+        pos_ += i + 1;
+        if (attrNameColon == nullptr &&
+            Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
+        {
+            hasDefaultNs = true;
+            defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
+        } else if (attrNameColon != nullptr &&
+                   Span(attrNameBegin, attrNameColon - attrNameBegin) ==
+                       "xmlns")
+        {
+            namespaces_.emplace_back(
+                    Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
+                    scanNamespaceIri(valueBegin, valueEnd));
+        } else {
+            attributes_.emplace_back(
+                    attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
+                    valueEnd);
+        }
+    }
+    if (!hasDefaultNs && !elements_.empty()) {
+        defaultNsId = elements_.top().defaultNamespaceId;
+    }
+    firstAttribute_ = true;
+    if (peek() == '/') {
+        state_ = State::EmptyElementTag;
+        ++pos_;
+    } else {
+        state_ = State::Content;
+    }
+    if (peek() != '>') {
+        throw css::uno::RuntimeException(
+            "missing '>' in " + fileUrl_ );
+    }
+    ++pos_;
+    elements_.push(
+        ElementData(
+            Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
+            defaultNsId));
+    if (nameColon == nullptr) {
+        *nsId = defaultNsId;
+        *localName = Span(nameBegin, nameEnd - nameBegin);
+    } else {
+        *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
+        *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
+    }
+    return Result::Begin;
+}
+
+XmlReader::Result XmlReader::handleEndTag() {
+    if (elements_.empty()) {
+        throw css::uno::RuntimeException(
+            "spurious end tag in " + fileUrl_ );
+    }
+    char const * nameBegin = pos_;
+    char const * nameColon = nullptr;
+    if (!scanName(&nameColon) ||
+        !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
+    {
+        throw css::uno::RuntimeException(
+            "tag mismatch in " + fileUrl_ );
+    }
+    handleElementEnd();
+    skipSpace();
+    if (peek() != '>') {
+        throw css::uno::RuntimeException(
+            "missing '>' in " + fileUrl_ );
+    }
+    ++pos_;
+    return Result::End;
+}
+
+void XmlReader::handleElementEnd() {
+    assert(!elements_.empty());
+    auto end = elements_.top().inheritedNamespaces;
+    namespaces_.resize(end);
+    elements_.pop();
+    state_ = elements_.empty() ? State::Done : State::Content;
+}
+
+XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
+    for (;;) {
+        auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
+        if (!i) {
+            throw css::uno::RuntimeException(
+                "premature end of " + fileUrl_ );
+        }
+        pos_ = i + 1;
+        switch (peek()) {
+        case '!':
+            ++pos_;
+            if (!skipComment() && !scanCdataSection().is()) {
+                skipDocumentTypeDeclaration();
+            }
+            break;
+        case '/':
+            ++pos_;
+            return handleEndTag();
+        case '?':
+            ++pos_;
+            skipProcessingInstruction();
+            break;
+        default:
+            return handleStartTag(nsId, data);
+        }
+    }
+}
+
+XmlReader::Result XmlReader::handleRawText(Span * text) {
+    pad_.clear();
+    for (char const * begin = pos_;;) {
+        switch (peek()) {
+        case '\0': // i.e., EOF
+            throw css::uno::RuntimeException(
+                "premature end of " + fileUrl_ );
+        case '\x0D':
+            pad_.add(begin, pos_ - begin);
+            ++pos_;
+            if (peek() != '\x0A') {
+                pad_.add("\x0A");
+            }
+            begin = pos_;
+            break;
+        case '&':
+            pad_.add(begin, pos_ - begin);
+            pos_ = handleReference(pos_, end_);
+            begin = pos_;
+            break;
+        case '<':
+            pad_.add(begin, pos_ - begin);
+            ++pos_;
+            switch (peek()) {
+            case '!':
+                ++pos_;
+                if (!skipComment()) {
+                    Span cdata(scanCdataSection());
+                    if (cdata.is()) {
+                        normalizeLineEnds(cdata);
+                    } else {
+                        skipDocumentTypeDeclaration();
+                    }
+                }
+                begin = pos_;
+                break;
+            case '/':
+                *text = pad_.get();
+                ++pos_;
+                state_ = State::EndTag;
+                return Result::Text;
+            case '?':
+                ++pos_;
+                skipProcessingInstruction();
+                begin = pos_;
+                break;
+            default:
+                *text = pad_.get();
+                state_ = State::StartTag;
+                return Result::Text;
+            }
+            break;
+        default:
+            ++pos_;
+            break;
+        }
+    }
+}
+
+XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
+    pad_.clear();
+    char const * flowBegin = pos_;
+    char const * flowEnd = pos_;
+    enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+        // a single true space character can go into the current flow,
+        // everything else breaks the flow
+    Space space = SPACE_START;
+    for (;;) {
+        switch (peek()) {
+        case '\0': // i.e., EOF
+            throw css::uno::RuntimeException(
+                "premature end of " + fileUrl_ );
+        case '\x09':
+        case '\x0A':
+        case '\x0D':
+            switch (space) {
+            case SPACE_START:
+            case SPACE_BREAK:
+                break;
+            case SPACE_NONE:
+            case SPACE_SPAN:
+                space = SPACE_BREAK;
+                break;
+            }
+            ++pos_;
+            break;
+        case ' ':
+            switch (space) {
+            case SPACE_START:
+            case SPACE_BREAK:
+                break;
+            case SPACE_NONE:
+                space = SPACE_SPAN;
+                break;
+            case SPACE_SPAN:
+                space = SPACE_BREAK;
+                break;
+            }
+            ++pos_;
+            break;
+        case '&':
+            switch (space) {
+            case SPACE_START:
+                break;
+            case SPACE_NONE:
+            case SPACE_SPAN:
+                pad_.add(flowBegin, pos_ - flowBegin);
+                break;
+            case SPACE_BREAK:
+                pad_.add(flowBegin, flowEnd - flowBegin);
+                pad_.add(" ");
+                break;
+            }
+            pos_ = handleReference(pos_, end_);
+            flowBegin = pos_;
+            flowEnd = pos_;
+            space = SPACE_NONE;
+            break;
+        case '<':
+            ++pos_;
+            switch (peek()) {
+            case '!':
+                ++pos_;
+                if (skipComment()) {
+                    space = SPACE_BREAK;
+                } else {
+                    Span cdata(scanCdataSection());
+                    if (cdata.is()) {
+                        // CDATA is not normalized (similar to character
+                        // references; it keeps the code simple), but it might
+                        // arguably be better to normalize it:
+                        switch (space) {
+                        case SPACE_START:
+                            break;
+                        case SPACE_NONE:
+                        case SPACE_SPAN:
+                            pad_.add(flowBegin, pos_ - flowBegin);
+                            break;
+                        case SPACE_BREAK:
+                            pad_.add(flowBegin, flowEnd - flowBegin);
+                            pad_.add(" ");
+                            break;
+                        }
+                        normalizeLineEnds(cdata);
+                        flowBegin = pos_;
+                        flowEnd = pos_;
+                        space = SPACE_NONE;
+                    } else {
+                        skipDocumentTypeDeclaration();
+                    }
+                }
+                break;
+            case '/':
+                ++pos_;
+                pad_.add(flowBegin, flowEnd - flowBegin);
+                *text = pad_.get();
+                state_ = State::EndTag;
+                return Result::Text;
+            case '?':
+                ++pos_;
+                skipProcessingInstruction();
+                space = SPACE_BREAK;
+                break;
+            default:
+                pad_.add(flowBegin, flowEnd - flowBegin);
+                *text = pad_.get();
+                state_ = State::StartTag;
+                return Result::Text;
+            }
+            break;
+        default:
+            switch (space) {
+            case SPACE_START:
+                flowBegin = pos_;
+                break;
+            case SPACE_NONE:
+            case SPACE_SPAN:
+                break;
+            case SPACE_BREAK:
+                pad_.add(flowBegin, flowEnd - flowBegin);
+                pad_.add(" ");
+                flowBegin = pos_;
+                break;
+            }
+            flowEnd = ++pos_;
+            space = SPACE_NONE;
+            break;
+        }
+    }
+}
+
+int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
+    assert(pos <= INT_MAX);
+    return static_cast< int >(pos);
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */