diff options
Diffstat (limited to 'xmlreader')
-rw-r--r-- | xmlreader/IwyuFilter_xmlreader.yaml | 2 | ||||
-rw-r--r-- | xmlreader/Library_xmlreader.mk | 42 | ||||
-rw-r--r-- | xmlreader/Makefile | 7 | ||||
-rw-r--r-- | xmlreader/Module_xmlreader.mk | 26 | ||||
-rw-r--r-- | xmlreader/README | 6 | ||||
-rw-r--r-- | xmlreader/source/pad.cxx | 78 | ||||
-rw-r--r-- | xmlreader/source/span.cxx | 51 | ||||
-rw-r--r-- | xmlreader/source/xmlreader.cxx | 941 |
8 files changed, 1153 insertions, 0 deletions
diff --git a/xmlreader/IwyuFilter_xmlreader.yaml b/xmlreader/IwyuFilter_xmlreader.yaml new file mode 100644 index 000000000..bb9f26a1e --- /dev/null +++ b/xmlreader/IwyuFilter_xmlreader.yaml @@ -0,0 +1,2 @@ +--- +assumeFilename: xmlreader/source/xmlreader.cxx diff --git a/xmlreader/Library_xmlreader.mk b/xmlreader/Library_xmlreader.mk new file mode 100644 index 000000000..fa4a6711c --- /dev/null +++ b/xmlreader/Library_xmlreader.mk @@ -0,0 +1,42 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This file incorporates work covered by the following license notice: +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to you under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 . +# + +$(eval $(call gb_Library_Library,xmlreader)) + +$(eval $(call gb_Library_add_defs,xmlreader,\ + -DOOO_DLLIMPLEMENTATION_XMLREADER \ +)) + +$(eval $(call gb_Library_use_external,xmlreader,boost_headers)) + +$(eval $(call gb_Library_use_libraries,xmlreader,\ + cppu \ + sal \ +)) + # cppu is only needed due to the cppumaker -C hack + +$(eval $(call gb_Library_add_exception_objects,xmlreader,\ + xmlreader/source/pad \ + xmlreader/source/span \ + xmlreader/source/xmlreader \ +)) + +$(eval $(call gb_Library_use_udk_api,xmlreader)) + +# vim: set noet sw=4 ts=4: diff --git a/xmlreader/Makefile b/xmlreader/Makefile new file mode 100644 index 000000000..ccb1c85a0 --- /dev/null +++ b/xmlreader/Makefile @@ -0,0 +1,7 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- + +module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) + +include $(module_directory)/../solenv/gbuild/partial_build.mk + +# vim: set noet sw=4 ts=4: diff --git a/xmlreader/Module_xmlreader.mk b/xmlreader/Module_xmlreader.mk new file mode 100644 index 000000000..2d2d91805 --- /dev/null +++ b/xmlreader/Module_xmlreader.mk @@ -0,0 +1,26 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This file incorporates work covered by the following license notice: +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to you under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 . +# + +$(eval $(call gb_Module_Module,xmlreader)) + +$(eval $(call gb_Module_add_targets,xmlreader,\ + Library_xmlreader \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/xmlreader/README b/xmlreader/README new file mode 100644 index 000000000..0c6bae91b --- /dev/null +++ b/xmlreader/README @@ -0,0 +1,6 @@ +fast/small XML pull parser. + +Implements a simple, fast pull parser, currently used by [[configmgr]] and +[[stoc]]'s simpleregistry code (used to register UNO components in +services.rdb files). It supports a subset of XML features, but is fast +and small. diff --git a/xmlreader/source/pad.cxx b/xmlreader/source/pad.cxx new file mode 100644 index 000000000..ce45f805f --- /dev/null +++ b/xmlreader/source/pad.cxx @@ -0,0 +1,78 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <sal/config.h> + +#include <cassert> + +#include <sal/types.h> +#include <xmlreader/pad.hxx> +#include <xmlreader/span.hxx> + +namespace xmlreader { + +void Pad::add(char const * begin, sal_Int32 length) { + assert( + begin != nullptr && length >= 0 && !(span_.is() && buffer_.getLength() != 0)); + if (length != 0) { + flushSpan(); + if (buffer_.isEmpty()) { + span_ = Span(begin, length); + } else { + buffer_.append(begin, length); + } + } +} + +void Pad::addEphemeral(char const * begin, sal_Int32 length) { + assert( + begin != nullptr && length >= 0 && !(span_.is() && buffer_.getLength() != 0)); + if (length != 0) { + flushSpan(); + buffer_.append(begin, length); + } +} + +void Pad::clear() { + assert(!(span_.is() && buffer_.getLength() != 0)); + span_.clear(); + buffer_.setLength(0); +} + +Span Pad::get() const { + assert(!(span_.is() && buffer_.getLength() != 0)); + if (span_.is()) { + return span_; + } else if (buffer_.isEmpty()) { + return Span(""); + } else { + return Span(buffer_.getStr(), buffer_.getLength()); + } +} + +void Pad::flushSpan() { + if (span_.is()) { + buffer_.append(span_.begin, span_.length); + span_.clear(); + } +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/xmlreader/source/span.cxx b/xmlreader/source/span.cxx new file mode 100644 index 000000000..29433c561 --- /dev/null +++ b/xmlreader/source/span.cxx @@ -0,0 +1,51 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <sal/config.h> + +#include <cassert> + +#include <com/sun/star/uno/RuntimeException.hpp> +#include <rtl/textcvt.h> +#include <rtl/textenc.h> +#include <rtl/ustring.h> +#include <rtl/ustring.hxx> +#include <sal/types.h> +#include <xmlreader/span.hxx> + +namespace xmlreader { + +OUString Span::convertFromUtf8() const { + assert(is()); + rtl_uString * s = nullptr; + if (!rtl_convertStringToUString( + &s, begin, length, RTL_TEXTENCODING_UTF8, + (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR | + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR | + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR))) + { + throw css::uno::RuntimeException( + "cannot convert from UTF-8"); + } + return OUString(s, SAL_NO_ACQUIRE); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/xmlreader/source/xmlreader.cxx b/xmlreader/source/xmlreader.cxx new file mode 100644 index 000000000..85027b66e --- /dev/null +++ b/xmlreader/source/xmlreader.cxx @@ -0,0 +1,941 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <sal/config.h> + +#include <cassert> +#include <climits> + +#include <com/sun/star/container/NoSuchElementException.hpp> +#include <com/sun/star/uno/RuntimeException.hpp> +#include <osl/file.h> +#include <rtl/character.hxx> +#include <rtl/string.h> +#include <rtl/ustring.hxx> +#include <sal/log.hxx> +#include <sal/types.h> +#include <xmlreader/pad.hxx> +#include <xmlreader/span.hxx> +#include <xmlreader/xmlreader.hxx> + +namespace xmlreader { + +namespace { + +bool isSpace(char c) { + switch (c) { + case '\x09': + case '\x0A': + case '\x0D': + case ' ': + return true; + default: + return false; + } +} + +} + +XmlReader::XmlReader(OUString const & fileUrl) + : fileUrl_(fileUrl) + , fileHandle_(nullptr) +{ + oslFileError e = osl_openFile( + fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read); + switch (e) + { + case osl_File_E_None: + break; + case osl_File_E_NOENT: + throw css::container::NoSuchElementException( fileUrl_ ); + default: + throw css::uno::RuntimeException( + "cannot open " + fileUrl_ + ": " + OUString::number(e)); + } + e = osl_getFileSize(fileHandle_, &fileSize_); + if (e == osl_File_E_None) { + e = osl_mapFile( + fileHandle_, &fileAddress_, fileSize_, 0, + osl_File_MapFlag_WillNeed); + } + if (e != osl_File_E_None) { + oslFileError e2 = osl_closeFile(fileHandle_); + if (e2 != osl_File_E_None) { + SAL_WARN( + "xmlreader", + "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2); + } + throw css::uno::RuntimeException( + "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" ); + } + namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace"); + namespaces_.emplace_back(Span("xml"), NAMESPACE_XML); + pos_ = static_cast< char * >(fileAddress_); + end_ = pos_ + fileSize_; + state_ = State::Content; + firstAttribute_ = true; +} + +XmlReader::~XmlReader() { + if (!fileHandle_) + return; + oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); + if (e != osl_File_E_None) { + SAL_WARN( + "xmlreader", + "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e); + } + e = osl_closeFile(fileHandle_); + if (e != osl_File_E_None) { + SAL_WARN( + "xmlreader", + "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); + } +} + +int XmlReader::registerNamespaceIri(Span const & iri) { + int id = toNamespaceId(namespaceIris_.size()); + namespaceIris_.push_back(iri); + if (iri == "http://www.w3.org/2001/XMLSchema-instance") { + // Old user layer .xcu files used the xsi namespace prefix without + // declaring a corresponding namespace binding, see issue 77174; reading + // those files during migration would fail without this hack that can be + // removed once migration is no longer relevant (see + // configmgr::Components::parseModificationLayer): + namespaces_.emplace_back(Span("xsi"), id); + } + return id; +} + +XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) +{ + switch (state_) { + case State::Content: + switch (reportText) { + case Text::NONE: + return handleSkippedText(data, nsId); + case Text::Raw: + return handleRawText(data); + default: // Text::Normalized + return handleNormalizedText(data); + } + case State::StartTag: + return handleStartTag(nsId, data); + case State::EndTag: + return handleEndTag(); + case State::EmptyElementTag: + handleElementEnd(); + return Result::End; + default: // State::Done + return Result::Done; + } +} + +bool XmlReader::nextAttribute(int * nsId, Span * localName) { + assert(nsId != nullptr && localName != nullptr); + if (firstAttribute_) { + currentAttribute_ = attributes_.begin(); + firstAttribute_ = false; + } else { + ++currentAttribute_; + } + if (currentAttribute_ == attributes_.end()) { + return false; + } + if (currentAttribute_->nameColon == nullptr) { + *nsId = NAMESPACE_NONE; + *localName = Span( + currentAttribute_->nameBegin, + currentAttribute_->nameEnd - currentAttribute_->nameBegin); + } else { + *nsId = getNamespaceId( + Span( + currentAttribute_->nameBegin, + currentAttribute_->nameColon - currentAttribute_->nameBegin)); + *localName = Span( + currentAttribute_->nameColon + 1, + currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); + } + return true; +} + +Span XmlReader::getAttributeValue(bool fullyNormalize) { + return handleAttributeValue( + currentAttribute_->valueBegin, currentAttribute_->valueEnd, + fullyNormalize); +} + +int XmlReader::getNamespaceId(Span const & prefix) const { + auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(), + [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; }); + + if (i != namespaces_.rend()) + return i->nsId; + + return NAMESPACE_UNKNOWN; +} + + +void XmlReader::normalizeLineEnds(Span const & text) { + char const * p = text.begin; + sal_Int32 n = text.length; + for (;;) { + sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); + if (i < 0) { + break; + } + pad_.add(p, i); + p += i + 1; + n -= i + 1; + if (n == 0 || *p != '\x0A') { + pad_.add("\x0A"); + } + } + pad_.add(p, n); +} + +void XmlReader::skipSpace() { + while (isSpace(peek())) { + ++pos_; + } +} + +bool XmlReader::skipComment() { + if (rtl_str_shortenedCompare_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), + RTL_CONSTASCII_LENGTH("--")) != + 0) + { + return false; + } + pos_ += RTL_CONSTASCII_LENGTH("--"); + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); + if (i < 0) { + throw css::uno::RuntimeException( + "premature end (within comment) of " + fileUrl_ ); + } + pos_ += i + RTL_CONSTASCII_LENGTH("--"); + if (read() != '>') { + throw css::uno::RuntimeException( + "illegal \"--\" within comment in " + fileUrl_ ); + } + return true; +} + +void XmlReader::skipProcessingInstruction() { + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); + if (i < 0) { + throw css::uno::RuntimeException( + "bad '<?' in " + fileUrl_ ); + } + pos_ += i + RTL_CONSTASCII_LENGTH("?>"); +} + +void XmlReader::skipDocumentTypeDeclaration() { + // Neither is it checked that the doctypedecl is at the correct position in + // the document, nor that it is well-formed: + for (;;) { + char c = read(); + switch (c) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + "premature end (within DTD) of " + fileUrl_ ); + case '"': + case '\'': + { + sal_Int32 i = rtl_str_indexOfChar_WithLength( + pos_, end_ - pos_, c); + if (i < 0) { + throw css::uno::RuntimeException( + "premature end (within DTD) of " + fileUrl_ ); + } + pos_ += i + 1; + } + break; + case '>': + return; + case '[': + for (;;) { + c = read(); + switch (c) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + "premature end (within DTD) of " + fileUrl_ ); + case '"': + case '\'': + { + sal_Int32 i = rtl_str_indexOfChar_WithLength( + pos_, end_ - pos_, c); + if (i < 0) { + throw css::uno::RuntimeException( + "premature end (within DTD) of " + fileUrl_ ); + } + pos_ += i + 1; + } + break; + case '<': + switch (read()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + "premature end (within DTD) of " + fileUrl_ ); + case '!': + skipComment(); + break; + case '?': + skipProcessingInstruction(); + break; + default: + break; + } + break; + case ']': + skipSpace(); + if (read() != '>') { + throw css::uno::RuntimeException( + "missing \">\" of DTD in " + fileUrl_ ); + } + return; + default: + break; + } + } + default: + break; + } + } +} + +Span XmlReader::scanCdataSection() { + if (rtl_str_shortenedCompare_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), + RTL_CONSTASCII_LENGTH("[CDATA[")) != + 0) + { + return Span(); + } + pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); + char const * begin = pos_; + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); + if (i < 0) { + throw css::uno::RuntimeException( + "premature end (within CDATA section) of " + fileUrl_ ); + } + pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); + return Span(begin, i); +} + +bool XmlReader::scanName(char const ** nameColon) { + assert(nameColon != nullptr && *nameColon == nullptr); + for (char const * begin = pos_;; ++pos_) { + switch (peek()) { + case '\0': // i.e., EOF + case '\x09': + case '\x0A': + case '\x0D': + case ' ': + case '/': + case '=': + case '>': + return pos_ != begin; + case ':': + *nameColon = pos_; + break; + default: + break; + } + } +} + +int XmlReader::scanNamespaceIri(char const * begin, char const * end) { + assert(begin != nullptr && begin <= end); + Span iri(handleAttributeValue(begin, end, false)); + for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { + if (namespaceIris_[i] == iri) { + return toNamespaceId(i); + } + } + return XmlReader::NAMESPACE_UNKNOWN; +} + +char const * XmlReader::handleReference(char const * position, char const * end) +{ + assert(position != nullptr && *position == '&' && position < end); + ++position; + if (*position == '#') { + ++position; + sal_uInt32 val = 0; + char const * p; + if (*position == 'x') { + ++position; + p = position; + for (;; ++position) { + char c = *position; + if (c >= '0' && c <= '9') { + val = 16 * val + (c - '0'); + } else if (c >= 'A' && c <= 'F') { + val = 16 * val + (c - 'A') + 10; + } else if (c >= 'a' && c <= 'f') { + val = 16 * val + (c - 'a') + 10; + } else { + break; + } + if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow + throw css::uno::RuntimeException( + "'&#x...' too large in " + fileUrl_ ); + } + } + } else { + p = position; + for (;; ++position) { + char c = *position; + if (c >= '0' && c <= '9') { + val = 10 * val + (c - '0'); + } else { + break; + } + if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow + throw css::uno::RuntimeException( + "'&#...' too large in " + fileUrl_ ); + } + } + } + if (position == p || *position++ != ';') { + throw css::uno::RuntimeException( + "'&#...' missing ';' in " + fileUrl_ ); + } + assert(rtl::isUnicodeCodePoint(val)); + if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || + (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) + { + throw css::uno::RuntimeException( + "character reference denoting invalid character in " + fileUrl_ ); + } + char buf[4]; + sal_Int32 len; + if (val < 0x80) { + buf[0] = static_cast< char >(val); + len = 1; + } else if (val < 0x800) { + buf[0] = static_cast< char >((val >> 6) | 0xC0); + buf[1] = static_cast< char >((val & 0x3F) | 0x80); + len = 2; + } else if (val < 0x10000) { + buf[0] = static_cast< char >((val >> 12) | 0xE0); + buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); + buf[2] = static_cast< char >((val & 0x3F) | 0x80); + len = 3; + } else { + buf[0] = static_cast< char >((val >> 18) | 0xF0); + buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); + buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); + buf[3] = static_cast< char >((val & 0x3F) | 0x80); + len = 4; + } + pad_.addEphemeral(buf, len); + return position; + } else { + struct EntityRef { + char const * inBegin; + sal_Int32 const inLength; + char const * outBegin; + sal_Int32 const outLength; + }; + static EntityRef const refs[] = { + { RTL_CONSTASCII_STRINGPARAM("amp;"), + RTL_CONSTASCII_STRINGPARAM("&") }, + { RTL_CONSTASCII_STRINGPARAM("lt;"), + RTL_CONSTASCII_STRINGPARAM("<") }, + { RTL_CONSTASCII_STRINGPARAM("gt;"), + RTL_CONSTASCII_STRINGPARAM(">") }, + { RTL_CONSTASCII_STRINGPARAM("apos;"), + RTL_CONSTASCII_STRINGPARAM("'") }, + { RTL_CONSTASCII_STRINGPARAM("quot;"), + RTL_CONSTASCII_STRINGPARAM("\"") } }; + for (const auto & ref : refs) { + if (rtl_str_shortenedCompare_WithLength( + position, end - position, ref.inBegin, ref.inLength, + ref.inLength) == + 0) + { + position += ref.inLength; + pad_.add(ref.outBegin, ref.outLength); + return position; + } + } + throw css::uno::RuntimeException( + "unknown entity reference in " + fileUrl_ ); + } +} + +Span XmlReader::handleAttributeValue( + char const * begin, char const * end, bool fullyNormalize) +{ + pad_.clear(); + if (fullyNormalize) { + while (begin != end && isSpace(*begin)) { + ++begin; + } + while (end != begin && isSpace(end[-1])) { + --end; + } + char const * p = begin; + enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; + // a single true space character can go into the current span, + // everything else breaks the span + Space space = SPACE_NONE; + while (p != end) { + switch (*p) { + case '\x09': + case '\x0A': + case '\x0D': + switch (space) { + case SPACE_NONE: + pad_.add(begin, p - begin); + pad_.add(" "); + space = SPACE_BREAK; + break; + case SPACE_SPAN: + pad_.add(begin, p - begin); + space = SPACE_BREAK; + break; + case SPACE_BREAK: + break; + } + begin = ++p; + break; + case ' ': + switch (space) { + case SPACE_NONE: + ++p; + space = SPACE_SPAN; + break; + case SPACE_SPAN: + pad_.add(begin, p - begin); + begin = ++p; + space = SPACE_BREAK; + break; + case SPACE_BREAK: + begin = ++p; + break; + } + break; + case '&': + pad_.add(begin, p - begin); + p = handleReference(p, end); + begin = p; + space = SPACE_NONE; + break; + default: + ++p; + space = SPACE_NONE; + break; + } + } + pad_.add(begin, p - begin); + } else { + char const * p = begin; + while (p != end) { + switch (*p) { + case '\x09': + case '\x0A': + pad_.add(begin, p - begin); + begin = ++p; + pad_.add(" "); + break; + case '\x0D': + pad_.add(begin, p - begin); + ++p; + if (peek() == '\x0A') { + ++p; + } + begin = p; + pad_.add(" "); + break; + case '&': + pad_.add(begin, p - begin); + p = handleReference(p, end); + begin = p; + break; + default: + ++p; + break; + } + } + pad_.add(begin, p - begin); + } + return pad_.get(); +} + +XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { + assert(nsId != nullptr && localName); + char const * nameBegin = pos_; + char const * nameColon = nullptr; + if (!scanName(&nameColon)) { + throw css::uno::RuntimeException( + "bad tag name in " + fileUrl_ ); + } + char const * nameEnd = pos_; + NamespaceList::size_type inheritedNamespaces = namespaces_.size(); + bool hasDefaultNs = false; + int defaultNsId = NAMESPACE_NONE; + attributes_.clear(); + for (;;) { + char const * p = pos_; + skipSpace(); + if (peek() == '/' || peek() == '>') { + break; + } + if (pos_ == p) { + throw css::uno::RuntimeException( + "missing whitespace before attribute in " + fileUrl_ ); + } + char const * attrNameBegin = pos_; + char const * attrNameColon = nullptr; + if (!scanName(&attrNameColon)) { + throw css::uno::RuntimeException( + "bad attribute name in " + fileUrl_ ); + } + char const * attrNameEnd = pos_; + skipSpace(); + if (read() != '=') { + throw css::uno::RuntimeException( + "missing '=' in " + fileUrl_ ); + } + skipSpace(); + char del = read(); + if (del != '\'' && del != '"') { + throw css::uno::RuntimeException( + "bad attribute value in " + fileUrl_ ); + } + char const * valueBegin = pos_; + sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); + if (i < 0) { + throw css::uno::RuntimeException( + "unterminated attribute value in " + fileUrl_ ); + } + char const * valueEnd = pos_ + i; + pos_ += i + 1; + if (attrNameColon == nullptr && + Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns") + { + hasDefaultNs = true; + defaultNsId = scanNamespaceIri(valueBegin, valueEnd); + } else if (attrNameColon != nullptr && + Span(attrNameBegin, attrNameColon - attrNameBegin) == + "xmlns") + { + namespaces_.emplace_back( + Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), + scanNamespaceIri(valueBegin, valueEnd)); + } else { + attributes_.emplace_back( + attrNameBegin, attrNameEnd, attrNameColon, valueBegin, + valueEnd); + } + } + if (!hasDefaultNs && !elements_.empty()) { + defaultNsId = elements_.top().defaultNamespaceId; + } + firstAttribute_ = true; + if (peek() == '/') { + state_ = State::EmptyElementTag; + ++pos_; + } else { + state_ = State::Content; + } + if (peek() != '>') { + throw css::uno::RuntimeException( + "missing '>' in " + fileUrl_ ); + } + ++pos_; + elements_.push( + ElementData( + Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, + defaultNsId)); + if (nameColon == nullptr) { + *nsId = defaultNsId; + *localName = Span(nameBegin, nameEnd - nameBegin); + } else { + *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); + *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); + } + return Result::Begin; +} + +XmlReader::Result XmlReader::handleEndTag() { + if (elements_.empty()) { + throw css::uno::RuntimeException( + "spurious end tag in " + fileUrl_ ); + } + char const * nameBegin = pos_; + char const * nameColon = nullptr; + if (!scanName(&nameColon) || + !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) + { + throw css::uno::RuntimeException( + "tag mismatch in " + fileUrl_ ); + } + handleElementEnd(); + skipSpace(); + if (peek() != '>') { + throw css::uno::RuntimeException( + "missing '>' in " + fileUrl_ ); + } + ++pos_; + return Result::End; +} + +void XmlReader::handleElementEnd() { + assert(!elements_.empty()); + auto end = elements_.top().inheritedNamespaces; + namespaces_.resize(end); + elements_.pop(); + state_ = elements_.empty() ? State::Done : State::Content; +} + +XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { + for (;;) { + sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); + if (i < 0) { + throw css::uno::RuntimeException( + "premature end of " + fileUrl_ ); + } + pos_ += i + 1; + switch (peek()) { + case '!': + ++pos_; + if (!skipComment() && !scanCdataSection().is()) { + skipDocumentTypeDeclaration(); + } + break; + case '/': + ++pos_; + return handleEndTag(); + case '?': + ++pos_; + skipProcessingInstruction(); + break; + default: + return handleStartTag(nsId, data); + } + } +} + +XmlReader::Result XmlReader::handleRawText(Span * text) { + pad_.clear(); + for (char const * begin = pos_;;) { + switch (peek()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + "premature end of " + fileUrl_ ); + case '\x0D': + pad_.add(begin, pos_ - begin); + ++pos_; + if (peek() != '\x0A') { + pad_.add("\x0A"); + } + begin = pos_; + break; + case '&': + pad_.add(begin, pos_ - begin); + pos_ = handleReference(pos_, end_); + begin = pos_; + break; + case '<': + pad_.add(begin, pos_ - begin); + ++pos_; + switch (peek()) { + case '!': + ++pos_; + if (!skipComment()) { + Span cdata(scanCdataSection()); + if (cdata.is()) { + normalizeLineEnds(cdata); + } else { + skipDocumentTypeDeclaration(); + } + } + begin = pos_; + break; + case '/': + *text = pad_.get(); + ++pos_; + state_ = State::EndTag; + return Result::Text; + case '?': + ++pos_; + skipProcessingInstruction(); + begin = pos_; + break; + default: + *text = pad_.get(); + state_ = State::StartTag; + return Result::Text; + } + break; + default: + ++pos_; + break; + } + } +} + +XmlReader::Result XmlReader::handleNormalizedText(Span * text) { + pad_.clear(); + char const * flowBegin = pos_; + char const * flowEnd = pos_; + enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; + // a single true space character can go into the current flow, + // everything else breaks the flow + Space space = SPACE_START; + for (;;) { + switch (peek()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + "premature end of " + fileUrl_ ); + case '\x09': + case '\x0A': + case '\x0D': + switch (space) { + case SPACE_START: + case SPACE_BREAK: + break; + case SPACE_NONE: + case SPACE_SPAN: + space = SPACE_BREAK; + break; + } + ++pos_; + break; + case ' ': + switch (space) { + case SPACE_START: + case SPACE_BREAK: + break; + case SPACE_NONE: + space = SPACE_SPAN; + break; + case SPACE_SPAN: + space = SPACE_BREAK; + break; + } + ++pos_; + break; + case '&': + switch (space) { + case SPACE_START: + break; + case SPACE_NONE: + case SPACE_SPAN: + pad_.add(flowBegin, pos_ - flowBegin); + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(" "); + break; + } + pos_ = handleReference(pos_, end_); + flowBegin = pos_; + flowEnd = pos_; + space = SPACE_NONE; + break; + case '<': + ++pos_; + switch (peek()) { + case '!': + ++pos_; + if (skipComment()) { + space = SPACE_BREAK; + } else { + Span cdata(scanCdataSection()); + if (cdata.is()) { + // CDATA is not normalized (similar to character + // references; it keeps the code simple), but it might + // arguably be better to normalize it: + switch (space) { + case SPACE_START: + break; + case SPACE_NONE: + case SPACE_SPAN: + pad_.add(flowBegin, pos_ - flowBegin); + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(" "); + break; + } + normalizeLineEnds(cdata); + flowBegin = pos_; + flowEnd = pos_; + space = SPACE_NONE; + } else { + skipDocumentTypeDeclaration(); + } + } + break; + case '/': + ++pos_; + pad_.add(flowBegin, flowEnd - flowBegin); + *text = pad_.get(); + state_ = State::EndTag; + return Result::Text; + case '?': + ++pos_; + skipProcessingInstruction(); + space = SPACE_BREAK; + break; + default: + pad_.add(flowBegin, flowEnd - flowBegin); + *text = pad_.get(); + state_ = State::StartTag; + return Result::Text; + } + break; + default: + switch (space) { + case SPACE_START: + flowBegin = pos_; + break; + case SPACE_NONE: + case SPACE_SPAN: + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(" "); + flowBegin = pos_; + break; + } + flowEnd = ++pos_; + space = SPACE_NONE; + break; + } + } +} + +int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { + assert(pos <= INT_MAX); + return static_cast< int >(pos); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |