/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include <sal/config.h> #include <cassert> #include <climits> #include <com/sun/star/container/NoSuchElementException.hpp> #include <com/sun/star/uno/RuntimeException.hpp> #include <osl/file.h> #include <rtl/character.hxx> #include <rtl/string.h> #include <rtl/ustring.hxx> #include <sal/log.hxx> #include <sal/types.h> #include <xmlreader/pad.hxx> #include <xmlreader/span.hxx> #include <xmlreader/xmlreader.hxx> namespace xmlreader { namespace { bool isSpace(char c) { switch (c) { case '\x09': case '\x0A': case '\x0D': case ' ': return true; default: return false; } } } XmlReader::XmlReader(OUString const & fileUrl) : fileUrl_(fileUrl) , fileHandle_(nullptr) { oslFileError e = osl_openFile( fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read); switch (e) { case osl_File_E_None: break; case osl_File_E_NOENT: throw css::container::NoSuchElementException( fileUrl_ ); default: throw css::uno::RuntimeException( "cannot open " + fileUrl_ + ": " + OUString::number(e)); } e = osl_getFileSize(fileHandle_, &fileSize_); if (e == osl_File_E_None) { e = osl_mapFile( fileHandle_, &fileAddress_, fileSize_, 0, osl_File_MapFlag_WillNeed); } if (e != osl_File_E_None) { oslFileError e2 = osl_closeFile(fileHandle_); if (e2 != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2); } throw css::uno::RuntimeException( "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" ); } namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace"); namespaces_.emplace_back(Span("xml"), NAMESPACE_XML); pos_ = static_cast< char * >(fileAddress_); end_ = pos_ + fileSize_; state_ = State::Content; firstAttribute_ = true; } XmlReader::~XmlReader() { if (!fileHandle_) return; oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); if (e != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e); } e = osl_closeFile(fileHandle_); if (e != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); } } int XmlReader::registerNamespaceIri(Span const & iri) { int id = toNamespaceId(namespaceIris_.size()); namespaceIris_.push_back(iri); if (iri == "http://www.w3.org/2001/XMLSchema-instance") { // Old user layer .xcu files used the xsi namespace prefix without // declaring a corresponding namespace binding, see issue 77174; reading // those files during migration would fail without this hack that can be // removed once migration is no longer relevant (see // configmgr::Components::parseModificationLayer): namespaces_.emplace_back(Span("xsi"), id); } return id; } XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) { switch (state_) { case State::Content: switch (reportText) { case Text::NONE: return handleSkippedText(data, nsId); case Text::Raw: return handleRawText(data); default: // Text::Normalized return handleNormalizedText(data); } case State::StartTag: return handleStartTag(nsId, data); case State::EndTag: return handleEndTag(); case State::EmptyElementTag: handleElementEnd(); return Result::End; default: // State::Done return Result::Done; } } bool XmlReader::nextAttribute(int * nsId, Span * localName) { assert(nsId != nullptr && localName != nullptr); if (firstAttribute_) { currentAttribute_ = attributes_.begin(); firstAttribute_ = false; } else { ++currentAttribute_; } if (currentAttribute_ == attributes_.end()) { return false; } if (currentAttribute_->nameColon == nullptr) { *nsId = NAMESPACE_NONE; *localName = Span( currentAttribute_->nameBegin, currentAttribute_->nameEnd - currentAttribute_->nameBegin); } else { *nsId = getNamespaceId( Span( currentAttribute_->nameBegin, currentAttribute_->nameColon - currentAttribute_->nameBegin)); *localName = Span( currentAttribute_->nameColon + 1, currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); } return true; } Span XmlReader::getAttributeValue(bool fullyNormalize) { return handleAttributeValue( currentAttribute_->valueBegin, currentAttribute_->valueEnd, fullyNormalize); } int XmlReader::getNamespaceId(Span const & prefix) const { auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(), [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; }); if (i != namespaces_.rend()) return i->nsId; return NAMESPACE_UNKNOWN; } void XmlReader::normalizeLineEnds(Span const & text) { char const * p = text.begin; sal_Int32 n = text.length; for (;;) { sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); if (i < 0) { break; } pad_.add(p, i); p += i + 1; n -= i + 1; if (n == 0 || *p != '\x0A') { pad_.add("\x0A"); } } pad_.add(p, n); } void XmlReader::skipSpace() { while (isSpace(peek())) { ++pos_; } } bool XmlReader::skipComment() { if (rtl_str_shortenedCompare_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), RTL_CONSTASCII_LENGTH("--")) != 0) { return false; } pos_ += RTL_CONSTASCII_LENGTH("--"); sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); if (i < 0) { throw css::uno::RuntimeException( "premature end (within comment) of " + fileUrl_ ); } pos_ += i + RTL_CONSTASCII_LENGTH("--"); if (read() != '>') { throw css::uno::RuntimeException( "illegal \"--\" within comment in " + fileUrl_ ); } return true; } void XmlReader::skipProcessingInstruction() { sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); if (i < 0) { throw css::uno::RuntimeException( "bad '<?' in " + fileUrl_ ); } pos_ += i + RTL_CONSTASCII_LENGTH("?>"); } void XmlReader::skipDocumentTypeDeclaration() { // Neither is it checked that the doctypedecl is at the correct position in // the document, nor that it is well-formed: for (;;) { char c = read(); switch (c) { case '\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case '"': case '\'': { sal_Int32 i = rtl_str_indexOfChar_WithLength( pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); } pos_ += i + 1; } break; case '>': return; case '[': for (;;) { c = read(); switch (c) { case '\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case '"': case '\'': { sal_Int32 i = rtl_str_indexOfChar_WithLength( pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); } pos_ += i + 1; } break; case '<': switch (read()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case '!': skipComment(); break; case '?': skipProcessingInstruction(); break; default: break; } break; case ']': skipSpace(); if (read() != '>') { throw css::uno::RuntimeException( "missing \">\" of DTD in " + fileUrl_ ); } return; default: break; } } default: break; } } } Span XmlReader::scanCdataSection() { if (rtl_str_shortenedCompare_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), RTL_CONSTASCII_LENGTH("[CDATA[")) != 0) { return Span(); } pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); char const * begin = pos_; sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); if (i < 0) { throw css::uno::RuntimeException( "premature end (within CDATA section) of " + fileUrl_ ); } pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); return Span(begin, i); } bool XmlReader::scanName(char const ** nameColon) { assert(nameColon != nullptr && *nameColon == nullptr); for (char const * begin = pos_;; ++pos_) { switch (peek()) { case '\0': // i.e., EOF case '\x09': case '\x0A': case '\x0D': case ' ': case '/': case '=': case '>': return pos_ != begin; case ':': *nameColon = pos_; break; default: break; } } } int XmlReader::scanNamespaceIri(char const * begin, char const * end) { assert(begin != nullptr && begin <= end); Span iri(handleAttributeValue(begin, end, false)); for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { if (namespaceIris_[i] == iri) { return toNamespaceId(i); } } return XmlReader::NAMESPACE_UNKNOWN; } char const * XmlReader::handleReference(char const * position, char const * end) { assert(position != nullptr && *position == '&' && position < end); ++position; if (*position == '#') { ++position; sal_uInt32 val = 0; char const * p; if (*position == 'x') { ++position; p = position; for (;; ++position) { char c = *position; if (c >= '0' && c <= '9') { val = 16 * val + (c - '0'); } else if (c >= 'A' && c <= 'F') { val = 16 * val + (c - 'A') + 10; } else if (c >= 'a' && c <= 'f') { val = 16 * val + (c - 'a') + 10; } else { break; } if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow throw css::uno::RuntimeException( "'&#x...' too large in " + fileUrl_ ); } } } else { p = position; for (;; ++position) { char c = *position; if (c >= '0' && c <= '9') { val = 10 * val + (c - '0'); } else { break; } if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow throw css::uno::RuntimeException( "'&#...' too large in " + fileUrl_ ); } } } if (position == p || *position++ != ';') { throw css::uno::RuntimeException( "'&#...' missing ';' in " + fileUrl_ ); } assert(rtl::isUnicodeCodePoint(val)); if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) { throw css::uno::RuntimeException( "character reference denoting invalid character in " + fileUrl_ ); } char buf[4]; sal_Int32 len; if (val < 0x80) { buf[0] = static_cast< char >(val); len = 1; } else if (val < 0x800) { buf[0] = static_cast< char >((val >> 6) | 0xC0); buf[1] = static_cast< char >((val & 0x3F) | 0x80); len = 2; } else if (val < 0x10000) { buf[0] = static_cast< char >((val >> 12) | 0xE0); buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); buf[2] = static_cast< char >((val & 0x3F) | 0x80); len = 3; } else { buf[0] = static_cast< char >((val >> 18) | 0xF0); buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); buf[3] = static_cast< char >((val & 0x3F) | 0x80); len = 4; } pad_.addEphemeral(buf, len); return position; } else { struct EntityRef { char const * inBegin; sal_Int32 const inLength; char const * outBegin; sal_Int32 const outLength; }; static EntityRef const refs[] = { { RTL_CONSTASCII_STRINGPARAM("amp;"), RTL_CONSTASCII_STRINGPARAM("&") }, { RTL_CONSTASCII_STRINGPARAM("lt;"), RTL_CONSTASCII_STRINGPARAM("<") }, { RTL_CONSTASCII_STRINGPARAM("gt;"), RTL_CONSTASCII_STRINGPARAM(">") }, { RTL_CONSTASCII_STRINGPARAM("apos;"), RTL_CONSTASCII_STRINGPARAM("'") }, { RTL_CONSTASCII_STRINGPARAM("quot;"), RTL_CONSTASCII_STRINGPARAM("\"") } }; for (const auto & ref : refs) { if (rtl_str_shortenedCompare_WithLength( position, end - position, ref.inBegin, ref.inLength, ref.inLength) == 0) { position += ref.inLength; pad_.add(ref.outBegin, ref.outLength); return position; } } throw css::uno::RuntimeException( "unknown entity reference in " + fileUrl_ ); } } Span XmlReader::handleAttributeValue( char const * begin, char const * end, bool fullyNormalize) { pad_.clear(); if (fullyNormalize) { while (begin != end && isSpace(*begin)) { ++begin; } while (end != begin && isSpace(end[-1])) { --end; } char const * p = begin; enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; // a single true space character can go into the current span, // everything else breaks the span Space space = SPACE_NONE; while (p != end) { switch (*p) { case '\x09': case '\x0A': case '\x0D': switch (space) { case SPACE_NONE: pad_.add(begin, p - begin); pad_.add(" "); space = SPACE_BREAK; break; case SPACE_SPAN: pad_.add(begin, p - begin); space = SPACE_BREAK; break; case SPACE_BREAK: break; } begin = ++p; break; case ' ': switch (space) { case SPACE_NONE: ++p; space = SPACE_SPAN; break; case SPACE_SPAN: pad_.add(begin, p - begin); begin = ++p; space = SPACE_BREAK; break; case SPACE_BREAK: begin = ++p; break; } break; case '&': pad_.add(begin, p - begin); p = handleReference(p, end); begin = p; space = SPACE_NONE; break; default: ++p; space = SPACE_NONE; break; } } pad_.add(begin, p - begin); } else { char const * p = begin; while (p != end) { switch (*p) { case '\x09': case '\x0A': pad_.add(begin, p - begin); begin = ++p; pad_.add(" "); break; case '\x0D': pad_.add(begin, p - begin); ++p; if (peek() == '\x0A') { ++p; } begin = p; pad_.add(" "); break; case '&': pad_.add(begin, p - begin); p = handleReference(p, end); begin = p; break; default: ++p; break; } } pad_.add(begin, p - begin); } return pad_.get(); } XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { assert(nsId != nullptr && localName); char const * nameBegin = pos_; char const * nameColon = nullptr; if (!scanName(&nameColon)) { throw css::uno::RuntimeException( "bad tag name in " + fileUrl_ ); } char const * nameEnd = pos_; NamespaceList::size_type inheritedNamespaces = namespaces_.size(); bool hasDefaultNs = false; int defaultNsId = NAMESPACE_NONE; attributes_.clear(); for (;;) { char const * p = pos_; skipSpace(); if (peek() == '/' || peek() == '>') { break; } if (pos_ == p) { throw css::uno::RuntimeException( "missing whitespace before attribute in " + fileUrl_ ); } char const * attrNameBegin = pos_; char const * attrNameColon = nullptr; if (!scanName(&attrNameColon)) { throw css::uno::RuntimeException( "bad attribute name in " + fileUrl_ ); } char const * attrNameEnd = pos_; skipSpace(); if (read() != '=') { throw css::uno::RuntimeException( "missing '=' in " + fileUrl_ ); } skipSpace(); char del = read(); if (del != '\'' && del != '"') { throw css::uno::RuntimeException( "bad attribute value in " + fileUrl_ ); } char const * valueBegin = pos_; sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); if (i < 0) { throw css::uno::RuntimeException( "unterminated attribute value in " + fileUrl_ ); } char const * valueEnd = pos_ + i; pos_ += i + 1; if (attrNameColon == nullptr && Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns") { hasDefaultNs = true; defaultNsId = scanNamespaceIri(valueBegin, valueEnd); } else if (attrNameColon != nullptr && Span(attrNameBegin, attrNameColon - attrNameBegin) == "xmlns") { namespaces_.emplace_back( Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), scanNamespaceIri(valueBegin, valueEnd)); } else { attributes_.emplace_back( attrNameBegin, attrNameEnd, attrNameColon, valueBegin, valueEnd); } } if (!hasDefaultNs && !elements_.empty()) { defaultNsId = elements_.top().defaultNamespaceId; } firstAttribute_ = true; if (peek() == '/') { state_ = State::EmptyElementTag; ++pos_; } else { state_ = State::Content; } if (peek() != '>') { throw css::uno::RuntimeException( "missing '>' in " + fileUrl_ ); } ++pos_; elements_.push( ElementData( Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, defaultNsId)); if (nameColon == nullptr) { *nsId = defaultNsId; *localName = Span(nameBegin, nameEnd - nameBegin); } else { *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); } return Result::Begin; } XmlReader::Result XmlReader::handleEndTag() { if (elements_.empty()) { throw css::uno::RuntimeException( "spurious end tag in " + fileUrl_ ); } char const * nameBegin = pos_; char const * nameColon = nullptr; if (!scanName(&nameColon) || !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) { throw css::uno::RuntimeException( "tag mismatch in " + fileUrl_ ); } handleElementEnd(); skipSpace(); if (peek() != '>') { throw css::uno::RuntimeException( "missing '>' in " + fileUrl_ ); } ++pos_; return Result::End; } void XmlReader::handleElementEnd() { assert(!elements_.empty()); auto end = elements_.top().inheritedNamespaces; namespaces_.resize(end); elements_.pop(); state_ = elements_.empty() ? State::Done : State::Content; } XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { for (;;) { auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_)); if (!i) { throw css::uno::RuntimeException( "premature end of " + fileUrl_ ); } pos_ = i + 1; switch (peek()) { case '!': ++pos_; if (!skipComment() && !scanCdataSection().is()) { skipDocumentTypeDeclaration(); } break; case '/': ++pos_; return handleEndTag(); case '?': ++pos_; skipProcessingInstruction(); break; default: return handleStartTag(nsId, data); } } } XmlReader::Result XmlReader::handleRawText(Span * text) { pad_.clear(); for (char const * begin = pos_;;) { switch (peek()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( "premature end of " + fileUrl_ ); case '\x0D': pad_.add(begin, pos_ - begin); ++pos_; if (peek() != '\x0A') { pad_.add("\x0A"); } begin = pos_; break; case '&': pad_.add(begin, pos_ - begin); pos_ = handleReference(pos_, end_); begin = pos_; break; case '<': pad_.add(begin, pos_ - begin); ++pos_; switch (peek()) { case '!': ++pos_; if (!skipComment()) { Span cdata(scanCdataSection()); if (cdata.is()) { normalizeLineEnds(cdata); } else { skipDocumentTypeDeclaration(); } } begin = pos_; break; case '/': *text = pad_.get(); ++pos_; state_ = State::EndTag; return Result::Text; case '?': ++pos_; skipProcessingInstruction(); begin = pos_; break; default: *text = pad_.get(); state_ = State::StartTag; return Result::Text; } break; default: ++pos_; break; } } } XmlReader::Result XmlReader::handleNormalizedText(Span * text) { pad_.clear(); char const * flowBegin = pos_; char const * flowEnd = pos_; enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; // a single true space character can go into the current flow, // everything else breaks the flow Space space = SPACE_START; for (;;) { switch (peek()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( "premature end of " + fileUrl_ ); case '\x09': case '\x0A': case '\x0D': switch (space) { case SPACE_START: case SPACE_BREAK: break; case SPACE_NONE: case SPACE_SPAN: space = SPACE_BREAK; break; } ++pos_; break; case ' ': switch (space) { case SPACE_START: case SPACE_BREAK: break; case SPACE_NONE: space = SPACE_SPAN; break; case SPACE_SPAN: space = SPACE_BREAK; break; } ++pos_; break; case '&': switch (space) { case SPACE_START: break; case SPACE_NONE: case SPACE_SPAN: pad_.add(flowBegin, pos_ - flowBegin); break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(" "); break; } pos_ = handleReference(pos_, end_); flowBegin = pos_; flowEnd = pos_; space = SPACE_NONE; break; case '<': ++pos_; switch (peek()) { case '!': ++pos_; if (skipComment()) { space = SPACE_BREAK; } else { Span cdata(scanCdataSection()); if (cdata.is()) { // CDATA is not normalized (similar to character // references; it keeps the code simple), but it might // arguably be better to normalize it: switch (space) { case SPACE_START: break; case SPACE_NONE: case SPACE_SPAN: pad_.add(flowBegin, pos_ - flowBegin); break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(" "); break; } normalizeLineEnds(cdata); flowBegin = pos_; flowEnd = pos_; space = SPACE_NONE; } else { skipDocumentTypeDeclaration(); } } break; case '/': ++pos_; pad_.add(flowBegin, flowEnd - flowBegin); *text = pad_.get(); state_ = State::EndTag; return Result::Text; case '?': ++pos_; skipProcessingInstruction(); space = SPACE_BREAK; break; default: pad_.add(flowBegin, flowEnd - flowBegin); *text = pad_.get(); state_ = State::StartTag; return Result::Text; } break; default: switch (space) { case SPACE_START: flowBegin = pos_; break; case SPACE_NONE: case SPACE_SPAN: break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(" "); flowBegin = pos_; break; } flowEnd = ++pos_; space = SPACE_NONE; break; } } } int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { assert(pos <= INT_MAX); return static_cast< int >(pos); } } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */