summaryrefslogtreecommitdiffstats
path: root/svl/source/misc/urihelper.cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
commit267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree358c9467650e1d0a1d7227a21dac2e3d08b622b2 /svl/source/misc/urihelper.cxx
parentInitial commit. (diff)
downloadlibreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'svl/source/misc/urihelper.cxx')
-rw-r--r--svl/source/misc/urihelper.cxx884
1 files changed, 884 insertions, 0 deletions
diff --git a/svl/source/misc/urihelper.cxx b/svl/source/misc/urihelper.cxx
new file mode 100644
index 0000000000..6488edb5bb
--- /dev/null
+++ b/svl/source/misc/urihelper.cxx
@@ -0,0 +1,884 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <memory>
+#include <string_view>
+
+#include <sal/config.h>
+
+#include <unicode/idna.h>
+
+#include <svl/urihelper.hxx>
+#include <com/sun/star/ucb/Command.hpp>
+#include <com/sun/star/ucb/IllegalIdentifierException.hpp>
+#include <com/sun/star/ucb/UniversalContentBroker.hpp>
+#include <com/sun/star/ucb/UnsupportedCommandException.hpp>
+#include <com/sun/star/ucb/XCommandEnvironment.hpp>
+#include <com/sun/star/ucb/XCommandProcessor.hpp>
+#include <com/sun/star/ucb/XContent.hpp>
+#include <com/sun/star/ucb/XUniversalContentBroker.hpp>
+#include <com/sun/star/uno/Any.hxx>
+#include <com/sun/star/uno/Exception.hpp>
+#include <com/sun/star/uno/Reference.hxx>
+#include <com/sun/star/uno/RuntimeException.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+#include <com/sun/star/uri/UriReferenceFactory.hpp>
+#include <com/sun/star/uri/XUriReference.hpp>
+#include <com/sun/star/uri/XUriReferenceFactory.hpp>
+#include <comphelper/processfactory.hxx>
+#include <osl/diagnose.h>
+#include <rtl/character.hxx>
+#include <rtl/ustrbuf.hxx>
+#include <rtl/ustring.hxx>
+#include <sal/types.h>
+#include <sal/log.hxx>
+#include <tools/inetmime.hxx>
+#include <unotools/charclass.hxx>
+
+using namespace com::sun::star;
+
+OUString URIHelper::SmartRel2Abs(INetURLObject const & rTheBaseURIRef,
+ OUString const & rTheRelURIRef,
+ Link<OUString *, bool> const & rMaybeFileHdl,
+ bool bCheckFileExists,
+ bool bIgnoreFragment,
+ INetURLObject::EncodeMechanism eEncodeMechanism,
+ INetURLObject::DecodeMechanism eDecodeMechanism,
+ rtl_TextEncoding eCharset,
+ FSysStyle eStyle)
+{
+ // Backwards compatibility:
+ if( rTheRelURIRef.startsWith("#") )
+ return rTheRelURIRef;
+
+ INetURLObject aAbsURIRef;
+ if (rTheBaseURIRef.HasError())
+ aAbsURIRef. SetSmartURL(rTheRelURIRef, eEncodeMechanism, eCharset, eStyle);
+ else
+ {
+ bool bWasAbsolute;
+ aAbsURIRef = rTheBaseURIRef.smartRel2Abs(rTheRelURIRef,
+ bWasAbsolute,
+ bIgnoreFragment,
+ eEncodeMechanism,
+ eCharset,
+ false/*bRelativeNonURIs*/,
+ eStyle);
+ if (bCheckFileExists
+ && !bWasAbsolute
+ && (aAbsURIRef.GetProtocol() == INetProtocol::File))
+ {
+ INetURLObject aNonFileURIRef;
+ aNonFileURIRef.SetSmartURL(rTheRelURIRef,
+ eEncodeMechanism,
+ eCharset,
+ eStyle);
+ if (!aNonFileURIRef.HasError()
+ && aNonFileURIRef.GetProtocol() != INetProtocol::File)
+ {
+ bool bMaybeFile = false;
+ if (rMaybeFileHdl.IsSet())
+ {
+ OUString aFilePath(rTheRelURIRef);
+ bMaybeFile = rMaybeFileHdl.Call(&aFilePath);
+ }
+ if (!bMaybeFile)
+ aAbsURIRef = aNonFileURIRef;
+ }
+ }
+ }
+ return aAbsURIRef.GetMainURL(eDecodeMechanism, eCharset);
+}
+
+namespace { Link<OUString *, bool> gMaybeFileHdl; }
+
+void URIHelper::SetMaybeFileHdl(Link<OUString *, bool> const & rTheMaybeFileHdl)
+{
+ gMaybeFileHdl = rTheMaybeFileHdl;
+}
+
+Link<OUString *, bool> const & URIHelper::GetMaybeFileHdl()
+{
+ return gMaybeFileHdl;
+}
+
+namespace {
+
+bool isAbsoluteHierarchicalUriReference(
+ css::uno::Reference< css::uri::XUriReference > const & uriReference)
+{
+ return uriReference.is() && uriReference->isAbsolute()
+ && !uriReference->hasRelativePath();
+}
+
+// To improve performance, assume that if for any prefix URL of a given
+// hierarchical URL either a UCB content cannot be created, or the UCB content
+// does not support the getCasePreservingURL command, then this will hold for
+// any other prefix URL of the given URL, too:
+enum Result { Success, GeneralFailure, SpecificFailure };
+
+Result normalizePrefix( css::uno::Reference< css::ucb::XUniversalContentBroker > const & broker,
+ OUString const & uri, OUString * normalized)
+{
+ OSL_ASSERT(broker.is() && normalized != nullptr);
+ css::uno::Reference< css::ucb::XContent > content;
+ try {
+ content = broker->queryContent(broker->createContentIdentifier(uri));
+ } catch (css::ucb::IllegalIdentifierException &) {}
+ if (!content.is()) {
+ return GeneralFailure;
+ }
+ try {
+ bool ok =
+ (css::uno::Reference< css::ucb::XCommandProcessor >(
+ content, css::uno::UNO_QUERY_THROW)->execute(
+ css::ucb::Command("getCasePreservingURL",
+ -1, css::uno::Any()),
+ 0,
+ css::uno::Reference< css::ucb::XCommandEnvironment >())
+ >>= *normalized);
+ OSL_ASSERT(ok);
+ } catch (css::uno::RuntimeException &) {
+ throw;
+ } catch (css::ucb::UnsupportedCommandException &) {
+ return GeneralFailure;
+ } catch (css::uno::Exception &) {
+ return SpecificFailure;
+ }
+ return Success;
+}
+
+OUString normalize(
+ css::uno::Reference< css::ucb::XUniversalContentBroker > const & broker,
+ css::uno::Reference< css::uri::XUriReferenceFactory > const & uriFactory,
+ OUString const & uriReference)
+{
+ // normalizePrefix can potentially fail (a typically example being a file
+ // URL that denotes a non-existing resource); in such a case, try to
+ // normalize as long a prefix of the given URL as possible (i.e., normalize
+ // all the existing directories within the path):
+ OUString normalized;
+ sal_Int32 n = uriReference.indexOf('#');
+ normalized = n == -1 ? uriReference : uriReference.copy(0, n);
+ switch (normalizePrefix(broker, normalized, &normalized)) {
+ case Success:
+ return n == -1 ? normalized : normalized + uriReference.subView(n);
+ case GeneralFailure:
+ return uriReference;
+ case SpecificFailure:
+ default:
+ break;
+ }
+ css::uno::Reference< css::uri::XUriReference > ref(
+ uriFactory->parse(uriReference));
+ if (!isAbsoluteHierarchicalUriReference(ref)) {
+ return uriReference;
+ }
+ sal_Int32 count = ref->getPathSegmentCount();
+ if (count < 2) {
+ return uriReference;
+ }
+ OUStringBuffer head(ref->getScheme());
+ head.append(':');
+ if (ref->hasAuthority()) {
+ head.append("//" + ref->getAuthority());
+ }
+ for (sal_Int32 i = count - 1; i > 0; --i) {
+ OUStringBuffer buf(head);
+ for (sal_Int32 j = 0; j < i; ++j) {
+ buf.append('/');
+ buf.append(ref->getPathSegment(j));
+ }
+ normalized = buf.makeStringAndClear();
+ if (normalizePrefix(broker, normalized, &normalized) != SpecificFailure)
+ {
+ buf.append(normalized);
+ css::uno::Reference< css::uri::XUriReference > preRef(
+ uriFactory->parse(normalized));
+ if (!isAbsoluteHierarchicalUriReference(preRef)) {
+ // This could only happen if something is inconsistent:
+ break;
+ }
+ sal_Int32 preCount = preRef->getPathSegmentCount();
+ // normalizePrefix may have added or removed a final slash:
+ if (preCount != i) {
+ if (preCount == i - 1) {
+ buf.append('/');
+ } else if (preCount - 1 == i && !buf.isEmpty()
+ && buf[buf.getLength() - 1] == '/')
+ {
+ buf.setLength(buf.getLength() - 1);
+ } else {
+ // This could only happen if something is inconsistent:
+ break;
+ }
+ }
+ for (sal_Int32 j = i; j < count; ++j) {
+ buf.append('/');
+ buf.append(ref->getPathSegment(j));
+ }
+ if (ref->hasQuery()) {
+ buf.append('?');
+ buf.append(ref->getQuery());
+ }
+ if (ref->hasFragment()) {
+ buf.append('#');
+ buf.append(ref->getFragment());
+ }
+ return buf.makeStringAndClear();
+ }
+ }
+ return uriReference;
+}
+
+}
+
+css::uno::Reference< css::uri::XUriReference >
+URIHelper::normalizedMakeRelative(
+ css::uno::Reference< css::uno::XComponentContext > const & context,
+ OUString const & baseUriReference, OUString const & uriReference)
+{
+ OSL_ASSERT(context.is());
+ css::uno::Reference< css::ucb::XUniversalContentBroker > broker(
+ css::ucb::UniversalContentBroker::create(context));
+ css::uno::Reference< css::uri::XUriReferenceFactory > uriFactory(
+ css::uri::UriReferenceFactory::create(context));
+ return uriFactory->makeRelative(
+ uriFactory->parse(normalize(broker, uriFactory, baseUriReference)),
+ uriFactory->parse(normalize(broker, uriFactory, uriReference)), true,
+ true, false);
+}
+
+OUString URIHelper::simpleNormalizedMakeRelative(
+ OUString const & baseUriReference, OUString const & uriReference)
+{
+ css::uno::Reference< css::uri::XUriReference > rel(
+ URIHelper::normalizedMakeRelative(
+ comphelper::getProcessComponentContext(), baseUriReference,
+ uriReference));
+ return rel.is() ? rel->getUriReference() : uriReference;
+}
+
+
+// FindFirstURLInText
+
+
+namespace {
+
+sal_Int32 nextChar(std::u16string_view rStr, sal_Int32 nPos)
+{
+ return rtl::isHighSurrogate(rStr[nPos])
+ && rStr.size() - nPos >= 2
+ && rtl::isLowSurrogate(rStr[nPos + 1]) ?
+ nPos + 2 : nPos + 1;
+}
+
+bool isBoundary1(CharClass const & rCharClass, OUString const & rStr,
+ sal_Int32 nPos, sal_Int32 nEnd)
+{
+ if (nPos == nEnd)
+ return true;
+ if (rCharClass.isLetterNumeric(rStr, nPos))
+ return false;
+ switch (rStr[nPos])
+ {
+ case '$':
+ case '%':
+ case '&':
+ case '-':
+ case '/':
+ case '@':
+ case '\\':
+ return false;
+ default:
+ return true;
+ }
+}
+
+bool isBoundary2(CharClass const & rCharClass, OUString const & rStr,
+ sal_Int32 nPos, sal_Int32 nEnd)
+{
+ if (nPos == nEnd)
+ return true;
+ if (rCharClass.isLetterNumeric(rStr, nPos))
+ return false;
+ switch (rStr[nPos])
+ {
+ case '!':
+ case '#':
+ case '$':
+ case '%':
+ case '&':
+ case '\'':
+ case '*':
+ case '+':
+ case '-':
+ case '/':
+ case '=':
+ case '?':
+ case '@':
+ case '^':
+ case '_':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ case '~':
+ return false;
+ default:
+ return true;
+ }
+}
+
+// tdf#145381 Added MatchingBracketDepth counter to detect matching closing
+// brackets that are part of the uri
+bool checkWChar(CharClass const & rCharClass, OUString const & rStr,
+ sal_Int32 * pPos, sal_Int32 * pEnd,
+ sal_Int32 * pMatchingBracketDepth = nullptr,
+ bool bBackslash = false, bool bPipe = false)
+{
+ sal_Unicode c = rStr[*pPos];
+ if (rtl::isAscii(c))
+ {
+ static sal_uInt8 const aMap[128]
+ = { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 0, 4, 4, 4, 1, // !"#$%&'
+ 5, 6, 1, 1, 1, 4, 1, 4, // ()*+,-./
+ 4, 4, 4, 4, 4, 4, 4, 4, // 01234567
+ 4, 4, 1, 1, 0, 1, 0, 1, // 89:;<=>?
+ 4, 4, 4, 4, 4, 4, 4, 4, // @ABCDEFG
+ 4, 4, 4, 4, 4, 4, 4, 4, // HIJKLMNO
+ 4, 4, 4, 4, 4, 4, 4, 4, // PQRSTUVW
+ 4, 4, 4, 1, 2, 1, 0, 1, // XYZ[\]^_
+ 0, 4, 4, 4, 4, 4, 4, 4, // `abcdefg
+ 4, 4, 4, 4, 4, 4, 4, 4, // hijklmno
+ 4, 4, 4, 4, 4, 4, 4, 4, // pqrstuvw
+ 4, 4, 4, 0, 3, 0, 1, 0 }; // xyz{|}~
+ switch (aMap[c])
+ {
+ default: // not uric
+ return false;
+
+ case 1: // uric
+ ++(*pPos);
+ return true;
+
+ case 2: // "\"
+ if (bBackslash)
+ {
+ *pEnd = ++(*pPos);
+ return true;
+ }
+ else
+ return false;
+
+ case 3: // "|"
+ if (bPipe)
+ {
+ *pEnd = ++(*pPos);
+ return true;
+ }
+ else
+ return false;
+
+ case 4: // alpha, digit, "$", "%", "&", "-", "/", "@" (see
+ // isBoundary1)
+ *pEnd = ++(*pPos);
+ return true;
+
+ case 5: // opening bracket
+ ++(*pPos);
+ if(nullptr != pMatchingBracketDepth)
+ ++(*pMatchingBracketDepth);
+ return true;
+
+ case 6: // closing bracket
+ ++(*pPos);
+ if(nullptr != pMatchingBracketDepth && *pMatchingBracketDepth > 0)
+ {
+ --(*pMatchingBracketDepth);
+ // tdf#145381 When there was an opening bracket, detect this closing bracket
+ // as part of the uri
+ *pEnd = *pPos;
+ }
+ return true;
+
+ }
+ }
+ else if (rCharClass.isLetterNumeric(rStr, *pPos))
+ {
+ *pEnd = *pPos = nextChar(rStr, *pPos);
+ return true;
+ }
+ else
+ return false;
+}
+
+sal_uInt32 scanDomain(OUString const & rStr, sal_Int32 * pPos,
+ sal_Int32 nEnd)
+{
+ sal_Unicode const * pBuffer = rStr.getStr();
+ sal_Unicode const * p = pBuffer + *pPos;
+ sal_uInt32 nLabels = INetURLObject::scanDomain(p, pBuffer + nEnd, false);
+ *pPos = sal::static_int_cast< sal_Int32 >(p - pBuffer);
+ return nLabels;
+}
+
+}
+
+OUString URIHelper::FindFirstURLInText(OUString const & rText,
+ sal_Int32 & rBegin,
+ sal_Int32 & rEnd,
+ CharClass const & rCharClass,
+ INetURLObject::EncodeMechanism eMechanism,
+ rtl_TextEncoding eCharset)
+{
+ if (rBegin > rEnd || rEnd > rText.getLength())
+ return OUString();
+
+ // Search for the first substring of [rBegin..rEnd[ that matches any of the
+ // following productions (for which the appropriate style bit is set in
+ // eStyle, if applicable).
+
+ // 1st Production (known scheme):
+ // \B1 <one of the known schemes, except file> ":" 1*wchar ["#" 1*wchar]
+ // \B1
+
+ // 2nd Production (file):
+ // \B1 "FILE:" 1*(wchar / "\" / "|") ["#" 1*wchar] \B1
+
+ // 3rd Production (ftp):
+ // \B1 "FTP" 2*("." label) ["/" *wchar] ["#" 1*wchar] \B1
+
+ // 4th Production (http):
+ // \B1 "WWW" 2*("." label) ["/" *wchar] ["#" 1*wchar] \B1
+
+ // 5th Production (mailto):
+ // \B2 local-part "@" domain \B1
+
+ // 6th Production (UNC file):
+ // \B1 "\\" domain "\" *(wchar / "\") \B1
+
+ // 7th Production (DOS file):
+ // \B1 ALPHA ":\" *(wchar / "\") \B1
+
+ // 8th Production (Unix-like DOS file):
+ // \B1 ALPHA ":/" *(wchar / "\") \B1
+
+ // The productions use the following auxiliary rules.
+
+ // local-part = atom *("." atom)
+ // atom = 1*(alphanum / "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+"
+ // / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}"
+ // / "~")
+ // domain = label *("." label)
+ // label = alphanum [*(alphanum / "-") alphanum]
+ // alphanum = ALPHA / DIGIT
+ // wchar = <any uric character (ignoring the escaped rule), or "%", or
+ // a letter or digit (according to rCharClass)>
+
+ // "\B1" (boundary 1) stands for the beginning or end of the block of text,
+ // or a character that is neither (a) a letter or digit (according to
+ // rCharClass), nor (b) any of "$", "%", "&", "-", "/", "@", or "\".
+ // (FIXME: What was the rationale for this set of punctuation characters?)
+
+ // "\B2" (boundary 2) stands for the beginning or end of the block of text,
+ // or a character that is neither (a) a letter or digit (according to
+ // rCharClass), nor (b) any of "!", "#", "$", "%", "&", "'", "*", "+", "-",
+ // "/", "=", "?", "@", "^", "_", "`", "{", "|", "}", or "~" (i.e., an RFC
+ // 822 <atom> character, or "@" from \B1's set above).
+
+ // Productions 1--4, and 6--8 try to find a maximum-length match, but they
+ // stop at the first <wchar> character that is a "\B1" character which is
+ // only followed by "\B1" characters (taking "\" and "|" characters into
+ // account appropriately). Production 5 simply tries to find a maximum-
+ // length match.
+
+ // Productions 1--4 use the given eMechanism and eCharset. Productions 5--9
+ // use EncodeMechanism::All.
+
+ // Productions 6--9 are only applicable if the FSysStyle::Dos bit is set in
+ // eStyle.
+
+ // tdf#145381: In addition to the productions I added a mechanism to detect
+ // matching brackets. The task presents the case of an url that ends on a
+ // closing bracket. This needs to be detected as part of the uri in the case
+ // that a matching opening bracket exists.
+
+ bool bBoundary1 = true;
+ bool bBoundary2 = true;
+ for (sal_Int32 nPos = rBegin; nPos != rEnd; nPos = nextChar(rText, nPos))
+ {
+ sal_Unicode c = rText[nPos];
+ if (bBoundary1)
+ {
+ if (rtl::isAsciiAlpha(c))
+ {
+ sal_Int32 i = nPos;
+ INetProtocol eScheme = INetURLObject::CompareProtocolScheme(rText.subView(i, rEnd - i));
+ if (eScheme == INetProtocol::File) // 2nd
+ {
+ while (rText[i++] != ':') ;
+ sal_Int32 nPrefixEnd = i;
+ sal_Int32 nUriEnd = i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd, nullptr, true,
+ true)) ;
+ if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
+ {
+ ++i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
+ }
+ if (nUriEnd != nPrefixEnd
+ && isBoundary1(rCharClass, rText, nUriEnd, rEnd))
+ {
+ INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
+ INetProtocol::File, eMechanism, eCharset,
+ FSysStyle::Detect);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = nUriEnd;
+ return
+ aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+ else if (eScheme != INetProtocol::NotValid) // 1st
+ {
+ while (rText[i++] != ':') ;
+ sal_Int32 nPrefixEnd = i;
+ sal_Int32 nUriEnd = i;
+ sal_Int32 nMatchingBracketDepth = 0;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd,
+ &nMatchingBracketDepth)) ;
+ if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
+ {
+ ++i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
+ }
+ if (nUriEnd != nPrefixEnd
+ && (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
+ || rText[nUriEnd] == '\\'))
+ {
+ INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
+ INetProtocol::Http, eMechanism,
+ eCharset);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = nUriEnd;
+ return
+ aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+
+ // 3rd, 4th:
+ i = nPos;
+ sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
+ if (nLabels >= 3
+ && rText[nPos + 3] == '.'
+ && (((rText[nPos] == 'w'
+ || rText[nPos] == 'W')
+ && (rText[nPos + 1] == 'w'
+ || rText[nPos + 1] == 'W')
+ && (rText[nPos + 2] == 'w'
+ || rText[nPos + 2] == 'W'))
+ || ((rText[nPos] == 'f'
+ || rText[nPos] == 'F')
+ && (rText[nPos + 1] == 't'
+ || rText[nPos + 1] == 'T')
+ && (rText[nPos + 2] == 'p'
+ || rText[nPos + 2] == 'P'))))
+ // (note that rText.GetChar(nPos + 3) is guaranteed to be
+ // valid)
+ {
+ sal_Int32 nUriEnd = i;
+ if (i != rEnd && rText[i] == '/')
+ {
+ nUriEnd = ++i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
+ }
+ if (i != rEnd && rText[i] == '#')
+ {
+ ++i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
+ }
+ if (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
+ || rText[nUriEnd] == '\\')
+ {
+ INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
+ INetProtocol::Http, eMechanism,
+ eCharset);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = nUriEnd;
+ return
+ aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+
+ if (rEnd - nPos >= 3
+ && rText[nPos + 1] == ':'
+ && (rText[nPos + 2] == '/'
+ || rText[nPos + 2] == '\\')) // 7th, 8th
+ {
+ i = nPos + 3;
+ sal_Int32 nUriEnd = i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
+ if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
+ {
+ INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
+ INetProtocol::File,
+ INetURLObject::EncodeMechanism::All,
+ RTL_TEXTENCODING_UTF8,
+ FSysStyle::Dos);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = nUriEnd;
+ return
+ aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+ }
+ else if (rEnd - nPos >= 2
+ && rText[nPos] == '\\'
+ && rText[nPos + 1] == '\\') // 6th
+ {
+ sal_Int32 i = nPos + 2;
+ sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
+ if (nLabels >= 1 && i != rEnd && rText[i] == '\\')
+ {
+ sal_Int32 nUriEnd = ++i;
+ while (i != rEnd
+ && checkWChar(rCharClass, rText, &i, &nUriEnd,
+ nullptr, true)) ;
+ if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
+ {
+ INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
+ INetProtocol::File,
+ INetURLObject::EncodeMechanism::All,
+ RTL_TEXTENCODING_UTF8,
+ FSysStyle::Dos);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = nUriEnd;
+ return
+ aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+ }
+ }
+ if (bBoundary2 && INetMIME::isAtomChar(c)) // 5th
+ {
+ bool bDot = false;
+ for (sal_Int32 i = nPos + 1; i != rEnd; ++i)
+ {
+ sal_Unicode c2 = rText[i];
+ if (INetMIME::isAtomChar(c2))
+ bDot = false;
+ else if (bDot)
+ break;
+ else if (c2 == '.')
+ bDot = true;
+ else
+ {
+ if (c2 == '@')
+ {
+ ++i;
+ sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
+ if (nLabels >= 1
+ && isBoundary1(rCharClass, rText, i, rEnd))
+ {
+ INetURLObject aUri(rText.subView(nPos, i - nPos),
+ INetProtocol::Mailto,
+ INetURLObject::EncodeMechanism::All);
+ if (!aUri.HasError())
+ {
+ rBegin = nPos;
+ rEnd = i;
+ return aUri.GetMainURL(
+ INetURLObject::DecodeMechanism::ToIUri);
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ bBoundary1 = isBoundary1(rCharClass, rText, nPos, rEnd);
+ bBoundary2 = isBoundary2(rCharClass, rText, nPos, rEnd);
+ }
+ rBegin = rEnd;
+ return OUString();
+}
+
+OUString URIHelper::FindFirstDOIInText(OUString const & rText,
+ sal_Int32 & rBegin,
+ sal_Int32 & rEnd,
+ CharClass const & rCharClass)
+{
+ if (rBegin > rEnd || rEnd > rText.getLength())
+ return OUString();
+
+ sal_Int32 start = 7;
+ sal_Int32 count = rEnd-rBegin;
+ OUString candidate(rText.subView(rBegin, count));
+ // Match with regex "doi:10\.\d{4,9}\/[-._;()\/:a-zA-Z0-9]+"
+ if (candidate.startsWithIgnoreAsciiCase("doi:10."))
+ {
+ bool flag = true;
+ sal_Int32 digit = 0;
+ for (sal_Int32 i=start; i<count; i++)
+ {
+ sal_Unicode c = candidate[i];
+ // Match 4 to 9 digits before slash
+ if (digit >= 0)
+ {
+ if (digit>9)
+ {
+ flag = false;
+ break;
+ }
+
+ if ( rCharClass.isDigit(candidate,i) )
+ {
+ digit++;
+ }
+ else if (c=='/' && digit>=4 && i<count-1)
+ {
+ digit=-1;
+ }
+ else
+ {
+ flag = false;
+ break;
+ }
+ }
+ // Match [-._;()\/:a-zA-Z0-9] after slash
+ else if (!( rCharClass.isAlphaNumeric(candidate, i) || c == '.' || c == '-' || c=='_' ||
+ c==';' || c=='(' || c==')' || c=='\\' || (c=='/' && i<count-1) || c==':'))
+ {
+ flag = false;
+ break;
+ }
+ }
+ if (flag && digit==-1)
+ {
+ return OUString::Concat("https://doi.org/")+candidate.subView(4);
+ }
+ }
+ rBegin = rEnd;
+ return OUString();
+}
+
+OUString URIHelper::removePassword(OUString const & rURI,
+ INetURLObject::EncodeMechanism eEncodeMechanism,
+ INetURLObject::DecodeMechanism eDecodeMechanism,
+ rtl_TextEncoding eCharset)
+{
+ INetURLObject aObj(rURI, eEncodeMechanism, eCharset);
+ return aObj.HasError() ?
+ rURI :
+ aObj.GetURLNoPass(eDecodeMechanism, eCharset);
+}
+
+OUString URIHelper::resolveIdnaHost(OUString const & url) {
+ css::uno::Reference<css::uri::XUriReference> uri(
+ css::uri::UriReferenceFactory::create(
+ comphelper::getProcessComponentContext())
+ ->parse(url));
+ if (!(uri.is() && uri->hasAuthority())) {
+ return url;
+ }
+ auto auth(uri->getAuthority());
+ if (auth.isEmpty())
+ return url;
+ sal_Int32 hostStart = auth.indexOf('@') + 1;
+ sal_Int32 hostEnd = auth.getLength();
+ while (hostEnd > hostStart && rtl::isAsciiDigit(auth[hostEnd - 1])) {
+ --hostEnd;
+ }
+ if (hostEnd > hostStart && auth[hostEnd - 1] == ':') {
+ --hostEnd;
+ } else {
+ hostEnd = auth.getLength();
+ }
+ auto asciiOnly = true;
+ for (auto i = hostStart; i != hostEnd; ++i) {
+ if (!rtl::isAscii(auth[i])) {
+ asciiOnly = false;
+ break;
+ }
+ }
+ if (asciiOnly) {
+ // Avoid icu::IDNA case normalization in purely non-IDNA domain names:
+ return url;
+ }
+ UErrorCode e = U_ZERO_ERROR;
+ std::unique_ptr<icu::IDNA> idna(
+ icu::IDNA::createUTS46Instance(
+ (UIDNA_USE_STD3_RULES | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_CHECK_CONTEXTO),
+ e));
+ if (U_FAILURE(e)) {
+ SAL_WARN("vcl.gdi", "icu::IDNA::createUTS46Instance " << e);
+ return url;
+ }
+ icu::UnicodeString ascii;
+ icu::IDNAInfo info;
+ idna->nameToASCII(
+ icu::UnicodeString(
+ reinterpret_cast<UChar const *>(auth.getStr() + hostStart),
+ hostEnd - hostStart),
+ ascii, info, e);
+ if (U_FAILURE(e) || info.hasErrors()) {
+ return url;
+ }
+ OUStringBuffer buf(uri->getScheme());
+ buf.append(OUString::Concat("://") + auth.subView(0, hostStart));
+ buf.append(
+ reinterpret_cast<sal_Unicode const *>(ascii.getBuffer()),
+ ascii.length());
+ buf.append(auth.subView(hostEnd) + uri->getPath());
+ if (uri->hasQuery()) {
+ buf.append("?" + uri->getQuery());
+ }
+ if (uri->hasFragment()) {
+ buf.append("#" + uri->getFragment());
+ }
+ return buf.makeStringAndClear();
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */