diff options
Diffstat (limited to '')
-rw-r--r-- | include/rtl/uri.h | 362 | ||||
-rw-r--r-- | include/rtl/uri.hxx | 173 |
2 files changed, 535 insertions, 0 deletions
diff --git a/include/rtl/uri.h b/include/rtl/uri.h new file mode 100644 index 0000000000..28975060c0 --- /dev/null +++ b/include/rtl/uri.h @@ -0,0 +1,362 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +/* + * This file is part of LibreOffice published API. + */ + +#ifndef INCLUDED_RTL_URI_H +#define INCLUDED_RTL_URI_H + +#include "sal/config.h" + +#include "rtl/textenc.h" +#include "rtl/ustring.h" +#include "sal/saldllapi.h" +#include "sal/types.h" + +#if defined __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** Various predefined URI 'char classes.' + + A 'char class' defines which (ASCII) characters can be written 'as they + are' in a part of a Uri, and which characters have to be written using + escape sequences ('%' followed by two hex digits). Characters outside + the ASCII range are always written using escape sequences. + + If there are other frequently used char classes, they can be added to + this enumeration; the function rtl_getUriCharClass() has to be adapted + then, too. + */ +typedef enum +{ + /** The empty char class. + + All characters are written using escape sequences. + */ + rtl_UriCharClassNone, + + /** The RFC 2732 @<uric> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and + letters. + + This differs from RFC 3986 @<fragment> in additionally allowing [] + @endverbatim + */ + rtl_UriCharClassUric, + + /** The RFC 2396 @<uric_no_slash> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters. + + This differs from RFC 3986 @<fragment> in additionally encoding / + This differs from RFC 3986 @<pchar> in additionally allowing ? + @endverbatim + */ + rtl_UriCharClassUricNoSlash, + + /** The RFC 2396 @<rel_segment> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters. + + This is the same as RFC 3986 @<segment-nz-nc> + @endverbatim + */ + rtl_UriCharClassRelSegment, + + /** The RFC 2396 @<reg_name> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters. + + This differs from RFC 3986 @<reg_name> in additionally allowing @ + @endverbatim + */ + rtl_UriCharClassRegName, + + /** The RFC 2396 @<userinfo> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters. + + This is the same as RFC 3986 @<userinfo> + @endverbatim + */ + rtl_UriCharClassUserinfo, + + /** The RFC 2396 @<pchar> char class. + + @verbatim + The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters. + + This differs from RFC 3986 @<pchar> in additionally encoding ; + @endverbatim + */ + rtl_UriCharClassPchar, + + /** The char class for the values of uno URL parameters. + + @verbatim + The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters. + @endverbatim + */ + rtl_UriCharClassUnoParamValue, + + rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM +} +rtl_UriCharClass; + +/** The mechanism describing how escape sequences in the input of + rtl_uriEncode() are handled. + */ +typedef enum +{ + /** The special meaning of '%' is ignored (i.e., there are by definition + no escape sequences in the input). + + This mechanism is useful to encode user input as part of a URI (e.g., + the user-supplied password in an ftp URL---'%20abcde' is a valid + password, so do not assume that the '%20' is an escaped space). + */ + rtl_UriEncodeIgnoreEscapes, + + /** All escape sequences ('%' followed by two hex digits) are kept intact, + even if they represent characters that need not be escaped or if they + do not even map to characters in the given charset. + + This mechanism is useful when passing on complete URIs more or less + unmodified (e.g., within an HTTP proxy): missing escape sequences are + added, but existing escape sequences are not touched (except that any + lower case hex digits are replaced by upper case hex digits). + */ + rtl_UriEncodeKeepEscapes, + + /** All escape sequences ('%' followed by two hex digits) are resolved in + a first step; only those that represent characters that need to be + escaped are kept intact. + + This mechanism is useful to properly encode complete URIs entered by + the user: the URI is brought into a 'canonic form,' but care is taken + not to damage (valid) escape sequences the (careful) user already + entered as such. + */ + rtl_UriEncodeCheckEscapes, + + /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting + unmappable characters. + + @since UDK 3.2.0 + */ + rtl_UriEncodeStrict, + + /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting + unmappable characters. + + Also, any escape sequences that are present are always considered to be (potentially broken) + UTF-8. This mechanism is meant to be used on the result of a rtl_UriDecodeToIuri decoding, + which will thus only contain escape sequences representing either ASCII characters or broken + UTF-8 sequences, and which will all be kept as-is. + + @since UDK 3.2.7 + */ + rtl_UriEncodeStrictKeepEscapes, + + rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM +} +rtl_UriEncodeMechanism; + +/** The mechanism describing how rtl_uriDecode() translates (part of) a URI + into a Unicode string. + */ +typedef enum +{ + /** The text is returned completely unmodified. + */ + rtl_UriDecodeNone, + + /** The text is returned in the form of an IURI (cf. + draft-masinter-url-i18n-05.txt). + + All escape sequences representing ASCII characters (%00--%7F) are + kept, all other escape sequences are interpreted as UTF-8 characters + and translated to Unicode, if possible. + */ + rtl_UriDecodeToIuri, + + /** The text is decoded. + + All escape sequences representing characters from the given charset + are decoded and translated to Unicode, if possible. + */ + rtl_UriDecodeWithCharset, + + /** Like rtl_UriDecodeWithCharset, but indicating failure when converting + unmappable characters. + + @since UDK 3.2.0 + */ + rtl_UriDecodeStrict, + + rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM +} +rtl_UriDecodeMechanism; + +/** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode(). + + The function rtl_uriEncode() expects an array of 128 booleans, and this + function maps rtl_UriCharClass enumeration members to such arrays. + + @param eCharClass + Any valid member of rtl_UriCharClass. + + @return + An array of 128 booleans, to be used in calls to rtl_uriEncode(). + */ +SAL_DLLPUBLIC sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass) + SAL_THROW_EXTERN_C(); + +/** Encode a text as (part of) a URI. + + @param pText + Any Unicode string. Must not be null. + + @param pCharClass + A char class, represented as an array of 128 booleans (true means keep the + corresponding ASCII character unencoded, false means encode it). Must not + be null, and the boolean corresponding to the percent sign (0x25) must be + false. (See rtl_getUriCharClass() for a function mapping from + rtl_UriCharClass to such arrays.) + + @param eMechanism + The mechanism describing how escape sequences in the input text are + handled. + + @param eCharset + When Unicode characters from the input text have to be written using + escape sequences (because they are either outside the ASCII range or do + not belong to the given char class), they are first translated into this + charset before being encoded using escape sequences. + + Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape + sequences already present in the input text are interpreted as characters + from this charset. + + @param pResult + Returns an encoded representation of the input text. Must itself not be + null, and must point to either null or a valid string. + + If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be + converted to eCharset because it contains unmappable characters (which + implies that pText is not empty), then an empty string is returned. + */ +SAL_DLLPUBLIC void SAL_CALL rtl_uriEncode( + rtl_uString * pText, + sal_Bool const * pCharClass, + rtl_UriEncodeMechanism eMechanism, + rtl_TextEncoding eCharset, + rtl_uString ** pResult) + SAL_THROW_EXTERN_C(); + +/** Decode (a part of) a URI. + + @param pText + Any Unicode string. Must not be null. (If the input is indeed part of a + valid URI, this string will only contain a subset of the ASCII characters, + but this function also handles other Unicode characters properly.) + + @param eMechanism + The mechanism describing how the input text is translated into a Unicode + string. + + @param eCharset + When the decode mechanism is rtl_UriDecodeWithCharset, all escape + sequences in the input text are interpreted as characters from this + charset. Those characters are translated to Unicode characters in the + resulting output, if possible. + + When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri, + this parameter is ignored (and is best specified as + RTL_TEXTENCODING_UTF8). + + @param pResult + Returns a decoded representation of the input text. Must itself not be + null, and must point to either null or a valid string. + + If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be + converted to eCharset because it contains (encodings of) unmappable + characters (which implies that pText is not empty), then an empty string is + returned. + */ +SAL_DLLPUBLIC void SAL_CALL rtl_uriDecode( + rtl_uString * pText, + rtl_UriDecodeMechanism eMechanism, + rtl_TextEncoding eCharset, + rtl_uString ** pResult) + SAL_THROW_EXTERN_C(); + +/** Convert a relative URI reference into an absolute URI. + + This function uses the strict parser algorithm described in RFC 3986, + section 5.2. + + This function signals exceptions by returning false and letting pException + point to a message explaining the exception. + + @param pBaseUriRef + An absolute URI that serves as the base URI. If it has to be inspected + (i.e., pRelUriRef is not an absolute URI already), and it is not an absolute + URI (i.e., does not begin with a @<scheme ":"> part), an exception will be + signaled. + + @param pRelUriRef + A URI reference that may be either absolute or relative. If it is + absolute, it will be returned unmodified. + + @param pResult + Returns an absolute URI. Must itself not be null, and must point to either + null or a valid string. If an exception is signalled, it is left unchanged. + + @param pException + Returns an explanatory message in case an exception is signalled. Must + itself not be null, and must point to either null or a valid string. If no + exception is signalled, it is left unchanged. + + @return + True if no exception is signalled, otherwise false. + */ +SAL_DLLPUBLIC sal_Bool SAL_CALL rtl_uriConvertRelToAbs( + rtl_uString * pBaseUriRef, + rtl_uString * pRelUriRef, + rtl_uString ** pResult, + rtl_uString ** pException) + SAL_THROW_EXTERN_C(); + +#if defined __cplusplus +} +#endif /* __cplusplus */ + +#endif // INCLUDED_RTL_URI_H + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/rtl/uri.hxx b/include/rtl/uri.hxx new file mode 100644 index 0000000000..b0b3e9ad81 --- /dev/null +++ b/include/rtl/uri.hxx @@ -0,0 +1,173 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +/* + * This file is part of LibreOffice published API. + */ + +#ifndef INCLUDED_RTL_URI_HXX +#define INCLUDED_RTL_URI_HXX + +#include "rtl/malformeduriexception.hxx" +#include "rtl/uri.h" +#include "rtl/textenc.h" +#include "rtl/ustring.hxx" +#include "sal/types.h" + +#if defined LIBO_INTERNAL_ONLY +#include <array> +#include <cassert> +#include <cstddef> +#include <string_view> +#include "config_global.h" +#endif + +namespace rtl { + +/** A wrapper around the C functions from <rtl/uri.h>. + */ +class Uri +{ +public: + /** A wrapper around rtl_uriEncode() from <rtl/uri.h> (see there), using + an array of 128 booleans as char class. + */ + static inline rtl::OUString encode(rtl::OUString const & rText, + sal_Bool const * pCharClass, + rtl_UriEncodeMechanism eMechanism, + rtl_TextEncoding eCharset); + + /** A wrapper around rtl_uriEncode() from <rtl/uri.h> (see there), using + a predefined rtl_UriCharClass enumeration member. + */ + static inline rtl::OUString encode(rtl::OUString const & rText, + rtl_UriCharClass eCharClass, + rtl_UriEncodeMechanism eMechanism, + rtl_TextEncoding eCharset); + + /** A wrapper around rtl_uriDecode() from <rtl/uri.h> (see there). + */ + static inline rtl::OUString decode(rtl::OUString const & rText, + rtl_UriDecodeMechanism eMechanism, + rtl_TextEncoding eCharset); + + /** A wrapper around rtl_uriConvertRelToAbs() from <rtl/uri.h> (see there). + + @exception MalformedUriException + Thrown in case rtl_uriConvertRelToAbs() signals an exception due to a + malformed base URI. + */ + static inline rtl::OUString convertRelToAbs( + rtl::OUString const & rBaseUriRef, rtl::OUString const & rRelUriRef); + +private: + Uri() SAL_DELETED_FUNCTION; + + Uri(Uri &) SAL_DELETED_FUNCTION; + + ~Uri() SAL_DELETED_FUNCTION; + + void operator =(Uri) SAL_DELETED_FUNCTION; +}; + +inline rtl::OUString Uri::encode(rtl::OUString const & rText, + sal_Bool const * pCharClass, + rtl_UriEncodeMechanism eMechanism, + rtl_TextEncoding eCharset) +{ + rtl::OUString aResult; + rtl_uriEncode(rText.pData, + pCharClass, + eMechanism, + eCharset, + &aResult.pData); + return aResult; +} + +inline rtl::OUString Uri::encode(rtl::OUString const & rText, + rtl_UriCharClass eCharClass, + rtl_UriEncodeMechanism eMechanism, + rtl_TextEncoding eCharset) +{ + rtl::OUString aResult; + rtl_uriEncode(rText.pData, + rtl_getUriCharClass(eCharClass), + eMechanism, + eCharset, + &aResult.pData); + return aResult; +} + +inline rtl::OUString Uri::decode(rtl::OUString const & rText, + rtl_UriDecodeMechanism eMechanism, + rtl_TextEncoding eCharset) +{ + rtl::OUString aResult; + rtl_uriDecode(rText.pData, + eMechanism, + eCharset, + &aResult.pData); + return aResult; +} + +inline rtl::OUString Uri::convertRelToAbs(rtl::OUString const & rBaseUriRef, + rtl::OUString const & rRelUriRef) +{ + rtl::OUString aResult; + rtl::OUString aException; + if (!rtl_uriConvertRelToAbs( + rBaseUriRef.pData, + rRelUriRef.pData, &aResult.pData, + &aException.pData)) + throw MalformedUriException(aException); + return aResult; +} + +#if defined LIBO_INTERNAL_ONLY + +constexpr std::size_t UriCharClassSize = 128; + +// Create a char class (for use with rtl_uriEncode and rtl::Uri::encode), represented as a +// compile-time std::array, from an UTF-8 string literal. +// +// The given `unencoded` lists each ASCII character once that shall not be encoded. (It uses an +// UTF-8 string type to emphasize that its characters' values are always interpreted as ASCII +// values.) +#if HAVE_CPP_CONSTEVAL +consteval +#else +constexpr +#endif +auto createUriCharClass(std::u8string_view unencoded) +{ + std::array<sal_Bool, UriCharClassSize> a = {}; + for (auto c: unencoded) { + assert(!a[c]); // would presumably indicate a typo in the `unencoded` argument + a[c] = true; + } + return a; +} + +#endif + +} + +#endif // INCLUDED_RTL_URI_HXX + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |