diff options
Diffstat (limited to '')
-rw-r--r-- | include/tools/inetmime.hxx | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/include/tools/inetmime.hxx b/include/tools/inetmime.hxx new file mode 100644 index 000000000..3c0fe30ff --- /dev/null +++ b/include/tools/inetmime.hxx @@ -0,0 +1,241 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ +#ifndef INCLUDED_TOOLS_INETMIME_HXX +#define INCLUDED_TOOLS_INETMIME_HXX + +#include <tools/toolsdllapi.h> +#include <rtl/character.hxx> +#include <rtl/string.hxx> +#include <rtl/ustring.hxx> +#include <tools/debug.hxx> + +#include <unordered_map> + +struct INetContentTypeParameter +{ + /** The optional character set specification (see RFC 2231), in US-ASCII + encoding and converted to lower case. + */ + OString m_sCharset; + + /** The optional language specification (see RFC 2231), in US-ASCII + encoding and converted to lower case. + */ + OString m_sLanguage; + + /** The attribute value. If the value is a quoted-string, it is + 'unpacked.' If a character set is specified, and the value can be + converted to Unicode, this is done. Also, if no character set is + specified, it is first tried to convert the value from UTF-8 encoding + to Unicode, and if that doesn't work (because the value is not in + UTF-8 encoding), it is converted from ISO-8859-1 encoding to Unicode + (which will always work). But if a character set is specified and the + value cannot be converted from that character set to Unicode, special + action is taken to produce a value that can possibly be transformed + back into its original form: Any 8-bit character from a non-encoded + part of the original value is directly converted to Unicode + (effectively handling it as if it was ISO-8859-1 encoded), and any + 8-bit character from an encoded part of the original value is mapped + to the range U+F800..U+F8FF at the top of the Corporate Use Subarea + within Unicode's Private Use Area (effectively adding 0xF800 to the + character's numeric value). + */ + OUString m_sValue; + + /** This is true if the value is successfully converted to Unicode, and + false if the value is a special mixture of ISO-LATIN-1 characters and + characters from Unicode's Private Use Area. + */ + bool m_bConverted; +}; + +/** The key is the name of the attribute, in US-ASCII encoding and converted + to lower case. If a parameter value is split as described in RFC 2231, + there will only be one item for the complete parameter, with the attribute + name lacking any section suffix. + */ +typedef std::unordered_map<OString, INetContentTypeParameter> + INetContentTypeParameterList; + + +class SAL_WARN_UNUSED TOOLS_DLLPUBLIC INetMIME +{ +public: + /** Check for US-ASCII visible character. + + @param nChar Some UCS-4 character. + + @return True if nChar is a US-ASCII visible character (US-ASCII + 0x21--0x7E). + */ + static inline bool isVisible(sal_uInt32 nChar); + + /** Check whether some character is valid within an RFC 822 <atom>. + + @param nChar Some UCS-4 character. + + @return True if nChar is valid within an RFC 822 <atom> (US-ASCII + 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+', + '-', '/', '=', '?', '^', '_', '`', '{', '|', '}', or '~'). + */ + static bool isAtomChar(sal_uInt32 nChar); + + /** Check whether some character is valid within an RFC 2060 <atom>. + + @param nChar Some UCS-4 character. + + @return True if nChar is valid within an RFC 2060 <atom> (US-ASCII + 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '&', ''', '+', ',', '-', + '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`', + '|', '}', or '~'). + */ + static bool isIMAPAtomChar(sal_uInt32 nChar); + + /** Get the digit weight of a US-ASCII character. + + @param nChar Some UCS-4 character. + + @return If nChar is a US-ASCII (decimal) digit character (US-ASCII + '0'--'9'), return the corresponding weight (0--9); otherwise, + return -1. + */ + static inline int getWeight(sal_uInt32 nChar); + + /** Get the hexadecimal digit weight of a US-ASCII character. + + @param nChar Some UCS-4 character. + + @return If nChar is a US-ASCII hexadecimal digit character (US-ASCII + '0'--'9', 'A'--'F', or 'a'--'f'), return the corresponding weight + (0--15); otherwise, return -1. + */ + static inline int getHexWeight(sal_uInt32 nChar); + + /** Check two US-ASCII strings for equality, ignoring case. + + @param pBegin1 Points to the start of the first string, must not be + null. + + @param pEnd1 Points past the end of the first string, must be >= + pBegin1. + + @param pString2 Points to the start of the null terminated second + string, must not be null. + + @return True if the two strings are equal, ignoring the case of US- + ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z'). + */ + static bool equalIgnoreCase(const sal_Unicode * pBegin1, + const sal_Unicode * pEnd1, + const char * pString2); + + static bool scanUnsigned(const sal_Unicode *& rBegin, + const sal_Unicode * pEnd, bool bLeadingZeroes, + sal_uInt32 & rValue); + + /** Parse the body of an RFC 2045 Content-Type header field. + + @param pBegin The range (that must be valid) from non-null pBegin, + inclusive. to non-null pEnd, exclusive, forms the body of the + Content-Type header field. It must be of the form + + token "/" token *(";" token "=" (token / quoted-string)) + + with intervening linear white space and comments (cf. RFCs 822, 2045). + The RFC 2231 extensions are supported. The encoding of rMediaType + should be US-ASCII, but any Unicode values in the range U+0080..U+FFFF + are interpreted 'as appropriate.' + + @param pType If not null, returns the type (the first of the above + tokens), in US-ASCII encoding and converted to lower case. + + @param pSubType If not null, returns the sub-type (the second of the + above tokens), in US-ASCII encoding and converted to lower case. + + @param pParameters If not null, returns the parameters as a list of + INetContentTypeParameters (the attributes are in US-ASCII encoding and + converted to lower case, the values are in Unicode encoding). If + null, only the syntax of the parameters is checked, but they are not + returned. + + @return Null if the syntax of the field body is incorrect (i.e., does + not start with type and sub-type tokens). Otherwise, a pointer past the + longest valid input prefix. If null is returned, none of the output + parameters will be modified. + */ + static sal_Unicode const * scanContentType( + std::u16string_view rStr, + OUString * pType = nullptr, OUString * pSubType = nullptr, + INetContentTypeParameterList * pParameters = nullptr); + + static OUString decodeHeaderFieldBody(const OString& rBody); + + /** Get the UTF-32 character at the head of a UTF-16 encoded string. + + @param rBegin Points to the start of the UTF-16 encoded string, must + not be null. On exit, it points past the first UTF-32 character's + encoding. + + @param pEnd Points past the end of the UTF-16 encoded string, must be + strictly greater than rBegin. + + @return The UCS-4 character at the head of the UTF-16 encoded string. + If the string does not start with the UTF-16 encoding of a UCS-32 + character, the first UTF-16 value is returned. + */ + static inline sal_uInt32 getUTF32Character(const sal_Unicode *& rBegin, + const sal_Unicode * pEnd); +}; + +// static +inline bool INetMIME::isVisible(sal_uInt32 nChar) +{ + return nChar >= '!' && nChar <= '~'; +} + +// static +inline int INetMIME::getWeight(sal_uInt32 nChar) +{ + return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : -1; +} + +// static +inline int INetMIME::getHexWeight(sal_uInt32 nChar) +{ + return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : + nChar >= 'A' && nChar <= 'F' ? int(nChar - 'A' + 10) : + nChar >= 'a' && nChar <= 'f' ? int(nChar - 'a' + 10) : -1; +} + +// static +inline sal_uInt32 INetMIME::getUTF32Character(const sal_Unicode *& rBegin, + const sal_Unicode * pEnd) +{ + assert(rBegin && rBegin < pEnd && + "INetMIME::getUTF32Character(): Bad sequence"); + sal_uInt32 nUTF32 = *rBegin++; + if (rBegin < pEnd && rtl::isHighSurrogate(nUTF32) && rtl::isLowSurrogate(rBegin[0])) + nUTF32 = rtl::combineSurrogates(nUTF32, *rBegin++); + return nUTF32; +} + + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |