diff options
Diffstat (limited to '')
-rw-r--r-- | svl/source/misc/adrparse.cxx | 725 |
1 files changed, 725 insertions, 0 deletions
diff --git a/svl/source/misc/adrparse.cxx b/svl/source/misc/adrparse.cxx new file mode 100644 index 000000000..16b3aa4c7 --- /dev/null +++ b/svl/source/misc/adrparse.cxx @@ -0,0 +1,725 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <rtl/ustrbuf.hxx> +#include <svl/adrparse.hxx> + +namespace +{ + +enum ElementType { ELEMENT_START, ELEMENT_DELIM, ELEMENT_ITEM, ELEMENT_END }; + +struct ParsedAddrSpec +{ + sal_Unicode const * m_pBegin; + sal_Unicode const * m_pEnd; + ElementType m_eLastElem; + bool m_bAtFound; + bool m_bReparse; + + ParsedAddrSpec() { reset(); } + + bool isPoorlyValid() const { return m_eLastElem >= ELEMENT_ITEM; } + + bool isValid() const { return isPoorlyValid() && m_bAtFound; } + + void reset(); + + void finish(); +}; + +void ParsedAddrSpec::reset() +{ + m_pBegin = nullptr; + m_pEnd = nullptr; + m_eLastElem = ELEMENT_START; + m_bAtFound = false; + m_bReparse = false; +} + +void ParsedAddrSpec::finish() +{ + if (isPoorlyValid()) + m_eLastElem = ELEMENT_END; + else + reset(); +} + +} + +class SvAddressParser_Impl +{ + enum State { BEFORE_COLON, BEFORE_LESS, AFTER_LESS, AFTER_GREATER }; + + enum TokenType: sal_uInt32 { + TOKEN_QUOTED = 0x80000000, TOKEN_DOMAIN, TOKEN_COMMENT, TOKEN_ATOM }; + + sal_Unicode const * m_pInputPos; + sal_Unicode const * m_pInputEnd; + sal_uInt32 m_nCurToken; + sal_Unicode const * m_pCurTokenBegin; + sal_Unicode const * m_pCurTokenEnd; + sal_Unicode const * m_pCurTokenContentBegin; + sal_Unicode const * m_pCurTokenContentEnd; + bool m_bCurTokenReparse; + ParsedAddrSpec m_aOuterAddrSpec; + ParsedAddrSpec m_aInnerAddrSpec; + ParsedAddrSpec * m_pAddrSpec; + sal_Unicode const * m_pRealNameBegin; + sal_Unicode const * m_pRealNameEnd; + sal_Unicode const * m_pRealNameContentBegin; + sal_Unicode const * m_pRealNameContentEnd; + bool m_bRealNameReparse; + bool m_bRealNameFinished; + sal_Unicode const * m_pFirstCommentBegin; + sal_Unicode const * m_pFirstCommentEnd; + bool m_bFirstCommentReparse; + State m_eState; + TokenType m_eType; + + inline void resetRealNameAndFirstComment(); + + inline void reset(); + + void addTokenToAddrSpec(ElementType eTokenElem); + + inline void addTokenToRealName(); + + bool readToken(); + + static OUString reparse(sal_Unicode const * pBegin, + sal_Unicode const * pEnd, bool bAddrSpec); + + static OUString reparseComment(sal_Unicode const * pBegin, + sal_Unicode const * pEnd); + +public: + SvAddressParser_Impl(SvAddressParser * pParser, const OUString& rIn); +}; + +inline void SvAddressParser_Impl::resetRealNameAndFirstComment() +{ + m_pRealNameBegin = nullptr; + m_pRealNameEnd = nullptr; + m_pRealNameContentBegin = nullptr; + m_pRealNameContentEnd = nullptr; + m_bRealNameReparse = false; + m_bRealNameFinished = false; + m_pFirstCommentBegin = nullptr; + m_pFirstCommentEnd = nullptr; + m_bFirstCommentReparse = false; +} + +inline void SvAddressParser_Impl::reset() +{ + m_aOuterAddrSpec.reset(); + m_aInnerAddrSpec.reset(); + m_pAddrSpec = &m_aOuterAddrSpec; + resetRealNameAndFirstComment(); + m_eState = BEFORE_COLON; + m_eType = TOKEN_ATOM; +} + +void SvAddressParser_Impl::addTokenToAddrSpec(ElementType eTokenElem) +{ + if (!m_pAddrSpec->m_pBegin) + m_pAddrSpec->m_pBegin = m_pCurTokenBegin; + else if (m_pAddrSpec->m_pEnd < m_pCurTokenBegin) + m_pAddrSpec->m_bReparse = true; + m_pAddrSpec->m_pEnd = m_pCurTokenEnd; + m_pAddrSpec->m_eLastElem = eTokenElem; +} + +inline void SvAddressParser_Impl::addTokenToRealName() +{ + if (!m_bRealNameFinished && m_eState != AFTER_LESS) + { + if (!m_pRealNameBegin) + m_pRealNameBegin = m_pRealNameContentBegin = m_pCurTokenBegin; + else if (m_pRealNameEnd < m_pCurTokenBegin - 1 + || (m_pRealNameEnd == m_pCurTokenBegin - 1 + && *m_pRealNameEnd != ' ')) + m_bRealNameReparse = true; + m_pRealNameEnd = m_pRealNameContentEnd = m_pCurTokenEnd; + } +} + + +// SvAddressParser_Impl + + +bool SvAddressParser_Impl::readToken() +{ + m_nCurToken = m_eType; + m_bCurTokenReparse = false; + switch (m_eType) + { + case TOKEN_QUOTED: + { + m_pCurTokenBegin = m_pInputPos - 1; + m_pCurTokenContentBegin = m_pInputPos; + bool bEscaped = false; + for (;;) + { + if (m_pInputPos >= m_pInputEnd) + return false; + sal_Unicode cChar = *m_pInputPos++; + if (bEscaped) + { + m_bCurTokenReparse = true; + bEscaped = false; + } + else if (cChar == '"') + { + m_pCurTokenEnd = m_pInputPos; + m_pCurTokenContentEnd = m_pInputPos - 1; + return true; + } + else if (cChar == '\\') + bEscaped = true; + } + } + + case TOKEN_DOMAIN: + { + m_pCurTokenBegin = m_pInputPos - 1; + m_pCurTokenContentBegin = m_pInputPos; + bool bEscaped = false; + for (;;) + { + if (m_pInputPos >= m_pInputEnd) + return false; + sal_Unicode cChar = *m_pInputPos++; + if (bEscaped) + bEscaped = false; + else if (cChar == ']') + { + m_pCurTokenEnd = m_pInputPos; + return true; + } + else if (cChar == '\\') + bEscaped = true; + } + } + + case TOKEN_COMMENT: + { + m_pCurTokenBegin = m_pInputPos - 1; + m_pCurTokenContentBegin = nullptr; + m_pCurTokenContentEnd = nullptr; + bool bEscaped = false; + int nLevel = 0; + for (;;) + { + if (m_pInputPos >= m_pInputEnd) + return false; + sal_Unicode cChar = *m_pInputPos++; + if (bEscaped) + { + m_bCurTokenReparse = true; + m_pCurTokenContentEnd = m_pInputPos; + bEscaped = false; + } + else if (cChar == '(') + { + if (!m_pCurTokenContentBegin) + m_pCurTokenContentBegin = m_pInputPos - 1; + m_pCurTokenContentEnd = m_pInputPos; + ++nLevel; + } + else if (cChar == ')') + if (nLevel) + { + m_pCurTokenContentEnd = m_pInputPos; + --nLevel; + } + else + return true; + else if (cChar == '\\') + { + if (!m_pCurTokenContentBegin) + m_pCurTokenContentBegin = m_pInputPos - 1; + bEscaped = true; + } + else if (cChar > ' ' && cChar != 0x7F) // DEL + { + if (!m_pCurTokenContentBegin) + m_pCurTokenContentBegin = m_pInputPos - 1; + m_pCurTokenContentEnd = m_pInputPos; + } + } + } + + default: + { + sal_Unicode cChar; + for (;;) + { + if (m_pInputPos >= m_pInputEnd) + return false; + cChar = *m_pInputPos++; + if (cChar > ' ' && cChar != 0x7F) // DEL + break; + } + m_pCurTokenBegin = m_pInputPos - 1; + if (cChar == '"' || cChar == '(' || cChar == ')' || cChar == ',' + || cChar == '.' || cChar == ':' || cChar == ';' + || cChar == '<' || cChar == '>' || cChar == '@' + || cChar == '[' || cChar == '\\' || cChar == ']') + { + m_nCurToken = cChar; + m_pCurTokenEnd = m_pInputPos; + return true; + } + else + for (;;) + { + if (m_pInputPos >= m_pInputEnd) + { + m_pCurTokenEnd = m_pInputPos; + return true; + } + cChar = *m_pInputPos++; + if (cChar <= ' ' || cChar == '"' || cChar == '(' + || cChar == ')' || cChar == ',' || cChar == '.' + || cChar == ':' || cChar == ';' || cChar == '<' + || cChar == '>' || cChar == '@' || cChar == '[' + || cChar == '\\' || cChar == ']' + || cChar == 0x7F) // DEL + { + m_pCurTokenEnd = --m_pInputPos; + return true; + } + } + } + } +} + +// static +OUString SvAddressParser_Impl::reparse(sal_Unicode const * pBegin, + sal_Unicode const * pEnd, bool bAddrSpec) +{ + OUStringBuffer aResult; + TokenType eMode = TOKEN_ATOM; + bool bEscaped = false; + bool bEndsWithSpace = false; + int nLevel = 0; + while (pBegin < pEnd) + { + sal_Unicode cChar = *pBegin++; + switch (eMode) + { + case TOKEN_QUOTED: + if (bEscaped) + { + aResult.append(cChar); + bEscaped = false; + } + else if (cChar == '"') + { + if (bAddrSpec) + aResult.append(cChar); + eMode = TOKEN_ATOM; + } + else if (cChar == '\\') + { + if (bAddrSpec) + aResult.append(cChar); + bEscaped = true; + } + else + aResult.append(cChar); + break; + + case TOKEN_DOMAIN: + if (bEscaped) + { + aResult.append(cChar); + bEscaped = false; + } + else if (cChar == ']') + { + aResult.append(cChar); + eMode = TOKEN_ATOM; + } + else if (cChar == '\\') + { + if (bAddrSpec) + aResult.append(cChar); + bEscaped = true; + } + else + aResult.append(cChar); + break; + + case TOKEN_COMMENT: + if (bEscaped) + bEscaped = false; + else if (cChar == '(') + ++nLevel; + else if (cChar == ')') + if (nLevel) + --nLevel; + else + eMode = TOKEN_ATOM; + else if (cChar == '\\') + bEscaped = true; + break; + + case TOKEN_ATOM: + if (cChar <= ' ' || cChar == 0x7F) // DEL + { + if (!bAddrSpec && !bEndsWithSpace) + { + aResult.append(' '); + bEndsWithSpace = true; + } + } + else if (cChar == '(') + { + if (!bAddrSpec && !bEndsWithSpace) + { + aResult.append(' '); + bEndsWithSpace = true; + } + eMode = TOKEN_COMMENT; + } + else + { + bEndsWithSpace = false; + if (cChar == '"') + { + if (bAddrSpec) + aResult.append(cChar); + eMode = TOKEN_QUOTED; + } + else if (cChar == '[') + { + aResult.append(cChar); + eMode = TOKEN_QUOTED; + } + else + aResult.append(cChar); + } + break; + } + } + return aResult.makeStringAndClear(); +} + +// static +OUString SvAddressParser_Impl::reparseComment(sal_Unicode const * pBegin, + sal_Unicode const * pEnd) +{ + OUStringBuffer aResult; + while (pBegin < pEnd) + { + sal_Unicode cChar = *pBegin++; + if (cChar == '\\') + cChar = *pBegin++; + aResult.append(cChar); + } + return aResult.makeStringAndClear(); +} + +SvAddressParser_Impl::SvAddressParser_Impl(SvAddressParser * pParser, + const OUString& rInput) + : m_pCurTokenBegin(nullptr) + , m_pCurTokenEnd(nullptr) + , m_pCurTokenContentBegin(nullptr) + , m_pCurTokenContentEnd(nullptr) +{ + m_pInputPos = rInput.getStr(); + m_pInputEnd = m_pInputPos + rInput.getLength(); + + reset(); + bool bDone = false; + for (;;) + { + if (!readToken()) + { + m_bRealNameFinished = true; + if (m_eState == AFTER_LESS) + m_nCurToken = '>'; + else + { + m_nCurToken = ','; + bDone = true; + } + } + switch (m_nCurToken) + { + case TOKEN_QUOTED: + if (m_pAddrSpec->m_eLastElem != ELEMENT_END) + { + if (m_pAddrSpec->m_bAtFound + || m_pAddrSpec->m_eLastElem <= ELEMENT_DELIM) + m_pAddrSpec->reset(); + addTokenToAddrSpec(ELEMENT_ITEM); + } + if (!m_bRealNameFinished && m_eState != AFTER_LESS) + { + if (m_bCurTokenReparse) + { + if (!m_pRealNameBegin) + m_pRealNameBegin = m_pCurTokenBegin; + m_pRealNameEnd = m_pCurTokenEnd; + m_bRealNameReparse = true; + } + else if (m_bRealNameReparse) + m_pRealNameEnd = m_pCurTokenEnd; + else if (!m_pRealNameBegin) + { + m_pRealNameBegin = m_pCurTokenBegin; + m_pRealNameContentBegin = m_pCurTokenContentBegin; + m_pRealNameEnd = m_pRealNameContentEnd = m_pCurTokenContentEnd; + } + else + { + m_pRealNameEnd = m_pCurTokenEnd; + m_bRealNameReparse = true; + } + } + m_eType = TOKEN_ATOM; + break; + + case TOKEN_DOMAIN: + if (m_pAddrSpec->m_eLastElem != ELEMENT_END) + { + if (m_pAddrSpec->m_bAtFound && m_pAddrSpec->m_eLastElem == ELEMENT_DELIM) + addTokenToAddrSpec(ELEMENT_ITEM); + else + m_pAddrSpec->reset(); + } + addTokenToRealName(); + m_eType = TOKEN_ATOM; + break; + + case TOKEN_COMMENT: + if (!m_bRealNameFinished && m_eState != AFTER_LESS + && !m_pFirstCommentBegin && m_pCurTokenContentBegin) + { + m_pFirstCommentBegin = m_pCurTokenContentBegin; + m_pFirstCommentEnd = m_pCurTokenContentEnd; + m_bFirstCommentReparse = m_bCurTokenReparse; + } + m_eType = TOKEN_ATOM; + break; + + case TOKEN_ATOM: + if (m_pAddrSpec->m_eLastElem != ELEMENT_END) + { + if (m_pAddrSpec->m_eLastElem != ELEMENT_DELIM) + m_pAddrSpec->reset(); + addTokenToAddrSpec(ELEMENT_ITEM); + } + addTokenToRealName(); + break; + + case '(': + m_eType = TOKEN_COMMENT; + break; + + case ')': + case '\\': + case ']': + m_pAddrSpec->finish(); + addTokenToRealName(); + break; + + case '<': + switch (m_eState) + { + case BEFORE_COLON: + case BEFORE_LESS: + m_aOuterAddrSpec.finish(); + if (m_pRealNameBegin) + m_bRealNameFinished = true; + m_pAddrSpec = &m_aInnerAddrSpec; + m_eState = AFTER_LESS; + break; + + case AFTER_LESS: + m_aInnerAddrSpec.finish(); + break; + + case AFTER_GREATER: + m_aOuterAddrSpec.finish(); + addTokenToRealName(); + break; + } + break; + + case '>': + if (m_eState == AFTER_LESS) + { + m_aInnerAddrSpec.finish(); + if (m_aInnerAddrSpec.isValid()) + m_aOuterAddrSpec.m_eLastElem = ELEMENT_END; + m_pAddrSpec = &m_aOuterAddrSpec; + m_eState = AFTER_GREATER; + } + else + { + m_aOuterAddrSpec.finish(); + addTokenToRealName(); + } + break; + + case '@': + if (m_pAddrSpec->m_eLastElem != ELEMENT_END) + { + if (!m_pAddrSpec->m_bAtFound + && m_pAddrSpec->m_eLastElem == ELEMENT_ITEM) + { + addTokenToAddrSpec(ELEMENT_DELIM); + m_pAddrSpec->m_bAtFound = true; + } + else + m_pAddrSpec->reset(); + } + addTokenToRealName(); + break; + + case ',': + case ';': + if (m_eState == AFTER_LESS) + if (m_nCurToken == ',') + { + if (m_aInnerAddrSpec.m_eLastElem != ELEMENT_END) + m_aInnerAddrSpec.reset(); + } + else + m_aInnerAddrSpec.finish(); + else + { + if(m_aInnerAddrSpec.isValid() || (!m_aOuterAddrSpec.isValid() && m_aInnerAddrSpec.isPoorlyValid())) + { + m_pAddrSpec = &m_aInnerAddrSpec; + } + else if(m_aOuterAddrSpec.isPoorlyValid()) + { + m_pAddrSpec = &m_aOuterAddrSpec; + } + else + { + m_pAddrSpec = nullptr; + } + + if (m_pAddrSpec) + { + OUString aTheAddrSpec; + if (m_pAddrSpec->m_bReparse) + aTheAddrSpec = reparse(m_pAddrSpec->m_pBegin, m_pAddrSpec->m_pEnd, true); + else + { + sal_Int32 nLen = m_pAddrSpec->m_pEnd - m_pAddrSpec->m_pBegin; + if (nLen == rInput.getLength()) + aTheAddrSpec = rInput; + else + aTheAddrSpec = rInput.copy( (m_pAddrSpec->m_pBegin - rInput.getStr()), + nLen); + } + OUString aTheRealName; + if (!m_pRealNameBegin || + (m_pAddrSpec == &m_aOuterAddrSpec && + m_pRealNameBegin == m_aOuterAddrSpec.m_pBegin && + m_pRealNameEnd == m_aOuterAddrSpec.m_pEnd && + m_pFirstCommentBegin)) + { + if (!m_pFirstCommentBegin) + aTheRealName = aTheAddrSpec; + else if (m_bFirstCommentReparse) + aTheRealName = reparseComment(m_pFirstCommentBegin, + m_pFirstCommentEnd); + else + aTheRealName = rInput.copy( (m_pFirstCommentBegin - rInput.getStr()), + (m_pFirstCommentEnd - m_pFirstCommentBegin)); + } + else if (m_bRealNameReparse) + aTheRealName = reparse(m_pRealNameBegin, m_pRealNameEnd, false); + else + { + sal_Int32 nLen = m_pRealNameContentEnd - m_pRealNameContentBegin; + if (nLen == rInput.getLength()) + aTheRealName = rInput; + else + aTheRealName = rInput.copy( (m_pRealNameContentBegin - rInput.getStr()), nLen); + } + pParser->m_vAddresses.emplace_back( aTheAddrSpec ); + } + if (bDone) + return; + reset(); + } + break; + + case ':': + switch (m_eState) + { + case BEFORE_COLON: + m_aOuterAddrSpec.reset(); + resetRealNameAndFirstComment(); + m_eState = BEFORE_LESS; + break; + + case BEFORE_LESS: + case AFTER_GREATER: + m_aOuterAddrSpec.finish(); + addTokenToRealName(); + break; + + case AFTER_LESS: + m_aInnerAddrSpec.reset(); + break; + } + break; + + case '"': + m_eType = TOKEN_QUOTED; + break; + + case '.': + if (m_pAddrSpec->m_eLastElem != ELEMENT_END) + { + if (m_pAddrSpec->m_eLastElem != ELEMENT_DELIM) + addTokenToAddrSpec(ELEMENT_DELIM); + else + m_pAddrSpec->reset(); + } + addTokenToRealName(); + break; + + case '[': + m_eType = TOKEN_DOMAIN; + break; + } + } +} + +SvAddressParser::SvAddressParser(const OUString& rInput) +{ + SvAddressParser_Impl aDoParse(this, rInput); +} + +SvAddressParser::~SvAddressParser() +{ +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |