1 files changed, 740 insertions, 0 deletions
diff --git a/comphelper/source/misc/syntaxhighlight.cxx b/comphelper/source/misc/syntaxhighlight.cxx
new file mode 100644
index 000000000..3ce8086e6
--- /dev/null
+++ b/comphelper/source/misc/syntaxhighlight.cxx
@@ -0,0 +1,740 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+
+#include <rtl/character.hxx>
+#include <unicode/uchar.h>
+#include <comphelper/syntaxhighlight.hxx>
+#include <o3tl/typed_flags_set.hxx>
+
+namespace {
+
+// Flags for character properties
+enum class CharFlags {
+    StartIdentifier   = 0x0001,
+    InIdentifier      = 0x0002,
+    StartNumber       = 0x0004,
+    InNumber          = 0x0008,
+    InHexNumber       = 0x0010,
+    InOctNumber       = 0x0020,
+    StartString       = 0x0040,
+    Operator          = 0x0080,
+    Space             = 0x0100,
+    EOL               = 0x0200
+};
+
+}
+
+namespace o3tl {
+    template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
+}
+
+// ##########################################################################
+// ATTENTION: all these words need to be in lower case
+// ##########################################################################
+static const char* strListBasicKeyWords[] = {
+    "access",
+    "alias",
+    "and",
+    "any",
+    "append",
+    "as",
+    "attribute",
+    "base",
+    "binary",
+    "boolean",
+    "byref",
+    "byte",
+    "byval",
+    "call",
+    "case",
+    "cdecl",
+    "classmodule",
+    "close",
+    "compare",
+    "compatible",
+    "const",
+    "currency",
+    "date",
+    "declare",
+    "defbool",
+    "defcur",
+    "defdate",
+    "defdbl",
+    "deferr",
+    "defint",
+    "deflng",
+    "defobj",
+    "defsng",
+    "defstr",
+    "defvar",
+    "dim",
+    "do",
+    "doevents",
+    "double",
+    "each",
+    "else",
+    "elseif",
+    "end",
+    "end enum",
+    "end function",
+    "end if",
+    "end property",
+    "end select",
+    "end sub",
+    "end type",
+    "endif",
+    "enum",
+    "eqv",
+    "erase",
+    "error",
+    "exit",
+    "explicit",
+    "for",
+    "function",
+    "get",
+    "global",
+    "gosub",
+    "goto",
+    "if",
+    "imp",
+    "implements",
+    "in",
+    "input",
+    "integer",
+    "is",
+    "let",
+    "lib",
+    "like",
+    "line",
+    "line input",
+    "local",
+    "lock",
+    "long",
+    "loop",
+    "lprint",
+    "lset",
+    "mod",
+    "name",
+    "new",
+    "next",
+    "not",
+    "object",
+    "on",
+    "open",
+    "option",
+    "optional",
+    "or",
+    "output",
+    "paramarray",
+    "preserve",
+    "print",
+    "private",
+    "property",
+    "public",
+    "random",
+    "read",
+    "redim",
+    "rem",
+    "resume",
+    "return",
+    "rset",
+    "select",
+    "set",
+    "shared",
+    "single",
+    "static",
+    "step",
+    "stop",
+    "string",
+    "sub",
+    "system",
+    "text",
+    "then",
+    "to",
+    "type",
+    "typeof",
+    "until",
+    "variant",
+    "vbasupport",
+    "wend",
+    "while",
+    "with",
+    "withevents",
+    "write",
+    "xor"
+};
+
+
+static const char* strListSqlKeyWords[] = {
+    "all",
+    "and",
+    "any",
+    "as",
+    "asc",
+    "avg",
+    "between",
+    "by",
+    "cast",
+    "corresponding",
+    "count",
+    "create",
+    "cross",
+    "delete",
+    "desc",
+    "distinct",
+    "drop",
+    "escape",
+    "except",
+    "exists",
+    "false",
+    "from",
+    "full",
+    "global",
+    "group",
+    "having",
+    "in",
+    "inner",
+    "insert",
+    "intersect",
+    "into",
+    "is",
+    "join",
+    "left",
+    "like",
+    "limit",
+    "local",
+    "match",
+    "max",
+    "min",
+    "natural",
+    "not",
+    "null",
+    "on",
+    "or",
+    "order",
+    "outer",
+    "right",
+    "select",
+    "set",
+    "some",
+    "sum",
+    "table",
+    "temporary",
+    "true",
+    "union",
+    "unique",
+    "unknown",
+    "update",
+    "using",
+    "values",
+    "where"
+};
+
+
+extern "C" {
+
+static int compare_strings( const void *arg1, const void *arg2 )
+{
+    return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
+}
+
+}
+
+namespace
+{
+    bool isAlpha(sal_Unicode c)
+    {
+        if (rtl::isAsciiAlpha(c))
+            return true;
+        return u_isalpha(c);
+    }
+}
+
+class SyntaxHighlighter::Tokenizer
+{
+    // Character information tables
+    CharFlags aCharTypeTab[256] = {};
+
+    // Auxiliary function: testing of the character flags
+    bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
+
+    // Get new token, EmptyString == nothing more over there
+    bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
+        /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
+
+    const char** ppListKeyWords;
+    sal_uInt16 nKeyWordCount;
+
+public:
+    HighlighterLanguage const aLanguage;
+
+    explicit Tokenizer( HighlighterLanguage aLang );
+
+    void getHighlightPortions(std::u16string_view rLine,
+                               /*out*/std::vector<HighlightPortion>& portions) const;
+    void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
+};
+
+// Helper function: test character flag
+bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
+{
+    bool bRet = false;
+    if( c != 0 && c <= 255 )
+    {
+        bRet = bool(aCharTypeTab[c] & nTestFlags);
+    }
+    else if( c > 255 )
+    {
+        bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
+            && isAlpha(c);
+    }
+    return bRet;
+}
+
+void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
+{
+    ppListKeyWords = ppKeyWords;
+    nKeyWordCount = nCount;
+}
+
+bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
+    /*out*/TokenType& reType,
+    /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
+{
+    reType = TokenType::Unknown;
+
+    rpStartPos = pos;
+
+    if( pos == end )
+        return false;
+
+    sal_Unicode c = *pos;
+    ++pos;
+
+    //*** Go through all possibilities ***
+    // Space?
+    if ( testCharFlags( c, CharFlags::Space ) )
+    {
+        while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
+            ++pos;
+
+        reType = TokenType::Whitespace;
+    }
+
+    // Identifier?
+    else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
+    {
+        bool bIdentifierChar;
+        do
+        {
+            if (pos == end)
+                break;
+            // Fetch next character
+            c = *pos;
+            bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
+            if( bIdentifierChar )
+                ++pos;
+        }
+        while( bIdentifierChar );
+
+        reType = TokenType::Identifier;
+
+        // Keyword table
+        if (ppListKeyWords != nullptr)
+        {
+            int nCount = pos - rpStartPos;
+
+            // No keyword if string contains char > 255
+            bool bCanBeKeyword = true;
+            for( int i = 0 ; i < nCount ; i++ )
+            {
+                if( rpStartPos[i] > 255 )
+                {
+                    bCanBeKeyword = false;
+                    break;
+                }
+            }
+
+            if( bCanBeKeyword )
+            {
+                std::u16string_view aKWString(&*rpStartPos, nCount);
+                OString aByteStr = OUStringToOString(aKWString,
+                    RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
+                if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
+                                                                        compare_strings ) )
+                {
+                    reType = TokenType::Keywords;
+
+                    if( aByteStr == "rem" )
+                    {
+                        // Remove all characters until end of line or EOF
+                        for (;;)
+                        {
+                            if (pos == end)
+                                break;
+                            sal_Unicode cPeek = *pos;
+                            if ( testCharFlags( cPeek, CharFlags::EOL ) )
+                                break;
+                            ++pos;
+                        }
+
+                        reType = TokenType::Comment;
+                    }
+                }
+            }
+        }
+    }
+
+    // Operator?
+    // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
+    else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
+    {
+        // parameters for SQL view
+        if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
+        {
+            if (c!='?')
+            {
+                bool bIdentifierChar;
+                do
+                {
+                    // Get next character
+                    if (pos == end)
+                        break;
+                    c = *pos;
+                    bIdentifierChar = isAlpha(c);
+                    if( bIdentifierChar )
+                        ++pos;
+                }
+                while( bIdentifierChar );
+            }
+            reType = TokenType::Parameter;
+        }
+        else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
+        {
+            if (pos != end && *pos=='-')
+            {
+                // Remove all characters until end of line or EOF
+                while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
+                {
+                    ++pos;
+                }
+                reType = TokenType::Comment;
+            }
+            else
+                reType = TokenType::Operator;
+        }
+        else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
+        {
+            if (pos != end && *pos=='/')
+            {
+                // Remove all characters until end of line or EOF
+                while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
+                {
+                    ++pos;
+                }
+                reType = TokenType::Comment;
+            }
+            else
+                reType = TokenType::Operator;
+        }
+        else
+        {
+            // Apostrophe is Basic comment
+            if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
+            {
+                // Skip all characters until end of input or end of line:
+                for (;;) {
+                    if (pos == end)
+                        break;
+                    c = *pos;
+                    if (testCharFlags(c, CharFlags::EOL)) {
+                        break;
+                    }
+                    ++pos;
+                }
+
+                reType = TokenType::Comment;
+            }
+
+            // The real operator; can be easily used since not the actual
+            // operator (e.g. +=) is concerned, but the fact that it is one
+            if( reType != TokenType::Comment )
+            {
+                reType = TokenType::Operator;
+            }
+
+        }
+    }
+
+    // Object separator? Must be handled before Number
+    else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
+    {
+        reType = TokenType::Operator;
+    }
+
+    // Number?
+    else if( testCharFlags( c, CharFlags::StartNumber ) )
+    {
+        reType = TokenType::Number;
+
+        // Number system, 10 = normal, it is changed for Oct/Hex
+        int nRadix = 10;
+
+        // Is it an Oct or a Hex number?
+        if( c == '&' )
+        {
+            // Octal?
+            if( pos != end && (*pos == 'o' || *pos == 'O' ))
+            {
+                // remove o
+                ++pos;
+                nRadix = 8;     // Octal base
+
+                // Read all numbers
+                while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
+                    ++pos;
+            }
+            // Hexadecimal?
+            else if( pos != end && (*pos == 'h' || *pos == 'H' ))
+            {
+                // remove x
+                ++pos;
+                nRadix = 16;     // Hexadecimal base
+
+                // Read all numbers
+                while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
+                    ++pos;
+            }
+            else
+            {
+                reType = TokenType::Operator;
+            }
+        }
+
+        // When it is not Oct or Hex, then it is double
+        if( reType == TokenType::Number && nRadix == 10 )
+        {
+            // Flag if the last character is an exponent
+            bool bAfterExpChar = false;
+
+            // Read all numbers
+            while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
+                    (bAfterExpChar && *pos == '+' ) ||
+                    (bAfterExpChar && *pos == '-' ) ))
+                    // After exponent +/- are OK, too
+            {
+                c = *pos++;
+                bAfterExpChar = ( c == 'e' || c == 'E' );
+            }
+        }
+    }
+
+    // String?
+    else if( testCharFlags( c, CharFlags::StartString ) )
+    {
+        // Remember which character has opened the string
+        sal_Unicode cEndString = c;
+        if( c == '[' )
+            cEndString = ']';
+
+        // Read all characters
+        while( pos == end || *pos != cEndString )
+        {
+            // Detect EOF before reading next char, so we do not lose EOF
+            if( pos == end )
+            {
+                // ERROR: unterminated string literal
+                reType = TokenType::Error;
+                break;
+            }
+            c = *pos++;
+            if( testCharFlags( c, CharFlags::EOL ) )
+            {
+                // ERROR: unterminated string literal
+                reType = TokenType::Error;
+                break;
+            }
+        }
+
+        if( reType != TokenType::Error )
+        {
+            ++pos;
+            if( cEndString == ']' )
+                reType = TokenType::Identifier;
+            else
+                reType = TokenType::String;
+        }
+    }
+
+    // End of line?
+    else if( testCharFlags( c, CharFlags::EOL ) )
+    {
+        // If another EOL character comes, read it
+        if (pos != end)
+        {
+            sal_Unicode cNext = *pos;
+            if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
+                ++pos;
+        }
+
+        reType = TokenType::EOL;
+    }
+
+    // All other will remain TokenType::Unknown
+
+    // Save end position
+    rpEndPos = pos;
+    return true;
+}
+
+SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
+{
+    // Fill character table
+    sal_uInt16 i;
+
+    // Allowed characters for identifiers
+    CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
+    for( i = 'a' ; i <= 'z' ; i++ )
+        aCharTypeTab[i] |= nHelpMask;
+    for( i = 'A' ; i <= 'Z' ; i++ )
+        aCharTypeTab[i] |= nHelpMask;
+    aCharTypeTab[int('_')] |= nHelpMask;
+    aCharTypeTab[int('$')] |= nHelpMask;
+
+    // Digit (can be identifier and number)
+    nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
+                         CharFlags::InNumber | CharFlags::InHexNumber;
+    for( i = '0' ; i <= '9' ; i++ )
+        aCharTypeTab[i] |= nHelpMask;
+
+    // Add e, E, . and & here manually
+    aCharTypeTab[int('e')] |= CharFlags::InNumber;
+    aCharTypeTab[int('E')] |= CharFlags::InNumber;
+    aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
+    aCharTypeTab[int('&')] |= CharFlags::StartNumber;
+
+    // Hexadecimal digit
+    for( i = 'a' ; i <= 'f' ; i++ )
+        aCharTypeTab[i] |= CharFlags::InHexNumber;
+    for( i = 'A' ; i <= 'F' ; i++ )
+        aCharTypeTab[i] |= CharFlags::InHexNumber;
+
+    // Octal digit
+    for( i = '0' ; i <= '7' ; i++ )
+        aCharTypeTab[i] |= CharFlags::InOctNumber;
+
+    // String literal start/end characters
+    aCharTypeTab[int('\'')] |= CharFlags::StartString;
+    aCharTypeTab[int('\"')] |= CharFlags::StartString;
+    aCharTypeTab[int('[')]  |= CharFlags::StartString;
+    aCharTypeTab[int('`')]  |= CharFlags::StartString;
+
+    // Operator characters
+    aCharTypeTab[int('!')] |= CharFlags::Operator;
+    aCharTypeTab[int('%')] |= CharFlags::Operator;
+    // aCharTypeTab[(int)'&'] |= CharFlags::Operator;     Removed because of #i14140
+    aCharTypeTab[int('(')] |= CharFlags::Operator;
+    aCharTypeTab[int(')')] |= CharFlags::Operator;
+    aCharTypeTab[int('*')] |= CharFlags::Operator;
+    aCharTypeTab[int('+')] |= CharFlags::Operator;
+    aCharTypeTab[int(',')] |= CharFlags::Operator;
+    aCharTypeTab[int('-')] |= CharFlags::Operator;
+    aCharTypeTab[int('/')] |= CharFlags::Operator;
+    aCharTypeTab[int(':')] |= CharFlags::Operator;
+    aCharTypeTab[int('<')] |= CharFlags::Operator;
+    aCharTypeTab[int('=')] |= CharFlags::Operator;
+    aCharTypeTab[int('>')] |= CharFlags::Operator;
+    aCharTypeTab[int('?')] |= CharFlags::Operator;
+    aCharTypeTab[int('^')] |= CharFlags::Operator;
+    aCharTypeTab[int('|')] |= CharFlags::Operator;
+    aCharTypeTab[int('~')] |= CharFlags::Operator;
+    aCharTypeTab[int('{')] |= CharFlags::Operator;
+    aCharTypeTab[int('}')] |= CharFlags::Operator;
+    // aCharTypeTab[(int)'['] |= CharFlags::Operator;     Removed because of #i17826
+    aCharTypeTab[int(']')] |= CharFlags::Operator;
+    aCharTypeTab[int(';')] |= CharFlags::Operator;
+
+    // Space
+    aCharTypeTab[int(' ') ] |= CharFlags::Space;
+    aCharTypeTab[int('\t')] |= CharFlags::Space;
+
+    // End of line characters
+    aCharTypeTab[int('\r')] |= CharFlags::EOL;
+    aCharTypeTab[int('\n')] |= CharFlags::EOL;
+
+    ppListKeyWords = nullptr;
+    nKeyWordCount = 0;
+}
+
+void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine,
+                                                 /*out*/std::vector<HighlightPortion>& portions) const
+{
+    // Set the position to the beginning of the source string
+    auto pos = rLine.begin();
+
+    // Variables for the out parameter
+    TokenType eType;
+    std::u16string_view::const_iterator pStartPos;
+    std::u16string_view::const_iterator pEndPos;
+
+    // Loop over all the tokens
+    while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
+    {
+        portions.emplace_back(
+                pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
+    }
+}
+
+
+SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
+    m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
+{
+    switch (language)
+    {
+        case HighlighterLanguage::Basic:
+            m_tokenizer->setKeyWords( strListBasicKeyWords,
+                                      std::size( strListBasicKeyWords ));
+            break;
+        case HighlighterLanguage::SQL:
+            m_tokenizer->setKeyWords( strListSqlKeyWords,
+                                      std::size( strListSqlKeyWords ));
+            break;
+        default:
+            assert(false); // this cannot happen
+    }
+}
+
+SyntaxHighlighter::~SyntaxHighlighter() {}
+
+void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
+                                              /*out*/std::vector<HighlightPortion>& portions) const
+{
+    m_tokenizer->getHighlightPortions( rLine, portions );
+}
+
+HighlighterLanguage SyntaxHighlighter::GetLanguage() const
+{
+    return m_tokenizer->aLanguage;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */