summaryrefslogtreecommitdiffstats
path: root/comphelper/source/misc/syntaxhighlight.cxx
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--comphelper/source/misc/syntaxhighlight.cxx740
1 files changed, 740 insertions, 0 deletions
diff --git a/comphelper/source/misc/syntaxhighlight.cxx b/comphelper/source/misc/syntaxhighlight.cxx
new file mode 100644
index 000000000..3ce8086e6
--- /dev/null
+++ b/comphelper/source/misc/syntaxhighlight.cxx
@@ -0,0 +1,740 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+
+#include <rtl/character.hxx>
+#include <unicode/uchar.h>
+#include <comphelper/syntaxhighlight.hxx>
+#include <o3tl/typed_flags_set.hxx>
+
+namespace {
+
+// Flags for character properties
+enum class CharFlags {
+ StartIdentifier = 0x0001,
+ InIdentifier = 0x0002,
+ StartNumber = 0x0004,
+ InNumber = 0x0008,
+ InHexNumber = 0x0010,
+ InOctNumber = 0x0020,
+ StartString = 0x0040,
+ Operator = 0x0080,
+ Space = 0x0100,
+ EOL = 0x0200
+};
+
+}
+
+namespace o3tl {
+ template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
+}
+
+// ##########################################################################
+// ATTENTION: all these words need to be in lower case
+// ##########################################################################
+static const char* strListBasicKeyWords[] = {
+ "access",
+ "alias",
+ "and",
+ "any",
+ "append",
+ "as",
+ "attribute",
+ "base",
+ "binary",
+ "boolean",
+ "byref",
+ "byte",
+ "byval",
+ "call",
+ "case",
+ "cdecl",
+ "classmodule",
+ "close",
+ "compare",
+ "compatible",
+ "const",
+ "currency",
+ "date",
+ "declare",
+ "defbool",
+ "defcur",
+ "defdate",
+ "defdbl",
+ "deferr",
+ "defint",
+ "deflng",
+ "defobj",
+ "defsng",
+ "defstr",
+ "defvar",
+ "dim",
+ "do",
+ "doevents",
+ "double",
+ "each",
+ "else",
+ "elseif",
+ "end",
+ "end enum",
+ "end function",
+ "end if",
+ "end property",
+ "end select",
+ "end sub",
+ "end type",
+ "endif",
+ "enum",
+ "eqv",
+ "erase",
+ "error",
+ "exit",
+ "explicit",
+ "for",
+ "function",
+ "get",
+ "global",
+ "gosub",
+ "goto",
+ "if",
+ "imp",
+ "implements",
+ "in",
+ "input",
+ "integer",
+ "is",
+ "let",
+ "lib",
+ "like",
+ "line",
+ "line input",
+ "local",
+ "lock",
+ "long",
+ "loop",
+ "lprint",
+ "lset",
+ "mod",
+ "name",
+ "new",
+ "next",
+ "not",
+ "object",
+ "on",
+ "open",
+ "option",
+ "optional",
+ "or",
+ "output",
+ "paramarray",
+ "preserve",
+ "print",
+ "private",
+ "property",
+ "public",
+ "random",
+ "read",
+ "redim",
+ "rem",
+ "resume",
+ "return",
+ "rset",
+ "select",
+ "set",
+ "shared",
+ "single",
+ "static",
+ "step",
+ "stop",
+ "string",
+ "sub",
+ "system",
+ "text",
+ "then",
+ "to",
+ "type",
+ "typeof",
+ "until",
+ "variant",
+ "vbasupport",
+ "wend",
+ "while",
+ "with",
+ "withevents",
+ "write",
+ "xor"
+};
+
+
+static const char* strListSqlKeyWords[] = {
+ "all",
+ "and",
+ "any",
+ "as",
+ "asc",
+ "avg",
+ "between",
+ "by",
+ "cast",
+ "corresponding",
+ "count",
+ "create",
+ "cross",
+ "delete",
+ "desc",
+ "distinct",
+ "drop",
+ "escape",
+ "except",
+ "exists",
+ "false",
+ "from",
+ "full",
+ "global",
+ "group",
+ "having",
+ "in",
+ "inner",
+ "insert",
+ "intersect",
+ "into",
+ "is",
+ "join",
+ "left",
+ "like",
+ "limit",
+ "local",
+ "match",
+ "max",
+ "min",
+ "natural",
+ "not",
+ "null",
+ "on",
+ "or",
+ "order",
+ "outer",
+ "right",
+ "select",
+ "set",
+ "some",
+ "sum",
+ "table",
+ "temporary",
+ "true",
+ "union",
+ "unique",
+ "unknown",
+ "update",
+ "using",
+ "values",
+ "where"
+};
+
+
+extern "C" {
+
+static int compare_strings( const void *arg1, const void *arg2 )
+{
+ return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
+}
+
+}
+
+namespace
+{
+ bool isAlpha(sal_Unicode c)
+ {
+ if (rtl::isAsciiAlpha(c))
+ return true;
+ return u_isalpha(c);
+ }
+}
+
+class SyntaxHighlighter::Tokenizer
+{
+ // Character information tables
+ CharFlags aCharTypeTab[256] = {};
+
+ // Auxiliary function: testing of the character flags
+ bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
+
+ // Get new token, EmptyString == nothing more over there
+ bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
+ /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
+
+ const char** ppListKeyWords;
+ sal_uInt16 nKeyWordCount;
+
+public:
+ HighlighterLanguage const aLanguage;
+
+ explicit Tokenizer( HighlighterLanguage aLang );
+
+ void getHighlightPortions(std::u16string_view rLine,
+ /*out*/std::vector<HighlightPortion>& portions) const;
+ void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
+};
+
+// Helper function: test character flag
+bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
+{
+ bool bRet = false;
+ if( c != 0 && c <= 255 )
+ {
+ bRet = bool(aCharTypeTab[c] & nTestFlags);
+ }
+ else if( c > 255 )
+ {
+ bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
+ && isAlpha(c);
+ }
+ return bRet;
+}
+
+void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
+{
+ ppListKeyWords = ppKeyWords;
+ nKeyWordCount = nCount;
+}
+
+bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
+ /*out*/TokenType& reType,
+ /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
+{
+ reType = TokenType::Unknown;
+
+ rpStartPos = pos;
+
+ if( pos == end )
+ return false;
+
+ sal_Unicode c = *pos;
+ ++pos;
+
+ //*** Go through all possibilities ***
+ // Space?
+ if ( testCharFlags( c, CharFlags::Space ) )
+ {
+ while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
+ ++pos;
+
+ reType = TokenType::Whitespace;
+ }
+
+ // Identifier?
+ else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
+ {
+ bool bIdentifierChar;
+ do
+ {
+ if (pos == end)
+ break;
+ // Fetch next character
+ c = *pos;
+ bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
+ if( bIdentifierChar )
+ ++pos;
+ }
+ while( bIdentifierChar );
+
+ reType = TokenType::Identifier;
+
+ // Keyword table
+ if (ppListKeyWords != nullptr)
+ {
+ int nCount = pos - rpStartPos;
+
+ // No keyword if string contains char > 255
+ bool bCanBeKeyword = true;
+ for( int i = 0 ; i < nCount ; i++ )
+ {
+ if( rpStartPos[i] > 255 )
+ {
+ bCanBeKeyword = false;
+ break;
+ }
+ }
+
+ if( bCanBeKeyword )
+ {
+ std::u16string_view aKWString(&*rpStartPos, nCount);
+ OString aByteStr = OUStringToOString(aKWString,
+ RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
+ if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
+ compare_strings ) )
+ {
+ reType = TokenType::Keywords;
+
+ if( aByteStr == "rem" )
+ {
+ // Remove all characters until end of line or EOF
+ for (;;)
+ {
+ if (pos == end)
+ break;
+ sal_Unicode cPeek = *pos;
+ if ( testCharFlags( cPeek, CharFlags::EOL ) )
+ break;
+ ++pos;
+ }
+
+ reType = TokenType::Comment;
+ }
+ }
+ }
+ }
+ }
+
+ // Operator?
+ // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
+ else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
+ {
+ // parameters for SQL view
+ if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
+ {
+ if (c!='?')
+ {
+ bool bIdentifierChar;
+ do
+ {
+ // Get next character
+ if (pos == end)
+ break;
+ c = *pos;
+ bIdentifierChar = isAlpha(c);
+ if( bIdentifierChar )
+ ++pos;
+ }
+ while( bIdentifierChar );
+ }
+ reType = TokenType::Parameter;
+ }
+ else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
+ {
+ if (pos != end && *pos=='-')
+ {
+ // Remove all characters until end of line or EOF
+ while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
+ {
+ ++pos;
+ }
+ reType = TokenType::Comment;
+ }
+ else
+ reType = TokenType::Operator;
+ }
+ else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
+ {
+ if (pos != end && *pos=='/')
+ {
+ // Remove all characters until end of line or EOF
+ while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
+ {
+ ++pos;
+ }
+ reType = TokenType::Comment;
+ }
+ else
+ reType = TokenType::Operator;
+ }
+ else
+ {
+ // Apostrophe is Basic comment
+ if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
+ {
+ // Skip all characters until end of input or end of line:
+ for (;;) {
+ if (pos == end)
+ break;
+ c = *pos;
+ if (testCharFlags(c, CharFlags::EOL)) {
+ break;
+ }
+ ++pos;
+ }
+
+ reType = TokenType::Comment;
+ }
+
+ // The real operator; can be easily used since not the actual
+ // operator (e.g. +=) is concerned, but the fact that it is one
+ if( reType != TokenType::Comment )
+ {
+ reType = TokenType::Operator;
+ }
+
+ }
+ }
+
+ // Object separator? Must be handled before Number
+ else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
+ {
+ reType = TokenType::Operator;
+ }
+
+ // Number?
+ else if( testCharFlags( c, CharFlags::StartNumber ) )
+ {
+ reType = TokenType::Number;
+
+ // Number system, 10 = normal, it is changed for Oct/Hex
+ int nRadix = 10;
+
+ // Is it an Oct or a Hex number?
+ if( c == '&' )
+ {
+ // Octal?
+ if( pos != end && (*pos == 'o' || *pos == 'O' ))
+ {
+ // remove o
+ ++pos;
+ nRadix = 8; // Octal base
+
+ // Read all numbers
+ while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
+ ++pos;
+ }
+ // Hexadecimal?
+ else if( pos != end && (*pos == 'h' || *pos == 'H' ))
+ {
+ // remove x
+ ++pos;
+ nRadix = 16; // Hexadecimal base
+
+ // Read all numbers
+ while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
+ ++pos;
+ }
+ else
+ {
+ reType = TokenType::Operator;
+ }
+ }
+
+ // When it is not Oct or Hex, then it is double
+ if( reType == TokenType::Number && nRadix == 10 )
+ {
+ // Flag if the last character is an exponent
+ bool bAfterExpChar = false;
+
+ // Read all numbers
+ while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
+ (bAfterExpChar && *pos == '+' ) ||
+ (bAfterExpChar && *pos == '-' ) ))
+ // After exponent +/- are OK, too
+ {
+ c = *pos++;
+ bAfterExpChar = ( c == 'e' || c == 'E' );
+ }
+ }
+ }
+
+ // String?
+ else if( testCharFlags( c, CharFlags::StartString ) )
+ {
+ // Remember which character has opened the string
+ sal_Unicode cEndString = c;
+ if( c == '[' )
+ cEndString = ']';
+
+ // Read all characters
+ while( pos == end || *pos != cEndString )
+ {
+ // Detect EOF before reading next char, so we do not lose EOF
+ if( pos == end )
+ {
+ // ERROR: unterminated string literal
+ reType = TokenType::Error;
+ break;
+ }
+ c = *pos++;
+ if( testCharFlags( c, CharFlags::EOL ) )
+ {
+ // ERROR: unterminated string literal
+ reType = TokenType::Error;
+ break;
+ }
+ }
+
+ if( reType != TokenType::Error )
+ {
+ ++pos;
+ if( cEndString == ']' )
+ reType = TokenType::Identifier;
+ else
+ reType = TokenType::String;
+ }
+ }
+
+ // End of line?
+ else if( testCharFlags( c, CharFlags::EOL ) )
+ {
+ // If another EOL character comes, read it
+ if (pos != end)
+ {
+ sal_Unicode cNext = *pos;
+ if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
+ ++pos;
+ }
+
+ reType = TokenType::EOL;
+ }
+
+ // All other will remain TokenType::Unknown
+
+ // Save end position
+ rpEndPos = pos;
+ return true;
+}
+
+SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
+{
+ // Fill character table
+ sal_uInt16 i;
+
+ // Allowed characters for identifiers
+ CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
+ for( i = 'a' ; i <= 'z' ; i++ )
+ aCharTypeTab[i] |= nHelpMask;
+ for( i = 'A' ; i <= 'Z' ; i++ )
+ aCharTypeTab[i] |= nHelpMask;
+ aCharTypeTab[int('_')] |= nHelpMask;
+ aCharTypeTab[int('$')] |= nHelpMask;
+
+ // Digit (can be identifier and number)
+ nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
+ CharFlags::InNumber | CharFlags::InHexNumber;
+ for( i = '0' ; i <= '9' ; i++ )
+ aCharTypeTab[i] |= nHelpMask;
+
+ // Add e, E, . and & here manually
+ aCharTypeTab[int('e')] |= CharFlags::InNumber;
+ aCharTypeTab[int('E')] |= CharFlags::InNumber;
+ aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
+ aCharTypeTab[int('&')] |= CharFlags::StartNumber;
+
+ // Hexadecimal digit
+ for( i = 'a' ; i <= 'f' ; i++ )
+ aCharTypeTab[i] |= CharFlags::InHexNumber;
+ for( i = 'A' ; i <= 'F' ; i++ )
+ aCharTypeTab[i] |= CharFlags::InHexNumber;
+
+ // Octal digit
+ for( i = '0' ; i <= '7' ; i++ )
+ aCharTypeTab[i] |= CharFlags::InOctNumber;
+
+ // String literal start/end characters
+ aCharTypeTab[int('\'')] |= CharFlags::StartString;
+ aCharTypeTab[int('\"')] |= CharFlags::StartString;
+ aCharTypeTab[int('[')] |= CharFlags::StartString;
+ aCharTypeTab[int('`')] |= CharFlags::StartString;
+
+ // Operator characters
+ aCharTypeTab[int('!')] |= CharFlags::Operator;
+ aCharTypeTab[int('%')] |= CharFlags::Operator;
+ // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
+ aCharTypeTab[int('(')] |= CharFlags::Operator;
+ aCharTypeTab[int(')')] |= CharFlags::Operator;
+ aCharTypeTab[int('*')] |= CharFlags::Operator;
+ aCharTypeTab[int('+')] |= CharFlags::Operator;
+ aCharTypeTab[int(',')] |= CharFlags::Operator;
+ aCharTypeTab[int('-')] |= CharFlags::Operator;
+ aCharTypeTab[int('/')] |= CharFlags::Operator;
+ aCharTypeTab[int(':')] |= CharFlags::Operator;
+ aCharTypeTab[int('<')] |= CharFlags::Operator;
+ aCharTypeTab[int('=')] |= CharFlags::Operator;
+ aCharTypeTab[int('>')] |= CharFlags::Operator;
+ aCharTypeTab[int('?')] |= CharFlags::Operator;
+ aCharTypeTab[int('^')] |= CharFlags::Operator;
+ aCharTypeTab[int('|')] |= CharFlags::Operator;
+ aCharTypeTab[int('~')] |= CharFlags::Operator;
+ aCharTypeTab[int('{')] |= CharFlags::Operator;
+ aCharTypeTab[int('}')] |= CharFlags::Operator;
+ // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
+ aCharTypeTab[int(']')] |= CharFlags::Operator;
+ aCharTypeTab[int(';')] |= CharFlags::Operator;
+
+ // Space
+ aCharTypeTab[int(' ') ] |= CharFlags::Space;
+ aCharTypeTab[int('\t')] |= CharFlags::Space;
+
+ // End of line characters
+ aCharTypeTab[int('\r')] |= CharFlags::EOL;
+ aCharTypeTab[int('\n')] |= CharFlags::EOL;
+
+ ppListKeyWords = nullptr;
+ nKeyWordCount = 0;
+}
+
+void SyntaxHighlighter::Tokenizer::getHighlightPortions(std::u16string_view rLine,
+ /*out*/std::vector<HighlightPortion>& portions) const
+{
+ // Set the position to the beginning of the source string
+ auto pos = rLine.begin();
+
+ // Variables for the out parameter
+ TokenType eType;
+ std::u16string_view::const_iterator pStartPos;
+ std::u16string_view::const_iterator pEndPos;
+
+ // Loop over all the tokens
+ while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
+ {
+ portions.emplace_back(
+ pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
+ }
+}
+
+
+SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
+ m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
+{
+ switch (language)
+ {
+ case HighlighterLanguage::Basic:
+ m_tokenizer->setKeyWords( strListBasicKeyWords,
+ std::size( strListBasicKeyWords ));
+ break;
+ case HighlighterLanguage::SQL:
+ m_tokenizer->setKeyWords( strListSqlKeyWords,
+ std::size( strListSqlKeyWords ));
+ break;
+ default:
+ assert(false); // this cannot happen
+ }
+}
+
+SyntaxHighlighter::~SyntaxHighlighter() {}
+
+void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
+ /*out*/std::vector<HighlightPortion>& portions) const
+{
+ m_tokenizer->getHighlightPortions( rLine, portions );
+}
+
+HighlighterLanguage SyntaxHighlighter::GetLanguage() const
+{
+ return m_tokenizer->aLanguage;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */