1 files changed, 717 insertions, 0 deletions
diff --git a/basic/source/comp/scanner.cxx b/basic/source/comp/scanner.cxx
new file mode 100644
index 000000000..b5a923969
--- /dev/null
+++ b/basic/source/comp/scanner.cxx
@@ -0,0 +1,717 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <basiccharclass.hxx>
+#include <scanner.hxx>
+#include <sbintern.hxx>
+#include <runtime.hxx>
+
+#include <basic/sberrors.hxx>
+#include <i18nlangtag/lang.h>
+#include <svl/numformat.hxx>
+#include <svl/zforlist.hxx>
+#include <rtl/character.hxx>
+#include <o3tl/string_view.hxx>
+#include <utility>
+
+SbiScanner::SbiScanner(OUString _aBuf, StarBASIC* p)
+    : aBuf(std::move(_aBuf))
+    , nLineIdx(-1)
+    , nSaveLineIdx(-1)
+    , pBasic(p)
+    , eScanType(SbxVARIANT)
+    , nVal(0)
+    , nSavedCol1(0)
+    , nCol(0)
+    , nErrors(0)
+    , nColLock(0)
+    , nBufPos(0)
+    , nLine(0)
+    , nCol1(0)
+    , nCol2(0)
+    , bSymbol(false)
+    , bNumber(false)
+    , bSpaces(false)
+    , bAbort(false)
+    , bHash(true)
+    , bError(false)
+    , bCompatible(false)
+    , bVBASupportOn(false)
+    , bPrevLineExtentsComment(false)
+    , bClosingUnderscore(false)
+    , bLineEndsWithWhitespace(false)
+    , bInStatement(false)
+{
+}
+
+void SbiScanner::LockColumn()
+{
+    if( !nColLock++ )
+        nSavedCol1 = nCol1;
+}
+
+void SbiScanner::UnlockColumn()
+{
+    if( nColLock )
+        nColLock--;
+}
+
+void SbiScanner::GenError( ErrCode code )
+{
+    if( GetSbData()->bBlockCompilerError )
+    {
+        bAbort = true;
+        return;
+    }
+    if( !bError )
+    {
+        bool bRes = true;
+        // report only one error per statement
+        bError = true;
+        if( pBasic )
+        {
+            // in case of EXPECTED or UNEXPECTED it always refers
+            // to the last token, so take the Col1 over
+            sal_Int32 nc = nColLock ? nSavedCol1 : nCol1;
+            if ( code.anyOf(
+                    ERRCODE_BASIC_EXPECTED,
+                    ERRCODE_BASIC_UNEXPECTED,
+                    ERRCODE_BASIC_SYMBOL_EXPECTED,
+                    ERRCODE_BASIC_LABEL_EXPECTED) )
+            {
+                    nc = nCol1;
+                    if( nc > nCol2 ) nCol2 = nc;
+            }
+            bRes = pBasic->CError( code, aError, nLine, nc, nCol2 );
+        }
+        bAbort = bAbort || !bRes  || ( code == ERRCODE_BASIC_NO_MEMORY || code == ERRCODE_BASIC_PROG_TOO_LARGE );
+    }
+    nErrors++;
+}
+
+
+// used by SbiTokenizer::MayBeLabel() to detect a label
+bool SbiScanner::DoesColonFollow()
+{
+    if(nCol < aLine.getLength() && aLine[nCol] == ':')
+    {
+        ++nLineIdx; ++nCol;
+        return true;
+    }
+    else
+        return false;
+}
+
+// test for legal suffix
+static SbxDataType GetSuffixType( sal_Unicode c )
+{
+    switch (c)
+    {
+    case '%':
+        return SbxINTEGER;
+    case '&':
+        return SbxLONG;
+    case '!':
+        return SbxSINGLE;
+    case '#':
+        return SbxDOUBLE;
+    case '@':
+        return SbxCURRENCY;
+    case '$':
+        return SbxSTRING;
+    default:
+        return SbxVARIANT;
+    }
+}
+
+// reading the next symbol into the variables aSym, nVal and eType
+// return value is sal_False at EOF or errors
+#define BUF_SIZE 80
+
+void SbiScanner::scanAlphanumeric()
+{
+    sal_Int32 n = nCol;
+    while(nCol < aLine.getLength() && (BasicCharClass::isAlphaNumeric(aLine[nCol], bCompatible) || aLine[nCol] == '_'))
+    {
+        ++nLineIdx;
+        ++nCol;
+    }
+    aSym = aLine.copy(n, nCol - n);
+}
+
+void SbiScanner::scanGoto()
+{
+    sal_Int32 n = nCol;
+    while(n < aLine.getLength() && BasicCharClass::isWhitespace(aLine[n]))
+        ++n;
+
+    if(n + 1 < aLine.getLength())
+    {
+        std::u16string_view aTemp = aLine.subView(n, 2);
+        if(o3tl::equalsIgnoreAsciiCase(aTemp, u"to"))
+        {
+            aSym = "goto";
+            nLineIdx += n + 2 - nCol;
+            nCol = n + 2;
+        }
+    }
+}
+
+bool SbiScanner::readLine()
+{
+    if(nBufPos >= aBuf.getLength())
+        return false;
+
+    sal_Int32 n = nBufPos;
+    sal_Int32 nLen = aBuf.getLength();
+
+    while(n < nLen && aBuf[n] != '\r' && aBuf[n] != '\n')
+        ++n;
+
+    // Trim trailing whitespace
+    sal_Int32 nEnd = n;
+    while(nBufPos < nEnd && BasicCharClass::isWhitespace(aBuf[nEnd - 1]))
+        --nEnd;
+
+    // tdf#149402 - check if line ends with a whitespace
+    bLineEndsWithWhitespace = (n > nEnd);
+    aLine = aBuf.copy(nBufPos, nEnd - nBufPos);
+
+    // Fast-forward past the line ending
+    if(n + 1 < nLen && aBuf[n] == '\r' && aBuf[n + 1] == '\n')
+        n += 2;
+    else if(n < nLen)
+        ++n;
+
+    nBufPos = n;
+    nLineIdx = 0;
+
+    ++nLine;
+    nCol = nCol1 = nCol2 = 0;
+    nColLock = 0;
+
+    return true;
+}
+
+bool SbiScanner::NextSym()
+{
+    // memorize for the EOLN-case
+    sal_Int32 nOldLine = nLine;
+    sal_Int32 nOldCol1 = nCol1;
+    sal_Int32 nOldCol2 = nCol2;
+    sal_Unicode buf[ BUF_SIZE ], *p = buf;
+
+    eScanType = SbxVARIANT;
+    aSym.clear();
+    bHash = bSymbol = bNumber = bSpaces = false;
+    bool bCompilerDirective = false;
+
+    // read in line?
+    if (nLineIdx == -1)
+    {
+        if(!readLine())
+            return false;
+
+        nOldLine = nLine;
+        nOldCol1 = nOldCol2 = 0;
+    }
+
+    const sal_Int32 nLineIdxScanStart = nLineIdx;
+
+    if(nCol < aLine.getLength() && BasicCharClass::isWhitespace(aLine[nCol]))
+    {
+        bSpaces = true;
+        while(nCol < aLine.getLength() && BasicCharClass::isWhitespace(aLine[nCol]))
+        {
+            ++nLineIdx;
+            ++nCol;
+        }
+    }
+
+    nCol1 = nCol;
+
+    // only blank line?
+    if(nCol >= aLine.getLength())
+        goto eoln;
+
+    if( bPrevLineExtentsComment )
+        goto PrevLineCommentLbl;
+
+    if(nCol < aLine.getLength() && aLine[nCol] == '#')
+    {
+        sal_Int32 nLineTempIdx = nLineIdx;
+        do
+        {
+            nLineTempIdx++;
+        } while (nLineTempIdx < aLine.getLength() && !BasicCharClass::isWhitespace(aLine[nLineTempIdx])
+            && aLine[nLineTempIdx] != '#' && aLine[nLineTempIdx] != ',');
+        // leave it if it is a date literal - it will be handled later
+        if (nLineTempIdx >= aLine.getLength() || aLine[nLineTempIdx] != '#')
+        {
+            ++nLineIdx;
+            ++nCol;
+            //ignore compiler directives (# is first non-space character)
+            if (nOldCol2 == 0)
+                bCompilerDirective = true;
+            else
+                bHash = true;
+        }
+    }
+
+    // copy character if symbol
+    if(nCol < aLine.getLength() && (BasicCharClass::isAlpha(aLine[nCol], bCompatible) || aLine[nCol] == '_'))
+    {
+        // if there's nothing behind '_' , it's the end of a line!
+        if(nCol + 1 == aLine.getLength() && aLine[nCol] == '_')
+        {
+            // Note that nCol is not incremented here...
+            ++nLineIdx;
+            goto eoln;
+        }
+
+        bSymbol = true;
+
+        scanAlphanumeric();
+
+        // Special handling for "go to"
+        if(nCol < aLine.getLength() && bCompatible && aSym.equalsIgnoreAsciiCase("go"))
+            scanGoto();
+
+        // tdf#125637 - check for closing underscore
+        if (nCol == aLine.getLength() && aLine[nCol - 1] == '_')
+        {
+            bClosingUnderscore = true;
+        }
+        // type recognition?
+        // don't test the exclamation mark
+        // if there's a symbol behind it
+        else if((nCol >= aLine.getLength() || aLine[nCol] != '!') ||
+                (nCol + 1 >= aLine.getLength() || !BasicCharClass::isAlpha(aLine[nCol + 1], bCompatible)))
+        {
+            if(nCol < aLine.getLength())
+            {
+                SbxDataType t(GetSuffixType(aLine[nCol]));
+                if( t != SbxVARIANT )
+                {
+                    eScanType = t;
+                    ++nLineIdx;
+                    ++nCol;
+                }
+            }
+        }
+    }
+
+    // read in and convert if number
+    else if((nCol < aLine.getLength() && rtl::isAsciiDigit(aLine[nCol])) ||
+            (nCol + 1 < aLine.getLength() && aLine[nCol] == '.' && rtl::isAsciiDigit(aLine[nCol + 1])))
+    {
+        short exp = 0;
+        short dec = 0;
+        eScanType = SbxDOUBLE;
+        bool bScanError = false;
+        bool bBufOverflow = false;
+        // All this because of 'D' or 'd' floating point type, sigh...
+        while(!bScanError && nCol < aLine.getLength() && strchr("0123456789.DEde", aLine[nCol]))
+        {
+            // from 4.1.1996: buffer full? -> go on scanning empty
+            if( (p-buf) == (BUF_SIZE-1) )
+            {
+                bBufOverflow = true;
+                ++nLineIdx;
+                ++nCol;
+                continue;
+            }
+            // point or exponent?
+            if(aLine[nCol] == '.')
+            {
+                if( ++dec > 1 )
+                    bScanError = true;
+                else
+                    *p++ = '.';
+            }
+            else if(strchr("DdEe", aLine[nCol]))
+            {
+                if (++exp > 1)
+                    bScanError = true;
+                else
+                {
+                    *p++ = 'E';
+                    if (nCol + 1 < aLine.getLength() && (aLine[nCol+1] == '+' || aLine[nCol+1] == '-'))
+                    {
+                        ++nLineIdx;
+                        ++nCol;
+                        if( (p-buf) == (BUF_SIZE-1) )
+                        {
+                            bBufOverflow = true;
+                            continue;
+                        }
+                        *p++ = aLine[nCol];
+                    }
+                }
+            }
+            else
+            {
+                *p++ = aLine[nCol];
+            }
+            ++nLineIdx;
+            ++nCol;
+        }
+        *p = 0;
+        aSym = p; bNumber = true;
+
+        // For bad characters, scan and parse errors generate only one error.
+        ErrCode nError = ERRCODE_NONE;
+        if (bScanError)
+        {
+            --nLineIdx;
+            --nCol;
+            aError = OUString( aLine[nCol]);
+            nError = ERRCODE_BASIC_BAD_CHAR_IN_NUMBER;
+        }
+
+        rtl_math_ConversionStatus eStatus = rtl_math_ConversionStatus_Ok;
+        const sal_Unicode* pParseEnd = buf;
+        nVal = rtl_math_uStringToDouble( buf, buf+(p-buf), '.', ',', &eStatus, &pParseEnd );
+        if (pParseEnd != buf+(p-buf))
+        {
+            // e.g. "12e" or "12e+", or with bScanError "12d"+"E".
+            sal_Int32 nChars = buf+(p-buf) - pParseEnd;
+            nLineIdx -= nChars;
+            nCol -= nChars;
+            // For bScanError, nLineIdx and nCol were already decremented, just
+            // add that character to the parse end.
+            if (bScanError)
+                ++nChars;
+            // Copy error position from original string, not the buffer
+            // replacement where "12dE" => "12EE".
+            aError = aLine.copy( nCol, nChars);
+            nError = ERRCODE_BASIC_BAD_CHAR_IN_NUMBER;
+        }
+        else if (eStatus != rtl_math_ConversionStatus_Ok)
+        {
+            // Keep the scan error and character at position, if any.
+            if (!nError)
+                nError = ERRCODE_BASIC_MATH_OVERFLOW;
+        }
+
+        if (nError)
+            GenError( nError );
+
+        if( !dec && !exp )
+        {
+            if( nVal >= SbxMININT && nVal <= SbxMAXINT )
+                eScanType = SbxINTEGER;
+            else if( nVal >= SbxMINLNG && nVal <= SbxMAXLNG )
+                    eScanType = SbxLONG;
+        }
+
+        if( bBufOverflow )
+            GenError( ERRCODE_BASIC_MATH_OVERFLOW );
+
+        // type recognition?
+        if( nCol < aLine.getLength() )
+        {
+            SbxDataType t(GetSuffixType(aLine[nCol]));
+            if( t != SbxVARIANT )
+            {
+                eScanType = t;
+                ++nLineIdx;
+                ++nCol;
+            }
+            // tdf#130476 - don't allow String trailing data type character with numbers
+            if ( t == SbxSTRING )
+            {
+                GenError( ERRCODE_BASIC_SYNTAX );
+            }
+        }
+    }
+
+    // Hex/octal number? Read in and convert:
+    else if(aLine.getLength() - nCol > 1 && aLine[nCol] == '&')
+    {
+        ++nLineIdx; ++nCol;
+        sal_Unicode base = 16;
+        sal_Unicode xch  = aLine[nCol];
+        ++nLineIdx; ++nCol;
+        switch( rtl::toAsciiUpperCase( xch ) )
+        {
+            case 'O':
+                base = 8;
+                break;
+            case 'H':
+                break;
+            default :
+                // treated as an operator
+                --nLineIdx; --nCol; nCol1 = nCol-1;
+                aSym = "&";
+                return true;
+        }
+        bNumber = true;
+        // Hex literals are signed Integers ( as defined by basic
+        // e.g. -2,147,483,648 through 2,147,483,647 (signed)
+        sal_uInt64 lu = 0;
+        bool bOverflow = false;
+        while(nCol < aLine.getLength() && BasicCharClass::isAlphaNumeric(aLine[nCol], false))
+        {
+            sal_Unicode ch = rtl::toAsciiUpperCase(aLine[nCol]);
+            ++nLineIdx; ++nCol;
+            if( ((base == 16 ) && rtl::isAsciiHexDigit( ch ) ) ||
+                     ((base == 8) && rtl::isAsciiOctalDigit( ch )))
+            {
+                int i = ch  - '0';
+                if( i > 9 ) i -= 7;
+                lu = ( lu * base ) + i;
+                if( lu > SAL_MAX_UINT32 )
+                {
+                    bOverflow = true;
+                }
+            }
+            else
+            {
+                aError = OUString(ch);
+                GenError( ERRCODE_BASIC_BAD_CHAR_IN_NUMBER );
+            }
+        }
+
+        // tdf#130476 - take into account trailing data type characters
+        if( nCol < aLine.getLength() )
+        {
+            SbxDataType t(GetSuffixType(aLine[nCol]));
+            if( t != SbxVARIANT )
+            {
+                eScanType = t;
+                ++nLineIdx;
+                ++nCol;
+            }
+            // tdf#130476 - don't allow String trailing data type character with numbers
+            if ( t == SbxSTRING )
+            {
+                GenError( ERRCODE_BASIC_SYNTAX );
+            }
+        }
+
+        // tdf#130476 - take into account trailing data type characters
+        switch ( eScanType )
+        {
+            case SbxINTEGER:
+                nVal = static_cast<double>( static_cast<sal_Int16>(lu) );
+                if ( lu > SbxMAXUINT )
+                {
+                    bOverflow = true;
+                }
+                break;
+            case SbxLONG: nVal = static_cast<double>( static_cast<sal_Int32>(lu) ); break;
+            case SbxVARIANT:
+            {
+                // tdf#62326 - If the value of the hex string without explicit type character lies within
+                // the range of 0x8000 (SbxMAXINT + 1) and 0xFFFF (SbxMAXUINT) inclusive, cast the value
+                // to 16 bit in order to get signed integers, e.g., SbxMININT through SbxMAXINT
+                sal_Int32 ls = (lu > SbxMAXINT && lu <= SbxMAXUINT) ? static_cast<sal_Int16>(lu) : static_cast<sal_Int32>(lu);
+                eScanType = ( ls >= SbxMININT && ls <= SbxMAXINT ) ? SbxINTEGER : SbxLONG;
+                nVal = static_cast<double>(ls);
+                break;
+            }
+            default:
+                nVal = static_cast<double>(lu);
+                break;
+        }
+        if( bOverflow )
+            GenError( ERRCODE_BASIC_MATH_OVERFLOW );
+    }
+
+    // Strings:
+    else if (nLineIdx < aLine.getLength() && (aLine[nLineIdx] == '"' || aLine[nLineIdx] == '['))
+    {
+        sal_Unicode cSep = aLine[nLineIdx];
+        if( cSep == '[' )
+        {
+            bSymbol = true;
+            cSep = ']';
+        }
+        sal_Int32 n = nCol + 1;
+        while (nLineIdx < aLine.getLength())
+        {
+            do
+            {
+                nLineIdx++;
+                nCol++;
+            }
+            while (nLineIdx < aLine.getLength() && (aLine[nLineIdx] != cSep));
+            if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == cSep)
+            {
+                nLineIdx++; nCol++;
+                if (nLineIdx >= aLine.getLength() || aLine[nLineIdx] != cSep || cSep == ']')
+                {
+                    // If VBA Interop then doesn't eat the [] chars
+                    if ( cSep == ']' && bVBASupportOn )
+                        aSym = aLine.copy( n - 1, nCol - n  + 1);
+                    else
+                        aSym = aLine.copy( n, nCol - n - 1 );
+                    // get out duplicate string delimiters
+                    OUStringBuffer aSymBuf(aSym.getLength());
+                    for ( sal_Int32 i = 0, len = aSym.getLength(); i < len; ++i )
+                    {
+                        aSymBuf.append( aSym[i] );
+                        if ( aSym[i] == cSep && ( i+1 < len ) && aSym[i+1] == cSep )
+                            ++i;
+                    }
+                    aSym = aSymBuf.makeStringAndClear();
+                    if( cSep != ']' )
+                        eScanType = SbxSTRING;
+                    break;
+                }
+            }
+            else
+            {
+                aError = OUString(cSep);
+                GenError( ERRCODE_BASIC_EXPECTED );
+            }
+        }
+    }
+
+    // Date:
+    else if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == '#')
+    {
+        sal_Int32 n = nCol + 1;
+        do
+        {
+            nLineIdx++;
+            nCol++;
+        }
+        while (nLineIdx < aLine.getLength() && (aLine[nLineIdx] != '#'));
+        if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == '#')
+        {
+            nLineIdx++; nCol++;
+            aSym = aLine.copy( n, nCol - n - 1 );
+
+            // parse date literal
+            std::shared_ptr<SvNumberFormatter> pFormatter;
+            if (GetSbData()->pInst)
+            {
+                pFormatter = GetSbData()->pInst->GetNumberFormatter();
+            }
+            else
+            {
+                sal_uInt32 nDummy;
+                pFormatter = SbiInstance::PrepareNumberFormatter( nDummy, nDummy, nDummy );
+            }
+            sal_uInt32 nIndex = pFormatter->GetStandardIndex( LANGUAGE_ENGLISH_US);
+            bool bSuccess = pFormatter->IsNumberFormat(aSym, nIndex, nVal);
+            if( bSuccess )
+            {
+                SvNumFormatType nType_ = pFormatter->GetType(nIndex);
+                if( !(nType_ & SvNumFormatType::DATE) )
+                    bSuccess = false;
+            }
+
+            if (!bSuccess)
+                GenError( ERRCODE_BASIC_CONVERSION );
+
+            bNumber = true;
+            eScanType = SbxDOUBLE;
+        }
+        else
+        {
+            aError = OUString('#');
+            GenError( ERRCODE_BASIC_EXPECTED );
+        }
+    }
+    // invalid characters:
+    else if (nLineIdx < aLine.getLength() && aLine[nLineIdx] >= 0x7F)
+    {
+        GenError( ERRCODE_BASIC_SYNTAX ); nLineIdx++; nCol++;
+    }
+    // other groups:
+    else
+    {
+        sal_Int32 n = 1;
+        auto nChar = nLineIdx < aLine.getLength() ? aLine[nLineIdx] : 0;
+        ++nLineIdx;
+        if (nLineIdx < aLine.getLength())
+        {
+            switch (nChar)
+            {
+                case '<': if( aLine[nLineIdx] == '>' || aLine[nLineIdx] == '=' ) n = 2; break;
+                case '>': if( aLine[nLineIdx] == '=' ) n = 2; break;
+                case ':': if( aLine[nLineIdx] == '=' ) n = 2; break;
+            }
+        }
+        aSym = aLine.copy(nCol, std::min(n, aLine.getLength() - nCol));
+        nLineIdx += n-1; nCol = nCol + n;
+    }
+
+    nCol2 = nCol-1;
+
+PrevLineCommentLbl:
+
+    if( bPrevLineExtentsComment || (eScanType != SbxSTRING &&
+                                    ( bCompilerDirective ||
+                                      aSym.startsWith("'") ||
+                                      aSym.equalsIgnoreAsciiCase( "REM" ) ) ) )
+    {
+        bPrevLineExtentsComment = false;
+        aSym = "REM";
+        sal_Int32 nLen = aLine.getLength() - nLineIdx;
+        // tdf#149402 - don't extend comment if line ends in a whitespace (asicCharClass::isWhitespace)
+        if (bCompatible && !bLineEndsWithWhitespace && aLine[nLineIdx + nLen - 1] == '_'
+            && aLine[nLineIdx + nLen - 2] == ' ')
+            bPrevLineExtentsComment = true;
+        nCol2 = nCol2 + nLen;
+        nLineIdx = -1;
+    }
+
+    if (nLineIdx == nLineIdxScanStart)
+    {
+        GenError( ERRCODE_BASIC_SYMBOL_EXPECTED );
+        return false;
+    }
+
+    return true;
+
+
+eoln:
+    if (nCol && aLine[--nLineIdx] == '_' && !bClosingUnderscore)
+    {
+        nLineIdx = -1;
+        bool bRes = NextSym();
+        if( aSym.startsWith(".") )
+        {
+            // object _
+            //    .Method
+            // ^^^  <- spaces is legal in MSO VBA
+            bSpaces = false;
+        }
+        return bRes;
+    }
+    else
+    {
+        nLineIdx = -1;
+        nLine = nOldLine;
+        nCol1 = nOldCol1;
+        nCol2 = nOldCol2;
+        aSym = "\n";
+        nColLock = 0;
+        bClosingUnderscore = false;
+        // tdf#149157 - break multiline continuation in a comment after a new line
+        bPrevLineExtentsComment = false;
+        return true;
+    }
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */