1 files changed, 345 insertions, 0 deletions
diff --git a/src/libixion/formula_lexer.cpp b/src/libixion/formula_lexer.cpp
new file mode 100644
index 0000000..7928d82
--- /dev/null
+++ b/src/libixion/formula_lexer.cpp
@@ -0,0 +1,345 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "formula_lexer.hpp"
+#include "debug.hpp"
+#include "ixion/global.hpp"
+
+#include <cassert>
+#include <iostream>
+#include <sstream>
+#include <cctype>
+#include <unordered_map>
+
+namespace ixion {
+
+namespace {
+
+const std::unordered_map<char, lexer_opcode_t> ops_map = {
+    { '&', lexer_opcode_t::concat },
+    { '(', lexer_opcode_t::open },
+    { ')', lexer_opcode_t::close },
+    { '*', lexer_opcode_t::multiply },
+    { '+', lexer_opcode_t::plus },
+    { '-', lexer_opcode_t::minus },
+    { '/', lexer_opcode_t::divide },
+    { '<', lexer_opcode_t::less },
+    { '=', lexer_opcode_t::equal },
+    { '>', lexer_opcode_t::greater },
+    { '^', lexer_opcode_t::exponent },
+    { '{', lexer_opcode_t::array_open },
+    { '}', lexer_opcode_t::array_close },
+};
+
+} // anonymous namespace
+
+class tokenizer
+{
+    enum buffer_type {
+        buf_numeral,
+        buf_name
+    };
+
+public:
+    tokenizer() = delete;
+    tokenizer(const tokenizer&) = delete;
+    tokenizer& operator= (tokenizer) = delete;
+
+    explicit tokenizer(lexer_tokens_t& tokens, const char* p, size_t n) :
+        m_tokens(tokens),
+        m_sep_arg(','),
+        m_sep_array_row(';'),
+        m_sep_decimal('.'),
+        mp_first(p),
+        mp_char(NULL),
+        m_size(n),
+        m_pos(0),
+        mp_char_stored(nullptr),
+        m_pos_stored(0)
+    {
+    }
+
+    void run();
+
+    void set_sep_arg(char c);
+
+private:
+    bool is_arg_sep(char c) const;
+    bool is_array_row_sep(char c) const;
+    bool is_decimal_sep(char c) const;
+    bool is_op(char c) const;
+
+    void init();
+
+    void numeral();
+    void space();
+    void name();
+    void op(lexer_opcode_t oc);
+    void string();
+
+    bool has_char() const;
+    void next();
+    void push_pos();
+    void pop_pos();
+
+private:
+    lexer_tokens_t& m_tokens;
+
+    char m_sep_arg;
+    char m_sep_array_row;
+    char m_sep_decimal;
+
+    const char* mp_first;
+    const char* mp_char;
+    const size_t m_size;
+    size_t m_pos;
+
+    const char* mp_char_stored;
+    size_t m_pos_stored;
+};
+
+void tokenizer::init()
+{
+    m_tokens.clear();
+    mp_char = mp_first;
+    m_pos = 0;
+}
+
+void tokenizer::run()
+{
+    if (!m_size)
+        // Nothing to do.
+        return;
+
+    init();
+
+    while (has_char())
+    {
+        if (std::isdigit(*mp_char))
+        {
+            numeral();
+            continue;
+        }
+
+        if (auto it = ops_map.find(*mp_char); it != ops_map.end())
+        {
+            op(it->second);
+            continue;
+        }
+
+        switch (*mp_char)
+        {
+            case ' ':
+                space();
+                continue;
+            case '"':
+                string();
+                continue;
+        }
+
+        if (is_arg_sep(*mp_char))
+        {
+            op(lexer_opcode_t::sep);
+            continue;
+        }
+
+        if (is_array_row_sep(*mp_char))
+        {
+            op(lexer_opcode_t::array_row_sep);
+            continue;
+        }
+
+        name();
+    }
+}
+
+void tokenizer::set_sep_arg(char c)
+{
+    m_sep_arg = c;
+}
+
+bool tokenizer::is_arg_sep(char c) const
+{
+    return c == m_sep_arg;
+}
+
+bool tokenizer::is_array_row_sep(char c) const
+{
+    return c == m_sep_array_row;
+}
+
+bool tokenizer::is_decimal_sep(char c) const
+{
+    return c == m_sep_decimal;
+}
+
+bool tokenizer::is_op(char c) const
+{
+    if (is_arg_sep(c))
+        return true;
+
+    if (ops_map.count(c) > 0)
+        return true;
+
+    switch (*mp_char)
+    {
+        case ' ':
+        case '"':
+            return true;
+    }
+    return false;
+}
+
+void tokenizer::numeral()
+{
+    const char* p = mp_char;
+    push_pos();
+
+    size_t len = 1;
+    size_t sep_count = 0;
+    for (next(); has_char(); next(), ++len)
+    {
+        if (*mp_char == ':')
+        {
+            // Treat this as a name.  This may be a part of a row-only range (e.g. 3:3).
+            pop_pos();
+            name();
+            return;
+        }
+
+        if (std::isdigit(*mp_char))
+            continue;
+        if (is_decimal_sep(*mp_char) && ++sep_count <= 1)
+            continue;
+
+        break;
+    }
+
+    if (sep_count > 1)
+    {
+        // failed to parse this as a numeral. Treat this as a name.
+        IXION_TRACE("error parsing '" << std::string(p, len) << "' as a numeral, treating it as a name.");
+        pop_pos();
+        name();
+        return;
+    }
+    double val = to_double({p, len});
+    m_tokens.emplace_back(val);
+}
+
+void tokenizer::space()
+{
+    // space is ignored for now.
+    next();
+}
+
+void tokenizer::name()
+{
+    std::vector<char> scopes;
+
+    const char* p = mp_char;
+    size_t len = 0;
+
+    for (; has_char(); next(), ++len)
+    {
+        char c = *mp_char;
+
+        if (!scopes.empty() && scopes.back() == c)
+        {
+            scopes.pop_back();
+            continue;
+        }
+
+        switch (c)
+        {
+            case '[':
+                scopes.push_back(']');
+                continue;
+            case '\'':
+                scopes.push_back('\'');
+                continue;
+        }
+
+        if (!scopes.empty())
+            continue;
+
+        if (is_op(c))
+            break;
+    }
+
+    m_tokens.emplace_back(lexer_opcode_t::name, std::string_view{p, len});
+}
+
+void tokenizer::op(lexer_opcode_t oc)
+{
+    m_tokens.emplace_back(oc);
+    next();
+}
+
+void tokenizer::string()
+{
+    next();
+    const char* p = mp_char;
+    size_t len = 0;
+    for (; *mp_char != '"' && has_char(); ++len)
+        next();
+
+    m_tokens.emplace_back(lexer_opcode_t::string, std::string_view{p, len});
+
+    if (*mp_char == '"')
+        next();
+}
+
+void tokenizer::next()
+{
+    ++mp_char;
+    ++m_pos;
+}
+
+void tokenizer::push_pos()
+{
+    mp_char_stored = mp_char;
+    m_pos_stored = m_pos;
+}
+
+void tokenizer::pop_pos()
+{
+    mp_char = mp_char_stored;
+    m_pos = m_pos_stored;
+
+    mp_char_stored = NULL;
+    m_pos_stored = 0;
+}
+
+bool tokenizer::has_char() const
+{
+    return m_pos < m_size;
+}
+
+// ============================================================================
+
+formula_lexer::tokenize_error::tokenize_error(const std::string& msg) : general_error(msg) {}
+
+formula_lexer::formula_lexer(const config& config, const char* p, size_t n) :
+    m_config(config), mp_first(p), m_size(n) {}
+
+formula_lexer::~formula_lexer() {}
+
+void formula_lexer::tokenize()
+{
+    tokenizer tkr(m_tokens, mp_first, m_size);
+    tkr.set_sep_arg(m_config.sep_function_arg);
+    tkr.run();
+}
+
+void formula_lexer::swap_tokens(lexer_tokens_t& tokens)
+{
+    m_tokens.swap(tokens);
+}
+
+}
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */