summaryrefslogtreecommitdiffstats
path: root/src/parser/stream.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/stream.cpp')
-rw-r--r--src/parser/stream.cpp447
1 files changed, 447 insertions, 0 deletions
diff --git a/src/parser/stream.cpp b/src/parser/stream.cpp
new file mode 100644
index 0000000..c0bbb28
--- /dev/null
+++ b/src/parser/stream.cpp
@@ -0,0 +1,447 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <orcus/stream.hpp>
+#include <orcus/exception.hpp>
+
+#include "utf8.hpp"
+
+#include <sstream>
+#include <fstream>
+#include <tuple>
+#include <cassert>
+#include <algorithm>
+#include <locale>
+#include <codecvt>
+#include <iostream>
+
+#include "filesystem_env.hpp"
+
+#include <boost/interprocess/file_mapping.hpp>
+#include <boost/interprocess/mapped_region.hpp>
+
+namespace bip = boost::interprocess;
+
+namespace orcus {
+
+namespace {
+
+enum class unicode_t
+{
+ unknown,
+ utf16_be,
+ utf16_le
+};
+
+unicode_t check_unicode_type(const char* p, size_t n)
+{
+ if (n > 2)
+ {
+ if (p[0] == '\xFE' && p[1] == '\xFF')
+ return unicode_t::utf16_be;
+
+ if (p[0] == '\xFF' && p[1] == '\xFE')
+ return unicode_t::utf16_le;
+ }
+
+ return unicode_t::unknown;
+}
+
+std::string convert_utf16_to_utf8(const char* p, size_t n, unicode_t ut)
+{
+ assert(ut == unicode_t::utf16_be || ut == unicode_t::utf16_le);
+
+ if (n & 0x01)
+ throw std::invalid_argument("size of a UTF-16 string must be divisible by 2.");
+
+ p += 2; // skip the BOM.
+
+ size_t n_buf = n / 2u - 1;
+ std::u16string buf(n_buf, 0);
+
+ switch (ut)
+ {
+ case unicode_t::utf16_be:
+ {
+ for (size_t i = 0; i < n_buf; ++i)
+ {
+ size_t offset = i * 2;
+ buf[i] = static_cast<char16_t>(p[offset+1] | p[offset] << 8);
+ }
+ break;
+ }
+ case unicode_t::utf16_le:
+ {
+ for (size_t i = 0; i < n_buf; ++i)
+ {
+ size_t offset = i * 2;
+ buf[i] = static_cast<char16_t>(p[offset] | p[offset+1]);
+ }
+ break;
+ }
+ default:
+ ;
+ }
+
+#if defined(_MSC_VER)
+ // char16_t does not work with MSVC just yet. This is a workaround. c.f.
+ // https://stackoverflow.com/questions/32055357/visual-studio-c-2015-stdcodecvt-with-char16-t-or-char32-t
+ const int16_t* pi16 = reinterpret_cast<const int16_t*>(buf.data());
+ const int16_t* pi16_end = pi16 + buf.size();
+ std::wstring_convert<std::codecvt_utf8_utf16<int16_t>, int16_t> conversion;
+ return conversion.to_bytes(pi16, pi16_end);
+#else
+ std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> conversion;
+ return conversion.to_bytes(buf);
+#endif
+}
+
+std::tuple<std::string_view, size_t, size_t> find_line_with_offset(std::string_view strm, std::ptrdiff_t offset)
+{
+ const char* p0 = strm.data();
+ const char* p_end = p0 + strm.size();
+ const char* p_offset = p0 + offset;
+
+ if (p_offset >= p_end)
+ {
+ std::ostringstream os;
+ os << "offset value of " << offset << " is out-of-bound for a stream of length " << strm.size();
+ throw std::invalid_argument(os.str());
+ }
+
+ // Determine the line number.
+ std::size_t line_num = 0;
+ for (const char* p = p0; p != p_offset; ++p)
+ {
+ if (*p == '\n')
+ ++line_num;
+ }
+
+ // Determine the beginning of the line.
+ const char* p_line_start = p_offset;
+
+ // if the error points at the new line character
+ // we have most likely an unterminated quote.
+ // Report the line with the actual error.
+ if (*p_offset == '\n' && offset > 0)
+ --p_line_start;
+
+ for (; p0 <= p_line_start; --p_line_start)
+ {
+ if (*p_line_start == '\n')
+ break;
+ }
+
+ ++p_line_start;
+ assert(p0 <= p_line_start);
+
+ // Determine the end of the line.
+ const char* p_line_end = p_offset;
+ for (; p_line_end < p_end; ++p_line_end)
+ {
+ if (*p_line_end == '\n')
+ // one character after the last character of the line.
+ break;
+ }
+
+ assert(p_line_start <= p_offset);
+ std::size_t offset_on_line = std::distance(p_line_start, p_offset);
+ std::string_view line(p_line_start, p_line_end - p_line_start);
+
+ return std::make_tuple(line, line_num, offset_on_line);
+}
+
+} // anonymous namespace
+
+struct file_content::impl
+{
+ boost::uintmax_t content_size;
+ bip::file_mapping mapped_file;
+ bip::mapped_region mapped_region;
+
+ std::string buffer; // its own buffer in case of stream conversion.
+
+ const char* content;
+
+ impl() : content_size(0), content(nullptr) {}
+
+ impl(std::string_view filepath) :
+ content_size(fs::file_size(std::string{filepath}.c_str())),
+ mapped_file(std::string{filepath}.c_str(), bip::read_only),
+ mapped_region(mapped_file, bip::read_only, 0, content_size),
+ content(nullptr)
+ {
+ content = static_cast<const char*>(mapped_region.get_address());
+ }
+};
+
+file_content::file_content() :
+ mp_impl(std::make_unique<impl>()) {}
+
+file_content::file_content(file_content&& other) = default;
+
+file_content::file_content(std::string_view filepath) :
+ mp_impl(std::make_unique<impl>(filepath)) {}
+
+file_content::~file_content() = default;
+
+const char* file_content::data() const
+{
+ return mp_impl->content;
+}
+
+size_t file_content::size() const
+{
+ return mp_impl->content_size;
+}
+
+bool file_content::empty() const
+{
+ return mp_impl->content_size == 0;
+}
+
+void file_content::swap(file_content& other)
+{
+ std::swap(mp_impl, other.mp_impl);
+}
+
+void file_content::load(std::string_view filepath)
+{
+ file_content tmp(filepath);
+ swap(tmp);
+}
+
+void file_content::convert_to_utf8()
+{
+ unicode_t ut = check_unicode_type(mp_impl->content, mp_impl->content_size);
+
+ switch (ut)
+ {
+ case unicode_t::utf16_be:
+ case unicode_t::utf16_le:
+ {
+ // Convert to utf-8 stream, and reset the content pointer and size.
+ mp_impl->buffer = convert_utf16_to_utf8(mp_impl->content, mp_impl->content_size, ut);
+ mp_impl->content = mp_impl->buffer.data();
+ mp_impl->content_size = mp_impl->buffer.size();
+ break;
+ }
+ default:
+ ;
+ }
+}
+
+std::string_view file_content::str() const
+{
+ return std::string_view(mp_impl->content, mp_impl->content_size);
+}
+
+struct memory_content::impl
+{
+ std::string_view content;
+ std::string buffer; // its own buffer in case of stream conversion.
+
+ impl() {}
+ impl(std::string_view s) : content(s) {}
+};
+
+memory_content::memory_content() : mp_impl(std::make_unique<impl>()) {}
+
+memory_content::memory_content(std::string_view s) :
+ mp_impl(std::make_unique<impl>(s)) {}
+
+memory_content::memory_content(memory_content&& other) = default;
+memory_content::~memory_content() = default;
+
+const char* memory_content::data() const
+{
+ return mp_impl->content.data();
+}
+
+size_t memory_content::size() const
+{
+ return mp_impl->content.size();
+}
+
+bool memory_content::empty() const
+{
+ return mp_impl->content.empty();
+}
+
+void memory_content::swap(memory_content& other)
+{
+ std::swap(mp_impl, other.mp_impl);
+}
+
+void memory_content::convert_to_utf8()
+{
+ unicode_t ut = check_unicode_type(mp_impl->content.data(), mp_impl->content.size());
+
+ switch (ut)
+ {
+ case unicode_t::utf16_be:
+ case unicode_t::utf16_le:
+ {
+ // Convert to utf-8 stream, and reset the content pointer and size.
+ mp_impl->buffer = convert_utf16_to_utf8(mp_impl->content.data(), mp_impl->content.size(), ut);
+ mp_impl->content = mp_impl->buffer;
+ break;
+ }
+ default:
+ ;
+ }
+}
+
+std::string_view memory_content::str() const
+{
+ return mp_impl->content;
+}
+
+line_with_offset::line_with_offset(std::string _line, std::size_t _line_number, std::size_t _offset_on_line) :
+ line(std::move(_line)),
+ line_number(_line_number),
+ offset_on_line(_offset_on_line)
+{}
+
+line_with_offset::line_with_offset(const line_with_offset& other) = default;
+line_with_offset::line_with_offset(line_with_offset&& other) = default;
+line_with_offset::~line_with_offset() = default;
+
+bool line_with_offset::operator== (const line_with_offset& other) const
+{
+ return line == other.line && line_number == other.line_number && offset_on_line == other.offset_on_line;
+}
+
+bool line_with_offset::operator!= (const line_with_offset& other) const
+{
+ return !operator==(other);
+}
+
+std::string create_parse_error_output(std::string_view strm, std::ptrdiff_t offset)
+{
+ if (strm.empty() || offset < 0)
+ return std::string();
+
+ const size_t max_line_length = 60;
+ offset = std::min<std::ptrdiff_t>(strm.size() - 1, offset);
+
+ auto line_info = find_line_with_offset(strm, offset);
+ std::string_view line = std::get<0>(line_info);
+ size_t line_num = std::get<1>(line_info);
+ size_t offset_on_line = std::get<2>(line_info);
+
+ if (offset_on_line < 30)
+ {
+ std::ostringstream os;
+ os << (line_num+1) << ":" << (offset_on_line+1) << ": ";
+ size_t line_num_width = os.str().size();
+
+ // Truncate line if it's too long.
+ if (line.size() > max_line_length)
+ line = std::string_view(line.data(), max_line_length);
+
+ os << line << std::endl;
+
+ for (size_t i = 0; i < (offset_on_line+line_num_width); ++i)
+ os << ' ';
+ os << '^';
+ return os.str();
+ }
+
+ // The error line is too long. Only show a segment of the line where the
+ // error occurred.
+
+ const size_t fixed_offset = 20;
+
+ size_t line_start = offset_on_line - fixed_offset;
+ size_t line_end = line_start + max_line_length;
+ if (line_end > line.size())
+ line_end = line.size();
+
+ size_t line_length = line_end - line_start;
+
+ line = std::string_view(line.data()+line_start, line_length);
+
+ std::ostringstream os;
+ os << line_num << ":" << (line_start+1) << ": ";
+ size_t line_num_width = os.str().size();
+
+ os << line << std::endl;
+
+ for (size_t i = 0; i < (fixed_offset+line_num_width); ++i)
+ os << ' ';
+ os << '^';
+
+ return os.str();
+}
+
+line_with_offset locate_line_with_offset(std::string_view strm, std::ptrdiff_t offset)
+{
+ auto line_info = find_line_with_offset(strm, offset);
+ std::string_view line = std::get<0>(line_info);
+ size_t line_num = std::get<1>(line_info);
+ size_t offset_on_line = std::get<2>(line_info);
+
+ return line_with_offset(std::string{line}, line_num, offset_on_line);
+}
+
+size_t locate_first_different_char(std::string_view left, std::string_view right)
+{
+ if (left.empty() || right.empty())
+ // If one of them is empty, then the first characters are considered
+ // different.
+ return 0;
+
+ size_t n = std::min(left.size(), right.size());
+ const char* p1 = left.data();
+ const char* p2 = right.data();
+ const char* p1_end = p1 + n;
+
+ for (; p1 != p1_end; ++p1, ++p2)
+ {
+ if (*p1 != *p2)
+ return std::distance(left.data(), p1);
+ }
+
+ return n;
+}
+
+std::size_t calc_logical_string_length(std::string_view s)
+{
+ std::size_t length = 0;
+
+ const char* p = s.data();
+ const char* p_end = p + s.size();
+
+ while (p < p_end)
+ {
+ ++length;
+
+ auto n_bytes = calc_utf8_byte_length(*p);
+ if (!n_bytes || n_bytes > 4)
+ {
+ std::ostringstream os;
+ os << "'" << s << "' contains invalid character at position " << std::distance(s.data(), p);
+ throw std::invalid_argument(os.str());
+ }
+
+ p += n_bytes;
+ }
+
+ if (p != p_end)
+ {
+ std::ostringstream os;
+ os << "last character of '" << s << "' ended prematurely";
+ throw std::invalid_argument(os.str());
+ }
+
+ return length;
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */