diff options
Diffstat (limited to 'src/parser/stream.cpp')
-rw-r--r-- | src/parser/stream.cpp | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/src/parser/stream.cpp b/src/parser/stream.cpp new file mode 100644 index 0000000..c0bbb28 --- /dev/null +++ b/src/parser/stream.cpp @@ -0,0 +1,447 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <orcus/stream.hpp> +#include <orcus/exception.hpp> + +#include "utf8.hpp" + +#include <sstream> +#include <fstream> +#include <tuple> +#include <cassert> +#include <algorithm> +#include <locale> +#include <codecvt> +#include <iostream> + +#include "filesystem_env.hpp" + +#include <boost/interprocess/file_mapping.hpp> +#include <boost/interprocess/mapped_region.hpp> + +namespace bip = boost::interprocess; + +namespace orcus { + +namespace { + +enum class unicode_t +{ + unknown, + utf16_be, + utf16_le +}; + +unicode_t check_unicode_type(const char* p, size_t n) +{ + if (n > 2) + { + if (p[0] == '\xFE' && p[1] == '\xFF') + return unicode_t::utf16_be; + + if (p[0] == '\xFF' && p[1] == '\xFE') + return unicode_t::utf16_le; + } + + return unicode_t::unknown; +} + +std::string convert_utf16_to_utf8(const char* p, size_t n, unicode_t ut) +{ + assert(ut == unicode_t::utf16_be || ut == unicode_t::utf16_le); + + if (n & 0x01) + throw std::invalid_argument("size of a UTF-16 string must be divisible by 2."); + + p += 2; // skip the BOM. + + size_t n_buf = n / 2u - 1; + std::u16string buf(n_buf, 0); + + switch (ut) + { + case unicode_t::utf16_be: + { + for (size_t i = 0; i < n_buf; ++i) + { + size_t offset = i * 2; + buf[i] = static_cast<char16_t>(p[offset+1] | p[offset] << 8); + } + break; + } + case unicode_t::utf16_le: + { + for (size_t i = 0; i < n_buf; ++i) + { + size_t offset = i * 2; + buf[i] = static_cast<char16_t>(p[offset] | p[offset+1]); + } + break; + } + default: + ; + } + +#if defined(_MSC_VER) + // char16_t does not work with MSVC just yet. This is a workaround. c.f. + // https://stackoverflow.com/questions/32055357/visual-studio-c-2015-stdcodecvt-with-char16-t-or-char32-t + const int16_t* pi16 = reinterpret_cast<const int16_t*>(buf.data()); + const int16_t* pi16_end = pi16 + buf.size(); + std::wstring_convert<std::codecvt_utf8_utf16<int16_t>, int16_t> conversion; + return conversion.to_bytes(pi16, pi16_end); +#else + std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> conversion; + return conversion.to_bytes(buf); +#endif +} + +std::tuple<std::string_view, size_t, size_t> find_line_with_offset(std::string_view strm, std::ptrdiff_t offset) +{ + const char* p0 = strm.data(); + const char* p_end = p0 + strm.size(); + const char* p_offset = p0 + offset; + + if (p_offset >= p_end) + { + std::ostringstream os; + os << "offset value of " << offset << " is out-of-bound for a stream of length " << strm.size(); + throw std::invalid_argument(os.str()); + } + + // Determine the line number. + std::size_t line_num = 0; + for (const char* p = p0; p != p_offset; ++p) + { + if (*p == '\n') + ++line_num; + } + + // Determine the beginning of the line. + const char* p_line_start = p_offset; + + // if the error points at the new line character + // we have most likely an unterminated quote. + // Report the line with the actual error. + if (*p_offset == '\n' && offset > 0) + --p_line_start; + + for (; p0 <= p_line_start; --p_line_start) + { + if (*p_line_start == '\n') + break; + } + + ++p_line_start; + assert(p0 <= p_line_start); + + // Determine the end of the line. + const char* p_line_end = p_offset; + for (; p_line_end < p_end; ++p_line_end) + { + if (*p_line_end == '\n') + // one character after the last character of the line. + break; + } + + assert(p_line_start <= p_offset); + std::size_t offset_on_line = std::distance(p_line_start, p_offset); + std::string_view line(p_line_start, p_line_end - p_line_start); + + return std::make_tuple(line, line_num, offset_on_line); +} + +} // anonymous namespace + +struct file_content::impl +{ + boost::uintmax_t content_size; + bip::file_mapping mapped_file; + bip::mapped_region mapped_region; + + std::string buffer; // its own buffer in case of stream conversion. + + const char* content; + + impl() : content_size(0), content(nullptr) {} + + impl(std::string_view filepath) : + content_size(fs::file_size(std::string{filepath}.c_str())), + mapped_file(std::string{filepath}.c_str(), bip::read_only), + mapped_region(mapped_file, bip::read_only, 0, content_size), + content(nullptr) + { + content = static_cast<const char*>(mapped_region.get_address()); + } +}; + +file_content::file_content() : + mp_impl(std::make_unique<impl>()) {} + +file_content::file_content(file_content&& other) = default; + +file_content::file_content(std::string_view filepath) : + mp_impl(std::make_unique<impl>(filepath)) {} + +file_content::~file_content() = default; + +const char* file_content::data() const +{ + return mp_impl->content; +} + +size_t file_content::size() const +{ + return mp_impl->content_size; +} + +bool file_content::empty() const +{ + return mp_impl->content_size == 0; +} + +void file_content::swap(file_content& other) +{ + std::swap(mp_impl, other.mp_impl); +} + +void file_content::load(std::string_view filepath) +{ + file_content tmp(filepath); + swap(tmp); +} + +void file_content::convert_to_utf8() +{ + unicode_t ut = check_unicode_type(mp_impl->content, mp_impl->content_size); + + switch (ut) + { + case unicode_t::utf16_be: + case unicode_t::utf16_le: + { + // Convert to utf-8 stream, and reset the content pointer and size. + mp_impl->buffer = convert_utf16_to_utf8(mp_impl->content, mp_impl->content_size, ut); + mp_impl->content = mp_impl->buffer.data(); + mp_impl->content_size = mp_impl->buffer.size(); + break; + } + default: + ; + } +} + +std::string_view file_content::str() const +{ + return std::string_view(mp_impl->content, mp_impl->content_size); +} + +struct memory_content::impl +{ + std::string_view content; + std::string buffer; // its own buffer in case of stream conversion. + + impl() {} + impl(std::string_view s) : content(s) {} +}; + +memory_content::memory_content() : mp_impl(std::make_unique<impl>()) {} + +memory_content::memory_content(std::string_view s) : + mp_impl(std::make_unique<impl>(s)) {} + +memory_content::memory_content(memory_content&& other) = default; +memory_content::~memory_content() = default; + +const char* memory_content::data() const +{ + return mp_impl->content.data(); +} + +size_t memory_content::size() const +{ + return mp_impl->content.size(); +} + +bool memory_content::empty() const +{ + return mp_impl->content.empty(); +} + +void memory_content::swap(memory_content& other) +{ + std::swap(mp_impl, other.mp_impl); +} + +void memory_content::convert_to_utf8() +{ + unicode_t ut = check_unicode_type(mp_impl->content.data(), mp_impl->content.size()); + + switch (ut) + { + case unicode_t::utf16_be: + case unicode_t::utf16_le: + { + // Convert to utf-8 stream, and reset the content pointer and size. + mp_impl->buffer = convert_utf16_to_utf8(mp_impl->content.data(), mp_impl->content.size(), ut); + mp_impl->content = mp_impl->buffer; + break; + } + default: + ; + } +} + +std::string_view memory_content::str() const +{ + return mp_impl->content; +} + +line_with_offset::line_with_offset(std::string _line, std::size_t _line_number, std::size_t _offset_on_line) : + line(std::move(_line)), + line_number(_line_number), + offset_on_line(_offset_on_line) +{} + +line_with_offset::line_with_offset(const line_with_offset& other) = default; +line_with_offset::line_with_offset(line_with_offset&& other) = default; +line_with_offset::~line_with_offset() = default; + +bool line_with_offset::operator== (const line_with_offset& other) const +{ + return line == other.line && line_number == other.line_number && offset_on_line == other.offset_on_line; +} + +bool line_with_offset::operator!= (const line_with_offset& other) const +{ + return !operator==(other); +} + +std::string create_parse_error_output(std::string_view strm, std::ptrdiff_t offset) +{ + if (strm.empty() || offset < 0) + return std::string(); + + const size_t max_line_length = 60; + offset = std::min<std::ptrdiff_t>(strm.size() - 1, offset); + + auto line_info = find_line_with_offset(strm, offset); + std::string_view line = std::get<0>(line_info); + size_t line_num = std::get<1>(line_info); + size_t offset_on_line = std::get<2>(line_info); + + if (offset_on_line < 30) + { + std::ostringstream os; + os << (line_num+1) << ":" << (offset_on_line+1) << ": "; + size_t line_num_width = os.str().size(); + + // Truncate line if it's too long. + if (line.size() > max_line_length) + line = std::string_view(line.data(), max_line_length); + + os << line << std::endl; + + for (size_t i = 0; i < (offset_on_line+line_num_width); ++i) + os << ' '; + os << '^'; + return os.str(); + } + + // The error line is too long. Only show a segment of the line where the + // error occurred. + + const size_t fixed_offset = 20; + + size_t line_start = offset_on_line - fixed_offset; + size_t line_end = line_start + max_line_length; + if (line_end > line.size()) + line_end = line.size(); + + size_t line_length = line_end - line_start; + + line = std::string_view(line.data()+line_start, line_length); + + std::ostringstream os; + os << line_num << ":" << (line_start+1) << ": "; + size_t line_num_width = os.str().size(); + + os << line << std::endl; + + for (size_t i = 0; i < (fixed_offset+line_num_width); ++i) + os << ' '; + os << '^'; + + return os.str(); +} + +line_with_offset locate_line_with_offset(std::string_view strm, std::ptrdiff_t offset) +{ + auto line_info = find_line_with_offset(strm, offset); + std::string_view line = std::get<0>(line_info); + size_t line_num = std::get<1>(line_info); + size_t offset_on_line = std::get<2>(line_info); + + return line_with_offset(std::string{line}, line_num, offset_on_line); +} + +size_t locate_first_different_char(std::string_view left, std::string_view right) +{ + if (left.empty() || right.empty()) + // If one of them is empty, then the first characters are considered + // different. + return 0; + + size_t n = std::min(left.size(), right.size()); + const char* p1 = left.data(); + const char* p2 = right.data(); + const char* p1_end = p1 + n; + + for (; p1 != p1_end; ++p1, ++p2) + { + if (*p1 != *p2) + return std::distance(left.data(), p1); + } + + return n; +} + +std::size_t calc_logical_string_length(std::string_view s) +{ + std::size_t length = 0; + + const char* p = s.data(); + const char* p_end = p + s.size(); + + while (p < p_end) + { + ++length; + + auto n_bytes = calc_utf8_byte_length(*p); + if (!n_bytes || n_bytes > 4) + { + std::ostringstream os; + os << "'" << s << "' contains invalid character at position " << std::distance(s.data(), p); + throw std::invalid_argument(os.str()); + } + + p += n_bytes; + } + + if (p != p_end) + { + std::ostringstream os; + os << "last character of '" << s << "' ended prematurely"; + throw std::invalid_argument(os.str()); + } + + return length; +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |