summaryrefslogtreecommitdiffstats
path: root/src/base/intern_string.hh
diff options
context:
space:
mode:
Diffstat (limited to 'src/base/intern_string.hh')
-rw-r--r--src/base/intern_string.hh857
1 files changed, 857 insertions, 0 deletions
diff --git a/src/base/intern_string.hh b/src/base/intern_string.hh
new file mode 100644
index 0000000..37ccc36
--- /dev/null
+++ b/src/base/intern_string.hh
@@ -0,0 +1,857 @@
+/**
+ * Copyright (c) 2014, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file intern_string.hh
+ */
+
+#ifndef intern_string_hh
+#define intern_string_hh
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <assert.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "fmt/format.h"
+#include "optional.hpp"
+#include "scn/util/string_view.h"
+#include "strnatcmp.h"
+#include "ww898/cp_utf8.hpp"
+
+struct string_fragment {
+ using iterator = const char*;
+
+ static string_fragment invalid()
+ {
+ string_fragment retval;
+
+ retval.invalidate();
+ return retval;
+ }
+
+ static string_fragment from_c_str(const char* str)
+ {
+ return string_fragment{str, 0, str != nullptr ? (int) strlen(str) : 0};
+ }
+
+ static string_fragment from_c_str(const unsigned char* str)
+ {
+ return string_fragment{
+ str, 0, str != nullptr ? (int) strlen((char*) str) : 0};
+ }
+
+ template<typename T, std::size_t N>
+ static string_fragment from_const(const T (&str)[N])
+ {
+ return string_fragment{str, 0, (int) N - 1};
+ }
+
+ static string_fragment from_str(const std::string& str)
+ {
+ return string_fragment{str.c_str(), 0, (int) str.size()};
+ }
+
+ static string_fragment from_substr(const std::string& str,
+ size_t offset,
+ size_t length)
+ {
+ return string_fragment{
+ str.c_str(), (int) offset, (int) (offset + length)};
+ }
+
+ static string_fragment from_str_range(const std::string& str,
+ size_t begin,
+ size_t end)
+ {
+ return string_fragment{str.c_str(), (int) begin, (int) end};
+ }
+
+ static string_fragment from_bytes(const char* bytes, size_t len)
+ {
+ return string_fragment{bytes, 0, (int) len};
+ }
+
+ static string_fragment from_bytes(const unsigned char* bytes, size_t len)
+ {
+ return string_fragment{(const char*) bytes, 0, (int) len};
+ }
+
+ static string_fragment from_memory_buffer(const fmt::memory_buffer& buf)
+ {
+ return string_fragment{buf.data(), 0, (int) buf.size()};
+ }
+
+ static string_fragment from_byte_range(const char* bytes,
+ size_t begin,
+ size_t end)
+ {
+ return string_fragment{bytes, (int) begin, (int) end};
+ }
+
+ explicit string_fragment(const char* str = "", int begin = 0, int end = -1)
+ : sf_string(str), sf_begin(begin), sf_end(end == -1 ? strlen(str) : end)
+ {
+ }
+
+ explicit string_fragment(const unsigned char* str,
+ int begin = 0,
+ int end = -1)
+ : sf_string((const char*) str), sf_begin(begin),
+ sf_end(end == -1 ? strlen((const char*) str) : end)
+ {
+ }
+
+ string_fragment(const std::string& str)
+ : sf_string(str.c_str()), sf_begin(0), sf_end(str.length())
+ {
+ }
+
+ bool is_valid() const
+ {
+ return this->sf_begin != -1 && this->sf_begin <= this->sf_end;
+ }
+
+ int length() const { return this->sf_end - this->sf_begin; }
+
+ Result<ssize_t, const char*> utf8_length() const;
+
+ const char* data() const { return &this->sf_string[this->sf_begin]; }
+
+ const unsigned char* udata() const
+ {
+ return (const unsigned char*) &this->sf_string[this->sf_begin];
+ }
+
+ char* writable_data(int offset = 0)
+ {
+ return (char*) &this->sf_string[this->sf_begin + offset];
+ }
+
+ char front() const { return this->sf_string[this->sf_begin]; }
+
+ uint32_t front_codepoint() const
+ {
+ size_t index = 0;
+ try {
+ return ww898::utf::utf8::read(
+ [this, &index]() { return this->data()[index++]; });
+ } catch (const std::runtime_error& e) {
+ return this->data()[0];
+ }
+ }
+
+ char back() const { return this->sf_string[this->sf_end - 1]; }
+
+ iterator begin() const { return &this->sf_string[this->sf_begin]; }
+
+ iterator end() const { return &this->sf_string[this->sf_end]; }
+
+ bool empty() const { return !this->is_valid() || length() == 0; }
+
+ Result<ssize_t, const char*> codepoint_to_byte_index(ssize_t cp_index) const
+ {
+ ssize_t retval = 0;
+
+ while (cp_index > 0) {
+ if (retval >= this->length()) {
+ return Err("index is beyond the end of the string");
+ }
+ auto ch_len = TRY(ww898::utf::utf8::char_size([this, retval]() {
+ return std::make_pair(this->data()[retval],
+ this->length() - retval - 1);
+ }));
+
+ retval += ch_len;
+ cp_index -= 1;
+ }
+
+ return Ok(retval);
+ }
+
+ char operator[](int index) const
+ {
+ return this->sf_string[sf_begin + index];
+ }
+
+ bool operator==(const std::string& str) const
+ {
+ if (this->length() != (int) str.length()) {
+ return false;
+ }
+
+ return memcmp(
+ &this->sf_string[this->sf_begin], str.c_str(), str.length())
+ == 0;
+ }
+
+ bool operator==(const string_fragment& sf) const
+ {
+ if (this->length() != sf.length()) {
+ return false;
+ }
+
+ return memcmp(this->data(), sf.data(), sf.length()) == 0;
+ }
+
+ bool iequal(const string_fragment& sf) const
+ {
+ if (this->length() != sf.length()) {
+ return false;
+ }
+
+ return strnatcasecmp(
+ this->length(), this->data(), sf.length(), sf.data())
+ == 0;
+ }
+
+ bool operator==(const char* str) const
+ {
+ size_t len = strlen(str);
+
+ return len == (size_t) this->length()
+ && strncmp(this->data(), str, this->length()) == 0;
+ }
+
+ bool operator!=(const char* str) const { return !(*this == str); }
+
+ bool startswith(const char* prefix) const
+ {
+ const auto* iter = this->begin();
+
+ while (*prefix != '\0' && iter < this->end() && *prefix == *iter) {
+ prefix += 1;
+ iter += 1;
+ }
+
+ return *prefix == '\0';
+ }
+
+ bool endswith(const char* suffix) const
+ {
+ int suffix_len = strlen(suffix);
+
+ if (suffix_len > this->length()) {
+ return false;
+ }
+
+ const auto* curr = this->end() - suffix_len;
+ while (*suffix != '\0' && *curr == *suffix) {
+ suffix += 1;
+ curr += 1;
+ }
+
+ return *suffix == '\0';
+ }
+
+ string_fragment substr(int begin) const
+ {
+ return string_fragment{
+ this->sf_string, this->sf_begin + begin, this->sf_end};
+ }
+
+ string_fragment sub_range(int begin, int end) const
+ {
+ return string_fragment{
+ this->sf_string, this->sf_begin + begin, this->sf_begin + end};
+ }
+
+ size_t count(char ch) const {
+ size_t retval = 0;
+
+ for (int lpc = this->sf_begin; lpc < this->sf_end; lpc++) {
+ if (this->sf_string[lpc] == ch) {
+ retval += 1;
+ }
+ }
+
+ return retval;
+ }
+
+ nonstd::optional<size_t> find(char ch) const
+ {
+ for (int lpc = this->sf_begin; lpc < this->sf_end; lpc++) {
+ if (this->sf_string[lpc] == ch) {
+ return lpc - this->sf_begin;
+ }
+ }
+
+ return nonstd::nullopt;
+ }
+
+ template<typename P>
+ string_fragment find_left_boundary(size_t start, P&& predicate) const
+ {
+ assert((int) start <= this->length());
+
+ if (start > 0 && start == this->length()) {
+ start -= 1;
+ }
+ while (start > 0) {
+ if (predicate(this->data()[start])) {
+ start += 1;
+ break;
+ }
+ start -= 1;
+ }
+
+ return string_fragment{
+ this->sf_string,
+ (int) start,
+ this->sf_end,
+ };
+ }
+
+ template<typename P>
+ string_fragment find_right_boundary(size_t start, P&& predicate) const
+ {
+ while ((int) start < this->length()) {
+ if (predicate(this->data()[start])) {
+ break;
+ }
+ start += 1;
+ }
+
+ return string_fragment{
+ this->sf_string,
+ this->sf_begin,
+ this->sf_begin + (int) start,
+ };
+ }
+
+ template<typename P>
+ string_fragment find_boundaries_around(size_t start, P&& predicate) const
+ {
+ return this->template find_left_boundary(start, predicate)
+ .find_right_boundary(0, predicate);
+ }
+
+ nonstd::optional<std::pair<uint32_t, string_fragment>> consume_codepoint()
+ const
+ {
+ auto cp = this->front_codepoint();
+ auto index_res = this->codepoint_to_byte_index(1);
+
+ if (index_res.isErr()) {
+ return nonstd::nullopt;
+ }
+
+ return std::make_pair(cp, this->substr(index_res.unwrap()));
+ }
+
+ template<typename P>
+ nonstd::optional<string_fragment> consume(P predicate) const
+ {
+ int consumed = 0;
+ while (consumed < this->length()) {
+ if (!predicate(this->data()[consumed])) {
+ break;
+ }
+
+ consumed += 1;
+ }
+
+ if (consumed == 0) {
+ return nonstd::nullopt;
+ }
+
+ return string_fragment{
+ this->sf_string,
+ this->sf_begin + consumed,
+ this->sf_end,
+ };
+ }
+
+ nonstd::optional<string_fragment> consume_n(int amount) const;
+
+ template<typename P>
+ string_fragment skip(P predicate) const
+ {
+ int offset = 0;
+ while (offset < this->length() && predicate(this->data()[offset])) {
+ offset += 1;
+ }
+
+ return string_fragment{
+ this->sf_string,
+ this->sf_begin + offset,
+ this->sf_end,
+ };
+ }
+
+ using split_result
+ = nonstd::optional<std::pair<string_fragment, string_fragment>>;
+
+ template<typename P>
+ split_result split_while(P&& predicate) const
+ {
+ int consumed = 0;
+ while (consumed < this->length()) {
+ if (!predicate(this->data()[consumed])) {
+ break;
+ }
+
+ consumed += 1;
+ }
+
+ if (consumed == 0) {
+ return nonstd::nullopt;
+ }
+
+ return std::make_pair(
+ string_fragment{
+ this->sf_string,
+ this->sf_begin,
+ this->sf_begin + consumed,
+ },
+ string_fragment{
+ this->sf_string,
+ this->sf_begin + consumed,
+ this->sf_end,
+ });
+ }
+
+ template<typename P>
+ split_result split_when(P&& predicate) const
+ {
+ int consumed = 0;
+ while (consumed < this->length()) {
+ if (predicate(this->data()[consumed])) {
+ break;
+ }
+
+ consumed += 1;
+ }
+
+ if (consumed == 0) {
+ return nonstd::nullopt;
+ }
+
+ return std::make_pair(
+ string_fragment{
+ this->sf_string,
+ this->sf_begin,
+ this->sf_begin + consumed,
+ },
+ string_fragment{
+ this->sf_string,
+ this->sf_begin + consumed + 1,
+ this->sf_end,
+ });
+ }
+
+ split_result split_n(int amount) const;
+
+ std::vector<string_fragment> split_lines() const;
+
+ struct tag1 {
+ const char t_value;
+
+ bool operator()(char ch) const { return this->t_value == ch; }
+ };
+
+ struct quoted_string_body {
+ bool qs_in_escape{false};
+
+ bool operator()(char ch)
+ {
+ if (this->qs_in_escape) {
+ this->qs_in_escape = false;
+ return true;
+ } else if (ch == '\\') {
+ this->qs_in_escape = true;
+ return true;
+ } else if (ch == '"') {
+ return false;
+ } else {
+ return true;
+ }
+ }
+ };
+
+ const char* to_string(char* buf) const
+ {
+ memcpy(buf, this->data(), this->length());
+ buf[this->length()] = '\0';
+
+ return buf;
+ }
+
+ std::string to_string() const
+ {
+ return {this->data(), (size_t) this->length()};
+ }
+
+ void clear()
+ {
+ this->sf_begin = 0;
+ this->sf_end = 0;
+ }
+
+ void invalidate()
+ {
+ this->sf_begin = -1;
+ this->sf_end = -1;
+ }
+
+ string_fragment trim(const char* tokens) const;
+ string_fragment trim() const;
+
+ string_fragment prepend(const char* str, int amount) const
+ {
+ return string_fragment{
+ str,
+ this->sf_begin + amount,
+ this->sf_end + amount,
+ };
+ }
+
+ string_fragment erase_before(const char* str, int amount) const
+ {
+ return string_fragment{
+ str,
+ this->sf_begin - amount,
+ this->sf_end - amount,
+ };
+ }
+
+ string_fragment erase(const char* str, int amount) const
+ {
+ return string_fragment{
+ str,
+ this->sf_begin,
+ this->sf_end - amount,
+ };
+ }
+
+ template<typename A>
+ const char* to_c_str(A allocator) const
+ {
+ auto* retval = allocator.allocate(this->length() + 1);
+ memcpy(retval, this->data(), this->length());
+ retval[this->length()] = '\0';
+ return retval;
+ }
+
+ template<typename A>
+ string_fragment to_owned(A allocator) const
+ {
+ return string_fragment{
+ this->template to_c_str(allocator),
+ 0,
+ this->length(),
+ };
+ }
+
+ scn::string_view to_string_view() const
+ {
+ return scn::string_view{this->begin(), this->end()};
+ }
+
+ enum class case_style {
+ lower,
+ upper,
+ camel,
+ mixed,
+ };
+
+ case_style detect_text_case_style() const;
+
+ std::string to_string_with_case_style(case_style style) const;
+
+ const char* sf_string;
+ int sf_begin;
+ int sf_end;
+};
+
+inline bool
+operator==(const std::string& left, const string_fragment& right)
+{
+ return right == left;
+}
+
+inline bool
+operator<(const char* left, const string_fragment& right)
+{
+ int rc = strncmp(left, right.data(), right.length());
+ return rc < 0;
+}
+
+inline void
+operator+=(std::string& left, const string_fragment& right)
+{
+ left.append(right.data(), right.length());
+}
+
+inline bool
+operator<(const string_fragment& left, const char* right)
+{
+ return strncmp(left.data(), right, left.length()) < 0;
+}
+
+inline std::ostream&
+operator<<(std::ostream& os, const string_fragment& sf)
+{
+ os.write(sf.data(), sf.length());
+ return os;
+}
+
+class intern_string {
+public:
+ static const intern_string* lookup(const char* str, ssize_t len) noexcept;
+
+ static const intern_string* lookup(const string_fragment& sf) noexcept;
+
+ static const intern_string* lookup(const std::string& str) noexcept;
+
+ const char* get() const { return this->is_str.c_str(); };
+
+ size_t size() const { return this->is_str.size(); }
+
+ std::string to_string() const { return this->is_str; }
+
+ string_fragment to_string_fragment() const
+ {
+ return string_fragment{this->is_str};
+ }
+
+ bool startswith(const char* prefix) const;
+
+ struct intern_table;
+ static std::shared_ptr<intern_table> get_table_lifetime();
+
+private:
+ friend intern_table;
+
+ intern_string(const char* str, ssize_t len)
+ : is_next(nullptr), is_str(str, (size_t) len)
+ {
+ }
+
+ intern_string* is_next;
+ std::string is_str;
+};
+
+using intern_table_lifetime = std::shared_ptr<intern_string::intern_table>;
+
+class intern_string_t {
+public:
+ using iterator = const char*;
+
+ intern_string_t(const intern_string* is = nullptr) : ist_interned_string(is)
+ {
+ }
+
+ const intern_string* unwrap() const { return this->ist_interned_string; }
+
+ void clear() { this->ist_interned_string = nullptr; };
+
+ bool empty() const { return this->ist_interned_string == nullptr; }
+
+ const char* get() const
+ {
+ if (this->empty()) {
+ return "";
+ }
+ return this->ist_interned_string->get();
+ }
+
+ const char* c_str() const { return this->get(); }
+
+ iterator begin() const { return this->get(); }
+
+ iterator end() const { return this->get() + this->size(); }
+
+ size_t size() const
+ {
+ if (this->ist_interned_string == nullptr) {
+ return 0;
+ }
+ return this->ist_interned_string->size();
+ }
+
+ size_t hash() const
+ {
+ auto ptr = (uintptr_t) this->ist_interned_string;
+
+ return ptr;
+ }
+
+ std::string to_string() const
+ {
+ if (this->ist_interned_string == nullptr) {
+ return "";
+ }
+ return this->ist_interned_string->to_string();
+ }
+
+ string_fragment to_string_fragment() const
+ {
+ if (this->ist_interned_string == nullptr) {
+ return string_fragment{"", 0, 0};
+ }
+ return this->ist_interned_string->to_string_fragment();
+ }
+
+ bool operator<(const intern_string_t& rhs) const
+ {
+ return strcmp(this->get(), rhs.get()) < 0;
+ }
+
+ bool operator==(const intern_string_t& rhs) const
+ {
+ return this->ist_interned_string == rhs.ist_interned_string;
+ }
+
+ bool operator!=(const intern_string_t& rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+ bool operator==(const char* rhs) const
+ {
+ return strcmp(this->get(), rhs) == 0;
+ }
+
+ bool operator!=(const char* rhs) const
+ {
+ return strcmp(this->get(), rhs) != 0;
+ }
+
+ static bool case_lt(const intern_string_t& lhs, const intern_string_t& rhs)
+ {
+ return strnatcasecmp(lhs.size(), lhs.get(), rhs.size(), rhs.get()) < 0;
+ }
+
+private:
+ const intern_string* ist_interned_string;
+};
+
+unsigned long hash_str(const char* str, size_t len);
+
+namespace fmt {
+template<>
+struct formatter<string_fragment> : formatter<string_view> {
+ template<typename FormatContext>
+ auto format(const string_fragment& sf, FormatContext& ctx)
+ {
+ return formatter<string_view>::format(
+ string_view{sf.data(), (size_t) sf.length()}, ctx);
+ }
+};
+
+template<>
+struct formatter<intern_string_t> : formatter<string_view> {
+ template<typename FormatContext>
+ auto format(const intern_string_t& is, FormatContext& ctx)
+ {
+ return formatter<string_view>::format(
+ string_view{is.get(), (size_t) is.size()}, ctx);
+ }
+};
+} // namespace fmt
+
+namespace std {
+template<>
+struct hash<const intern_string_t> {
+ std::size_t operator()(const intern_string_t& ist) const
+ {
+ return ist.hash();
+ }
+};
+} // namespace std
+
+inline bool
+operator<(const char* left, const intern_string_t& right)
+{
+ int rc = strncmp(left, right.get(), right.size());
+ return rc < 0;
+}
+
+inline bool
+operator<(const intern_string_t& left, const char* right)
+{
+ return strncmp(left.get(), right, left.size()) < 0;
+}
+
+inline bool
+operator==(const intern_string_t& left, const string_fragment& sf)
+{
+ return ((int) left.size() == sf.length())
+ && (memcmp(left.get(), sf.data(), left.size()) == 0);
+}
+
+inline bool
+operator==(const string_fragment& left, const intern_string_t& right)
+{
+ return (left.length() == (int) right.size())
+ && (memcmp(left.data(), right.get(), left.length()) == 0);
+}
+
+namespace std {
+inline string
+to_string(const string_fragment& s)
+{
+ return {s.data(), (size_t) s.length()};
+}
+
+inline string
+to_string(const intern_string_t& s)
+{
+ return s.to_string();
+}
+} // namespace std
+
+inline string_fragment
+to_string_fragment(const string_fragment& s)
+{
+ return s;
+}
+
+inline string_fragment
+to_string_fragment(const intern_string_t& s)
+{
+ return string_fragment(s.get(), 0, s.size());
+}
+
+inline string_fragment
+to_string_fragment(const std::string& s)
+{
+ return string_fragment(s.c_str(), 0, s.length());
+}
+
+struct frag_hasher {
+ size_t operator()(const string_fragment& sf) const
+ {
+ return hash_str(sf.data(), sf.length());
+ }
+};
+
+#endif