diff options
Diffstat (limited to '')
-rw-r--r-- | src/text_format.cc | 145 |
1 files changed, 126 insertions, 19 deletions
diff --git a/src/text_format.cc b/src/text_format.cc index 0b72786..dc63cf6 100644 --- a/src/text_format.cc +++ b/src/text_format.cc @@ -29,8 +29,11 @@ * @file text_format.cc */ +#include <set> + #include "text_format.hh" +#include "base/lnav_log.hh" #include "config.h" #include "pcrepp/pcre2pp.hh" #include "yajl/api/yajl_parse.h" @@ -39,37 +42,66 @@ text_format_t detect_text_format(string_fragment sf, nonstd::optional<ghc::filesystem::path> path) { - static const auto GZ_EXT = ghc::filesystem::path(".gz"); - static const auto BZ2_EXT = ghc::filesystem::path(".bz2"); + static const std::set<ghc::filesystem::path> FILTER_EXTS = { + ".bz2", + ".gz", + ".lzma", + ".xz", + ".zst", + }; + static const auto C_EXTS = std::set<ghc::filesystem::path>{ + ".h", + ".hh", + ".hpp", + ".c", + ".cc", + ".cpp", + ".tpp", + }; + static const auto PY_EXT = ghc::filesystem::path(".py"); + static const auto RS_EXT = ghc::filesystem::path(".rs"); + static const auto JAVA_EXT = ghc::filesystem::path(".java"); + static const auto TOML_EXT = ghc::filesystem::path(".toml"); + static const auto XML_EXT = ghc::filesystem::path(".xml"); + static const auto YAML_EXT = ghc::filesystem::path(".yaml"); + static const auto YML_EXT = ghc::filesystem::path(".yml"); + static const auto MAKEFILE_STEM = ghc::filesystem::path("Makefile"); static const auto MD_EXT = ghc::filesystem::path(".md"); static const auto MARKDOWN_EXT = ghc::filesystem::path(".markdown"); + static const auto SH_EXT = ghc::filesystem::path(".sh"); + + static const auto DIFF_MATCHERS = lnav::pcre2pp::code::from_const( + R"(^--- .*\n\+\+\+ .*\n)", PCRE2_MULTILINE); static const auto MAN_MATCHERS = lnav::pcre2pp::code::from_const( R"(^[A-Za-z][A-Za-z\-_\+0-9]+\(\d\)\s+)", PCRE2_MULTILINE); - // XXX This is a pretty crude way of detecting format... + // XXX This is a pretty crude way of + // detecting format... static const auto PYTHON_MATCHERS = lnav::pcre2pp::code::from_const( "(?:" - "^\\s*def\\s+\\w+\\([^)]*\\):[^\\n]*$|" + "^\\s*def\\s+\\w+\\([^)]*\\):" + "[^\\n]*$|" "^\\s*try:[^\\n]*$" ")", PCRE2_MULTILINE); - static const auto RUST_MATCHERS - = lnav::pcre2pp::code::from_const(R"( + static const auto RUST_MATCHERS = lnav::pcre2pp::code::from_const( + R"( (?: ^\s*use\s+[\w+:\{\}]+;$| -^\s*(?:pub)?\s+(?:const|enum|fn)\s+\w+.*$| +^\s*(?:pub enum|pub const|(?:pub )?fn)\s+\w+.*$| ^\s*impl\s+\w+.*$ ) )", - PCRE2_MULTILINE); + PCRE2_MULTILINE); static const auto JAVA_MATCHERS = lnav::pcre2pp::code::from_const( "(?:" "^package\\s+|" "^import\\s+|" - "^\\s*(?:public)?\\s*class\\s*(\\w+\\s+)*\\s*{" + "^\\s*(?:public)?\\s*" + "class\\s*(\\w+\\s+)*\\s*{" ")", PCRE2_MULTILINE); @@ -77,15 +109,18 @@ detect_text_format(string_fragment sf, "(?:" "^#\\s*include\\s+|" "^#\\s*define\\s+|" - "^\\s*if\\s+\\([^)]+\\)[^\\n]*$|" - "^\\s*(?:\\w+\\s+)*class \\w+ {" + "^\\s*if\\s+\\([^)]+\\)[^\\n]" + "*$|" + "^\\s*(?:\\w+\\s+)*class " + "\\w+ {" ")", PCRE2_MULTILINE); static const auto SQL_MATCHERS = lnav::pcre2pp::code::from_const( "(?:" "select\\s+.+\\s+from\\s+|" - "insert\\s+into\\s+.+\\s+values" + "insert\\s+into\\s+.+\\s+" + "values" ")", PCRE2_MULTILINE | PCRE2_CASELESS); @@ -96,19 +131,55 @@ detect_text_format(string_fragment sf, ")", PCRE2_MULTILINE | PCRE2_CASELESS); - text_format_t retval = text_format_t::TF_UNKNOWN; + static const auto SH_MATCHERS + = lnav::pcre2pp::code::from_const("^#!.+sh\\b", PCRE2_MULTILINE); if (path) { - if (path->extension() == GZ_EXT) { - path = path->stem(); - } - if (path->extension() == BZ2_EXT) { + while (FILTER_EXTS.count(path->extension()) > 0) { path = path->stem(); } - if (path->extension() == MD_EXT || path->extension() == MARKDOWN_EXT) { + auto stem = path->stem(); + auto ext = path->extension(); + if (ext == MD_EXT || ext == MARKDOWN_EXT) { return text_format_t::TF_MARKDOWN; } + + if (C_EXTS.count(ext) > 0) { + return text_format_t::TF_C_LIKE; + } + + if (ext == PY_EXT) { + return text_format_t::TF_PYTHON; + } + + if (ext == RS_EXT) { + return text_format_t::TF_RUST; + } + + if (ext == TOML_EXT) { + return text_format_t::TF_TOML; + } + + if (ext == JAVA_EXT) { + return text_format_t::TF_JAVA; + } + + if (ext == YAML_EXT || ext == YML_EXT) { + return text_format_t::TF_YAML; + } + + if (ext == XML_EXT) { + return text_format_t::TF_XML; + } + + if (stem == MAKEFILE_STEM) { + return text_format_t::TF_MAKEFILE; + } + + if (stem == SH_EXT) { + return text_format_t::TF_SHELL_SCRIPT; + } } { @@ -120,6 +191,14 @@ detect_text_format(string_fragment sf, } } + if (DIFF_MATCHERS.find_in(sf).ignore_error()) { + return text_format_t::TF_DIFF; + } + + if (SH_MATCHERS.find_in(sf).ignore_error()) { + return text_format_t::TF_SHELL_SCRIPT; + } + if (MAN_MATCHERS.find_in(sf).ignore_error()) { return text_format_t::TF_MAN; } @@ -148,5 +227,33 @@ detect_text_format(string_fragment sf, return text_format_t::TF_XML; } - return retval; + return text_format_t::TF_UNKNOWN; +} + +nonstd::optional<text_format_meta_t> +extract_text_meta(string_fragment sf, text_format_t tf) +{ + static const auto MAN_NAME = lnav::pcre2pp::code::from_const( + R"(^([A-Za-z][A-Za-z\-_\+0-9]+\(\d\))\s+)", PCRE2_MULTILINE); + + switch (tf) { + case text_format_t::TF_MAN: { + static thread_local auto md + = lnav::pcre2pp::match_data::unitialized(); + + auto find_res + = MAN_NAME.capture_from(sf).into(md).matches().ignore_error(); + + if (find_res) { + return text_format_meta_t{ + md.to_string(), + }; + } + break; + } + default: + break; + } + + return nonstd::nullopt; } |