summaryrefslogtreecommitdiffstats
path: root/src/text_format.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/text_format.cc')
-rw-r--r--src/text_format.cc152
1 files changed, 152 insertions, 0 deletions
diff --git a/src/text_format.cc b/src/text_format.cc
new file mode 100644
index 0000000..0b72786
--- /dev/null
+++ b/src/text_format.cc
@@ -0,0 +1,152 @@
+/**
+ * Copyright (c) 2017, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file text_format.cc
+ */
+
+#include "text_format.hh"
+
+#include "config.h"
+#include "pcrepp/pcre2pp.hh"
+#include "yajl/api/yajl_parse.h"
+
+text_format_t
+detect_text_format(string_fragment sf,
+ nonstd::optional<ghc::filesystem::path> path)
+{
+ static const auto GZ_EXT = ghc::filesystem::path(".gz");
+ static const auto BZ2_EXT = ghc::filesystem::path(".bz2");
+ static const auto MD_EXT = ghc::filesystem::path(".md");
+ static const auto MARKDOWN_EXT = ghc::filesystem::path(".markdown");
+
+ static const auto MAN_MATCHERS = lnav::pcre2pp::code::from_const(
+ R"(^[A-Za-z][A-Za-z\-_\+0-9]+\(\d\)\s+)", PCRE2_MULTILINE);
+
+ // XXX This is a pretty crude way of detecting format...
+ static const auto PYTHON_MATCHERS = lnav::pcre2pp::code::from_const(
+ "(?:"
+ "^\\s*def\\s+\\w+\\([^)]*\\):[^\\n]*$|"
+ "^\\s*try:[^\\n]*$"
+ ")",
+ PCRE2_MULTILINE);
+
+ static const auto RUST_MATCHERS
+ = lnav::pcre2pp::code::from_const(R"(
+(?:
+^\s*use\s+[\w+:\{\}]+;$|
+^\s*(?:pub)?\s+(?:const|enum|fn)\s+\w+.*$|
+^\s*impl\s+\w+.*$
+)
+)",
+ PCRE2_MULTILINE);
+
+ static const auto JAVA_MATCHERS = lnav::pcre2pp::code::from_const(
+ "(?:"
+ "^package\\s+|"
+ "^import\\s+|"
+ "^\\s*(?:public)?\\s*class\\s*(\\w+\\s+)*\\s*{"
+ ")",
+ PCRE2_MULTILINE);
+
+ static const auto C_LIKE_MATCHERS = lnav::pcre2pp::code::from_const(
+ "(?:"
+ "^#\\s*include\\s+|"
+ "^#\\s*define\\s+|"
+ "^\\s*if\\s+\\([^)]+\\)[^\\n]*$|"
+ "^\\s*(?:\\w+\\s+)*class \\w+ {"
+ ")",
+ PCRE2_MULTILINE);
+
+ static const auto SQL_MATCHERS = lnav::pcre2pp::code::from_const(
+ "(?:"
+ "select\\s+.+\\s+from\\s+|"
+ "insert\\s+into\\s+.+\\s+values"
+ ")",
+ PCRE2_MULTILINE | PCRE2_CASELESS);
+
+ static const auto XML_MATCHERS = lnav::pcre2pp::code::from_const(
+ "(?:"
+ R"(<\?xml(\s+\w+\s*=\s*"[^"]*")*\?>|)"
+ R"(</?\w+(\s+\w+\s*=\s*"[^"]*")*\s*>)"
+ ")",
+ PCRE2_MULTILINE | PCRE2_CASELESS);
+
+ text_format_t retval = text_format_t::TF_UNKNOWN;
+
+ if (path) {
+ if (path->extension() == GZ_EXT) {
+ path = path->stem();
+ }
+ if (path->extension() == BZ2_EXT) {
+ path = path->stem();
+ }
+
+ if (path->extension() == MD_EXT || path->extension() == MARKDOWN_EXT) {
+ return text_format_t::TF_MARKDOWN;
+ }
+ }
+
+ {
+ auto_mem<yajl_handle_t> jhandle(yajl_free);
+
+ jhandle = yajl_alloc(nullptr, nullptr, nullptr);
+ if (yajl_parse(jhandle, sf.udata(), sf.length()) == yajl_status_ok) {
+ return text_format_t::TF_JSON;
+ }
+ }
+
+ if (MAN_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_MAN;
+ }
+
+ if (PYTHON_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_PYTHON;
+ }
+
+ if (RUST_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_RUST;
+ }
+
+ if (JAVA_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_JAVA;
+ }
+
+ if (C_LIKE_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_C_LIKE;
+ }
+
+ if (SQL_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_SQL;
+ }
+
+ if (XML_MATCHERS.find_in(sf).ignore_error()) {
+ return text_format_t::TF_XML;
+ }
+
+ return retval;
+}