summaryrefslogtreecommitdiffstats
path: root/src/data_scanner_re.re
diff options
context:
space:
mode:
Diffstat (limited to 'src/data_scanner_re.re')
-rw-r--r--src/data_scanner_re.re277
1 files changed, 277 insertions, 0 deletions
diff --git a/src/data_scanner_re.re b/src/data_scanner_re.re
new file mode 100644
index 0000000..904aa9f
--- /dev/null
+++ b/src/data_scanner_re.re
@@ -0,0 +1,277 @@
+/**
+ * Copyright (c) 2015, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+
+#include "base/date_time_scanner.hh"
+#include "config.h"
+#include "data_scanner.hh"
+
+nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
+{
+ data_token_t token_out = DT_INVALID;
+ capture_t cap_all;
+ capture_t cap_inner;
+# define YYCTYPE unsigned char
+# define CAPTURE(tok) { \
+ if (YYCURSOR.val == EMPTY) { \
+ this->ds_next_offset = this->ds_input.length(); \
+ } else { \
+ this->ds_next_offset = YYCURSOR.val - this->ds_input.udata(); \
+ } \
+ cap_all.c_end = this->ds_next_offset; \
+ cap_inner.c_end = this->ds_next_offset; \
+ token_out = tok; \
+ }
+
+# define RET(tok) { \
+ CAPTURE(tok); \
+ return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()}; \
+ }
+ static const unsigned char *EMPTY = (const unsigned char *) "";
+
+ struct _YYCURSOR {
+ YYCTYPE operator*() const {
+ if (this->val < this->lim) {
+ return *val;
+ }
+ return '\0';
+ }
+
+ operator const YYCTYPE *() const {
+ if (this->val < this->lim) {
+ return this->val;
+ }
+ return EMPTY;
+ }
+
+ const YYCTYPE *operator=(const YYCTYPE *rhs) {
+ this->val = rhs;
+ return rhs;
+ }
+
+ const YYCTYPE *operator+(int rhs) {
+ return this->val + rhs;
+ }
+
+ const _YYCURSOR *operator-=(int rhs) {
+ this->val -= rhs;
+ return this;
+ }
+
+ _YYCURSOR& operator++() {
+ this->val += 1;
+ return *this;
+ }
+
+ const YYCTYPE *val{nullptr};
+ const YYCTYPE *lim{nullptr};
+ } YYCURSOR;
+ YYCURSOR = (const unsigned char *) this->ds_input.udata() + this->ds_next_offset;
+ _YYCURSOR yyt1;
+ _YYCURSOR yyt2;
+ _YYCURSOR yyt3;
+ _YYCURSOR yyt4;
+ const YYCTYPE *YYLIMIT = (const unsigned char *) this->ds_input.end();
+ const YYCTYPE *YYMARKER = YYCURSOR;
+
+ YYCURSOR.lim = YYLIMIT;
+
+ cap_all.c_begin = this->ds_next_offset;
+ cap_all.c_end = this->ds_next_offset;
+ cap_inner.c_begin = this->ds_next_offset;
+ cap_inner.c_end = this->ds_next_offset;
+
+ /*!re2c
+ re2c:yyfill:enable = 0;
+ re2c:flags:tags = 1;
+
+ SPACE = [ \t\r];
+ ALPHA = [a-zA-Z];
+ ESC = "\x1b";
+ NUM = [0-9];
+ ALPHANUM = [a-zA-Z0-9_];
+ EOF = "\x00";
+ IPV4SEG = ("25"[0-5]|("2"[0-4]|"1"{0,1}[0-9]){0,1}[0-9]);
+ IPV4ADDR = (IPV4SEG"."){3,3}IPV4SEG;
+ IPV6SEG = [0-9a-fA-F]{1,4};
+ IPV6ADDR = (
+ (IPV6SEG":"){7,7}IPV6SEG|
+ (IPV6SEG":"){1,7}":"|
+ (IPV6SEG":"){1,6}":"IPV6SEG|
+ (IPV6SEG":"){1,5}(":"IPV6SEG){1,2}|
+ (IPV6SEG":"){1,4}(":"IPV6SEG){1,3}|
+ (IPV6SEG":"){1,3}(":"IPV6SEG){1,4}|
+ (IPV6SEG":"){1,2}(":"IPV6SEG){1,5}|
+ IPV6SEG":"((":"IPV6SEG){1,6})|
+ ":"((":"IPV6SEG){1,7}|":")|
+ [a-fA-F0-9]{4}":"(":"IPV6SEG){0,4}"%"[0-9a-zA-Z]{1,}|
+ "::"('ffff'(":0"{1,4}){0,1}":"){0,1}IPV4ADDR|
+ (IPV6SEG":"){1,4}":"IPV4ADDR
+ );
+
+ EOF { return nonstd::nullopt; }
+
+ ("u"|"r")?'"'('\\'.|[^\x00\x1b"\\]|'""')*'"' {
+ CAPTURE(DT_QUOTED_STRING);
+ switch (this->ds_input[cap_inner.c_begin]) {
+ case 'u':
+ case 'r':
+ cap_inner.c_begin += 1;
+ break;
+ }
+ cap_inner.c_begin += 1;
+ cap_inner.c_end -= 1;
+ return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()};
+ }
+ [a-qstv-zA-QSTV-Z]"'" {
+ CAPTURE(DT_WORD);
+ }
+ ("u"|"r")?"'"('\\'.|"''"|[^\x00\x1b'\\])*"'"/[^sS] {
+ CAPTURE(DT_QUOTED_STRING);
+ switch (this->ds_input[cap_inner.c_begin]) {
+ case 'u':
+ case 'r':
+ cap_inner.c_begin += 1;
+ break;
+ }
+ cap_inner.c_begin += 1;
+ cap_inner.c_end -= 1;
+ return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()};
+ }
+ [a-zA-Z0-9]+":/""/"?[^\x00\x1b\r\n\t '"[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
+ ("/"|"./"|"../"|[A-Z]":\\"|"\\\\")("Program Files"(" (x86)")?)?[a-zA-Z0-9_\.\-\~/\\!@#$%^&*()]* { RET(DT_PATH); }
+ (SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); }
+ (SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); }
+ [0-9a-fA-F][0-9a-fA-F]((":"|"-")[0-9a-fA-F][0-9a-fA-F])+ {
+ if ((YYCURSOR.val - (this->ds_input.udata() + this->ds_next_offset)) == 17) {
+ RET(DT_MAC_ADDRESS);
+ } else {
+ RET(DT_HEX_DUMP);
+ }
+ }
+ (NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4})("T"|" ")NUM{2}":"NUM{2}(":"NUM{2}("."NUM{3,6})?)? {
+ RET(DT_DATE_TIME);
+ }
+ ALPHA{3}(" "NUM|" "NUM{2})" "NUM{2}":"NUM{2}(":"NUM{2}("."NUM{3,6})?)? {
+ RET(DT_DATE_TIME);
+ }
+ (NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4}) {
+ RET(DT_DATE);
+ }
+ IPV6ADDR/[^:a-zA-Z0-9] { RET(DT_IPV6_ADDRESS); }
+
+ "<!"[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+))?|SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"))*SPACE*">" {
+ RET(DT_XML_DECL_TAG);
+ }
+
+ "<""?"?[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+))?)*SPACE*("/"|"?")">" {
+ RET(DT_XML_EMPTY_TAG);
+ }
+
+ "<"[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*"="SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+))?)*SPACE*">" {
+ RET(DT_XML_OPEN_TAG);
+ }
+
+ "</"[a-zA-Z0-9_:\-]+SPACE*">" {
+ RET(DT_XML_CLOSE_TAG);
+ }
+
+ "\n"[A-Z][A-Z _\-0-9]+"\n" {
+ RET(DT_H1);
+ }
+
+ ESC"["[0-9=;?]*[a-zA-Z] {
+ RET(DT_CSI);
+ }
+
+ ":" { RET(DT_COLON); }
+ "=" { RET(DT_EQUALS); }
+ "," { RET(DT_COMMA); }
+ ";" { RET(DT_SEMI); }
+ "()" | "{}" | "[]" { RET(DT_EMPTY_CONTAINER); }
+ "{" { RET(DT_LCURLY); }
+ "}" { RET(DT_RCURLY); }
+ "[" { RET(DT_LSQUARE); }
+ "]" { RET(DT_RSQUARE); }
+ "(" { RET(DT_LPAREN); }
+ ")" { RET(DT_RPAREN); }
+ "<" { RET(DT_LANGLE); }
+ ">" { RET(DT_RANGLE); }
+
+ IPV4ADDR/[^0-9] {
+ RET(DT_IPV4_ADDRESS);
+ }
+
+ [0-9a-fA-F]{8}("-"[0-9a-fA-F]{4}){3}"-"[0-9a-fA-F]{12} { RET(DT_UUID); }
+
+ (NUM{4}" "NUM{4}" "NUM{4}" "NUM{4}|NUM{16})/[^0-9] {
+ CAPTURE(DT_CREDIT_CARD_NUMBER);
+ if (!this->is_credit_card(this->to_string_fragment(cap_all))) {
+ if (cap_all.length() > 16) {
+ cap_all.c_end = cap_all.c_begin + 4;
+ cap_inner.c_end = cap_inner.c_begin + 4;
+ }
+ this->ds_next_offset = cap_all.c_end;
+ token_out = DT_NUMBER;
+ }
+ return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()};
+ }
+
+ [0-9]"."[0-9]+'e'[\-\+][0-9]+ { RET(DT_NUMBER); }
+
+ [0-9]+("."[0-9]+[a-zA-Z0-9_]*){2,}("-"[a-zA-Z0-9_]+)?|[0-9]+("."[0-9]+[a-zA-Z0-9_]*)+"-"[a-zA-Z0-9_]+ {
+ RET(DT_VERSION_NUMBER);
+ }
+
+ "-"?"0"[0-7]+ { RET(DT_OCTAL_NUMBER); }
+ "-"?[0-9]+("."[0-9]+)?[ ]*"%" { RET(DT_PERCENTAGE); }
+ "-"?[0-9]+("."[0-9]+)?([eE][\-+][0-9]+)? { RET(DT_NUMBER); }
+ "-"?("0x"|[0-9])[0-9a-fA-F]+ { RET(DT_HEX_NUMBER); }
+
+ [a-zA-Z0-9\._%+-]+"@"[a-zA-Z0-9\.-]+"."[a-zA-Z]+ { RET(DT_EMAIL); }
+
+ "true"|"True"|"TRUE"|"false"|"False"|"FALSE"|"None"|"null"|"NULL"/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_CONSTANT); }
+
+ ("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); }
+
+ [^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\.\\][^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]*("::"[^\x00\x1b"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
+ RET(DT_SYMBOL);
+ }
+
+ ("\r"?"\n"|"\\n") { RET(DT_LINE); }
+ SPACE+ { RET(DT_WHITE); }
+ "." { RET(DT_DOT); }
+ "\\". { RET(DT_ESCAPED_CHAR); }
+ . { RET(DT_GARBAGE); }
+
+ */
+}