diff options
Diffstat (limited to '')
-rw-r--r-- | collectors/log2journal/log2journal-json.c | 630 |
1 files changed, 630 insertions, 0 deletions
diff --git a/collectors/log2journal/log2journal-json.c b/collectors/log2journal/log2journal-json.c new file mode 100644 index 00000000..2ca294e4 --- /dev/null +++ b/collectors/log2journal/log2journal-json.c @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "log2journal.h" + +#define JSON_ERROR_LINE_MAX 1024 +#define JSON_KEY_MAX 1024 +#define JSON_DEPTH_MAX 100 + +struct log_json_state { + LOG_JOB *jb; + + const char *line; + uint32_t pos; + uint32_t depth; + char *stack[JSON_DEPTH_MAX]; + + char key[JSON_KEY_MAX]; + char msg[JSON_ERROR_LINE_MAX]; +}; + +static inline bool json_parse_object(LOG_JSON_STATE *js); +static inline bool json_parse_array(LOG_JSON_STATE *js); + +#define json_current_pos(js) &(js)->line[(js)->pos] +#define json_consume_char(js) ++(js)->pos + +static inline void json_process_key_value(LOG_JSON_STATE *js, const char *value, size_t len) { + log_job_send_extracted_key_value(js->jb, js->key, value, len); +} + +static inline void json_skip_spaces(LOG_JSON_STATE *js) { + const char *s = json_current_pos(js); + const char *start = s; + + while(isspace(*s)) s++; + + js->pos += s - start; +} + +static inline bool json_expect_char_after_white_space(LOG_JSON_STATE *js, const char *expected) { + json_skip_spaces(js); + + const char *s = json_current_pos(js); + for(const char *e = expected; *e ;e++) { + if (*s == *e) + return true; + } + + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: character '%c' is not one of the expected characters (%s), at pos %zu", + *s ? *s : '?', expected, js->pos); + + return false; +} + +static inline bool json_parse_null(LOG_JSON_STATE *js) { + const char *s = json_current_pos(js); + if (strncmp(s, "null", 4) == 0) { + json_process_key_value(js, "null", 4); + js->pos += 4; + return true; + } + else { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: expected 'null', found '%.4s' at position %zu", s, js->pos); + return false; + } +} + +static inline bool json_parse_true(LOG_JSON_STATE *js) { + const char *s = json_current_pos(js); + if (strncmp(s, "true", 4) == 0) { + json_process_key_value(js, "true", 4); + js->pos += 4; + return true; + } + else { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: expected 'true', found '%.4s' at position %zu", s, js->pos); + return false; + } +} + +static inline bool json_parse_false(LOG_JSON_STATE *js) { + const char *s = json_current_pos(js); + if (strncmp(s, "false", 5) == 0) { + json_process_key_value(js, "false", 5); + js->pos += 5; + return true; + } + else { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: expected 'false', found '%.4s' at position %zu", s, js->pos); + return false; + } +} + +static inline bool json_parse_number(LOG_JSON_STATE *js) { + static __thread char value[8192]; + + value[0] = '\0'; + char *d = value; + const char *s = json_current_pos(js); + size_t remaining = sizeof(value) - 1; // Reserve space for null terminator + + // Optional minus sign + if (*s == '-') { + *d++ = *s++; + remaining--; + } + + // Digits before decimal point + while (*s >= '0' && *s <= '9') { + if (remaining < 2) { + snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated number value at pos %zu", js->pos); + return false; + } + *d++ = *s++; + remaining--; + } + + // Decimal point and fractional part + if (*s == '.') { + *d++ = *s++; + remaining--; + + while (*s >= '0' && *s <= '9') { + if (remaining < 2) { + snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated fractional part at pos %zu", js->pos); + return false; + } + *d++ = *s++; + remaining--; + } + } + + // Exponent part + if (*s == 'e' || *s == 'E') { + *d++ = *s++; + remaining--; + + // Optional sign in exponent + if (*s == '+' || *s == '-') { + *d++ = *s++; + remaining--; + } + + while (*s >= '0' && *s <= '9') { + if (remaining < 2) { + snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated exponent at pos %zu", js->pos); + return false; + } + *d++ = *s++; + remaining--; + } + } + + *d = '\0'; + js->pos += d - value; + + if (d > value) { + json_process_key_value(js, value, d - value); + return true; + } else { + snprintf(js->msg, sizeof(js->msg), "JSON PARSER: invalid number format at pos %zu", js->pos); + return false; + } +} + +static inline bool encode_utf8(unsigned codepoint, char **d, size_t *remaining) { + if (codepoint <= 0x7F) { + // 1-byte sequence + if (*remaining < 2) return false; // +1 for the null + *(*d)++ = (char)codepoint; + (*remaining)--; + } + else if (codepoint <= 0x7FF) { + // 2-byte sequence + if (*remaining < 3) return false; // +1 for the null + *(*d)++ = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); + *(*d)++ = (char)(0x80 | (codepoint & 0x3F)); + (*remaining) -= 2; + } + else if (codepoint <= 0xFFFF) { + // 3-byte sequence + if (*remaining < 4) return false; // +1 for the null + *(*d)++ = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); + *(*d)++ = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + *(*d)++ = (char)(0x80 | (codepoint & 0x3F)); + (*remaining) -= 3; + } + else if (codepoint <= 0x10FFFF) { + // 4-byte sequence + if (*remaining < 5) return false; // +1 for the null + *(*d)++ = (char)(0xF0 | ((codepoint >> 18) & 0x07)); + *(*d)++ = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + *(*d)++ = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + *(*d)++ = (char)(0x80 | (codepoint & 0x3F)); + (*remaining) -= 4; + } + else + // Invalid code point + return false; + + return true; +} + +size_t parse_surrogate(const char *s, char *d, size_t *remaining) { + if (s[0] != '\\' || (s[1] != 'u' && s[1] != 'U')) { + return 0; // Not a valid Unicode escape sequence + } + + char hex[9] = {0}; // Buffer for the hexadecimal value + unsigned codepoint; + + if (s[1] == 'u') { + // Handle \uXXXX + if (!isxdigit(s[2]) || !isxdigit(s[3]) || !isxdigit(s[4]) || !isxdigit(s[5])) { + return 0; // Not a valid \uXXXX sequence + } + + hex[0] = s[2]; + hex[1] = s[3]; + hex[2] = s[4]; + hex[3] = s[5]; + codepoint = (unsigned)strtoul(hex, NULL, 16); + + if (codepoint >= 0xD800 && codepoint <= 0xDBFF) { + // Possible start of surrogate pair + if (s[6] == '\\' && s[7] == 'u' && isxdigit(s[8]) && isxdigit(s[9]) && + isxdigit(s[10]) && isxdigit(s[11])) { + // Valid low surrogate + unsigned low_surrogate = strtoul(&s[8], NULL, 16); + if (low_surrogate < 0xDC00 || low_surrogate > 0xDFFF) { + return 0; // Invalid low surrogate + } + codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low_surrogate - 0xDC00); + return encode_utf8(codepoint, &d, remaining) ? 12 : 0; // \uXXXX\uXXXX + } + } + + // Single \uXXXX + return encode_utf8(codepoint, &d, remaining) ? 6 : 0; + } + else { + // Handle \UXXXXXXXX + for (int i = 2; i < 10; i++) { + if (!isxdigit(s[i])) { + return 0; // Not a valid \UXXXXXXXX sequence + } + hex[i - 2] = s[i]; + } + codepoint = (unsigned)strtoul(hex, NULL, 16); + return encode_utf8(codepoint, &d, remaining) ? 10 : 0; // \UXXXXXXXX + } +} + +static inline void copy_newline(LOG_JSON_STATE *js __maybe_unused, char **d, size_t *remaining) { + if(*remaining > 3) { + *(*d)++ = '\\'; + *(*d)++ = 'n'; + (*remaining) -= 2; + } +} + +static inline void copy_tab(LOG_JSON_STATE *js __maybe_unused, char **d, size_t *remaining) { + if(*remaining > 3) { + *(*d)++ = '\\'; + *(*d)++ = 't'; + (*remaining) -= 2; + } +} + +static inline bool json_parse_string(LOG_JSON_STATE *js) { + static __thread char value[JOURNAL_MAX_VALUE_LEN]; + + if(!json_expect_char_after_white_space(js, "\"")) + return false; + + json_consume_char(js); + + value[0] = '\0'; + char *d = value; + const char *s = json_current_pos(js); + size_t remaining = sizeof(value); + + while (*s && *s != '"') { + char c; + + if (*s == '\\') { + s++; + + switch (*s) { + case 'n': + copy_newline(js, &d, &remaining); + s++; + continue; + + case 't': + copy_tab(js, &d, &remaining); + s++; + continue; + + case 'f': + case 'b': + case 'r': + c = ' '; + s++; + break; + + case 'u': { + size_t old_remaining = remaining; + size_t consumed = parse_surrogate(s - 1, d, &remaining); + if (consumed > 0) { + s += consumed - 1; // -1 because we already incremented s after '\\' + d += old_remaining - remaining; + continue; + } + else { + *d++ = '\\'; + remaining--; + c = *s++; + } + } + break; + + default: + c = *s++; + break; + } + } + else + c = *s++; + + if(remaining < 2) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: truncated string value at pos %zu", js->pos); + return false; + } + else { + *d++ = c; + remaining--; + } + } + *d = '\0'; + js->pos += s - json_current_pos(js); + + if(!json_expect_char_after_white_space(js, "\"")) + return false; + + json_consume_char(js); + + if(d > value) + json_process_key_value(js, value, d - value); + + return true; +} + +static inline bool json_parse_key_and_push(LOG_JSON_STATE *js) { + if (!json_expect_char_after_white_space(js, "\"")) + return false; + + if(js->depth >= JSON_DEPTH_MAX - 1) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: object too deep, at pos %zu", js->pos); + return false; + } + + json_consume_char(js); + + char *d = js->stack[js->depth]; + if(js->depth) + *d++ = '_'; + + size_t remaining = sizeof(js->key) - (d - js->key); + + const char *s = json_current_pos(js); + char last_c = '\0'; + while(*s && *s != '\"') { + char c; + + if (*s == '\\') { + s++; + c = (char)((*s == 'u') ? '_' : journal_key_characters_map[(unsigned char)*s]); + s += (*s == 'u') ? 5 : 1; + } + else + c = journal_key_characters_map[(unsigned char)*s++]; + + if(c == '_' && last_c == '_') + continue; + else { + if(remaining < 2) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: key buffer full - keys are too long, at pos %zu", js->pos); + return false; + } + *d++ = c; + remaining--; + } + + last_c = c; + } + *d = '\0'; + js->pos += s - json_current_pos(js); + + if (!json_expect_char_after_white_space(js, "\"")) + return false; + + json_consume_char(js); + + js->stack[++js->depth] = d; + + return true; +} + +static inline bool json_key_pop(LOG_JSON_STATE *js) { + if(js->depth <= 0) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: cannot pop a key at depth %zu, at pos %zu", js->depth, js->pos); + return false; + } + + char *k = js->stack[js->depth--]; + *k = '\0'; + return true; +} + +static inline bool json_parse_value(LOG_JSON_STATE *js) { + if(!json_expect_char_after_white_space(js, "-.0123456789tfn\"{[")) + return false; + + const char *s = json_current_pos(js); + switch(*s) { + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return json_parse_number(js); + + case 't': + return json_parse_true(js); + + case 'f': + return json_parse_false(js); + + case 'n': + return json_parse_null(js); + + case '"': + return json_parse_string(js); + + case '{': + return json_parse_object(js); + + case '[': + return json_parse_array(js); + } + + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: unexpected character at pos %zu", js->pos); + return false; +} + +static inline bool json_key_index_and_push(LOG_JSON_STATE *js, size_t index) { + char *d = js->stack[js->depth]; + if(js->depth > 0) { + *d++ = '_'; + } + + // Convert index to string manually + char temp[32]; + char *t = temp + sizeof(temp) - 1; // Start at the end of the buffer + *t = '\0'; + + do { + *--t = (char)((index % 10) + '0'); + index /= 10; + } while (index > 0); + + size_t remaining = sizeof(js->key) - (d - js->key); + + // Append the index to the key + while (*t) { + if(remaining < 2) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: key buffer full - keys are too long, at pos %zu", js->pos); + return false; + } + + *d++ = *t++; + remaining--; + } + + *d = '\0'; // Null-terminate the key + js->stack[++js->depth] = d; + + return true; +} + +static inline bool json_parse_array(LOG_JSON_STATE *js) { + if(!json_expect_char_after_white_space(js, "[")) + return false; + + json_consume_char(js); + + size_t index = 0; + do { + if(!json_key_index_and_push(js, index)) + return false; + + if(!json_parse_value(js)) + return false; + + json_key_pop(js); + + if(!json_expect_char_after_white_space(js, ",]")) + return false; + + const char *s = json_current_pos(js); + json_consume_char(js); + if(*s == ',') { + index++; + continue; + } + else // } + break; + + } while(true); + + return true; +} + +static inline bool json_parse_object(LOG_JSON_STATE *js) { + if(!json_expect_char_after_white_space(js, "{")) + return false; + + json_consume_char(js); + + do { + if (!json_expect_char_after_white_space(js, "\"")) + return false; + + if(!json_parse_key_and_push(js)) + return false; + + if(!json_expect_char_after_white_space(js, ":")) + return false; + + json_consume_char(js); + + if(!json_parse_value(js)) + return false; + + json_key_pop(js); + + if(!json_expect_char_after_white_space(js, ",}")) + return false; + + const char *s = json_current_pos(js); + json_consume_char(js); + if(*s == ',') + continue; + else // } + break; + + } while(true); + + return true; +} + +LOG_JSON_STATE *json_parser_create(LOG_JOB *jb) { + LOG_JSON_STATE *js = mallocz(sizeof(LOG_JSON_STATE)); + memset(js, 0, sizeof(LOG_JSON_STATE)); + js->jb = jb; + + if(jb->prefix) + copy_to_buffer(js->key, sizeof(js->key), js->jb->prefix, strlen(js->jb->prefix)); + + js->stack[0] = &js->key[strlen(js->key)]; + + return js; +} + +void json_parser_destroy(LOG_JSON_STATE *js) { + if(js) + freez(js); +} + +const char *json_parser_error(LOG_JSON_STATE *js) { + return js->msg; +} + +bool json_parse_document(LOG_JSON_STATE *js, const char *txt) { + js->line = txt; + js->pos = 0; + js->msg[0] = '\0'; + js->stack[0][0] = '\0'; + js->depth = 0; + + if(!json_parse_object(js)) + return false; + + json_skip_spaces(js); + const char *s = json_current_pos(js); + + if(*s) { + snprintf(js->msg, sizeof(js->msg), + "JSON PARSER: excess characters found after document is finished, at pos %zu", js->pos); + return false; + } + + return true; +} + +void json_test(void) { + LOG_JOB jb = { .prefix = "NIGNX_" }; + LOG_JSON_STATE *json = json_parser_create(&jb); + + json_parse_document(json, "{\"value\":\"\\u\\u039A\\u03B1\\u03BB\\u03B7\\u03BC\\u03AD\\u03C1\\u03B1\"}"); + + json_parser_destroy(json); +} |