diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
commit | 2cb7e0aaedad73b076ea18c6900b0e86c5760d79 (patch) | |
tree | da68ca54bb79f4080079bf0828acda937593a4e1 /src/journal | |
parent | Initial commit. (diff) | |
download | systemd-upstream.tar.xz systemd-upstream.zip |
Adding upstream version 247.3.upstream/247.3upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/journal')
70 files changed, 27907 insertions, 0 deletions
diff --git a/src/journal/audit-type.c b/src/journal/audit-type.c new file mode 100644 index 0000000..122cdf5 --- /dev/null +++ b/src/journal/audit-type.c @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "audit-type.h" +#include "missing_audit.h" + +#include "audit_type-to-name.h" diff --git a/src/journal/audit-type.h b/src/journal/audit-type.h new file mode 100644 index 0000000..f2c4898 --- /dev/null +++ b/src/journal/audit-type.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdio.h> + +#include "alloc-util.h" +#include "macro.h" + +const char *audit_type_to_string(int type); +int audit_type_from_string(const char *s); + +/* This is inspired by DNS TYPEnnn formatting */ +#define audit_type_name_alloca(type) \ + ({ \ + const char *_s_; \ + _s_ = audit_type_to_string(type); \ + if (!_s_) { \ + _s_ = newa(char, STRLEN("AUDIT") + DECIMAL_STR_MAX(int)); \ + sprintf((char*) _s_, "AUDIT%04i", type); \ + } \ + _s_; \ + }) diff --git a/src/journal/audit_type-to-name.awk b/src/journal/audit_type-to-name.awk new file mode 100644 index 0000000..44fc702 --- /dev/null +++ b/src/journal/audit_type-to-name.awk @@ -0,0 +1,9 @@ +BEGIN{ + print "const char *audit_type_to_string(int type) {\n\tswitch(type) {" +} +{ + printf " case AUDIT_%s: return \"%s\";\n", $1, $1 +} +END{ + print " default: return NULL;\n\t}\n}\n" +} diff --git a/src/journal/cat.c b/src/journal/cat.c new file mode 100644 index 0000000..bccf615 --- /dev/null +++ b/src/journal/cat.c @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "main-func.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "string-util.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "util.h" + +static const char *arg_identifier = NULL; +static int arg_priority = LOG_INFO; +static int arg_stderr_priority = -1; +static bool arg_level_prefix = true; + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-cat", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n" + "\n%sExecute process with stdout/stderr connected to the journal.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -t --identifier=STRING Set syslog identifier\n" + " -p --priority=PRIORITY Set priority value (0..7)\n" + " --stderr-priority=PRIORITY Set priority value (0..7) used for stderr\n" + " --level-prefix=BOOL Control whether level prefix shall be parsed\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , ansi_highlight(), ansi_normal() + , link + ); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_STDERR_PRIORITY, + ARG_LEVEL_PREFIX + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "identifier", required_argument, NULL, 't' }, + { "priority", required_argument, NULL, 'p' }, + { "stderr-priority", required_argument, NULL, ARG_STDERR_PRIORITY }, + { "level-prefix", required_argument, NULL, ARG_LEVEL_PREFIX }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "+ht:p:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + help(); + return 0; + + case ARG_VERSION: + return version(); + + case 't': + if (isempty(optarg)) + arg_identifier = NULL; + else + arg_identifier = optarg; + break; + + case 'p': + arg_priority = log_level_from_string(optarg); + if (arg_priority < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse priority value."); + break; + + case ARG_STDERR_PRIORITY: + arg_stderr_priority = log_level_from_string(optarg); + if (arg_stderr_priority < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse stderr priority value."); + break; + + case ARG_LEVEL_PREFIX: { + int k; + + k = parse_boolean(optarg); + if (k < 0) + return log_error_errno(k, "Failed to parse level prefix value."); + + arg_level_prefix = k; + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached("Unhandled option"); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_close_ int outfd = -1, errfd = -1, saved_stderr = -1; + int r; + + log_setup_cli(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + outfd = sd_journal_stream_fd(arg_identifier, arg_priority, arg_level_prefix); + if (outfd < 0) + return log_error_errno(outfd, "Failed to create stream fd: %m"); + + if (arg_stderr_priority >= 0 && arg_stderr_priority != arg_priority) { + errfd = sd_journal_stream_fd(arg_identifier, arg_stderr_priority, arg_level_prefix); + if (errfd < 0) + return log_error_errno(errfd, "Failed to create stream fd: %m"); + } + + saved_stderr = fcntl(STDERR_FILENO, F_DUPFD_CLOEXEC, 3); + + r = rearrange_stdio(STDIN_FILENO, outfd, errfd < 0 ? outfd : errfd); /* Invalidates fd on success + error! */ + TAKE_FD(outfd); + TAKE_FD(errfd); + if (r < 0) + return log_error_errno(r, "Failed to rearrange stdout/stderr: %m"); + + if (argc <= optind) + (void) execl("/bin/cat", "/bin/cat", NULL); + else + (void) execvp(argv[optind], argv + optind); + r = -errno; + + /* Let's try to restore a working stderr, so we can print the error message */ + if (saved_stderr >= 0) + (void) dup3(saved_stderr, STDERR_FILENO, 0); + + return log_error_errno(r, "Failed to execute process: %m"); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal/catalog.c b/src/journal/catalog.c new file mode 100644 index 0000000..0f6ad8a --- /dev/null +++ b/src/journal/catalog.c @@ -0,0 +1,742 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <locale.h> +#include <stdio.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "conf-files.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "log.h" +#include "memory-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "siphash24.h" +#include "sort-util.h" +#include "sparse-endian.h" +#include "strbuf.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" + +const char * const catalog_file_dirs[] = { + "/usr/local/lib/systemd/catalog/", + "/usr/lib/systemd/catalog/", + NULL +}; + +#define CATALOG_SIGNATURE { 'R', 'H', 'H', 'H', 'K', 'S', 'L', 'P' } + +typedef struct CatalogHeader { + uint8_t signature[8]; /* "RHHHKSLP" */ + le32_t compatible_flags; + le32_t incompatible_flags; + le64_t header_size; + le64_t n_items; + le64_t catalog_item_size; +} CatalogHeader; + +typedef struct CatalogItem { + sd_id128_t id; + char language[32]; /* One byte is used for termination, so the maximum allowed + * length of the string is actually 31 bytes. */ + le64_t offset; +} CatalogItem; + +static void catalog_hash_func(const CatalogItem *i, struct siphash *state) { + siphash24_compress(&i->id, sizeof(i->id), state); + siphash24_compress_string(i->language, state); +} + +static int catalog_compare_func(const CatalogItem *a, const CatalogItem *b) { + unsigned k; + int r; + + for (k = 0; k < ELEMENTSOF(b->id.bytes); k++) { + r = CMP(a->id.bytes[k], b->id.bytes[k]); + if (r != 0) + return r; + } + + return strcmp(a->language, b->language); +} + +DEFINE_HASH_OPS(catalog_hash_ops, CatalogItem, catalog_hash_func, catalog_compare_func); + +static bool next_header(const char **s) { + const char *e; + + e = strchr(*s, '\n'); + + /* Unexpected end */ + if (!e) + return false; + + /* End of headers */ + if (e == *s) + return false; + + *s = e + 1; + return true; +} + +static const char *skip_header(const char *s) { + while (next_header(&s)) + ; + return s; +} + +static char *combine_entries(const char *one, const char *two) { + const char *b1, *b2; + size_t l1, l2, n; + char *dest, *p; + + /* Find split point of headers to body */ + b1 = skip_header(one); + b2 = skip_header(two); + + l1 = strlen(one); + l2 = strlen(two); + dest = new(char, l1 + l2 + 1); + if (!dest) { + log_oom(); + return NULL; + } + + p = dest; + + /* Headers from @one */ + n = b1 - one; + p = mempcpy(p, one, n); + + /* Headers from @two, these will only be found if not present above */ + n = b2 - two; + p = mempcpy(p, two, n); + + /* Body from @one */ + n = l1 - (b1 - one); + if (n > 0) { + memcpy(p, b1, n); + p += n; + + /* Body from @two */ + } else { + n = l2 - (b2 - two); + memcpy(p, b2, n); + p += n; + } + + assert(p - dest <= (ptrdiff_t)(l1 + l2)); + p[0] = '\0'; + return dest; +} + +static int finish_item( + OrderedHashmap *h, + sd_id128_t id, + const char *language, + char *payload, size_t payload_size) { + + _cleanup_free_ CatalogItem *i = NULL; + _cleanup_free_ char *prev = NULL, *combined = NULL; + + assert(h); + assert(payload); + assert(payload_size > 0); + + i = new0(CatalogItem, 1); + if (!i) + return log_oom(); + + i->id = id; + if (language) { + assert(strlen(language) > 1 && strlen(language) < 32); + strcpy(i->language, language); + } + + prev = ordered_hashmap_get(h, i); + if (prev) { + /* Already have such an item, combine them */ + combined = combine_entries(payload, prev); + if (!combined) + return log_oom(); + + if (ordered_hashmap_update(h, i, combined) < 0) + return log_oom(); + combined = NULL; + } else { + /* A new item */ + combined = memdup(payload, payload_size + 1); + if (!combined) + return log_oom(); + + if (ordered_hashmap_put(h, i, combined) < 0) + return log_oom(); + i = NULL; + combined = NULL; + } + + return 0; +} + +int catalog_file_lang(const char* filename, char **lang) { + char *beg, *end, *_lang; + + end = endswith(filename, ".catalog"); + if (!end) + return 0; + + beg = end - 1; + while (beg > filename && !IN_SET(*beg, '.', '/') && end - beg < 32) + beg--; + + if (*beg != '.' || end <= beg + 1) + return 0; + + _lang = strndup(beg + 1, end - beg - 1); + if (!_lang) + return -ENOMEM; + + *lang = _lang; + return 1; +} + +static int catalog_entry_lang( + const char* filename, + unsigned line, + const char* t, + const char* deflang, + char **ret) { + + size_t c; + char *z; + + c = strlen(t); + if (c < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] Language too short.", filename, line); + if (c > 31) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] language too long.", filename, line); + + if (deflang) { + if (streq(t, deflang)) { + log_warning("[%s:%u] language specified unnecessarily", filename, line); + return 0; + } + + log_warning("[%s:%u] language differs from default for file", filename, line); + } + + z = strdup(t); + if (!z) + return -ENOMEM; + + *ret = z; + return 0; +} + +int catalog_import_file(OrderedHashmap *h, const char *path) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *payload = NULL; + size_t payload_size = 0, payload_allocated = 0; + unsigned n = 0; + sd_id128_t id; + _cleanup_free_ char *deflang = NULL, *lang = NULL; + bool got_id = false, empty_line = true; + int r; + + assert(h); + assert(path); + + f = fopen(path, "re"); + if (!f) + return log_error_errno(errno, "Failed to open file %s: %m", path); + + r = catalog_file_lang(path, &deflang); + if (r < 0) + log_error_errno(r, "Failed to determine language for file %s: %m", path); + if (r == 1) + log_debug("File %s has language %s.", path, deflang); + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t line_len; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read file %s: %m", path); + if (r == 0) + break; + + n++; + + if (isempty(line)) { + empty_line = true; + continue; + } + + if (strchr(COMMENTS, line[0])) + continue; + + if (empty_line && + strlen(line) >= 2+1+32 && + line[0] == '-' && + line[1] == '-' && + line[2] == ' ' && + IN_SET(line[2+1+32], ' ', '\0')) { + + bool with_language; + sd_id128_t jd; + + /* New entry */ + + with_language = line[2+1+32] != '\0'; + line[2+1+32] = '\0'; + + if (sd_id128_from_string(line + 2 + 1, &jd) >= 0) { + + if (got_id) { + if (payload_size == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] No payload text.", + path, + n); + + r = finish_item(h, id, lang ?: deflang, payload, payload_size); + if (r < 0) + return r; + + lang = mfree(lang); + payload_size = 0; + } + + if (with_language) { + char *t; + + t = strstrip(line + 2 + 1 + 32 + 1); + r = catalog_entry_lang(path, n, t, deflang, &lang); + if (r < 0) + return r; + } + + got_id = true; + empty_line = false; + id = jd; + + continue; + } + } + + /* Payload */ + if (!got_id) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] Got payload before ID.", + path, n); + + line_len = strlen(line); + if (!GREEDY_REALLOC(payload, payload_allocated, + payload_size + (empty_line ? 1 : 0) + line_len + 1 + 1)) + return log_oom(); + + if (empty_line) + payload[payload_size++] = '\n'; + memcpy(payload + payload_size, line, line_len); + payload_size += line_len; + payload[payload_size++] = '\n'; + payload[payload_size] = '\0'; + + empty_line = false; + } + + if (got_id) { + if (payload_size == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] No payload text.", + path, n); + + r = finish_item(h, id, lang ?: deflang, payload, payload_size); + if (r < 0) + return r; + } + + return 0; +} + +static int64_t write_catalog( + const char *database, + struct strbuf *sb, + CatalogItem *items, + size_t n) { + + _cleanup_fclose_ FILE *w = NULL; + _cleanup_free_ char *p = NULL; + CatalogHeader header; + size_t k; + int r; + + r = mkdir_parents(database, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create parent directories of %s: %m", database); + + r = fopen_temporary(database, &w, &p); + if (r < 0) + return log_error_errno(r, "Failed to open database for writing: %s: %m", + database); + + header = (CatalogHeader) { + .signature = CATALOG_SIGNATURE, + .header_size = htole64(ALIGN_TO(sizeof(CatalogHeader), 8)), + .catalog_item_size = htole64(sizeof(CatalogItem)), + .n_items = htole64(n), + }; + + r = -EIO; + + k = fwrite(&header, 1, sizeof(header), w); + if (k != sizeof(header)) { + log_error("%s: failed to write header.", p); + goto error; + } + + k = fwrite(items, 1, n * sizeof(CatalogItem), w); + if (k != n * sizeof(CatalogItem)) { + log_error("%s: failed to write database.", p); + goto error; + } + + k = fwrite(sb->buf, 1, sb->len, w); + if (k != sb->len) { + log_error("%s: failed to write strings.", p); + goto error; + } + + r = fflush_and_check(w); + if (r < 0) { + log_error_errno(r, "%s: failed to write database: %m", p); + goto error; + } + + (void) fchmod(fileno(w), 0644); + + if (rename(p, database) < 0) { + r = log_error_errno(errno, "rename (%s -> %s) failed: %m", p, database); + goto error; + } + + return ftello(w); + +error: + (void) unlink(p); + return r; +} + +int catalog_update(const char* database, const char* root, const char* const* dirs) { + _cleanup_strv_free_ char **files = NULL; + char **f; + _cleanup_(strbuf_cleanupp) struct strbuf *sb = NULL; + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + _cleanup_free_ CatalogItem *items = NULL; + ssize_t offset; + char *payload; + CatalogItem *i; + unsigned n; + int r; + int64_t sz; + + h = ordered_hashmap_new(&catalog_hash_ops); + sb = strbuf_new(); + if (!h || !sb) + return log_oom(); + + r = conf_files_list_strv(&files, ".catalog", root, 0, dirs); + if (r < 0) + return log_error_errno(r, "Failed to get catalog files: %m"); + + STRV_FOREACH(f, files) { + log_debug("Reading file '%s'", *f); + r = catalog_import_file(h, *f); + if (r < 0) + return log_error_errno(r, "Failed to import file '%s': %m", *f); + } + + if (ordered_hashmap_size(h) <= 0) { + log_info("No items in catalog."); + return 0; + } else + log_debug("Found %u items in catalog.", ordered_hashmap_size(h)); + + items = new(CatalogItem, ordered_hashmap_size(h)); + if (!items) + return log_oom(); + + n = 0; + ORDERED_HASHMAP_FOREACH_KEY(payload, i, h) { + log_debug("Found " SD_ID128_FORMAT_STR ", language %s", + SD_ID128_FORMAT_VAL(i->id), + isempty(i->language) ? "C" : i->language); + + offset = strbuf_add_string(sb, payload, strlen(payload)); + if (offset < 0) + return log_oom(); + + i->offset = htole64((uint64_t) offset); + items[n++] = *i; + } + + assert(n == ordered_hashmap_size(h)); + typesafe_qsort(items, n, catalog_compare_func); + + strbuf_complete(sb); + + sz = write_catalog(database, sb, items, n); + if (sz < 0) + return log_error_errno(sz, "Failed to write %s: %m", database); + + log_debug("%s: wrote %u items, with %zu bytes of strings, %"PRIi64" total size.", + database, n, sb->len, sz); + return 0; +} + +static int open_mmap(const char *database, int *_fd, struct stat *_st, void **_p) { + _cleanup_close_ int fd = -1; + const CatalogHeader *h; + struct stat st; + void *p; + + assert(_fd); + assert(_st); + assert(_p); + + fd = open(database, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (st.st_size < (off_t) sizeof(CatalogHeader)) + return -EINVAL; + + p = mmap(NULL, PAGE_ALIGN(st.st_size), PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + return -errno; + + h = p; + if (memcmp(h->signature, (const uint8_t[]) CATALOG_SIGNATURE, sizeof(h->signature)) != 0 || + le64toh(h->header_size) < sizeof(CatalogHeader) || + le64toh(h->catalog_item_size) < sizeof(CatalogItem) || + h->incompatible_flags != 0 || + le64toh(h->n_items) <= 0 || + st.st_size < (off_t) (le64toh(h->header_size) + le64toh(h->catalog_item_size) * le64toh(h->n_items))) { + munmap(p, st.st_size); + return -EBADMSG; + } + + *_fd = TAKE_FD(fd); + *_st = st; + *_p = p; + + return 0; +} + +static const char *find_id(void *p, sd_id128_t id) { + CatalogItem *f = NULL, key = { .id = id }; + const CatalogHeader *h = p; + const char *loc; + + loc = setlocale(LC_MESSAGES, NULL); + if (!isempty(loc) && !STR_IN_SET(loc, "C", "POSIX")) { + size_t len; + + len = strcspn(loc, ".@"); + if (len > sizeof(key.language) - 1) + log_debug("LC_MESSAGES value too long, ignoring: \"%.*s\"", (int) len, loc); + else { + strncpy(key.language, loc, len); + key.language[len] = '\0'; + + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + if (!f) { + char *e; + + e = strchr(key.language, '_'); + if (e) { + *e = 0; + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + } + } + } + } + + if (!f) { + zero(key.language); + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + } + + if (!f) + return NULL; + + return (const char*) p + + le64toh(h->header_size) + + le64toh(h->n_items) * le64toh(h->catalog_item_size) + + le64toh(f->offset); +} + +int catalog_get(const char* database, sd_id128_t id, char **_text) { + _cleanup_close_ int fd = -1; + void *p = NULL; + struct stat st = {}; + char *text = NULL; + int r; + const char *s; + + assert(_text); + + r = open_mmap(database, &fd, &st, &p); + if (r < 0) + return r; + + s = find_id(p, id); + if (!s) { + r = -ENOENT; + goto finish; + } + + text = strdup(s); + if (!text) { + r = -ENOMEM; + goto finish; + } + + *_text = text; + r = 0; + +finish: + if (p) + munmap(p, st.st_size); + + return r; +} + +static char *find_header(const char *s, const char *header) { + + for (;;) { + const char *v; + + v = startswith(s, header); + if (v) { + v += strspn(v, WHITESPACE); + return strndup(v, strcspn(v, NEWLINE)); + } + + if (!next_header(&s)) + return NULL; + } +} + +static void dump_catalog_entry(FILE *f, sd_id128_t id, const char *s, bool oneline) { + if (oneline) { + _cleanup_free_ char *subject = NULL, *defined_by = NULL; + + subject = find_header(s, "Subject:"); + defined_by = find_header(s, "Defined-By:"); + + fprintf(f, SD_ID128_FORMAT_STR " %s: %s\n", + SD_ID128_FORMAT_VAL(id), + strna(defined_by), strna(subject)); + } else + fprintf(f, "-- " SD_ID128_FORMAT_STR "\n%s\n", + SD_ID128_FORMAT_VAL(id), s); +} + +int catalog_list(FILE *f, const char *database, bool oneline) { + _cleanup_close_ int fd = -1; + void *p = NULL; + struct stat st; + const CatalogHeader *h; + const CatalogItem *items; + int r; + unsigned n; + sd_id128_t last_id; + bool last_id_set = false; + + r = open_mmap(database, &fd, &st, &p); + if (r < 0) + return r; + + h = p; + items = (const CatalogItem*) ((const uint8_t*) p + le64toh(h->header_size)); + + for (n = 0; n < le64toh(h->n_items); n++) { + const char *s; + + if (last_id_set && sd_id128_equal(last_id, items[n].id)) + continue; + + assert_se(s = find_id(p, items[n].id)); + + dump_catalog_entry(f, items[n].id, s, oneline); + + last_id_set = true; + last_id = items[n].id; + } + + munmap(p, st.st_size); + + return 0; +} + +int catalog_list_items(FILE *f, const char *database, bool oneline, char **items) { + char **item; + int r = 0; + + STRV_FOREACH(item, items) { + sd_id128_t id; + int k; + _cleanup_free_ char *msg = NULL; + + k = sd_id128_from_string(*item, &id); + if (k < 0) { + log_error_errno(k, "Failed to parse id128 '%s': %m", *item); + if (r == 0) + r = k; + continue; + } + + k = catalog_get(database, id, &msg); + if (k < 0) { + log_full_errno(k == -ENOENT ? LOG_NOTICE : LOG_ERR, k, + "Failed to retrieve catalog entry for '%s': %m", *item); + if (r == 0) + r = k; + continue; + } + + dump_catalog_entry(f, id, msg, oneline); + } + + return r; +} diff --git a/src/journal/catalog.h b/src/journal/catalog.h new file mode 100644 index 0000000..df27869 --- /dev/null +++ b/src/journal/catalog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <stdio.h> + +#include "sd-id128.h" + +#include "hashmap.h" +#include "strbuf.h" + +int catalog_import_file(OrderedHashmap *h, const char *path); +int catalog_update(const char* database, const char* root, const char* const* dirs); +int catalog_get(const char* database, sd_id128_t id, char **data); +int catalog_list(FILE *f, const char* database, bool oneline); +int catalog_list_items(FILE *f, const char* database, bool oneline, char **items); +int catalog_file_lang(const char *filename, char **lang); +extern const char * const catalog_file_dirs[]; +extern const struct hash_ops catalog_hash_ops; diff --git a/src/journal/compress.c b/src/journal/compress.c new file mode 100644 index 0000000..aaf186b --- /dev/null +++ b/src/journal/compress.c @@ -0,0 +1,1061 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <inttypes.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#if HAVE_XZ +#include <lzma.h> +#endif + +#if HAVE_LZ4 +#include <lz4.h> +#include <lz4frame.h> +#endif + +#if HAVE_ZSTD +#include <zstd.h> +#include <zstd_errors.h> +#endif + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "io-util.h" +#include "journal-def.h" +#include "macro.h" +#include "sparse-endian.h" +#include "string-table.h" +#include "string-util.h" +#include "unaligned.h" +#include "util.h" + +#if HAVE_LZ4 +DEFINE_TRIVIAL_CLEANUP_FUNC(LZ4F_compressionContext_t, LZ4F_freeCompressionContext); +DEFINE_TRIVIAL_CLEANUP_FUNC(LZ4F_decompressionContext_t, LZ4F_freeDecompressionContext); +#endif + +#if HAVE_ZSTD +DEFINE_TRIVIAL_CLEANUP_FUNC(ZSTD_CCtx *, ZSTD_freeCCtx); +DEFINE_TRIVIAL_CLEANUP_FUNC(ZSTD_DCtx *, ZSTD_freeDCtx); + +static int zstd_ret_to_errno(size_t ret) { + switch (ZSTD_getErrorCode(ret)) { + case ZSTD_error_dstSize_tooSmall: + return -ENOBUFS; + case ZSTD_error_memory_allocation: + return -ENOMEM; + default: + return -EBADMSG; + } +} +#endif + +#define ALIGN_8(l) ALIGN_TO(l, sizeof(size_t)) + +static const char* const object_compressed_table[_OBJECT_COMPRESSED_MAX] = { + [OBJECT_COMPRESSED_XZ] = "XZ", + [OBJECT_COMPRESSED_LZ4] = "LZ4", + [OBJECT_COMPRESSED_ZSTD] = "ZSTD", + /* If we add too many more entries here, it's going to grow quite large (and be mostly sparse), since + * the array key is actually a bitmask, not a plain enum */ +}; + +DEFINE_STRING_TABLE_LOOKUP(object_compressed, int); + +int compress_blob_xz(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_XZ + static const lzma_options_lzma opt = { + 1u << 20u, NULL, 0, LZMA_LC_DEFAULT, LZMA_LP_DEFAULT, + LZMA_PB_DEFAULT, LZMA_MODE_FAST, 128, LZMA_MF_HC3, 4 + }; + static const lzma_filter filters[] = { + { LZMA_FILTER_LZMA2, (lzma_options_lzma*) &opt }, + { LZMA_VLI_UNKNOWN, NULL } + }; + lzma_ret ret; + size_t out_pos = 0; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + /* Returns < 0 if we couldn't compress the data or the + * compressed result is longer than the original */ + + if (src_size < 80) + return -ENOBUFS; + + ret = lzma_stream_buffer_encode((lzma_filter*) filters, LZMA_CHECK_NONE, NULL, + src, src_size, dst, &out_pos, dst_alloc_size); + if (ret != LZMA_OK) + return -ENOBUFS; + + *dst_size = out_pos; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int compress_blob_lz4(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_LZ4 + int r; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + /* Returns < 0 if we couldn't compress the data or the + * compressed result is longer than the original */ + + if (src_size < 9) + return -ENOBUFS; + + r = LZ4_compress_default(src, (char*)dst + 8, src_size, (int) dst_alloc_size - 8); + if (r <= 0) + return -ENOBUFS; + + unaligned_write_le64(dst, src_size); + *dst_size = r + 8; + + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int compress_blob_zstd( + const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_ZSTD + size_t k; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + k = ZSTD_compress(dst, dst_alloc_size, src, src_size, 0); + if (ZSTD_isError(k)) + return zstd_ret_to_errno(k); + + *dst_size = k; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_xz(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + size_t space; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size); + assert(dst_size); + assert(*dst_alloc_size == 0 || *dst); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return -ENOMEM; + + space = MIN(src_size * 2, dst_max ?: (size_t) -1); + if (!greedy_realloc(dst, dst_alloc_size, space, 1)) + return -ENOMEM; + + s.next_in = src; + s.avail_in = src_size; + + s.next_out = *dst; + s.avail_out = space; + + for (;;) { + size_t used; + + ret = lzma_code(&s, LZMA_FINISH); + + if (ret == LZMA_STREAM_END) + break; + else if (ret != LZMA_OK) + return -ENOMEM; + + if (dst_max > 0 && (space - s.avail_out) >= dst_max) + break; + else if (dst_max > 0 && space == dst_max) + return -ENOBUFS; + + used = space - s.avail_out; + space = MIN(2 * space, dst_max ?: (size_t) -1); + if (!greedy_realloc(dst, dst_alloc_size, space, 1)) + return -ENOMEM; + + s.avail_out = space - used; + s.next_out = *(uint8_t**)dst + used; + } + + *dst_size = space - s.avail_out; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_lz4(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max) { + +#if HAVE_LZ4 + char* out; + int r, size; /* LZ4 uses int for size */ + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size); + assert(dst_size); + assert(*dst_alloc_size == 0 || *dst); + + if (src_size <= 8) + return -EBADMSG; + + size = unaligned_read_le64(src); + if (size < 0 || (unsigned) size != unaligned_read_le64(src)) + return -EFBIG; + if ((size_t) size > *dst_alloc_size) { + out = realloc(*dst, size); + if (!out) + return -ENOMEM; + *dst = out; + *dst_alloc_size = size; + } else + out = *dst; + + r = LZ4_decompress_safe((char*)src + 8, out, src_size - 8, size); + if (r < 0 || r != size) + return -EBADMSG; + + *dst_size = size; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_zstd( + const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t *dst_size, size_t dst_max) { + +#if HAVE_ZSTD + uint64_t size; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size); + assert(dst_size); + assert(*dst_alloc_size == 0 || *dst); + + size = ZSTD_getFrameContentSize(src, src_size); + if (IN_SET(size, ZSTD_CONTENTSIZE_ERROR, ZSTD_CONTENTSIZE_UNKNOWN)) + return -EBADMSG; + + if (dst_max > 0 && size > dst_max) + size = dst_max; + if (size > SIZE_MAX) + return -E2BIG; + + if (!(greedy_realloc(dst, dst_alloc_size, MAX(ZSTD_DStreamOutSize(), size), 1))) + return -ENOMEM; + + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = ZSTD_createDCtx(); + if (!dctx) + return -ENOMEM; + + ZSTD_inBuffer input = { + .src = src, + .size = src_size, + }; + ZSTD_outBuffer output = { + .dst = *dst, + .size = *dst_alloc_size, + }; + + size_t k = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(k)) { + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(k)); + return zstd_ret_to_errno(k); + } + assert(output.pos >= size); + + *dst_size = size; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob( + int compression, + const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max) { + + if (compression == OBJECT_COMPRESSED_XZ) + return decompress_blob_xz( + src, src_size, + dst, dst_alloc_size, dst_size, dst_max); + else if (compression == OBJECT_COMPRESSED_LZ4) + return decompress_blob_lz4( + src, src_size, + dst, dst_alloc_size, dst_size, dst_max); + else if (compression == OBJECT_COMPRESSED_ZSTD) + return decompress_blob_zstd( + src, src_size, + dst, dst_alloc_size, dst_size, dst_max); + else + return -EPROTONOSUPPORT; +} + +int decompress_startswith_xz(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + + /* Checks whether the decompressed blob starts with the + * mentioned prefix. The byte extra needs to follow the + * prefix */ + + assert(src); + assert(src_size > 0); + assert(buffer); + assert(buffer_size); + assert(prefix); + assert(*buffer_size == 0 || *buffer); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return -EBADMSG; + + if (!(greedy_realloc(buffer, buffer_size, ALIGN_8(prefix_len + 1), 1))) + return -ENOMEM; + + s.next_in = src; + s.avail_in = src_size; + + s.next_out = *buffer; + s.avail_out = *buffer_size; + + for (;;) { + ret = lzma_code(&s, LZMA_FINISH); + + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return -EBADMSG; + + if (*buffer_size - s.avail_out >= prefix_len + 1) + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; + + if (ret == LZMA_STREAM_END) + return 0; + + s.avail_out += *buffer_size; + + if (!(greedy_realloc(buffer, buffer_size, *buffer_size * 2, 1))) + return -ENOMEM; + + s.next_out = *(uint8_t**)buffer + *buffer_size - s.avail_out; + } + +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith_lz4(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra) { +#if HAVE_LZ4 + /* Checks whether the decompressed blob starts with the + * mentioned prefix. The byte extra needs to follow the + * prefix */ + + int r; + + assert(src); + assert(src_size > 0); + assert(buffer); + assert(buffer_size); + assert(prefix); + assert(*buffer_size == 0 || *buffer); + + if (src_size <= 8) + return -EBADMSG; + + if (!(greedy_realloc(buffer, buffer_size, ALIGN_8(prefix_len + 1), 1))) + return -ENOMEM; + + r = LZ4_decompress_safe_partial((char*)src + 8, *buffer, src_size - 8, + prefix_len + 1, *buffer_size); + /* One lz4 < 1.8.3, we might get "failure" (r < 0), or "success" where + * just a part of the buffer is decompressed. But if we get a smaller + * amount of bytes than requested, we don't know whether there isn't enough + * data to fill the requested size or whether we just got a partial answer. + */ + if (r < 0 || (size_t) r < prefix_len + 1) { + size_t size; + + if (LZ4_versionNumber() >= 10803) + /* We trust that the newer lz4 decompresses the number of bytes we + * requested if available in the compressed string. */ + return 0; + + if (r > 0) + /* Compare what we have first, in case of mismatch we can + * shortcut the full comparison. */ + if (memcmp(*buffer, prefix, r) != 0) + return 0; + + /* Before version 1.8.3, lz4 always tries to decode full a "sequence", + * so in pathological cases might need to decompress the full field. */ + r = decompress_blob_lz4(src, src_size, buffer, buffer_size, &size, 0); + if (r < 0) + return r; + + if (size < prefix_len + 1) + return 0; + } + + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith_zstd( + const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra) { +#if HAVE_ZSTD + assert(src); + assert(src_size > 0); + assert(buffer); + assert(buffer_size); + assert(prefix); + assert(*buffer_size == 0 || *buffer); + + uint64_t size = ZSTD_getFrameContentSize(src, src_size); + if (IN_SET(size, ZSTD_CONTENTSIZE_ERROR, ZSTD_CONTENTSIZE_UNKNOWN)) + return -EBADMSG; + + if (size < prefix_len + 1) + return 0; /* Decompressed text too short to match the prefix and extra */ + + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = ZSTD_createDCtx(); + if (!dctx) + return -ENOMEM; + + if (!(greedy_realloc(buffer, buffer_size, MAX(ZSTD_DStreamOutSize(), prefix_len + 1), 1))) + return -ENOMEM; + + ZSTD_inBuffer input = { + .src = src, + .size = src_size, + }; + ZSTD_outBuffer output = { + .dst = *buffer, + .size = *buffer_size, + }; + size_t k; + + k = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(k)) { + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(k)); + return zstd_ret_to_errno(k); + } + assert(output.pos >= prefix_len + 1); + + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith( + int compression, + const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra) { + + if (compression == OBJECT_COMPRESSED_XZ) + return decompress_startswith_xz( + src, src_size, + buffer, buffer_size, + prefix, prefix_len, + extra); + + else if (compression == OBJECT_COMPRESSED_LZ4) + return decompress_startswith_lz4( + src, src_size, + buffer, buffer_size, + prefix, prefix_len, + extra); + else if (compression == OBJECT_COMPRESSED_ZSTD) + return decompress_startswith_zstd( + src, src_size, + buffer, buffer_size, + prefix, prefix_len, + extra); + else + return -EBADMSG; +} + +int compress_stream_xz(int fdf, int fdt, uint64_t max_bytes) { +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + uint8_t buf[BUFSIZ], out[BUFSIZ]; + lzma_action action = LZMA_RUN; + + assert(fdf >= 0); + assert(fdt >= 0); + + ret = lzma_easy_encoder(&s, LZMA_PRESET_DEFAULT, LZMA_CHECK_CRC64); + if (ret != LZMA_OK) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to initialize XZ encoder: code %u", + ret); + + for (;;) { + if (s.avail_in == 0 && action == LZMA_RUN) { + size_t m = sizeof(buf); + ssize_t n; + + if (max_bytes != (uint64_t) -1 && (uint64_t) m > max_bytes) + m = (size_t) max_bytes; + + n = read(fdf, buf, m); + if (n < 0) + return -errno; + if (n == 0) + action = LZMA_FINISH; + else { + s.next_in = buf; + s.avail_in = n; + + if (max_bytes != (uint64_t) -1) { + assert(max_bytes >= (uint64_t) n); + max_bytes -= n; + } + } + } + + if (s.avail_out == 0) { + s.next_out = out; + s.avail_out = sizeof(out); + } + + ret = lzma_code(&s, action); + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Compression failed: code %u", + ret); + + if (s.avail_out == 0 || ret == LZMA_STREAM_END) { + ssize_t n, k; + + n = sizeof(out) - s.avail_out; + + k = loop_write(fdt, out, n, false); + if (k < 0) + return k; + + if (ret == LZMA_STREAM_END) { + log_debug("XZ compression finished (%"PRIu64" -> %"PRIu64" bytes, %.1f%%)", + s.total_in, s.total_out, + (double) s.total_out / s.total_in * 100); + + return 0; + } + } + } +#else + return -EPROTONOSUPPORT; +#endif +} + +#define LZ4_BUFSIZE (512*1024u) + +int compress_stream_lz4(int fdf, int fdt, uint64_t max_bytes) { + +#if HAVE_LZ4 + LZ4F_errorCode_t c; + _cleanup_(LZ4F_freeCompressionContextp) LZ4F_compressionContext_t ctx = NULL; + _cleanup_free_ char *buf = NULL; + char *src = NULL; + size_t size, n, total_in = 0, total_out, offset = 0, frame_size; + struct stat st; + int r; + static const LZ4F_compressOptions_t options = { + .stableSrc = 1, + }; + static const LZ4F_preferences_t preferences = { + .frameInfo.blockSizeID = 5, + }; + + c = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION); + if (LZ4F_isError(c)) + return -ENOMEM; + + if (fstat(fdf, &st) < 0) + return log_debug_errno(errno, "fstat() failed: %m"); + + frame_size = LZ4F_compressBound(LZ4_BUFSIZE, &preferences); + size = frame_size + 64*1024; /* add some space for header and trailer */ + buf = malloc(size); + if (!buf) + return -ENOMEM; + + n = offset = total_out = LZ4F_compressBegin(ctx, buf, size, &preferences); + if (LZ4F_isError(n)) + return -EINVAL; + + src = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fdf, 0); + if (src == MAP_FAILED) + return -errno; + + log_debug("Buffer size is %zu bytes, header size %zu bytes.", size, n); + + while (total_in < (size_t) st.st_size) { + ssize_t k; + + k = MIN(LZ4_BUFSIZE, st.st_size - total_in); + n = LZ4F_compressUpdate(ctx, buf + offset, size - offset, + src + total_in, k, &options); + if (LZ4F_isError(n)) { + r = -ENOTRECOVERABLE; + goto cleanup; + } + + total_in += k; + offset += n; + total_out += n; + + if (max_bytes != (uint64_t) -1 && total_out > (size_t) max_bytes) + return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), + "Compressed stream longer than %" PRIu64 " bytes", + max_bytes); + + if (size - offset < frame_size + 4) { + k = loop_write(fdt, buf, offset, false); + if (k < 0) { + r = k; + goto cleanup; + } + offset = 0; + } + } + + n = LZ4F_compressEnd(ctx, buf + offset, size - offset, &options); + if (LZ4F_isError(n)) { + r = -ENOTRECOVERABLE; + goto cleanup; + } + + offset += n; + total_out += n; + r = loop_write(fdt, buf, offset, false); + if (r < 0) + goto cleanup; + + log_debug("LZ4 compression finished (%zu -> %zu bytes, %.1f%%)", + total_in, total_out, + (double) total_out / total_in * 100); + cleanup: + munmap(src, st.st_size); + return r; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_stream_xz(int fdf, int fdt, uint64_t max_bytes) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + + uint8_t buf[BUFSIZ], out[BUFSIZ]; + lzma_action action = LZMA_RUN; + + assert(fdf >= 0); + assert(fdt >= 0); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEM), + "Failed to initialize XZ decoder: code %u", + ret); + + for (;;) { + if (s.avail_in == 0 && action == LZMA_RUN) { + ssize_t n; + + n = read(fdf, buf, sizeof(buf)); + if (n < 0) + return -errno; + if (n == 0) + action = LZMA_FINISH; + else { + s.next_in = buf; + s.avail_in = n; + } + } + + if (s.avail_out == 0) { + s.next_out = out; + s.avail_out = sizeof(out); + } + + ret = lzma_code(&s, action); + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Decompression failed: code %u", + ret); + + if (s.avail_out == 0 || ret == LZMA_STREAM_END) { + ssize_t n, k; + + n = sizeof(out) - s.avail_out; + + if (max_bytes != (uint64_t) -1) { + if (max_bytes < (uint64_t) n) + return -EFBIG; + + max_bytes -= n; + } + + k = loop_write(fdt, out, n, false); + if (k < 0) + return k; + + if (ret == LZMA_STREAM_END) { + log_debug("XZ decompression finished (%"PRIu64" -> %"PRIu64" bytes, %.1f%%)", + s.total_in, s.total_out, + (double) s.total_out / s.total_in * 100); + + return 0; + } + } + } +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without XZ support."); +#endif +} + +int decompress_stream_lz4(int in, int out, uint64_t max_bytes) { +#if HAVE_LZ4 + size_t c; + _cleanup_(LZ4F_freeDecompressionContextp) LZ4F_decompressionContext_t ctx = NULL; + _cleanup_free_ char *buf = NULL; + char *src; + struct stat st; + int r = 0; + size_t total_in = 0, total_out = 0; + + c = LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION); + if (LZ4F_isError(c)) + return -ENOMEM; + + if (fstat(in, &st) < 0) + return log_debug_errno(errno, "fstat() failed: %m"); + + buf = malloc(LZ4_BUFSIZE); + if (!buf) + return -ENOMEM; + + src = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, in, 0); + if (src == MAP_FAILED) + return -errno; + + while (total_in < (size_t) st.st_size) { + size_t produced = LZ4_BUFSIZE; + size_t used = st.st_size - total_in; + + c = LZ4F_decompress(ctx, buf, &produced, src + total_in, &used, NULL); + if (LZ4F_isError(c)) { + r = -EBADMSG; + goto cleanup; + } + + total_in += used; + total_out += produced; + + if (max_bytes != (uint64_t) -1 && total_out > (size_t) max_bytes) { + log_debug("Decompressed stream longer than %"PRIu64" bytes", max_bytes); + r = -EFBIG; + goto cleanup; + } + + r = loop_write(out, buf, produced, false); + if (r < 0) + goto cleanup; + } + + log_debug("LZ4 decompression finished (%zu -> %zu bytes, %.1f%%)", + total_in, total_out, + total_in > 0 ? (double) total_out / total_in * 100 : 0.0); + cleanup: + munmap(src, st.st_size); + return r; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without LZ4 support."); +#endif +} + +int compress_stream_zstd(int fdf, int fdt, uint64_t max_bytes) { +#if HAVE_ZSTD + _cleanup_(ZSTD_freeCCtxp) ZSTD_CCtx *cctx = NULL; + _cleanup_free_ void *in_buff = NULL, *out_buff = NULL; + size_t in_allocsize, out_allocsize; + size_t z; + uint64_t left = max_bytes, in_bytes = 0; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Create the context and buffers */ + in_allocsize = ZSTD_CStreamInSize(); + out_allocsize = ZSTD_CStreamOutSize(); + in_buff = malloc(in_allocsize); + out_buff = malloc(out_allocsize); + cctx = ZSTD_createCCtx(); + if (!cctx || !out_buff || !in_buff) + return -ENOMEM; + + z = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1); + if (ZSTD_isError(z)) + log_debug("Failed to enable ZSTD checksum, ignoring: %s", ZSTD_getErrorName(z)); + + /* This loop read from the input file, compresses that entire chunk, + * and writes all output produced to the output file. + */ + for (;;) { + bool is_last_chunk; + ZSTD_inBuffer input = { + .src = in_buff, + .size = 0, + .pos = 0 + }; + ssize_t red; + + red = loop_read(fdf, in_buff, in_allocsize, true); + if (red < 0) + return red; + is_last_chunk = red == 0; + + in_bytes += (size_t) red; + input.size = (size_t) red; + + for (bool finished = false; !finished;) { + ZSTD_outBuffer output = { + .dst = out_buff, + .size = out_allocsize, + .pos = 0 + }; + size_t remaining; + ssize_t wrote; + + /* Compress into the output buffer and write all of the + * output to the file so we can reuse the buffer next + * iteration. + */ + remaining = ZSTD_compressStream2( + cctx, &output, &input, + is_last_chunk ? ZSTD_e_end : ZSTD_e_continue); + + if (ZSTD_isError(remaining)) { + log_debug("ZSTD encoder failed: %s", ZSTD_getErrorName(remaining)); + return zstd_ret_to_errno(remaining); + } + + if (left < output.pos) + return -EFBIG; + + wrote = loop_write(fdt, output.dst, output.pos, 1); + if (wrote < 0) + return wrote; + + left -= output.pos; + + /* If we're on the last chunk we're finished when zstd + * returns 0, which means its consumed all the input AND + * finished the frame. Otherwise, we're finished when + * we've consumed all the input. + */ + finished = is_last_chunk ? (remaining == 0) : (input.pos == input.size); + } + + /* zstd only returns 0 when the input is completely consumed */ + assert(input.pos == input.size); + if (is_last_chunk) + break; + } + + if (in_bytes > 0) + log_debug("ZSTD compression finished (%" PRIu64 " -> %" PRIu64 " bytes, %.1f%%)", + in_bytes, max_bytes - left, (double) (max_bytes - left) / in_bytes * 100); + else + log_debug("ZSTD compression finished (%" PRIu64 " -> %" PRIu64 " bytes)", + in_bytes, max_bytes - left); + + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_stream_zstd(int fdf, int fdt, uint64_t max_bytes) { +#if HAVE_ZSTD + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = NULL; + _cleanup_free_ void *in_buff = NULL, *out_buff = NULL; + size_t in_allocsize, out_allocsize; + size_t last_result = 0; + uint64_t left = max_bytes, in_bytes = 0; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Create the context and buffers */ + in_allocsize = ZSTD_DStreamInSize(); + out_allocsize = ZSTD_DStreamOutSize(); + in_buff = malloc(in_allocsize); + out_buff = malloc(out_allocsize); + dctx = ZSTD_createDCtx(); + if (!dctx || !out_buff || !in_buff) + return -ENOMEM; + + /* This loop assumes that the input file is one or more concatenated + * zstd streams. This example won't work if there is trailing non-zstd + * data at the end, but streaming decompression in general handles this + * case. ZSTD_decompressStream() returns 0 exactly when the frame is + * completed, and doesn't consume input after the frame. + */ + for (;;) { + bool has_error = false; + ZSTD_inBuffer input = { + .src = in_buff, + .size = 0, + .pos = 0 + }; + ssize_t red; + + red = loop_read(fdf, in_buff, in_allocsize, true); + if (red < 0) + return red; + if (red == 0) + break; + + in_bytes += (size_t) red; + input.size = (size_t) red; + input.pos = 0; + + /* Given a valid frame, zstd won't consume the last byte of the + * frame until it has flushed all of the decompressed data of + * the frame. So input.pos < input.size means frame is not done + * or there is still output available. + */ + while (input.pos < input.size) { + ZSTD_outBuffer output = { + .dst = out_buff, + .size = out_allocsize, + .pos = 0 + }; + ssize_t wrote; + /* The return code is zero if the frame is complete, but + * there may be multiple frames concatenated together. + * Zstd will automatically reset the context when a + * frame is complete. Still, calling ZSTD_DCtx_reset() + * can be useful to reset the context to a clean state, + * for instance if the last decompression call returned + * an error. + */ + last_result = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(last_result)) { + has_error = true; + break; + } + + if (left < output.pos) + return -EFBIG; + + wrote = loop_write(fdt, output.dst, output.pos, 1); + if (wrote < 0) + return wrote; + + left -= output.pos; + } + if (has_error) + break; + } + + if (in_bytes == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "ZSTD decoder failed: no data read"); + + if (last_result != 0) { + /* The last return value from ZSTD_decompressStream did not end + * on a frame, but we reached the end of the file! We assume + * this is an error, and the input was truncated. + */ + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(last_result)); + return zstd_ret_to_errno(last_result); + } + + log_debug( + "ZSTD decompression finished (%" PRIu64 " -> %" PRIu64 " bytes, %.1f%%)", + in_bytes, + max_bytes - left, + (double) (max_bytes - left) / in_bytes * 100); + return 0; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without ZSTD support."); +#endif +} + +int decompress_stream(const char *filename, int fdf, int fdt, uint64_t max_bytes) { + + if (endswith(filename, ".lz4")) + return decompress_stream_lz4(fdf, fdt, max_bytes); + else if (endswith(filename, ".xz")) + return decompress_stream_xz(fdf, fdt, max_bytes); + else if (endswith(filename, ".zst")) + return decompress_stream_zstd(fdf, fdt, max_bytes); + else + return -EPROTONOSUPPORT; +} diff --git a/src/journal/compress.h b/src/journal/compress.h new file mode 100644 index 0000000..db7f399 --- /dev/null +++ b/src/journal/compress.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <unistd.h> + +#include "journal-def.h" + +const char* object_compressed_to_string(int compression); +int object_compressed_from_string(const char *compression); + +int compress_blob_xz(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +int compress_blob_lz4(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +int compress_blob_zstd(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); + +static inline int compress_blob(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { + int r; +#if HAVE_ZSTD + r = compress_blob_zstd(src, src_size, dst, dst_alloc_size, dst_size); + if (r == 0) + return OBJECT_COMPRESSED_ZSTD; +#elif HAVE_LZ4 + r = compress_blob_lz4(src, src_size, dst, dst_alloc_size, dst_size); + if (r == 0) + return OBJECT_COMPRESSED_LZ4; +#elif HAVE_XZ + r = compress_blob_xz(src, src_size, dst, dst_alloc_size, dst_size); + if (r == 0) + return OBJECT_COMPRESSED_XZ; +#else + r = -EOPNOTSUPP; +#endif + return r; +} + +int decompress_blob_xz(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max); +int decompress_blob_lz4(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max); +int decompress_blob_zstd(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max); +int decompress_blob(int compression, + const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max); + +int decompress_startswith_xz(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith_lz4(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith_zstd(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith(int compression, + const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra); + +int compress_stream_xz(int fdf, int fdt, uint64_t max_bytes); +int compress_stream_lz4(int fdf, int fdt, uint64_t max_bytes); +int compress_stream_zstd(int fdf, int fdt, uint64_t max_bytes); + +int decompress_stream_xz(int fdf, int fdt, uint64_t max_size); +int decompress_stream_lz4(int fdf, int fdt, uint64_t max_size); +int decompress_stream_zstd(int fdf, int fdt, uint64_t max_size); + +#if HAVE_ZSTD +# define compress_stream compress_stream_zstd +# define COMPRESSED_EXT ".zst" +#elif HAVE_LZ4 +# define compress_stream compress_stream_lz4 +# define COMPRESSED_EXT ".lz4" +#elif HAVE_XZ +# define compress_stream compress_stream_xz +# define COMPRESSED_EXT ".xz" +#else +static inline int compress_stream(int fdf, int fdt, uint64_t max_size) { + return -EOPNOTSUPP; +} +# define COMPRESSED_EXT "" +#endif + +int decompress_stream(const char *filename, int fdf, int fdt, uint64_t max_bytes); diff --git a/src/journal/fsprg.c b/src/journal/fsprg.c new file mode 100644 index 0000000..7ea7249 --- /dev/null +++ b/src/journal/fsprg.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * + * fsprg v0.1 - (seekable) forward-secure pseudorandom generator + * Copyright © 2012 B. Poettering + * Contact: fsprg@point-at-infinity.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +/* + * See "Practical Secure Logging: Seekable Sequential Key Generators" + * by G. A. Marson, B. Poettering for details: + * + * http://eprint.iacr.org/2013/397 + */ + +#include <string.h> + +#include "fsprg.h" +#include "gcrypt-util.h" +#include "memory-util.h" + +#define ISVALID_SECPAR(secpar) (((secpar) % 16 == 0) && ((secpar) >= 16) && ((secpar) <= 16384)) +#define VALIDATE_SECPAR(secpar) assert(ISVALID_SECPAR(secpar)); + +#define RND_HASH GCRY_MD_SHA256 +#define RND_GEN_P 0x01 +#define RND_GEN_Q 0x02 +#define RND_GEN_X 0x03 + +#pragma GCC diagnostic ignored "-Wpointer-arith" +/* TODO: remove void* arithmetic and this work-around */ + +/******************************************************************************/ + +static void mpi_export(void *buf, size_t buflen, const gcry_mpi_t x) { + unsigned len; + size_t nwritten; + + assert(gcry_mpi_cmp_ui(x, 0) >= 0); + len = (gcry_mpi_get_nbits(x) + 7) / 8; + assert(len <= buflen); + memzero(buf, buflen); + gcry_mpi_print(GCRYMPI_FMT_USG, buf + (buflen - len), len, &nwritten, x); + assert(nwritten == len); +} + +static gcry_mpi_t mpi_import(const void *buf, size_t buflen) { + gcry_mpi_t h; + _unused_ unsigned len; + + assert_se(gcry_mpi_scan(&h, GCRYMPI_FMT_USG, buf, buflen, NULL) == 0); + len = (gcry_mpi_get_nbits(h) + 7) / 8; + assert(len <= buflen); + assert(gcry_mpi_cmp_ui(h, 0) >= 0); + + return h; +} + +static void uint64_export(void *buf, size_t buflen, uint64_t x) { + assert(buflen == 8); + ((uint8_t*) buf)[0] = (x >> 56) & 0xff; + ((uint8_t*) buf)[1] = (x >> 48) & 0xff; + ((uint8_t*) buf)[2] = (x >> 40) & 0xff; + ((uint8_t*) buf)[3] = (x >> 32) & 0xff; + ((uint8_t*) buf)[4] = (x >> 24) & 0xff; + ((uint8_t*) buf)[5] = (x >> 16) & 0xff; + ((uint8_t*) buf)[6] = (x >> 8) & 0xff; + ((uint8_t*) buf)[7] = (x >> 0) & 0xff; +} + +_pure_ static uint64_t uint64_import(const void *buf, size_t buflen) { + assert(buflen == 8); + return + (uint64_t)(((uint8_t*) buf)[0]) << 56 | + (uint64_t)(((uint8_t*) buf)[1]) << 48 | + (uint64_t)(((uint8_t*) buf)[2]) << 40 | + (uint64_t)(((uint8_t*) buf)[3]) << 32 | + (uint64_t)(((uint8_t*) buf)[4]) << 24 | + (uint64_t)(((uint8_t*) buf)[5]) << 16 | + (uint64_t)(((uint8_t*) buf)[6]) << 8 | + (uint64_t)(((uint8_t*) buf)[7]) << 0; +} + +/* deterministically generate from seed/idx a string of buflen pseudorandom bytes */ +static void det_randomize(void *buf, size_t buflen, const void *seed, size_t seedlen, uint32_t idx) { + gcry_md_hd_t hd, hd2; + size_t olen, cpylen; + uint32_t ctr; + + olen = gcry_md_get_algo_dlen(RND_HASH); + gcry_md_open(&hd, RND_HASH, 0); + gcry_md_write(hd, seed, seedlen); + gcry_md_putc(hd, (idx >> 24) & 0xff); + gcry_md_putc(hd, (idx >> 16) & 0xff); + gcry_md_putc(hd, (idx >> 8) & 0xff); + gcry_md_putc(hd, (idx >> 0) & 0xff); + + for (ctr = 0; buflen; ctr++) { + gcry_md_copy(&hd2, hd); + gcry_md_putc(hd2, (ctr >> 24) & 0xff); + gcry_md_putc(hd2, (ctr >> 16) & 0xff); + gcry_md_putc(hd2, (ctr >> 8) & 0xff); + gcry_md_putc(hd2, (ctr >> 0) & 0xff); + gcry_md_final(hd2); + cpylen = (buflen < olen) ? buflen : olen; + memcpy(buf, gcry_md_read(hd2, RND_HASH), cpylen); + gcry_md_close(hd2); + buf += cpylen; + buflen -= cpylen; + } + gcry_md_close(hd); +} + +/* deterministically generate from seed/idx a prime of length `bits' that is 3 (mod 4) */ +static gcry_mpi_t genprime3mod4(int bits, const void *seed, size_t seedlen, uint32_t idx) { + size_t buflen = bits / 8; + uint8_t buf[buflen]; + gcry_mpi_t p; + + assert(bits % 8 == 0); + assert(buflen > 0); + + det_randomize(buf, buflen, seed, seedlen, idx); + buf[0] |= 0xc0; /* set upper two bits, so that n=pq has maximum size */ + buf[buflen - 1] |= 0x03; /* set lower two bits, to have result 3 (mod 4) */ + + p = mpi_import(buf, buflen); + while (gcry_prime_check(p, 0)) + gcry_mpi_add_ui(p, p, 4); + + return p; +} + +/* deterministically generate from seed/idx a quadratic residue (mod n) */ +static gcry_mpi_t gensquare(const gcry_mpi_t n, const void *seed, size_t seedlen, uint32_t idx, unsigned secpar) { + size_t buflen = secpar / 8; + uint8_t buf[buflen]; + gcry_mpi_t x; + + det_randomize(buf, buflen, seed, seedlen, idx); + buf[0] &= 0x7f; /* clear upper bit, so that we have x < n */ + x = mpi_import(buf, buflen); + assert(gcry_mpi_cmp(x, n) < 0); + gcry_mpi_mulm(x, x, x, n); + return x; +} + +/* compute 2^m (mod phi(p)), for a prime p */ +static gcry_mpi_t twopowmodphi(uint64_t m, const gcry_mpi_t p) { + gcry_mpi_t phi, r; + int n; + + phi = gcry_mpi_new(0); + gcry_mpi_sub_ui(phi, p, 1); + + /* count number of used bits in m */ + for (n = 0; (1ULL << n) <= m; n++) + ; + + r = gcry_mpi_new(0); + gcry_mpi_set_ui(r, 1); + while (n) { /* square and multiply algorithm for fast exponentiation */ + n--; + gcry_mpi_mulm(r, r, r, phi); + if (m & ((uint64_t)1 << n)) { + gcry_mpi_add(r, r, r); + if (gcry_mpi_cmp(r, phi) >= 0) + gcry_mpi_sub(r, r, phi); + } + } + + gcry_mpi_release(phi); + return r; +} + +/* Decompose $x \in Z_n$ into $(xp,xq) \in Z_p \times Z_q$ using Chinese Remainder Theorem */ +static void CRT_decompose(gcry_mpi_t *xp, gcry_mpi_t *xq, const gcry_mpi_t x, const gcry_mpi_t p, const gcry_mpi_t q) { + *xp = gcry_mpi_new(0); + *xq = gcry_mpi_new(0); + gcry_mpi_mod(*xp, x, p); + gcry_mpi_mod(*xq, x, q); +} + +/* Compose $(xp,xq) \in Z_p \times Z_q$ into $x \in Z_n$ using Chinese Remainder Theorem */ +static void CRT_compose(gcry_mpi_t *x, const gcry_mpi_t xp, const gcry_mpi_t xq, const gcry_mpi_t p, const gcry_mpi_t q) { + gcry_mpi_t a, u; + + a = gcry_mpi_new(0); + u = gcry_mpi_new(0); + *x = gcry_mpi_new(0); + gcry_mpi_subm(a, xq, xp, q); + gcry_mpi_invm(u, p, q); + gcry_mpi_mulm(a, a, u, q); /* a = (xq - xp) / p (mod q) */ + gcry_mpi_mul(*x, p, a); + gcry_mpi_add(*x, *x, xp); /* x = p * ((xq - xp) / p mod q) + xp */ + gcry_mpi_release(a); + gcry_mpi_release(u); +} + +/******************************************************************************/ + +size_t FSPRG_mskinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + 2 * (_secpar / 2) / 8; /* to store header,p,q */ +} + +size_t FSPRG_mpkinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + _secpar / 8; /* to store header,n */ +} + +size_t FSPRG_stateinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + 2 * _secpar / 8 + 8; /* to store header,n,x,epoch */ +} + +static void store_secpar(void *buf, uint16_t secpar) { + secpar = secpar / 16 - 1; + ((uint8_t*) buf)[0] = (secpar >> 8) & 0xff; + ((uint8_t*) buf)[1] = (secpar >> 0) & 0xff; +} + +static uint16_t read_secpar(const void *buf) { + uint16_t secpar; + secpar = + (uint16_t)(((uint8_t*) buf)[0]) << 8 | + (uint16_t)(((uint8_t*) buf)[1]) << 0; + return 16 * (secpar + 1); +} + +void FSPRG_GenMK(void *msk, void *mpk, const void *seed, size_t seedlen, unsigned _secpar) { + uint8_t iseed[FSPRG_RECOMMENDED_SEEDLEN]; + gcry_mpi_t n, p, q; + uint16_t secpar; + + VALIDATE_SECPAR(_secpar); + secpar = _secpar; + + initialize_libgcrypt(false); + + if (!seed) { + gcry_randomize(iseed, FSPRG_RECOMMENDED_SEEDLEN, GCRY_STRONG_RANDOM); + seed = iseed; + seedlen = FSPRG_RECOMMENDED_SEEDLEN; + } + + p = genprime3mod4(secpar / 2, seed, seedlen, RND_GEN_P); + q = genprime3mod4(secpar / 2, seed, seedlen, RND_GEN_Q); + + if (msk) { + store_secpar(msk + 0, secpar); + mpi_export(msk + 2 + 0 * (secpar / 2) / 8, (secpar / 2) / 8, p); + mpi_export(msk + 2 + 1 * (secpar / 2) / 8, (secpar / 2) / 8, q); + } + + if (mpk) { + n = gcry_mpi_new(0); + gcry_mpi_mul(n, p, q); + assert(gcry_mpi_get_nbits(n) == secpar); + + store_secpar(mpk + 0, secpar); + mpi_export(mpk + 2, secpar / 8, n); + + gcry_mpi_release(n); + } + + gcry_mpi_release(p); + gcry_mpi_release(q); +} + +void FSPRG_GenState0(void *state, const void *mpk, const void *seed, size_t seedlen) { + gcry_mpi_t n, x; + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(mpk + 0); + n = mpi_import(mpk + 2, secpar / 8); + x = gensquare(n, seed, seedlen, RND_GEN_X, secpar); + + memcpy(state, mpk, 2 + secpar / 8); + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, x); + memzero(state + 2 + 2 * secpar / 8, 8); + + gcry_mpi_release(n); + gcry_mpi_release(x); +} + +void FSPRG_Evolve(void *state) { + gcry_mpi_t n, x; + uint16_t secpar; + uint64_t epoch; + + initialize_libgcrypt(false); + + secpar = read_secpar(state + 0); + n = mpi_import(state + 2 + 0 * secpar / 8, secpar / 8); + x = mpi_import(state + 2 + 1 * secpar / 8, secpar / 8); + epoch = uint64_import(state + 2 + 2 * secpar / 8, 8); + + gcry_mpi_mulm(x, x, x, n); + epoch++; + + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, x); + uint64_export(state + 2 + 2 * secpar / 8, 8, epoch); + + gcry_mpi_release(n); + gcry_mpi_release(x); +} + +uint64_t FSPRG_GetEpoch(const void *state) { + uint16_t secpar; + secpar = read_secpar(state + 0); + return uint64_import(state + 2 + 2 * secpar / 8, 8); +} + +void FSPRG_Seek(void *state, uint64_t epoch, const void *msk, const void *seed, size_t seedlen) { + gcry_mpi_t p, q, n, x, xp, xq, kp, kq, xm; + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(msk + 0); + p = mpi_import(msk + 2 + 0 * (secpar / 2) / 8, (secpar / 2) / 8); + q = mpi_import(msk + 2 + 1 * (secpar / 2) / 8, (secpar / 2) / 8); + + n = gcry_mpi_new(0); + gcry_mpi_mul(n, p, q); + + x = gensquare(n, seed, seedlen, RND_GEN_X, secpar); + CRT_decompose(&xp, &xq, x, p, q); /* split (mod n) into (mod p) and (mod q) using CRT */ + + kp = twopowmodphi(epoch, p); /* compute 2^epoch (mod phi(p)) */ + kq = twopowmodphi(epoch, q); /* compute 2^epoch (mod phi(q)) */ + + gcry_mpi_powm(xp, xp, kp, p); /* compute x^(2^epoch) (mod p) */ + gcry_mpi_powm(xq, xq, kq, q); /* compute x^(2^epoch) (mod q) */ + + CRT_compose(&xm, xp, xq, p, q); /* combine (mod p) and (mod q) to (mod n) using CRT */ + + store_secpar(state + 0, secpar); + mpi_export(state + 2 + 0 * secpar / 8, secpar / 8, n); + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, xm); + uint64_export(state + 2 + 2 * secpar / 8, 8, epoch); + + gcry_mpi_release(p); + gcry_mpi_release(q); + gcry_mpi_release(n); + gcry_mpi_release(x); + gcry_mpi_release(xp); + gcry_mpi_release(xq); + gcry_mpi_release(kp); + gcry_mpi_release(kq); + gcry_mpi_release(xm); +} + +void FSPRG_GetKey(const void *state, void *key, size_t keylen, uint32_t idx) { + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(state + 0); + det_randomize(key, keylen, state + 2, 2 * secpar / 8 + 8, idx); +} diff --git a/src/journal/fsprg.h b/src/journal/fsprg.h new file mode 100644 index 0000000..dfe2d79 --- /dev/null +++ b/src/journal/fsprg.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* + * fsprg v0.1 - (seekable) forward-secure pseudorandom generator + * Copyright © 2012 B. Poettering + * Contact: fsprg@point-at-infinity.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include <inttypes.h> +#include <sys/types.h> + +#include "macro.h" +#include "util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FSPRG_RECOMMENDED_SECPAR 1536 +#define FSPRG_RECOMMENDED_SEEDLEN (96/8) + +size_t FSPRG_mskinbytes(unsigned secpar) _const_; +size_t FSPRG_mpkinbytes(unsigned secpar) _const_; +size_t FSPRG_stateinbytes(unsigned secpar) _const_; + +/* Setup msk and mpk. Providing seed != NULL makes this algorithm deterministic. */ +void FSPRG_GenMK(void *msk, void *mpk, const void *seed, size_t seedlen, unsigned secpar); + +/* Initialize state deterministically in dependence on seed. */ +/* Note: in case one wants to run only one GenState0 per GenMK it is safe to use + the same seed for both GenMK and GenState0. +*/ +void FSPRG_GenState0(void *state, const void *mpk, const void *seed, size_t seedlen); + +void FSPRG_Evolve(void *state); + +uint64_t FSPRG_GetEpoch(const void *state) _pure_; + +/* Seek to any arbitrary state (by providing msk together with seed from GenState0). */ +void FSPRG_Seek(void *state, uint64_t epoch, const void *msk, const void *seed, size_t seedlen); + +void FSPRG_GetKey(const void *state, void *key, size_t keylen, uint32_t idx); + +#ifdef __cplusplus +} +#endif diff --git a/src/journal/generate-audit_type-list.sh b/src/journal/generate-audit_type-list.sh new file mode 100755 index 0000000..912d0c9 --- /dev/null +++ b/src/journal/generate-audit_type-list.sh @@ -0,0 +1,15 @@ +#!/bin/sh +set -eu + +cpp="$1" +shift + +includes="" +for i in "$@"; do + includes="$includes -include $i" +done + +$cpp -dM $includes - </dev/null | \ + grep -vE 'AUDIT_.*(FIRST|LAST)_' | \ + sed -r -n 's/^#define\s+AUDIT_(\w+)\s+([0-9]{4})\s*$$/\1\t\2/p' | \ + sort -k2 diff --git a/src/journal/journal-authenticate.c b/src/journal/journal-authenticate.c new file mode 100644 index 0000000..a5ff987 --- /dev/null +++ b/src/journal/journal-authenticate.c @@ -0,0 +1,536 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sys/mman.h> + +#include "fd-util.h" +#include "fsprg.h" +#include "gcrypt-util.h" +#include "hexdecoct.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "memory-util.h" +#include "time-util.h" + +static uint64_t journal_file_tag_seqnum(JournalFile *f) { + uint64_t r; + + assert(f); + + r = le64toh(f->header->n_tags) + 1; + f->header->n_tags = htole64(r); + + return r; +} + +int journal_file_append_tag(JournalFile *f) { + Object *o; + uint64_t p; + int r; + + assert(f); + + if (!f->seal) + return 0; + + if (!f->hmac_running) + return 0; + + assert(f->hmac); + + r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p); + if (r < 0) + return r; + + o->tag.seqnum = htole64(journal_file_tag_seqnum(f)); + o->tag.epoch = htole64(FSPRG_GetEpoch(f->fsprg_state)); + + log_debug("Writing tag %"PRIu64" for epoch %"PRIu64"", + le64toh(o->tag.seqnum), + FSPRG_GetEpoch(f->fsprg_state)); + + /* Add the tag object itself, so that we can protect its + * header. This will exclude the actual hash value in it */ + r = journal_file_hmac_put_object(f, OBJECT_TAG, o, p); + if (r < 0) + return r; + + /* Get the HMAC tag and store it in the object */ + memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH); + f->hmac_running = false; + + return 0; +} + +int journal_file_hmac_start(JournalFile *f) { + uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */ + assert(f); + + if (!f->seal) + return 0; + + if (f->hmac_running) + return 0; + + /* Prepare HMAC for next cycle */ + gcry_md_reset(f->hmac); + FSPRG_GetKey(f->fsprg_state, key, sizeof(key), 0); + gcry_md_setkey(f->hmac, key, sizeof(key)); + + f->hmac_running = true; + + return 0; +} + +static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) { + uint64_t t; + + assert(f); + assert(epoch); + assert(f->seal); + + if (f->fss_start_usec == 0 || + f->fss_interval_usec == 0) + return -EOPNOTSUPP; + + if (realtime < f->fss_start_usec) + return -ESTALE; + + t = realtime - f->fss_start_usec; + t = t / f->fss_interval_usec; + + *epoch = t; + return 0; +} + +static int journal_file_fsprg_need_evolve(JournalFile *f, uint64_t realtime) { + uint64_t goal, epoch; + int r; + assert(f); + + if (!f->seal) + return 0; + + r = journal_file_get_epoch(f, realtime, &goal); + if (r < 0) + return r; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (epoch > goal) + return -ESTALE; + + return epoch != goal; +} + +int journal_file_fsprg_evolve(JournalFile *f, uint64_t realtime) { + uint64_t goal, epoch; + int r; + + assert(f); + + if (!f->seal) + return 0; + + r = journal_file_get_epoch(f, realtime, &goal); + if (r < 0) + return r; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (epoch < goal) + log_debug("Evolving FSPRG key from epoch %"PRIu64" to %"PRIu64".", epoch, goal); + + for (;;) { + if (epoch > goal) + return -ESTALE; + if (epoch == goal) + return 0; + + FSPRG_Evolve(f->fsprg_state); + epoch = FSPRG_GetEpoch(f->fsprg_state); + } +} + +int journal_file_fsprg_seek(JournalFile *f, uint64_t goal) { + void *msk; + uint64_t epoch; + + assert(f); + + if (!f->seal) + return 0; + + assert(f->fsprg_seed); + + if (f->fsprg_state) { + /* Cheaper... */ + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (goal == epoch) + return 0; + + if (goal == epoch+1) { + FSPRG_Evolve(f->fsprg_state); + return 0; + } + } else { + f->fsprg_state_size = FSPRG_stateinbytes(FSPRG_RECOMMENDED_SECPAR); + f->fsprg_state = malloc(f->fsprg_state_size); + + if (!f->fsprg_state) + return -ENOMEM; + } + + log_debug("Seeking FSPRG key to %"PRIu64".", goal); + + msk = alloca(FSPRG_mskinbytes(FSPRG_RECOMMENDED_SECPAR)); + FSPRG_GenMK(msk, NULL, f->fsprg_seed, f->fsprg_seed_size, FSPRG_RECOMMENDED_SECPAR); + FSPRG_Seek(f->fsprg_state, goal, msk, f->fsprg_seed, f->fsprg_seed_size); + return 0; +} + +int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) { + int r; + + assert(f); + + if (!f->seal) + return 0; + + if (realtime <= 0) + realtime = now(CLOCK_REALTIME); + + r = journal_file_fsprg_need_evolve(f, realtime); + if (r <= 0) + return 0; + + r = journal_file_append_tag(f); + if (r < 0) + return r; + + r = journal_file_fsprg_evolve(f, realtime); + if (r < 0) + return r; + + return 0; +} + +int journal_file_hmac_put_object(JournalFile *f, ObjectType type, Object *o, uint64_t p) { + int r; + + assert(f); + + if (!f->seal) + return 0; + + r = journal_file_hmac_start(f); + if (r < 0) + return r; + + if (!o) { + r = journal_file_move_to_object(f, type, p, &o); + if (r < 0) + return r; + } else { + if (type > OBJECT_UNUSED && o->object.type != type) + return -EBADMSG; + } + + gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload)); + + switch (o->object.type) { + + case OBJECT_DATA: + /* All but hash and payload are mutable */ + gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash)); + gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload)); + break; + + case OBJECT_FIELD: + /* Same here */ + gcry_md_write(f->hmac, &o->field.hash, sizeof(o->field.hash)); + gcry_md_write(f->hmac, o->field.payload, le64toh(o->object.size) - offsetof(FieldObject, payload)); + break; + + case OBJECT_ENTRY: + /* All */ + gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum)); + break; + + case OBJECT_FIELD_HASH_TABLE: + case OBJECT_DATA_HASH_TABLE: + case OBJECT_ENTRY_ARRAY: + /* Nothing: everything is mutable */ + break; + + case OBJECT_TAG: + /* All but the tag itself */ + gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum)); + gcry_md_write(f->hmac, &o->tag.epoch, sizeof(o->tag.epoch)); + break; + default: + return -EINVAL; + } + + return 0; +} + +int journal_file_hmac_put_header(JournalFile *f) { + int r; + + assert(f); + + if (!f->seal) + return 0; + + r = journal_file_hmac_start(f); + if (r < 0) + return r; + + /* All but state+reserved, boot_id, arena_size, + * tail_object_offset, n_objects, n_entries, + * tail_entry_seqnum, head_entry_seqnum, entry_array_offset, + * head_entry_realtime, tail_entry_realtime, + * tail_entry_monotonic, n_data, n_fields, n_tags, + * n_entry_arrays. */ + + gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature)); + gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id)); + gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id)); + gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset)); + + return 0; +} + +int journal_file_fss_load(JournalFile *f) { + int r, fd = -1; + char *p = NULL; + struct stat st; + FSSHeader *m = NULL; + sd_id128_t machine; + + assert(f); + + if (!f->seal) + return 0; + + r = sd_id128_get_machine(&machine); + if (r < 0) + return r; + + if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss", + SD_ID128_FORMAT_VAL(machine)) < 0) + return -ENOMEM; + + fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600); + if (fd < 0) { + if (errno != ENOENT) + log_error_errno(errno, "Failed to open %s: %m", p); + + r = -errno; + goto finish; + } + + if (fstat(fd, &st) < 0) { + r = -errno; + goto finish; + } + + if (st.st_size < (off_t) sizeof(FSSHeader)) { + r = -ENODATA; + goto finish; + } + + m = mmap(NULL, PAGE_ALIGN(sizeof(FSSHeader)), PROT_READ, MAP_SHARED, fd, 0); + if (m == MAP_FAILED) { + m = NULL; + r = -errno; + goto finish; + } + + if (memcmp(m->signature, FSS_HEADER_SIGNATURE, 8) != 0) { + r = -EBADMSG; + goto finish; + } + + if (m->incompatible_flags != 0) { + r = -EPROTONOSUPPORT; + goto finish; + } + + if (le64toh(m->header_size) < sizeof(FSSHeader)) { + r = -EBADMSG; + goto finish; + } + + if (le64toh(m->fsprg_state_size) != FSPRG_stateinbytes(le16toh(m->fsprg_secpar))) { + r = -EBADMSG; + goto finish; + } + + f->fss_file_size = le64toh(m->header_size) + le64toh(m->fsprg_state_size); + if ((uint64_t) st.st_size < f->fss_file_size) { + r = -ENODATA; + goto finish; + } + + if (!sd_id128_equal(machine, m->machine_id)) { + r = -EHOSTDOWN; + goto finish; + } + + if (le64toh(m->start_usec) <= 0 || + le64toh(m->interval_usec) <= 0) { + r = -EBADMSG; + goto finish; + } + + f->fss_file = mmap(NULL, PAGE_ALIGN(f->fss_file_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (f->fss_file == MAP_FAILED) { + f->fss_file = NULL; + r = -errno; + goto finish; + } + + f->fss_start_usec = le64toh(f->fss_file->start_usec); + f->fss_interval_usec = le64toh(f->fss_file->interval_usec); + + f->fsprg_state = (uint8_t*) f->fss_file + le64toh(f->fss_file->header_size); + f->fsprg_state_size = le64toh(f->fss_file->fsprg_state_size); + + r = 0; + +finish: + if (m) + munmap(m, PAGE_ALIGN(sizeof(FSSHeader))); + + safe_close(fd); + free(p); + + return r; +} + +int journal_file_hmac_setup(JournalFile *f) { + gcry_error_t e; + + if (!f->seal) + return 0; + + initialize_libgcrypt(true); + + e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC); + if (e != 0) + return -EOPNOTSUPP; + + return 0; +} + +int journal_file_append_first_tag(JournalFile *f) { + int r; + uint64_t p; + + if (!f->seal) + return 0; + + log_debug("Calculating first tag..."); + + r = journal_file_hmac_put_header(f); + if (r < 0) + return r; + + p = le64toh(f->header->field_hash_table_offset); + if (p < offsetof(Object, hash_table.items)) + return -EINVAL; + p -= offsetof(Object, hash_table.items); + + r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, NULL, p); + if (r < 0) + return r; + + p = le64toh(f->header->data_hash_table_offset); + if (p < offsetof(Object, hash_table.items)) + return -EINVAL; + p -= offsetof(Object, hash_table.items); + + r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, NULL, p); + if (r < 0) + return r; + + r = journal_file_append_tag(f); + if (r < 0) + return r; + + return 0; +} + +int journal_file_parse_verification_key(JournalFile *f, const char *key) { + uint8_t *seed; + size_t seed_size, c; + const char *k; + int r; + unsigned long long start, interval; + + seed_size = FSPRG_RECOMMENDED_SEEDLEN; + seed = malloc(seed_size); + if (!seed) + return -ENOMEM; + + k = key; + for (c = 0; c < seed_size; c++) { + int x, y; + + while (*k == '-') + k++; + + x = unhexchar(*k); + if (x < 0) { + free(seed); + return -EINVAL; + } + k++; + y = unhexchar(*k); + if (y < 0) { + free(seed); + return -EINVAL; + } + k++; + + seed[c] = (uint8_t) (x * 16 + y); + } + + if (*k != '/') { + free(seed); + return -EINVAL; + } + k++; + + r = sscanf(k, "%llx-%llx", &start, &interval); + if (r != 2) { + free(seed); + return -EINVAL; + } + + f->fsprg_seed = seed; + f->fsprg_seed_size = seed_size; + + f->fss_start_usec = start * interval; + f->fss_interval_usec = interval; + + return 0; +} + +bool journal_file_next_evolve_usec(JournalFile *f, usec_t *u) { + uint64_t epoch; + + assert(f); + assert(u); + + if (!f->seal) + return false; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + + *u = (usec_t) (f->fss_start_usec + f->fss_interval_usec * epoch + f->fss_interval_usec); + + return true; +} diff --git a/src/journal/journal-authenticate.h b/src/journal/journal-authenticate.h new file mode 100644 index 0000000..e895722 --- /dev/null +++ b/src/journal/journal-authenticate.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "journal-file.h" + +int journal_file_append_tag(JournalFile *f); +int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime); +int journal_file_append_first_tag(JournalFile *f); + +int journal_file_hmac_setup(JournalFile *f); +int journal_file_hmac_start(JournalFile *f); +int journal_file_hmac_put_header(JournalFile *f); +int journal_file_hmac_put_object(JournalFile *f, ObjectType type, Object *o, uint64_t p); + +int journal_file_fss_load(JournalFile *f); +int journal_file_parse_verification_key(JournalFile *f, const char *key); + +int journal_file_fsprg_evolve(JournalFile *f, uint64_t realtime); +int journal_file_fsprg_seek(JournalFile *f, uint64_t epoch); + +bool journal_file_next_evolve_usec(JournalFile *f, usec_t *u); diff --git a/src/journal/journal-def.h b/src/journal/journal-def.h new file mode 100644 index 0000000..bd924bd --- /dev/null +++ b/src/journal/journal-def.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-id128.h" + +#include "macro.h" +#include "sparse-endian.h" + +/* + * If you change this file you probably should also change its documentation: + * + * https://systemd.io/JOURNAL_FILE_FORMAT + */ + +typedef struct Header Header; + +typedef struct ObjectHeader ObjectHeader; +typedef union Object Object; + +typedef struct DataObject DataObject; +typedef struct FieldObject FieldObject; +typedef struct EntryObject EntryObject; +typedef struct HashTableObject HashTableObject; +typedef struct EntryArrayObject EntryArrayObject; +typedef struct TagObject TagObject; + +typedef struct EntryItem EntryItem; +typedef struct HashItem HashItem; + +typedef struct FSSHeader FSSHeader; + +/* Object types */ +typedef enum ObjectType { + OBJECT_UNUSED, /* also serves as "any type" or "additional context" */ + OBJECT_DATA, + OBJECT_FIELD, + OBJECT_ENTRY, + OBJECT_DATA_HASH_TABLE, + OBJECT_FIELD_HASH_TABLE, + OBJECT_ENTRY_ARRAY, + OBJECT_TAG, + _OBJECT_TYPE_MAX +} ObjectType; + +/* Object flags */ +enum { + OBJECT_COMPRESSED_XZ = 1 << 0, + OBJECT_COMPRESSED_LZ4 = 1 << 1, + OBJECT_COMPRESSED_ZSTD = 1 << 2, + OBJECT_COMPRESSION_MASK = (OBJECT_COMPRESSED_XZ | OBJECT_COMPRESSED_LZ4 | OBJECT_COMPRESSED_ZSTD), + _OBJECT_COMPRESSED_MAX = OBJECT_COMPRESSION_MASK, +}; + +struct ObjectHeader { + uint8_t type; + uint8_t flags; + uint8_t reserved[6]; + le64_t size; + uint8_t payload[]; +} _packed_; + +#define DataObject__contents { \ + ObjectHeader object; \ + le64_t hash; \ + le64_t next_hash_offset; \ + le64_t next_field_offset; \ + le64_t entry_offset; /* the first array entry we store inline */ \ + le64_t entry_array_offset; \ + le64_t n_entries; \ + uint8_t payload[]; \ + } + +struct DataObject DataObject__contents; +struct DataObject__packed DataObject__contents _packed_; +assert_cc(sizeof(struct DataObject) == sizeof(struct DataObject__packed)); + +#define FieldObject__contents { \ + ObjectHeader object; \ + le64_t hash; \ + le64_t next_hash_offset; \ + le64_t head_data_offset; \ + uint8_t payload[]; \ +} + +struct FieldObject FieldObject__contents; +struct FieldObject__packed FieldObject__contents _packed_; +assert_cc(sizeof(struct FieldObject) == sizeof(struct FieldObject__packed)); + +struct EntryItem { + le64_t object_offset; + le64_t hash; +} _packed_; + +#define EntryObject__contents { \ + ObjectHeader object; \ + le64_t seqnum; \ + le64_t realtime; \ + le64_t monotonic; \ + sd_id128_t boot_id; \ + le64_t xor_hash; \ + EntryItem items[]; \ + } + +struct EntryObject EntryObject__contents; +struct EntryObject__packed EntryObject__contents _packed_; +assert_cc(sizeof(struct EntryObject) == sizeof(struct EntryObject__packed)); + +struct HashItem { + le64_t head_hash_offset; + le64_t tail_hash_offset; +} _packed_; + +struct HashTableObject { + ObjectHeader object; + HashItem items[]; +} _packed_; + +struct EntryArrayObject { + ObjectHeader object; + le64_t next_entry_array_offset; + le64_t items[]; +} _packed_; + +#define TAG_LENGTH (256/8) + +struct TagObject { + ObjectHeader object; + le64_t seqnum; + le64_t epoch; + uint8_t tag[TAG_LENGTH]; /* SHA-256 HMAC */ +} _packed_; + +union Object { + ObjectHeader object; + DataObject data; + FieldObject field; + EntryObject entry; + HashTableObject hash_table; + EntryArrayObject entry_array; + TagObject tag; +}; + +enum { + STATE_OFFLINE = 0, + STATE_ONLINE = 1, + STATE_ARCHIVED = 2, + _STATE_MAX +}; + +/* Header flags */ +enum { + HEADER_INCOMPATIBLE_COMPRESSED_XZ = 1 << 0, + HEADER_INCOMPATIBLE_COMPRESSED_LZ4 = 1 << 1, + HEADER_INCOMPATIBLE_KEYED_HASH = 1 << 2, + HEADER_INCOMPATIBLE_COMPRESSED_ZSTD = 1 << 3, +}; + +#define HEADER_INCOMPATIBLE_ANY \ + (HEADER_INCOMPATIBLE_COMPRESSED_XZ | \ + HEADER_INCOMPATIBLE_COMPRESSED_LZ4 | \ + HEADER_INCOMPATIBLE_KEYED_HASH | \ + HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) + +#if HAVE_XZ && HAVE_LZ4 && HAVE_ZSTD +# define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_ANY +#elif HAVE_XZ && HAVE_LZ4 +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_XZ|HEADER_INCOMPATIBLE_COMPRESSED_LZ4|HEADER_INCOMPATIBLE_KEYED_HASH) +#elif HAVE_XZ && HAVE_ZSTD +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_XZ|HEADER_INCOMPATIBLE_COMPRESSED_ZSTD|HEADER_INCOMPATIBLE_KEYED_HASH) +#elif HAVE_LZ4 && HAVE_ZSTD +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_LZ4|HEADER_INCOMPATIBLE_COMPRESSED_ZSTD|HEADER_INCOMPATIBLE_KEYED_HASH) +#elif HAVE_XZ +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_XZ|HEADER_INCOMPATIBLE_KEYED_HASH) +#elif HAVE_LZ4 +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_LZ4|HEADER_INCOMPATIBLE_KEYED_HASH) +#elif HAVE_ZSTD +# define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_ZSTD|HEADER_INCOMPATIBLE_KEYED_HASH) +#else +# define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_KEYED_HASH +#endif + +enum { + HEADER_COMPATIBLE_SEALED = 1 << 0, +}; + +#define HEADER_COMPATIBLE_ANY HEADER_COMPATIBLE_SEALED +#if HAVE_GCRYPT +# define HEADER_COMPATIBLE_SUPPORTED HEADER_COMPATIBLE_SEALED +#else +# define HEADER_COMPATIBLE_SUPPORTED 0 +#endif + +#define HEADER_SIGNATURE \ + ((const char[]) { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' }) + +#define struct_Header__contents { \ + uint8_t signature[8]; /* "LPKSHHRH" */ \ + le32_t compatible_flags; \ + le32_t incompatible_flags; \ + uint8_t state; \ + uint8_t reserved[7]; \ + sd_id128_t file_id; \ + sd_id128_t machine_id; \ + sd_id128_t boot_id; /* last writer */ \ + sd_id128_t seqnum_id; \ + le64_t header_size; \ + le64_t arena_size; \ + le64_t data_hash_table_offset; \ + le64_t data_hash_table_size; \ + le64_t field_hash_table_offset; \ + le64_t field_hash_table_size; \ + le64_t tail_object_offset; \ + le64_t n_objects; \ + le64_t n_entries; \ + le64_t tail_entry_seqnum; \ + le64_t head_entry_seqnum; \ + le64_t entry_array_offset; \ + le64_t head_entry_realtime; \ + le64_t tail_entry_realtime; \ + le64_t tail_entry_monotonic; \ + /* Added in 187 */ \ + le64_t n_data; \ + le64_t n_fields; \ + /* Added in 189 */ \ + le64_t n_tags; \ + le64_t n_entry_arrays; \ + /* Added in 246 */ \ + le64_t data_hash_chain_depth; \ + le64_t field_hash_chain_depth; \ + } + +struct Header struct_Header__contents; +struct Header__packed struct_Header__contents _packed_; +assert_cc(sizeof(struct Header) == sizeof(struct Header__packed)); +assert_cc(sizeof(struct Header) == 256); + +#define FSS_HEADER_SIGNATURE \ + ((const char[]) { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' }) + +struct FSSHeader { + uint8_t signature[8]; /* "KSHHRHLP" */ + le32_t compatible_flags; + le32_t incompatible_flags; + sd_id128_t machine_id; + sd_id128_t boot_id; /* last writer */ + le64_t header_size; + le64_t start_usec; + le64_t interval_usec; + le16_t fsprg_secpar; + le16_t reserved[3]; + le64_t fsprg_state_size; +} _packed_; diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c new file mode 100644 index 0000000..15336be --- /dev/null +++ b/src/journal/journal-file.c @@ -0,0 +1,4162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <linux/fs.h> +#include <pthread.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/statvfs.h> +#include <sys/uio.h> +#include <unistd.h> + +#include "sd-event.h" + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "chattr-util.h" +#include "compress.h" +#include "env-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "lookup3.h" +#include "memory-util.h" +#include "path-util.h" +#include "random-util.h" +#include "set.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "xattr-util.h" + +#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem)) +#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem)) + +#define DEFAULT_COMPRESS_THRESHOLD (512ULL) +#define MIN_COMPRESS_THRESHOLD (8ULL) + +/* This is the minimum journal file size */ +#define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */ + +/* These are the lower and upper bounds if we deduce the max_use value + * from the file system size */ +#define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */ +#define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */ + +/* Those are the lower and upper bounds for the minimal use limit, + * i.e. how much we'll use even if keep_free suggests otherwise. */ +#define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */ +#define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */ + +/* This is the upper bound if we deduce max_size from max_use */ +#define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */ + +/* This is the upper bound if we deduce the keep_free value from the + * file system size */ +#define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */ + +/* This is the keep_free value when we can't determine the system + * size */ +#define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */ + +/* This is the default maximum number of journal files to keep around. */ +#define DEFAULT_N_MAX_FILES 100 + +/* n_data was the first entry we added after the initial file format design */ +#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data)) + +/* How many entries to keep in the entry array chain cache at max */ +#define CHAIN_CACHE_MAX 20 + +/* How much to increase the journal file size at once each time we allocate something new. */ +#define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */ + +/* Reread fstat() of the file for detecting deletions at least this often */ +#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC) + +/* The mmap context to use for the header we pick as one above the last defined typed */ +#define CONTEXT_HEADER _OBJECT_TYPE_MAX + +/* Longest hash chain to rotate after */ +#define HASH_CHAIN_DEPTH_MAX 100 + +#ifdef __clang__ +# pragma GCC diagnostic ignored "-Waddress-of-packed-member" +#endif + +/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync(). + * As a result we use atomic operations on f->offline_state for inter-thread communications with + * journal_file_set_offline() and journal_file_set_online(). */ +static void journal_file_set_offline_internal(JournalFile *f) { + assert(f); + assert(f->fd >= 0); + assert(f->header); + + for (;;) { + switch (f->offline_state) { + case OFFLINE_CANCEL: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE)) + continue; + return; + + case OFFLINE_AGAIN_FROM_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING)) + continue; + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING)) + continue; + break; + + case OFFLINE_SYNCING: + (void) fsync(f->fd); + + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING)) + continue; + + f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE; + (void) fsync(f->fd); + break; + + case OFFLINE_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE)) + continue; + _fallthrough_; + case OFFLINE_DONE: + return; + + case OFFLINE_JOINED: + log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()"); + return; + } + } +} + +static void * journal_file_set_offline_thread(void *arg) { + JournalFile *f = arg; + + (void) pthread_setname_np(pthread_self(), "journal-offline"); + + journal_file_set_offline_internal(f); + + return NULL; +} + +static int journal_file_set_offline_thread_join(JournalFile *f) { + int r; + + assert(f); + + if (f->offline_state == OFFLINE_JOINED) + return 0; + + r = pthread_join(f->offline_thread, NULL); + if (r) + return -r; + + f->offline_state = OFFLINE_JOINED; + + if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) + return -EIO; + + return 0; +} + +/* Trigger a restart if the offline thread is mid-flight in a restartable state. */ +static bool journal_file_set_offline_try_restart(JournalFile *f) { + for (;;) { + switch (f->offline_state) { + case OFFLINE_AGAIN_FROM_SYNCING: + case OFFLINE_AGAIN_FROM_OFFLINING: + return true; + + case OFFLINE_CANCEL: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING)) + continue; + return true; + + case OFFLINE_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING)) + continue; + return true; + + case OFFLINE_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING)) + continue; + return true; + + default: + return false; + } + } +} + +/* Sets a journal offline. + * + * If wait is false then an offline is dispatched in a separate thread for a + * subsequent journal_file_set_offline() or journal_file_set_online() of the + * same journal to synchronize with. + * + * If wait is true, then either an existing offline thread will be restarted + * and joined, or if none exists the offline is simply performed in this + * context without involving another thread. + */ +int journal_file_set_offline(JournalFile *f, bool wait) { + bool restarted; + int r; + + assert(f); + + if (!f->writable) + return -EPERM; + + if (f->fd < 0 || !f->header) + return -EINVAL; + + /* An offlining journal is implicitly online and may modify f->header->state, + * we must also join any potentially lingering offline thread when not online. */ + if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE) + return journal_file_set_offline_thread_join(f); + + /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */ + restarted = journal_file_set_offline_try_restart(f); + if ((restarted && wait) || !restarted) { + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + } + + if (restarted) + return 0; + + /* Initiate a new offline. */ + f->offline_state = OFFLINE_SYNCING; + + if (wait) /* Without using a thread if waiting. */ + journal_file_set_offline_internal(f); + else { + sigset_t ss, saved_ss; + int k; + + assert_se(sigfillset(&ss) >= 0); + /* Don't block SIGBUS since the offlining thread accesses a memory mapped file. + * Asynchronous SIGBUS signals can safely be handled by either thread. */ + assert_se(sigdelset(&ss, SIGBUS) >= 0); + + r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss); + if (r > 0) + return -r; + + r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f); + + k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL); + if (r > 0) { + f->offline_state = OFFLINE_JOINED; + return -r; + } + if (k > 0) + return -k; + } + + return 0; +} + +static int journal_file_set_online(JournalFile *f) { + bool wait = true; + + assert(f); + + if (!f->writable) + return -EPERM; + + if (f->fd < 0 || !f->header) + return -EINVAL; + + while (wait) { + switch (f->offline_state) { + case OFFLINE_JOINED: + /* No offline thread, no need to wait. */ + wait = false; + break; + + case OFFLINE_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL)) + continue; + /* Canceled syncing prior to offlining, no need to wait. */ + wait = false; + break; + + case OFFLINE_AGAIN_FROM_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL)) + continue; + /* Canceled restart from syncing, no need to wait. */ + wait = false; + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL)) + continue; + /* Canceled restart from offlining, must wait for offlining to complete however. */ + _fallthrough_; + default: { + int r; + + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + + wait = false; + break; + } + } + } + + if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) + return -EIO; + + switch (f->header->state) { + case STATE_ONLINE: + return 0; + + case STATE_OFFLINE: + f->header->state = STATE_ONLINE; + (void) fsync(f->fd); + return 0; + + default: + return -EINVAL; + } +} + +bool journal_file_is_offlining(JournalFile *f) { + assert(f); + + __sync_synchronize(); + + if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED)) + return false; + + return true; +} + +JournalFile* journal_file_close(JournalFile *f) { + if (!f) + return NULL; + +#if HAVE_GCRYPT + /* Write the final tag */ + if (f->seal && f->writable) { + int r; + + r = journal_file_append_tag(f); + if (r < 0) + log_error_errno(r, "Failed to append tag when closing journal: %m"); + } +#endif + + if (f->post_change_timer) { + if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0) + journal_file_post_change(f); + + sd_event_source_disable_unref(f->post_change_timer); + } + + journal_file_set_offline(f, true); + + if (f->mmap && f->cache_fd) + mmap_cache_free_fd(f->mmap, f->cache_fd); + + if (f->fd >= 0 && f->defrag_on_close) { + + /* Be friendly to btrfs: turn COW back on again now, + * and defragment the file. We won't write to the file + * ever again, hence remove all fragmentation, and + * reenable all the good bits COW usually provides + * (such as data checksumming). */ + + (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL); + (void) btrfs_defrag_fd(f->fd); + } + + if (f->close_fd) + safe_close(f->fd); + free(f->path); + + mmap_cache_unref(f->mmap); + + ordered_hashmap_free_free(f->chain_cache); + +#if HAVE_COMPRESSION + free(f->compress_buffer); +#endif + +#if HAVE_GCRYPT + if (f->fss_file) + munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size)); + else + free(f->fsprg_state); + + free(f->fsprg_seed); + + if (f->hmac) + gcry_md_close(f->hmac); +#endif + + return mfree(f); +} + +static int journal_file_init_header(JournalFile *f, JournalFile *template) { + Header h = {}; + ssize_t k; + int r; + + assert(f); + + memcpy(h.signature, HEADER_SIGNATURE, 8); + h.header_size = htole64(ALIGN64(sizeof(h))); + + h.incompatible_flags |= htole32( + f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ | + f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 | + f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD | + f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH); + + h.compatible_flags = htole32( + f->seal * HEADER_COMPATIBLE_SEALED); + + r = sd_id128_randomize(&h.file_id); + if (r < 0) + return r; + + if (template) { + h.seqnum_id = template->header->seqnum_id; + h.tail_entry_seqnum = template->header->tail_entry_seqnum; + } else + h.seqnum_id = h.file_id; + + k = pwrite(f->fd, &h, sizeof(h), 0); + if (k < 0) + return -errno; + + if (k != sizeof(h)) + return -EIO; + + return 0; +} + +static int journal_file_refresh_header(JournalFile *f) { + int r; + + assert(f); + assert(f->header); + + r = sd_id128_get_machine(&f->header->machine_id); + if (IN_SET(r, -ENOENT, -ENOMEDIUM)) + /* We don't have a machine-id, let's continue without */ + zero(f->header->machine_id); + else if (r < 0) + return r; + + r = sd_id128_get_boot(&f->header->boot_id); + if (r < 0) + return r; + + r = journal_file_set_online(f); + + /* Sync the online state to disk */ + (void) fsync(f->fd); + + /* We likely just created a new file, also sync the directory this file is located in. */ + (void) fsync_directory_of_file(f->fd); + + return r; +} + +static bool warn_wrong_flags(const JournalFile *f, bool compatible) { + const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY, + supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED; + const char *type = compatible ? "compatible" : "incompatible"; + uint32_t flags; + + flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags); + + if (flags & ~supported) { + if (flags & ~any) + log_debug("Journal file %s has unknown %s flags 0x%"PRIx32, + f->path, type, flags & ~any); + flags = (flags & any) & ~supported; + if (flags) { + const char* strv[5]; + unsigned n = 0; + _cleanup_free_ char *t = NULL; + + if (compatible) { + if (flags & HEADER_COMPATIBLE_SEALED) + strv[n++] = "sealed"; + } else { + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ) + strv[n++] = "xz-compressed"; + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4) + strv[n++] = "lz4-compressed"; + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) + strv[n++] = "zstd-compressed"; + if (flags & HEADER_INCOMPATIBLE_KEYED_HASH) + strv[n++] = "keyed-hash"; + } + strv[n] = NULL; + assert(n < ELEMENTSOF(strv)); + + t = strv_join((char**) strv, ", "); + log_debug("Journal file %s uses %s %s %s disabled at compilation time.", + f->path, type, n > 1 ? "flags" : "flag", strnull(t)); + } + return true; + } + + return false; +} + +static int journal_file_verify_header(JournalFile *f) { + uint64_t arena_size, header_size; + + assert(f); + assert(f->header); + + if (memcmp(f->header->signature, HEADER_SIGNATURE, 8)) + return -EBADMSG; + + /* In both read and write mode we refuse to open files with incompatible + * flags we don't know. */ + if (warn_wrong_flags(f, false)) + return -EPROTONOSUPPORT; + + /* When open for writing we refuse to open files with compatible flags, too. */ + if (f->writable && warn_wrong_flags(f, true)) + return -EPROTONOSUPPORT; + + if (f->header->state >= _STATE_MAX) + return -EBADMSG; + + header_size = le64toh(READ_NOW(f->header->header_size)); + + /* The first addition was n_data, so check that we are at least this large */ + if (header_size < HEADER_SIZE_MIN) + return -EBADMSG; + + if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + return -EBADMSG; + + arena_size = le64toh(READ_NOW(f->header->arena_size)); + + if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size) + return -ENODATA; + + if (le64toh(f->header->tail_object_offset) > header_size + arena_size) + return -ENODATA; + + if (!VALID64(le64toh(f->header->data_hash_table_offset)) || + !VALID64(le64toh(f->header->field_hash_table_offset)) || + !VALID64(le64toh(f->header->tail_object_offset)) || + !VALID64(le64toh(f->header->entry_array_offset))) + return -ENODATA; + + if (f->writable) { + sd_id128_t machine_id; + uint8_t state; + int r; + + r = sd_id128_get_machine(&machine_id); + if (r < 0) + return r; + + if (!sd_id128_equal(machine_id, f->header->machine_id)) + return -EHOSTDOWN; + + state = f->header->state; + + if (state == STATE_ARCHIVED) + return -ESHUTDOWN; /* Already archived */ + else if (state == STATE_ONLINE) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), + "Journal file %s is already online. Assuming unclean closing.", + f->path); + else if (state != STATE_OFFLINE) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), + "Journal file %s has unknown state %i.", + f->path, state); + + if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0) + return -EBADMSG; + + /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't + * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks + * bisection. */ + if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) + return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY), + "Journal file %s is from the future, refusing to append new data to it that'd be older.", + f->path); + } + + f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header); + f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header); + f->compress_zstd = JOURNAL_HEADER_COMPRESSED_ZSTD(f->header); + + f->seal = JOURNAL_HEADER_SEALED(f->header); + + f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header); + + return 0; +} + +int journal_file_fstat(JournalFile *f) { + int r; + + assert(f); + assert(f->fd >= 0); + + if (fstat(f->fd, &f->last_stat) < 0) + return -errno; + + f->last_stat_usec = now(CLOCK_MONOTONIC); + + /* Refuse dealing with files that aren't regular */ + r = stat_verify_regular(&f->last_stat); + if (r < 0) + return r; + + /* Refuse appending to files that are already deleted */ + if (f->last_stat.st_nlink <= 0) + return -EIDRM; + + return 0; +} + +static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) { + uint64_t old_size, new_size, old_header_size, old_arena_size; + int r; + + assert(f); + assert(f->header); + + /* We assume that this file is not sparse, and we know that for sure, since we always call + * posix_fallocate() ourselves */ + + if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset) + return -EINVAL; + + if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) + return -EIO; + + old_header_size = le64toh(READ_NOW(f->header->header_size)); + old_arena_size = le64toh(READ_NOW(f->header->arena_size)); + if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size) + return -EBADMSG; + + old_size = old_header_size + old_arena_size; + + new_size = MAX(PAGE_ALIGN(offset + size), old_header_size); + + if (new_size <= old_size) { + + /* We already pre-allocated enough space, but before + * we write to it, let's check with fstat() if the + * file got deleted, in order make sure we don't throw + * away the data immediately. Don't check fstat() for + * all writes though, but only once ever 10s. */ + + if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC)) + return 0; + + return journal_file_fstat(f); + } + + /* Allocate more space. */ + + if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) + return -E2BIG; + + if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) { + struct statvfs svfs; + + if (fstatvfs(f->fd, &svfs) >= 0) { + uint64_t available; + + available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free); + + if (new_size - old_size > available) + return -E2BIG; + } + } + + /* Increase by larger blocks at once */ + new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE; + if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) + new_size = f->metrics.max_size; + + /* Note that the glibc fallocate() fallback is very + inefficient, hence we try to minimize the allocation area + as we can. */ + r = posix_fallocate(f->fd, old_size, new_size - old_size); + if (r != 0) + return -r; + + f->header->arena_size = htole64(new_size - old_header_size); + + return journal_file_fstat(f); +} + +static unsigned type_to_context(ObjectType type) { + /* One context for each type, plus one catch-all for the rest */ + assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS); + assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS); + return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0; +} + +static int journal_file_move_to( + JournalFile *f, + ObjectType type, + bool keep_always, + uint64_t offset, + uint64_t size, + void **ret, + size_t *ret_size) { + + int r; + + assert(f); + assert(ret); + + if (size <= 0) + return -EINVAL; + + if (size > UINT64_MAX - offset) + return -EBADMSG; + + /* Avoid SIGBUS on invalid accesses */ + if (offset + size > (uint64_t) f->last_stat.st_size) { + /* Hmm, out of range? Let's refresh the fstat() data + * first, before we trust that check. */ + + r = journal_file_fstat(f); + if (r < 0) + return r; + + if (offset + size > (uint64_t) f->last_stat.st_size) + return -EADDRNOTAVAIL; + } + + return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size); +} + +static uint64_t minimum_header_size(Object *o) { + + static const uint64_t table[] = { + [OBJECT_DATA] = sizeof(DataObject), + [OBJECT_FIELD] = sizeof(FieldObject), + [OBJECT_ENTRY] = sizeof(EntryObject), + [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject), + [OBJECT_TAG] = sizeof(TagObject), + }; + + if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0) + return sizeof(ObjectHeader); + + return table[o->object.type]; +} + +/* Lightweight object checks. We want this to be fast, so that we won't + * slowdown every journal_file_move_to_object() call too much. */ +static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) { + assert(f); + assert(o); + + switch (o->object.type) { + + case OBJECT_DATA: + if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad n_entries: %" PRIu64 ": %" PRIu64, + le64toh(o->data.n_entries), + offset); + + if (le64toh(o->object.size) <= offsetof(DataObject, payload)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64, + offsetof(DataObject, payload), + le64toh(o->object.size), + offset); + + if (!VALID64(le64toh(o->data.next_hash_offset)) || + !VALID64(le64toh(o->data.next_field_offset)) || + !VALID64(le64toh(o->data.entry_offset)) || + !VALID64(le64toh(o->data.entry_array_offset))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64, + le64toh(o->data.next_hash_offset), + le64toh(o->data.next_field_offset), + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + offset); + + break; + + case OBJECT_FIELD: + if (le64toh(o->object.size) <= offsetof(FieldObject, payload)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64, + offsetof(FieldObject, payload), + le64toh(o->object.size), + offset); + + if (!VALID64(le64toh(o->field.next_hash_offset)) || + !VALID64(le64toh(o->field.head_data_offset))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64, + le64toh(o->field.next_hash_offset), + le64toh(o->field.head_data_offset), + offset); + break; + + case OBJECT_ENTRY: { + uint64_t sz; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(EntryObject, items) || + (sz - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64, + offsetof(EntryObject, items), + sz, + offset); + + if ((sz - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid number items in entry: %" PRIu64 ": %" PRIu64, + (sz - offsetof(EntryObject, items)) / sizeof(EntryItem), + offset); + + if (le64toh(o->entry.seqnum) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry seqnum: %" PRIx64 ": %" PRIu64, + le64toh(o->entry.seqnum), + offset); + + if (!VALID_REALTIME(le64toh(o->entry.realtime))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64, + le64toh(o->entry.realtime), + offset); + + if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64, + le64toh(o->entry.monotonic), + offset); + + break; + } + + case OBJECT_DATA_HASH_TABLE: + case OBJECT_FIELD_HASH_TABLE: { + uint64_t sz; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(HashTableObject, items) || + (sz - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 || + (sz - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid %s hash table size: %" PRIu64 ": %" PRIu64, + o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field", + sz, + offset); + + break; + } + + case OBJECT_ENTRY_ARRAY: { + uint64_t sz; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(EntryArrayObject, items) || + (sz - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 || + (sz - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object entry array size: %" PRIu64 ": %" PRIu64, + sz, + offset); + + if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64, + le64toh(o->entry_array.next_entry_array_offset), + offset); + + break; + } + + case OBJECT_TAG: + if (le64toh(o->object.size) != sizeof(TagObject)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object tag size: %" PRIu64 ": %" PRIu64, + le64toh(o->object.size), + offset); + + if (!VALID_EPOCH(le64toh(o->tag.epoch))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object tag epoch: %" PRIu64 ": %" PRIu64, + le64toh(o->tag.epoch), offset); + + break; + } + + return 0; +} + +int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) { + int r; + void *t; + size_t tsize; + Object *o; + uint64_t s; + + assert(f); + assert(ret); + + /* Objects may only be located at multiple of 64 bit */ + if (!VALID64(offset)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to object at non-64bit boundary: %" PRIu64, + offset); + + /* Object may not be located in the file header */ + if (offset < le64toh(f->header->header_size)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to object located in file header: %" PRIu64, + offset); + + r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize); + if (r < 0) + return r; + + o = (Object*) t; + s = le64toh(READ_NOW(o->object.size)); + + if (s == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to uninitialized object: %" PRIu64, + offset); + if (s < sizeof(ObjectHeader)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to overly short object: %" PRIu64, + offset); + + if (o->object.type <= OBJECT_UNUSED) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to object with invalid type: %" PRIu64, + offset); + + if (s < minimum_header_size(o)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to truncated object: %" PRIu64, + offset); + + if (type > OBJECT_UNUSED && o->object.type != type) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to object of unexpected type: %" PRIu64, + offset); + + if (s > tsize) { + r = journal_file_move_to(f, type, false, offset, s, &t, NULL); + if (r < 0) + return r; + + o = (Object*) t; + } + + r = journal_file_check_object(f, offset, o); + if (r < 0) + return r; + + *ret = o; + return 0; +} + +static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) { + uint64_t r; + + assert(f); + assert(f->header); + + r = le64toh(f->header->tail_entry_seqnum) + 1; + + if (seqnum) { + /* If an external seqnum counter was passed, we update + * both the local and the external one, and set it to + * the maximum of both */ + + if (*seqnum + 1 > r) + r = *seqnum + 1; + + *seqnum = r; + } + + f->header->tail_entry_seqnum = htole64(r); + + if (f->header->head_entry_seqnum == 0) + f->header->head_entry_seqnum = htole64(r); + + return r; +} + +int journal_file_append_object( + JournalFile *f, + ObjectType type, + uint64_t size, + Object **ret, + uint64_t *ret_offset) { + + int r; + uint64_t p; + Object *tail, *o; + void *t; + + assert(f); + assert(f->header); + assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX); + assert(size >= sizeof(ObjectHeader)); + + r = journal_file_set_online(f); + if (r < 0) + return r; + + p = le64toh(f->header->tail_object_offset); + if (p == 0) + p = le64toh(f->header->header_size); + else { + uint64_t sz; + + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail); + if (r < 0) + return r; + + sz = le64toh(READ_NOW(tail->object.size)); + if (sz > UINT64_MAX - sizeof(uint64_t) + 1) + return -EBADMSG; + + sz = ALIGN64(sz); + if (p > UINT64_MAX - sz) + return -EBADMSG; + + p += sz; + } + + r = journal_file_allocate(f, p, size); + if (r < 0) + return r; + + r = journal_file_move_to(f, type, false, p, size, &t, NULL); + if (r < 0) + return r; + + o = (Object*) t; + o->object = (ObjectHeader) { + .type = type, + .size = htole64(size), + }; + + f->header->tail_object_offset = htole64(p); + f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1); + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +static int journal_file_setup_data_hash_table(JournalFile *f) { + uint64_t s, p; + Object *o; + int r; + + assert(f); + assert(f->header); + + /* We estimate that we need 1 hash table entry per 768 bytes + of journal file and we want to make sure we never get + beyond 75% fill level. Calculate the hash table size for + the maximum file size based on these metrics. */ + + s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem); + if (s < DEFAULT_DATA_HASH_TABLE_SIZE) + s = DEFAULT_DATA_HASH_TABLE_SIZE; + + log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem)); + + r = journal_file_append_object(f, + OBJECT_DATA_HASH_TABLE, + offsetof(Object, hash_table.items) + s, + &o, &p); + if (r < 0) + return r; + + memzero(o->hash_table.items, s); + + f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); + f->header->data_hash_table_size = htole64(s); + + return 0; +} + +static int journal_file_setup_field_hash_table(JournalFile *f) { + uint64_t s, p; + Object *o; + int r; + + assert(f); + assert(f->header); + + /* We use a fixed size hash table for the fields as this + * number should grow very slowly only */ + + s = DEFAULT_FIELD_HASH_TABLE_SIZE; + log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem)); + + r = journal_file_append_object(f, + OBJECT_FIELD_HASH_TABLE, + offsetof(Object, hash_table.items) + s, + &o, &p); + if (r < 0) + return r; + + memzero(o->hash_table.items, s); + + f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); + f->header->field_hash_table_size = htole64(s); + + return 0; +} + +int journal_file_map_data_hash_table(JournalFile *f) { + uint64_t s, p; + void *t; + int r; + + assert(f); + assert(f->header); + + if (f->data_hash_table) + return 0; + + p = le64toh(f->header->data_hash_table_offset); + s = le64toh(f->header->data_hash_table_size); + + r = journal_file_move_to(f, + OBJECT_DATA_HASH_TABLE, + true, + p, s, + &t, NULL); + if (r < 0) + return r; + + f->data_hash_table = t; + return 0; +} + +int journal_file_map_field_hash_table(JournalFile *f) { + uint64_t s, p; + void *t; + int r; + + assert(f); + assert(f->header); + + if (f->field_hash_table) + return 0; + + p = le64toh(f->header->field_hash_table_offset); + s = le64toh(f->header->field_hash_table_size); + + r = journal_file_move_to(f, + OBJECT_FIELD_HASH_TABLE, + true, + p, s, + &t, NULL); + if (r < 0) + return r; + + f->field_hash_table = t; + return 0; +} + +static int journal_file_link_field( + JournalFile *f, + Object *o, + uint64_t offset, + uint64_t hash) { + + uint64_t p, h, m; + int r; + + assert(f); + assert(f->header); + assert(f->field_hash_table); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_FIELD) + return -EINVAL; + + m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + /* This might alter the window we are looking at */ + o->field.next_hash_offset = o->field.head_data_offset = 0; + + h = hash % m; + p = le64toh(f->field_hash_table[h].tail_hash_offset); + if (p == 0) + f->field_hash_table[h].head_hash_offset = htole64(offset); + else { + r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); + if (r < 0) + return r; + + o->field.next_hash_offset = htole64(offset); + } + + f->field_hash_table[h].tail_hash_offset = htole64(offset); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1); + + return 0; +} + +static int journal_file_link_data( + JournalFile *f, + Object *o, + uint64_t offset, + uint64_t hash) { + + uint64_t p, h, m; + int r; + + assert(f); + assert(f->header); + assert(f->data_hash_table); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_DATA) + return -EINVAL; + + m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + /* This might alter the window we are looking at */ + o->data.next_hash_offset = o->data.next_field_offset = 0; + o->data.entry_offset = o->data.entry_array_offset = 0; + o->data.n_entries = 0; + + h = hash % m; + p = le64toh(f->data_hash_table[h].tail_hash_offset); + if (p == 0) + /* Only entry in the hash table is easy */ + f->data_hash_table[h].head_hash_offset = htole64(offset); + else { + /* Move back to the previous data object, to patch in + * pointer */ + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + o->data.next_hash_offset = htole64(offset); + } + + f->data_hash_table[h].tail_hash_offset = htole64(offset); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + f->header->n_data = htole64(le64toh(f->header->n_data) + 1); + + return 0; +} + +static int next_hash_offset( + JournalFile *f, + uint64_t *p, + le64_t *next_hash_offset, + uint64_t *depth, + le64_t *header_max_depth) { + + uint64_t nextp; + + nextp = le64toh(READ_NOW(*next_hash_offset)); + if (nextp > 0) { + if (nextp <= *p) /* Refuse going in loops */ + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Detected hash item loop in %s, refusing.", f->path); + + (*depth)++; + + /* If the depth of this hash chain is larger than all others we have seen so far, record it */ + if (header_max_depth && f->writable) + *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth))); + } + + *p = nextp; + return 0; +} + +int journal_file_find_field_object_with_hash( + JournalFile *f, + const void *field, uint64_t size, uint64_t hash, + Object **ret, uint64_t *ret_offset) { + + uint64_t p, osize, h, m, depth = 0; + int r; + + assert(f); + assert(f->header); + assert(field && size > 0); + + /* If the field hash table is empty, we can't find anything */ + if (le64toh(f->header->field_hash_table_size) <= 0) + return 0; + + /* Map the field hash table, if it isn't mapped yet. */ + r = journal_file_map_field_hash_table(f); + if (r < 0) + return r; + + osize = offsetof(Object, field.payload) + size; + + m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + h = hash % m; + p = le64toh(f->field_hash_table[h].head_hash_offset); + while (p > 0) { + Object *o; + + r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); + if (r < 0) + return r; + + if (le64toh(o->field.hash) == hash && + le64toh(o->object.size) == osize && + memcmp(o->field.payload, field, size) == 0) { + + if (ret) + *ret = o; + if (ret_offset) + *ret_offset = p; + + return 1; + } + + r = next_hash_offset( + f, + &p, + &o->field.next_hash_offset, + &depth, + JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL); + if (r < 0) + return r; + } + + return 0; +} + +uint64_t journal_file_hash_data( + JournalFile *f, + const void *data, + size_t sz) { + + assert(f); + assert(data || sz == 0); + + /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash + * function use siphash. Old journal files use the Jenkins hash. */ + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + return siphash24(data, sz, f->header->file_id.bytes); + + return jenkins_hash64(data, sz); +} + +int journal_file_find_field_object( + JournalFile *f, + const void *field, uint64_t size, + Object **ret, uint64_t *ret_offset) { + + assert(f); + assert(field && size > 0); + + return journal_file_find_field_object_with_hash( + f, + field, size, + journal_file_hash_data(f, field, size), + ret, ret_offset); +} + +int journal_file_find_data_object_with_hash( + JournalFile *f, + const void *data, uint64_t size, uint64_t hash, + Object **ret, uint64_t *ret_offset) { + + uint64_t p, osize, h, m, depth = 0; + int r; + + assert(f); + assert(f->header); + assert(data || size == 0); + + /* If there's no data hash table, then there's no entry. */ + if (le64toh(f->header->data_hash_table_size) <= 0) + return 0; + + /* Map the data hash table, if it isn't mapped yet. */ + r = journal_file_map_data_hash_table(f); + if (r < 0) + return r; + + osize = offsetof(Object, data.payload) + size; + + m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + h = hash % m; + p = le64toh(f->data_hash_table[h].head_hash_offset); + + while (p > 0) { + Object *o; + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + if (le64toh(o->data.hash) != hash) + goto next; + + if (o->object.flags & OBJECT_COMPRESSION_MASK) { +#if HAVE_COMPRESSION + uint64_t l; + size_t rsize = 0; + + l = le64toh(READ_NOW(o->object.size)); + if (l <= offsetof(Object, data.payload)) + return -EBADMSG; + + l -= offsetof(Object, data.payload); + + r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK, + o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0); + if (r < 0) + return r; + + if (rsize == size && + memcmp(f->compress_buffer, data, size) == 0) { + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 1; + } +#else + return -EPROTONOSUPPORT; +#endif + } else if (le64toh(o->object.size) == osize && + memcmp(o->data.payload, data, size) == 0) { + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 1; + } + + next: + r = next_hash_offset( + f, + &p, + &o->data.next_hash_offset, + &depth, + JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL); + if (r < 0) + return r; + } + + return 0; +} + +int journal_file_find_data_object( + JournalFile *f, + const void *data, uint64_t size, + Object **ret, uint64_t *ret_offset) { + + assert(f); + assert(data || size == 0); + + return journal_file_find_data_object_with_hash( + f, + data, size, + journal_file_hash_data(f, data, size), + ret, ret_offset); +} + +bool journal_field_valid(const char *p, size_t l, bool allow_protected) { + const char *a; + + /* We kinda enforce POSIX syntax recommendations for + environment variables here, but make a couple of additional + requirements. + + http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */ + + if (l == (size_t) -1) + l = strlen(p); + + /* No empty field names */ + if (l <= 0) + return false; + + /* Don't allow names longer than 64 chars */ + if (l > 64) + return false; + + /* Variables starting with an underscore are protected */ + if (!allow_protected && p[0] == '_') + return false; + + /* Don't allow digits as first character */ + if (p[0] >= '0' && p[0] <= '9') + return false; + + /* Only allow A-Z0-9 and '_' */ + for (a = p; a < p + l; a++) + if ((*a < 'A' || *a > 'Z') && + (*a < '0' || *a > '9') && + *a != '_') + return false; + + return true; +} + +static int journal_file_append_field( + JournalFile *f, + const void *field, uint64_t size, + Object **ret, uint64_t *ret_offset) { + + uint64_t hash, p; + uint64_t osize; + Object *o; + int r; + + assert(f); + assert(field && size > 0); + + if (!journal_field_valid(field, size, true)) + return -EBADMSG; + + hash = journal_file_hash_data(f, field, size); + + r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p); + if (r < 0) + return r; + else if (r > 0) { + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 0; + } + + osize = offsetof(Object, field.payload) + size; + r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p); + if (r < 0) + return r; + + o->field.hash = htole64(hash); + memcpy(o->field.payload, field, size); + + r = journal_file_link_field(f, o, p, hash); + if (r < 0) + return r; + + /* The linking might have altered the window, so let's + * refresh our pointer */ + r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); + if (r < 0) + return r; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p); + if (r < 0) + return r; +#endif + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +static int journal_file_append_data( + JournalFile *f, + const void *data, uint64_t size, + Object **ret, uint64_t *ret_offset) { + + uint64_t hash, p; + uint64_t osize; + Object *o; + int r, compression = 0; + const void *eq; + + assert(f); + assert(data || size == 0); + + hash = journal_file_hash_data(f, data, size); + + r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p); + if (r < 0) + return r; + if (r > 0) { + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 0; + } + + osize = offsetof(Object, data.payload) + size; + r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p); + if (r < 0) + return r; + + o->data.hash = htole64(hash); + +#if HAVE_COMPRESSION + if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) { + size_t rsize = 0; + + compression = compress_blob(data, size, o->data.payload, size - 1, &rsize); + + if (compression >= 0) { + o->object.size = htole64(offsetof(Object, data.payload) + rsize); + o->object.flags |= compression; + + log_debug("Compressed data object %"PRIu64" -> %zu using %s", + size, rsize, object_compressed_to_string(compression)); + } else + /* Compression didn't work, we don't really care why, let's continue without compression */ + compression = 0; + } +#endif + + if (compression == 0) + memcpy_safe(o->data.payload, data, size); + + r = journal_file_link_data(f, o, p, hash); + if (r < 0) + return r; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); + if (r < 0) + return r; +#endif + + /* The linking might have altered the window, so let's + * refresh our pointer */ + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + if (!data) + eq = NULL; + else + eq = memchr(data, '=', size); + if (eq && eq > data) { + Object *fo = NULL; + uint64_t fp; + + /* Create field object ... */ + r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp); + if (r < 0) + return r; + + /* ... and link it in. */ + o->data.next_field_offset = fo->field.head_data_offset; + fo->field.head_data_offset = le64toh(p); + } + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +uint64_t journal_file_entry_n_items(Object *o) { + uint64_t sz; + assert(o); + + if (o->object.type != OBJECT_ENTRY) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry.items)) + return 0; + + return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem); +} + +uint64_t journal_file_entry_array_n_items(Object *o) { + uint64_t sz; + + assert(o); + + if (o->object.type != OBJECT_ENTRY_ARRAY) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry_array.items)) + return 0; + + return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t); +} + +uint64_t journal_file_hash_table_n_items(Object *o) { + uint64_t sz; + + assert(o); + + if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE)) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, hash_table.items)) + return 0; + + return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem); +} + +static int link_entry_into_array(JournalFile *f, + le64_t *first, + le64_t *idx, + uint64_t p) { + int r; + uint64_t n = 0, ap = 0, q, i, a, hidx; + Object *o; + + assert(f); + assert(f->header); + assert(first); + assert(idx); + assert(p > 0); + + a = le64toh(*first); + i = hidx = le64toh(READ_NOW(*idx)); + while (a > 0) { + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + n = journal_file_entry_array_n_items(o); + if (i < n) { + o->entry_array.items[i] = htole64(p); + *idx = htole64(hidx + 1); + return 0; + } + + i -= n; + ap = a; + a = le64toh(o->entry_array.next_entry_array_offset); + } + + if (hidx > n) + n = (hidx+1) * 2; + else + n = n * 2; + + if (n < 4) + n = 4; + + r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY, + offsetof(Object, entry_array.items) + n * sizeof(uint64_t), + &o, &q); + if (r < 0) + return r; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q); + if (r < 0) + return r; +#endif + + o->entry_array.items[i] = htole64(p); + + if (ap == 0) + *first = htole64(q); + else { + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o); + if (r < 0) + return r; + + o->entry_array.next_entry_array_offset = htole64(q); + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1); + + *idx = htole64(hidx + 1); + + return 0; +} + +static int link_entry_into_array_plus_one(JournalFile *f, + le64_t *extra, + le64_t *first, + le64_t *idx, + uint64_t p) { + + uint64_t hidx; + int r; + + assert(f); + assert(extra); + assert(first); + assert(idx); + assert(p > 0); + + hidx = le64toh(READ_NOW(*idx)); + if (hidx == UINT64_MAX) + return -EBADMSG; + if (hidx == 0) + *extra = htole64(p); + else { + le64_t i; + + i = htole64(hidx - 1); + r = link_entry_into_array(f, first, &i, p); + if (r < 0) + return r; + } + + *idx = htole64(hidx + 1); + return 0; +} + +static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) { + uint64_t p; + int r; + + assert(f); + assert(o); + assert(offset > 0); + + p = le64toh(o->entry.items[i].object_offset); + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + return link_entry_into_array_plus_one(f, + &o->data.entry_offset, + &o->data.entry_array_offset, + &o->data.n_entries, + offset); +} + +static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) { + uint64_t n, i; + int r; + + assert(f); + assert(f->header); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_ENTRY) + return -EINVAL; + + __sync_synchronize(); + + /* Link up the entry itself */ + r = link_entry_into_array(f, + &f->header->entry_array_offset, + &f->header->n_entries, + offset); + if (r < 0) + return r; + + /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */ + + if (f->header->head_entry_realtime == 0) + f->header->head_entry_realtime = o->entry.realtime; + + f->header->tail_entry_realtime = o->entry.realtime; + f->header->tail_entry_monotonic = o->entry.monotonic; + + /* Link up the items */ + n = journal_file_entry_n_items(o); + for (i = 0; i < n; i++) { + r = journal_file_link_entry_item(f, o, offset, i); + if (r < 0) + return r; + } + + return 0; +} + +static int journal_file_append_entry_internal( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + uint64_t xor_hash, + const EntryItem items[], unsigned n_items, + uint64_t *seqnum, + Object **ret, uint64_t *ret_offset) { + uint64_t np; + uint64_t osize; + Object *o; + int r; + + assert(f); + assert(f->header); + assert(items || n_items == 0); + assert(ts); + + osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem)); + + r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np); + if (r < 0) + return r; + + o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum)); + memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem)); + o->entry.realtime = htole64(ts->realtime); + o->entry.monotonic = htole64(ts->monotonic); + o->entry.xor_hash = htole64(xor_hash); + if (boot_id) + f->header->boot_id = *boot_id; + o->entry.boot_id = f->header->boot_id; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np); + if (r < 0) + return r; +#endif + + r = journal_file_link_entry(f, o, np); + if (r < 0) + return r; + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = np; + + return 0; +} + +void journal_file_post_change(JournalFile *f) { + assert(f); + + if (f->fd < 0) + return; + + /* inotify() does not receive IN_MODIFY events from file + * accesses done via mmap(). After each access we hence + * trigger IN_MODIFY by truncating the journal file to its + * current size which triggers IN_MODIFY. */ + + __sync_synchronize(); + + if (ftruncate(f->fd, f->last_stat.st_size) < 0) + log_debug_errno(errno, "Failed to truncate file to its own size: %m"); +} + +static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) { + assert(userdata); + + journal_file_post_change(userdata); + + return 1; +} + +static void schedule_post_change(JournalFile *f) { + int r; + + assert(f); + assert(f->post_change_timer); + + r = sd_event_source_get_enabled(f->post_change_timer, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to get ftruncate timer state: %m"); + goto fail; + } + if (r > 0) + return; + + r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period); + if (r < 0) { + log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT); + if (r < 0) { + log_debug_errno(r, "Failed to enable scheduled ftruncate: %m"); + goto fail; + } + + return; + +fail: + /* On failure, let's simply post the change immediately. */ + journal_file_post_change(f); +} + +/* Enable coalesced change posting in a timer on the provided sd_event instance */ +int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) { + _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL; + int r; + + assert(f); + assert_return(!f->post_change_timer, -EINVAL); + assert(e); + assert(t); + + r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(timer, SD_EVENT_OFF); + if (r < 0) + return r; + + f->post_change_timer = TAKE_PTR(timer); + f->post_change_timer_period = t; + + return r; +} + +static int entry_item_cmp(const EntryItem *a, const EntryItem *b) { + return CMP(le64toh(a->object_offset), le64toh(b->object_offset)); +} + +int journal_file_append_entry( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + const struct iovec iovec[], unsigned n_iovec, + uint64_t *seqnum, + Object **ret, uint64_t *ret_offset) { + + unsigned i; + EntryItem *items; + int r; + uint64_t xor_hash = 0; + struct dual_timestamp _ts; + + assert(f); + assert(f->header); + assert(iovec || n_iovec == 0); + + if (ts) { + if (!VALID_REALTIME(ts->realtime)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid realtime timestamp %" PRIu64 ", refusing entry.", + ts->realtime); + if (!VALID_MONOTONIC(ts->monotonic)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid monotomic timestamp %" PRIu64 ", refusing entry.", + ts->monotonic); + } else { + dual_timestamp_get(&_ts); + ts = &_ts; + } + +#if HAVE_GCRYPT + r = journal_file_maybe_append_tag(f, ts->realtime); + if (r < 0) + return r; +#endif + + /* alloca() can't take 0, hence let's allocate at least one */ + items = newa(EntryItem, MAX(1u, n_iovec)); + + for (i = 0; i < n_iovec; i++) { + uint64_t p; + Object *o; + + r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p); + if (r < 0) + return r; + + /* When calculating the XOR hash field, we need to take special care if the "keyed-hash" + * journal file flag is on. We use the XOR hash field to quickly determine the identity of a + * specific record, and give records with otherwise identical position (i.e. match in seqno, + * timestamp, …) a stable ordering. But for that we can't have it that the hash of the + * objects in each file is different since they are keyed. Hence let's calculate the Jenkins + * hash here for that. This also has the benefit that cursors for old and new journal files + * are completely identical (they include the XOR hash after all). For classic Jenkins-hash + * files things are easier, we can just take the value from the stored record directly. */ + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len); + else + xor_hash ^= le64toh(o->data.hash); + + items[i].object_offset = htole64(p); + items[i].hash = o->data.hash; + } + + /* Order by the position on disk, in order to improve seek + * times for rotating media. */ + typesafe_qsort(items, n_iovec, entry_item_cmp); + + r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset); + + /* If the memory mapping triggered a SIGBUS then we return an + * IO error and ignore the error code passed down to us, since + * it is very likely just an effect of a nullified replacement + * mapping page */ + + if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) + r = -EIO; + + if (f->post_change_timer) + schedule_post_change(f); + else + journal_file_post_change(f); + + return r; +} + +typedef struct ChainCacheItem { + uint64_t first; /* the array at the beginning of the chain */ + uint64_t array; /* the cached array */ + uint64_t begin; /* the first item in the cached array */ + uint64_t total; /* the total number of items in all arrays before this one in the chain */ + uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */ +} ChainCacheItem; + +static void chain_cache_put( + OrderedHashmap *h, + ChainCacheItem *ci, + uint64_t first, + uint64_t array, + uint64_t begin, + uint64_t total, + uint64_t last_index) { + + if (!ci) { + /* If the chain item to cache for this chain is the + * first one it's not worth caching anything */ + if (array == first) + return; + + if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) { + ci = ordered_hashmap_steal_first(h); + assert(ci); + } else { + ci = new(ChainCacheItem, 1); + if (!ci) + return; + } + + ci->first = first; + + if (ordered_hashmap_put(h, &ci->first, ci) < 0) { + free(ci); + return; + } + } else + assert(ci->first == first); + + ci->array = array; + ci->begin = begin; + ci->total = total; + ci->last_index = last_index; +} + +static int generic_array_get( + JournalFile *f, + uint64_t first, + uint64_t i, + Object **ret, uint64_t *ret_offset) { + + Object *o; + uint64_t p = 0, a, t = 0; + int r; + ChainCacheItem *ci; + + assert(f); + + a = first; + + /* Try the chain cache first */ + ci = ordered_hashmap_get(f->chain_cache, &first); + if (ci && i > ci->total) { + a = ci->array; + i -= ci->total; + t = ci->total; + } + + while (a > 0) { + uint64_t k; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + k = journal_file_entry_array_n_items(o); + if (i < k) { + p = le64toh(o->entry_array.items[i]); + goto found; + } + + i -= k; + t += k; + a = le64toh(o->entry_array.next_entry_array_offset); + } + + return 0; + +found: + /* Let's cache this item for the next invocation */ + chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + return 1; +} + +static int generic_array_get_plus_one( + JournalFile *f, + uint64_t extra, + uint64_t first, + uint64_t i, + Object **ret, uint64_t *ret_offset) { + + Object *o; + + assert(f); + + if (i == 0) { + int r; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o); + if (r < 0) + return r; + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = extra; + + return 1; + } + + return generic_array_get(f, first, i-1, ret, ret_offset); +} + +enum { + TEST_FOUND, + TEST_LEFT, + TEST_RIGHT +}; + +static int generic_array_bisect( + JournalFile *f, + uint64_t first, + uint64_t n, + uint64_t needle, + int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), + direction_t direction, + Object **ret, + uint64_t *ret_offset, + uint64_t *ret_idx) { + + uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1; + bool subtract_one = false; + Object *o, *array = NULL; + int r; + ChainCacheItem *ci; + + assert(f); + assert(test_object); + + /* Start with the first array in the chain */ + a = first; + + ci = ordered_hashmap_get(f->chain_cache, &first); + if (ci && n > ci->total && ci->begin != 0) { + /* Ah, we have iterated this bisection array chain + * previously! Let's see if we can skip ahead in the + * chain, as far as the last time. But we can't jump + * backwards in the chain, so let's check that + * first. */ + + r = test_object(f, ci->begin, needle); + if (r < 0) + return r; + + if (r == TEST_LEFT) { + /* OK, what we are looking for is right of the + * begin of this EntryArray, so let's jump + * straight to previously cached array in the + * chain */ + + a = ci->array; + n -= ci->total; + t = ci->total; + last_index = ci->last_index; + } + } + + while (a > 0) { + uint64_t left, right, k, lp; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array); + if (r < 0) + return r; + + k = journal_file_entry_array_n_items(array); + right = MIN(k, n); + if (right <= 0) + return 0; + + i = right - 1; + lp = p = le64toh(array->entry_array.items[i]); + if (p <= 0) + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)"); + n = i; + continue; + } + if (r < 0) + return r; + + if (r == TEST_FOUND) + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + if (r == TEST_RIGHT) { + left = 0; + right -= 1; + + if (last_index != (uint64_t) -1) { + assert(last_index <= right); + + /* If we cached the last index we + * looked at, let's try to not to jump + * too wildly around and see if we can + * limit the range to look at early to + * the immediate neighbors of the last + * index we looked at. */ + + if (last_index > 0) { + uint64_t x = last_index - 1; + + p = le64toh(array->entry_array.items[x]); + if (p <= 0) + return -EBADMSG; + + r = test_object(f, p, needle); + if (r < 0) + return r; + + if (r == TEST_FOUND) + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + if (r == TEST_RIGHT) + right = x; + else + left = x + 1; + } + + if (last_index < right) { + uint64_t y = last_index + 1; + + p = le64toh(array->entry_array.items[y]); + if (p <= 0) + return -EBADMSG; + + r = test_object(f, p, needle); + if (r < 0) + return r; + + if (r == TEST_FOUND) + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + if (r == TEST_RIGHT) + right = y; + else + left = y + 1; + } + } + + for (;;) { + if (left == right) { + if (direction == DIRECTION_UP) + subtract_one = true; + + i = left; + goto found; + } + + assert(left < right); + i = (left + right) / 2; + + p = le64toh(array->entry_array.items[i]); + if (p <= 0) + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)"); + right = n = i; + continue; + } + if (r < 0) + return r; + + if (r == TEST_FOUND) + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + if (r == TEST_RIGHT) + right = i; + else + left = i + 1; + } + } + + if (k >= n) { + if (direction == DIRECTION_UP) { + i = n; + subtract_one = true; + goto found; + } + + return 0; + } + + last_p = lp; + + n -= k; + t += k; + last_index = (uint64_t) -1; + a = le64toh(array->entry_array.next_entry_array_offset); + } + + return 0; + +found: + if (subtract_one && t == 0 && i == 0) + return 0; + + /* Let's cache this item for the next invocation */ + chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i); + + if (subtract_one && i == 0) + p = last_p; + else if (subtract_one) + p = le64toh(array->entry_array.items[i-1]); + else + p = le64toh(array->entry_array.items[i]); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = p; + + if (ret_idx) + *ret_idx = t + i + (subtract_one ? -1 : 0); + + return 1; +} + +static int generic_array_bisect_plus_one( + JournalFile *f, + uint64_t extra, + uint64_t first, + uint64_t n, + uint64_t needle, + int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), + direction_t direction, + Object **ret, + uint64_t *ret_offset, + uint64_t *ret_idx) { + + int r; + bool step_back = false; + Object *o; + + assert(f); + assert(test_object); + + if (n <= 0) + return 0; + + /* This bisects the array in object 'first', but first checks + * an extra */ + r = test_object(f, extra, needle); + if (r < 0) + return r; + + if (r == TEST_FOUND) + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + /* if we are looking with DIRECTION_UP then we need to first + see if in the actual array there is a matching entry, and + return the last one of that. But if there isn't any we need + to return this one. Hence remember this, and return it + below. */ + if (r == TEST_LEFT) + step_back = direction == DIRECTION_UP; + + if (r == TEST_RIGHT) { + if (direction == DIRECTION_DOWN) + goto found; + else + return 0; + } + + r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx); + + if (r == 0 && step_back) + goto found; + + if (r > 0 && ret_idx) + (*ret_idx)++; + + return r; + +found: + r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o); + if (r < 0) + return r; + + if (ret) + *ret = o; + + if (ret_offset) + *ret_offset = extra; + + if (ret_idx) + *ret_idx = 0; + + return 1; +} + +_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) { + assert(f); + assert(p > 0); + + if (p == needle) + return TEST_FOUND; + else if (p < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) { + uint64_t sq; + Object *o; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + sq = le64toh(READ_NOW(o->entry.seqnum)); + if (sq == needle) + return TEST_FOUND; + else if (sq < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +int journal_file_move_to_entry_by_seqnum( + JournalFile *f, + uint64_t seqnum, + direction_t direction, + Object **ret, + uint64_t *ret_offset) { + assert(f); + assert(f->header); + + return generic_array_bisect( + f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + seqnum, + test_object_seqnum, + direction, + ret, ret_offset, NULL); +} + +static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) { + Object *o; + uint64_t rt; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + rt = le64toh(READ_NOW(o->entry.realtime)); + if (rt == needle) + return TEST_FOUND; + else if (rt < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +int journal_file_move_to_entry_by_realtime( + JournalFile *f, + uint64_t realtime, + direction_t direction, + Object **ret, + uint64_t *ret_offset) { + assert(f); + assert(f->header); + + return generic_array_bisect( + f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + realtime, + test_object_realtime, + direction, + ret, ret_offset, NULL); +} + +static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) { + Object *o; + uint64_t m; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + m = le64toh(READ_NOW(o->entry.monotonic)); + if (m == needle) + return TEST_FOUND; + else if (m < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +static int find_data_object_by_boot_id( + JournalFile *f, + sd_id128_t boot_id, + Object **o, + uint64_t *b) { + + char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID="; + + sd_id128_to_string(boot_id, t + 9); + return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b); +} + +int journal_file_move_to_entry_by_monotonic( + JournalFile *f, + sd_id128_t boot_id, + uint64_t monotonic, + direction_t direction, + Object **ret, + uint64_t *ret_offset) { + + Object *o; + int r; + + assert(f); + + r = find_data_object_by_boot_id(f, boot_id, &o, NULL); + if (r < 0) + return r; + if (r == 0) + return -ENOENT; + + return generic_array_bisect_plus_one( + f, + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + le64toh(o->data.n_entries), + monotonic, + test_object_monotonic, + direction, + ret, ret_offset, NULL); +} + +void journal_file_reset_location(JournalFile *f) { + f->location_type = LOCATION_HEAD; + f->current_offset = 0; + f->current_seqnum = 0; + f->current_realtime = 0; + f->current_monotonic = 0; + zero(f->current_boot_id); + f->current_xor_hash = 0; +} + +void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) { + f->location_type = LOCATION_SEEK; + f->current_offset = offset; + f->current_seqnum = le64toh(o->entry.seqnum); + f->current_realtime = le64toh(o->entry.realtime); + f->current_monotonic = le64toh(o->entry.monotonic); + f->current_boot_id = o->entry.boot_id; + f->current_xor_hash = le64toh(o->entry.xor_hash); +} + +int journal_file_compare_locations(JournalFile *af, JournalFile *bf) { + int r; + + assert(af); + assert(af->header); + assert(bf); + assert(bf->header); + assert(af->location_type == LOCATION_SEEK); + assert(bf->location_type == LOCATION_SEEK); + + /* If contents, timestamps and seqnum match, these entries are + * identical*/ + if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) && + af->current_monotonic == bf->current_monotonic && + af->current_realtime == bf->current_realtime && + af->current_xor_hash == bf->current_xor_hash && + sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) && + af->current_seqnum == bf->current_seqnum) + return 0; + + if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) { + + /* If this is from the same seqnum source, compare + * seqnums */ + r = CMP(af->current_seqnum, bf->current_seqnum); + if (r != 0) + return r; + + /* Wow! This is weird, different data but the same + * seqnums? Something is borked, but let's make the + * best of it and compare by time. */ + } + + if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) { + + /* If the boot id matches, compare monotonic time */ + r = CMP(af->current_monotonic, bf->current_monotonic); + if (r != 0) + return r; + } + + /* Otherwise, compare UTC time */ + r = CMP(af->current_realtime, bf->current_realtime); + if (r != 0) + return r; + + /* Finally, compare by contents */ + return CMP(af->current_xor_hash, bf->current_xor_hash); +} + +static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) { + + /* Increase or decrease the specified index, in the right direction. */ + + if (direction == DIRECTION_DOWN) { + if (*i >= n - 1) + return 0; + + (*i) ++; + } else { + if (*i <= 0) + return 0; + + (*i) --; + } + + return 1; +} + +static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) { + + /* Consider it an error if any of the two offsets is uninitialized */ + if (old_offset == 0 || new_offset == 0) + return false; + + /* If we go down, the new offset must be larger than the old one. */ + return direction == DIRECTION_DOWN ? + new_offset > old_offset : + new_offset < old_offset; +} + +int journal_file_next_entry( + JournalFile *f, + uint64_t p, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + uint64_t i, n, ofs; + int r; + + assert(f); + assert(f->header); + + n = le64toh(READ_NOW(f->header->n_entries)); + if (n <= 0) + return 0; + + if (p == 0) + i = direction == DIRECTION_DOWN ? 0 : n - 1; + else { + r = generic_array_bisect(f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + p, + test_object_offset, + DIRECTION_DOWN, + NULL, NULL, + &i); + if (r <= 0) + return r; + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + /* And jump to it */ + for (;;) { + r = generic_array_get(f, + le64toh(f->header->entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; + + /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if + * the next one might work for us instead. */ + log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i); + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + /* Ensure our array is properly ordered. */ + if (p > 0 && !check_properly_ordered(ofs, p, direction)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s: entry array not properly ordered at entry %" PRIu64, + f->path, i); + + if (ret_offset) + *ret_offset = ofs; + + return 1; +} + +int journal_file_next_entry_for_data( + JournalFile *f, + Object *o, uint64_t p, + uint64_t data_offset, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + uint64_t i, n, ofs; + Object *d; + int r; + + assert(f); + assert(p > 0 || !o); + + r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d); + if (r < 0) + return r; + + n = le64toh(READ_NOW(d->data.n_entries)); + if (n <= 0) + return n; + + if (!o) + i = direction == DIRECTION_DOWN ? 0 : n - 1; + else { + if (o->object.type != OBJECT_ENTRY) + return -EINVAL; + + r = generic_array_bisect_plus_one(f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + le64toh(d->data.n_entries), + p, + test_object_offset, + DIRECTION_DOWN, + NULL, NULL, + &i); + + if (r <= 0) + return r; + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + for (;;) { + r = generic_array_get_plus_one(f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; + + log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i); + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + /* Ensure our array is properly ordered. */ + if (p > 0 && check_properly_ordered(ofs, p, direction)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s data entry array not properly ordered at entry %" PRIu64, + f->path, i); + + if (ret_offset) + *ret_offset = ofs; + + return 1; +} + +int journal_file_move_to_entry_by_offset_for_data( + JournalFile *f, + uint64_t data_offset, + uint64_t p, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + int r; + Object *d; + + assert(f); + + r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d); + if (r < 0) + return r; + + return generic_array_bisect_plus_one( + f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + le64toh(d->data.n_entries), + p, + test_object_offset, + direction, + ret, ret_offset, NULL); +} + +int journal_file_move_to_entry_by_monotonic_for_data( + JournalFile *f, + uint64_t data_offset, + sd_id128_t boot_id, + uint64_t monotonic, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + Object *o, *d; + int r; + uint64_t b, z; + + assert(f); + + /* First, seek by time */ + r = find_data_object_by_boot_id(f, boot_id, &o, &b); + if (r < 0) + return r; + if (r == 0) + return -ENOENT; + + r = generic_array_bisect_plus_one(f, + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + le64toh(o->data.n_entries), + monotonic, + test_object_monotonic, + direction, + NULL, &z, NULL); + if (r <= 0) + return r; + + /* And now, continue seeking until we find an entry that + * exists in both bisection arrays */ + + for (;;) { + Object *qo; + uint64_t p, q; + + r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d); + if (r < 0) + return r; + + r = generic_array_bisect_plus_one(f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + le64toh(d->data.n_entries), + z, + test_object_offset, + direction, + NULL, &p, NULL); + if (r <= 0) + return r; + + r = journal_file_move_to_object(f, OBJECT_DATA, b, &o); + if (r < 0) + return r; + + r = generic_array_bisect_plus_one(f, + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + le64toh(o->data.n_entries), + p, + test_object_offset, + direction, + &qo, &q, NULL); + + if (r <= 0) + return r; + + if (p == q) { + if (ret) + *ret = qo; + if (ret_offset) + *ret_offset = q; + + return 1; + } + + z = q; + } +} + +int journal_file_move_to_entry_by_seqnum_for_data( + JournalFile *f, + uint64_t data_offset, + uint64_t seqnum, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + Object *d; + int r; + + assert(f); + + r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d); + if (r < 0) + return r; + + return generic_array_bisect_plus_one( + f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + le64toh(d->data.n_entries), + seqnum, + test_object_seqnum, + direction, + ret, ret_offset, NULL); +} + +int journal_file_move_to_entry_by_realtime_for_data( + JournalFile *f, + uint64_t data_offset, + uint64_t realtime, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + Object *d; + int r; + + assert(f); + + r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d); + if (r < 0) + return r; + + return generic_array_bisect_plus_one( + f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + le64toh(d->data.n_entries), + realtime, + test_object_realtime, + direction, + ret, ret_offset, NULL); +} + +void journal_file_dump(JournalFile *f) { + Object *o; + int r; + uint64_t p; + + assert(f); + assert(f->header); + + journal_file_print_header(f); + + p = le64toh(READ_NOW(f->header->header_size)); + while (p != 0) { + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) + goto fail; + + switch (o->object.type) { + + case OBJECT_UNUSED: + printf("Type: OBJECT_UNUSED\n"); + break; + + case OBJECT_DATA: + printf("Type: OBJECT_DATA\n"); + break; + + case OBJECT_FIELD: + printf("Type: OBJECT_FIELD\n"); + break; + + case OBJECT_ENTRY: + printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n", + le64toh(o->entry.seqnum), + le64toh(o->entry.monotonic), + le64toh(o->entry.realtime)); + break; + + case OBJECT_FIELD_HASH_TABLE: + printf("Type: OBJECT_FIELD_HASH_TABLE\n"); + break; + + case OBJECT_DATA_HASH_TABLE: + printf("Type: OBJECT_DATA_HASH_TABLE\n"); + break; + + case OBJECT_ENTRY_ARRAY: + printf("Type: OBJECT_ENTRY_ARRAY\n"); + break; + + case OBJECT_TAG: + printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n", + le64toh(o->tag.seqnum), + le64toh(o->tag.epoch)); + break; + + default: + printf("Type: unknown (%i)\n", o->object.type); + break; + } + + if (o->object.flags & OBJECT_COMPRESSION_MASK) + printf("Flags: %s\n", + object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK)); + + if (p == le64toh(f->header->tail_object_offset)) + p = 0; + else + p += ALIGN64(le64toh(o->object.size)); + } + + return; +fail: + log_error("File corrupt"); +} + +static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) { + const char *x; + + x = format_timestamp(buf, l, t); + if (x) + return x; + return " --- "; +} + +void journal_file_print_header(JournalFile *f) { + char a[SD_ID128_STRING_MAX], b[SD_ID128_STRING_MAX], c[SD_ID128_STRING_MAX], d[SD_ID128_STRING_MAX]; + char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX]; + struct stat st; + char bytes[FORMAT_BYTES_MAX]; + + assert(f); + assert(f->header); + + printf("File path: %s\n" + "File ID: %s\n" + "Machine ID: %s\n" + "Boot ID: %s\n" + "Sequential number ID: %s\n" + "State: %s\n" + "Compatible flags:%s%s\n" + "Incompatible flags:%s%s%s%s%s\n" + "Header size: %"PRIu64"\n" + "Arena size: %"PRIu64"\n" + "Data hash table size: %"PRIu64"\n" + "Field hash table size: %"PRIu64"\n" + "Rotate suggested: %s\n" + "Head sequential number: %"PRIu64" (%"PRIx64")\n" + "Tail sequential number: %"PRIu64" (%"PRIx64")\n" + "Head realtime timestamp: %s (%"PRIx64")\n" + "Tail realtime timestamp: %s (%"PRIx64")\n" + "Tail monotonic timestamp: %s (%"PRIx64")\n" + "Objects: %"PRIu64"\n" + "Entry objects: %"PRIu64"\n", + f->path, + sd_id128_to_string(f->header->file_id, a), + sd_id128_to_string(f->header->machine_id, b), + sd_id128_to_string(f->header->boot_id, c), + sd_id128_to_string(f->header->seqnum_id, d), + f->header->state == STATE_OFFLINE ? "OFFLINE" : + f->header->state == STATE_ONLINE ? "ONLINE" : + f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN", + JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "", + (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "", + JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "", + JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "", + JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "", + JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "", + (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "", + le64toh(f->header->header_size), + le64toh(f->header->arena_size), + le64toh(f->header->data_hash_table_size) / sizeof(HashItem), + le64toh(f->header->field_hash_table_size) / sizeof(HashItem), + yes_no(journal_file_rotate_suggested(f, 0)), + le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum), + le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum), + format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime), + format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime), + format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic), + le64toh(f->header->n_objects), + le64toh(f->header->n_entries)); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + printf("Data objects: %"PRIu64"\n" + "Data hash table fill: %.1f%%\n", + le64toh(f->header->n_data), + 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)))); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + printf("Field objects: %"PRIu64"\n" + "Field hash table fill: %.1f%%\n", + le64toh(f->header->n_fields), + 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)))); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_tags)) + printf("Tag objects: %"PRIu64"\n", + le64toh(f->header->n_tags)); + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + printf("Entry array objects: %"PRIu64"\n", + le64toh(f->header->n_entry_arrays)); + + if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth)) + printf("Deepest field hash chain: %" PRIu64"\n", + f->header->field_hash_chain_depth); + + if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth)) + printf("Deepest data hash chain: %" PRIu64"\n", + f->header->data_hash_chain_depth); + + if (fstat(f->fd, &st) >= 0) + printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL)); +} + +static int journal_file_warn_btrfs(JournalFile *f) { + unsigned attrs; + int r; + + assert(f); + + /* Before we write anything, check if the COW logic is turned + * off on btrfs. Given our write pattern that is quite + * unfriendly to COW file systems this should greatly improve + * performance on COW file systems, such as btrfs, at the + * expense of data integrity features (which shouldn't be too + * bad, given that we do our own checksumming). */ + + r = btrfs_is_filesystem(f->fd); + if (r < 0) + return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m"); + if (!r) + return 0; + + r = read_attr_fd(f->fd, &attrs); + if (r < 0) + return log_warning_errno(r, "Failed to read file attributes: %m"); + + if (attrs & FS_NOCOW_FL) { + log_debug("Detected btrfs file system with copy-on-write disabled, all is good."); + return 0; + } + + log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. " + "This is likely to slow down journal access substantially, please consider turning " + "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path); + + return 1; +} + +int journal_file_open( + int fd, + const char *fname, + int flags, + mode_t mode, + bool compress, + uint64_t compress_threshold_bytes, + bool seal, + JournalMetrics *metrics, + MMapCache *mmap_cache, + Set *deferred_closes, + JournalFile *template, + JournalFile **ret) { + + bool newly_created = false; + JournalFile *f; + void *h; + int r; + + assert(ret); + assert(fd >= 0 || fname); + + if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR)) + return -EINVAL; + + if (fname && (flags & O_CREAT) && !endswith(fname, ".journal")) + return -EINVAL; + + f = new(JournalFile, 1); + if (!f) + return -ENOMEM; + + *f = (JournalFile) { + .fd = fd, + .mode = mode, + + .flags = flags, + .prot = prot_from_flags(flags), + .writable = (flags & O_ACCMODE) != O_RDONLY, + +#if HAVE_ZSTD + .compress_zstd = compress, +#elif HAVE_LZ4 + .compress_lz4 = compress, +#elif HAVE_XZ + .compress_xz = compress, +#endif + .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ? + DEFAULT_COMPRESS_THRESHOLD : + MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes), +#if HAVE_GCRYPT + .seal = seal, +#endif + }; + + /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if + * people really want that */ + r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH"); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring."); + f->keyed_hash = true; + } else + f->keyed_hash = r; + + if (DEBUG_LOGGING) { + static int last_seal = -1, last_compress = -1, last_keyed_hash = -1; + static uint64_t last_bytes = UINT64_MAX; + char bytes[FORMAT_BYTES_MAX]; + + if (last_seal != f->seal || + last_keyed_hash != f->keyed_hash || + last_compress != JOURNAL_FILE_COMPRESS(f) || + last_bytes != f->compress_threshold_bytes) { + + log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s", + yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)), + format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes)); + last_seal = f->seal; + last_keyed_hash = f->keyed_hash; + last_compress = JOURNAL_FILE_COMPRESS(f); + last_bytes = f->compress_threshold_bytes; + } + } + + if (mmap_cache) + f->mmap = mmap_cache_ref(mmap_cache); + else { + f->mmap = mmap_cache_new(); + if (!f->mmap) { + r = -ENOMEM; + goto fail; + } + } + + if (fname) { + f->path = strdup(fname); + if (!f->path) { + r = -ENOMEM; + goto fail; + } + } else { + assert(fd >= 0); + + /* If we don't know the path, fill in something explanatory and vaguely useful */ + if (asprintf(&f->path, "/proc/self/%i", fd) < 0) { + r = -ENOMEM; + goto fail; + } + } + + f->chain_cache = ordered_hashmap_new(&uint64_hash_ops); + if (!f->chain_cache) { + r = -ENOMEM; + goto fail; + } + + if (f->fd < 0) { + /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO + * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence + * it doesn't hurt in that case. */ + + f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode); + if (f->fd < 0) { + r = -errno; + goto fail; + } + + /* fds we opened here by us should also be closed by us. */ + f->close_fd = true; + + r = fd_nonblock(f->fd, false); + if (r < 0) + goto fail; + } + + f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd); + if (!f->cache_fd) { + r = -ENOMEM; + goto fail; + } + + r = journal_file_fstat(f); + if (r < 0) + goto fail; + + if (f->last_stat.st_size == 0 && f->writable) { + + (void) journal_file_warn_btrfs(f); + + /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this + * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many + * file systems maintain for each file, but the API to query this is very new, hence let's emulate this + * via extended attributes. If extended attributes are not supported we'll just skip this, and rely + * solely on mtime/atime/ctime of the file. */ + (void) fd_setcrtime(f->fd, 0); + +#if HAVE_GCRYPT + /* Try to load the FSPRG state, and if we can't, then + * just don't do sealing */ + if (f->seal) { + r = journal_file_fss_load(f); + if (r < 0) + f->seal = false; + } +#endif + + r = journal_file_init_header(f, template); + if (r < 0) + goto fail; + + r = journal_file_fstat(f); + if (r < 0) + goto fail; + + newly_created = true; + } + + if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) { + r = -ENODATA; + goto fail; + } + + r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL); + if (r == -EINVAL) { + /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only + * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error + * code. */ + r = -EAFNOSUPPORT; + goto fail; + } + if (r < 0) + goto fail; + + f->header = h; + + if (!newly_created) { + set_clear_with_destructor(deferred_closes, journal_file_close); + + r = journal_file_verify_header(f); + if (r < 0) + goto fail; + } + +#if HAVE_GCRYPT + if (!newly_created && f->writable) { + r = journal_file_fss_load(f); + if (r < 0) + goto fail; + } +#endif + + if (f->writable) { + if (metrics) { + journal_default_metrics(metrics, f->fd); + f->metrics = *metrics; + } else if (template) + f->metrics = template->metrics; + + r = journal_file_refresh_header(f); + if (r < 0) + goto fail; + } + +#if HAVE_GCRYPT + r = journal_file_hmac_setup(f); + if (r < 0) + goto fail; +#endif + + if (newly_created) { + r = journal_file_setup_field_hash_table(f); + if (r < 0) + goto fail; + + r = journal_file_setup_data_hash_table(f); + if (r < 0) + goto fail; + +#if HAVE_GCRYPT + r = journal_file_append_first_tag(f); + if (r < 0) + goto fail; +#endif + } + + if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) { + r = -EIO; + goto fail; + } + + if (template && template->post_change_timer) { + r = journal_file_enable_post_change_timer( + f, + sd_event_source_get_event(template->post_change_timer), + template->post_change_timer_period); + + if (r < 0) + goto fail; + } + + /* The file is opened now successfully, thus we take possession of any passed in fd. */ + f->close_fd = true; + + *ret = f; + return 0; + +fail: + if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd)) + r = -EIO; + + (void) journal_file_close(f); + + return r; +} + +int journal_file_archive(JournalFile *f) { + _cleanup_free_ char *p = NULL; + + assert(f); + + if (!f->writable) + return -EINVAL; + + /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse + * rotation, since we don't know the actual path, and couldn't rename the file hence. */ + if (path_startswith(f->path, "/proc/self/fd")) + return -EINVAL; + + if (!endswith(f->path, ".journal")) + return -EINVAL; + + if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal", + (int) strlen(f->path) - 8, f->path, + SD_ID128_FORMAT_VAL(f->header->seqnum_id), + le64toh(f->header->head_entry_seqnum), + le64toh(f->header->head_entry_realtime)) < 0) + return -ENOMEM; + + /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's + * ignore that case. */ + if (rename(f->path, p) < 0 && errno != ENOENT) + return -errno; + + /* Sync the rename to disk */ + (void) fsync_directory_of_file(f->fd); + + /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state + * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE, + * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue + * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining + * occurs. */ + f->archive = true; + + /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal + * files when we archive them */ + f->defrag_on_close = true; + + return 0; +} + +JournalFile* journal_initiate_close( + JournalFile *f, + Set *deferred_closes) { + + int r; + + assert(f); + + if (deferred_closes) { + + r = set_put(deferred_closes, f); + if (r < 0) + log_debug_errno(r, "Failed to add file to deferred close set, closing immediately."); + else { + (void) journal_file_set_offline(f, false); + return NULL; + } + } + + return journal_file_close(f); +} + +int journal_file_rotate( + JournalFile **f, + bool compress, + uint64_t compress_threshold_bytes, + bool seal, + Set *deferred_closes) { + + JournalFile *new_file = NULL; + int r; + + assert(f); + assert(*f); + + r = journal_file_archive(*f); + if (r < 0) + return r; + + r = journal_file_open( + -1, + (*f)->path, + (*f)->flags, + (*f)->mode, + compress, + compress_threshold_bytes, + seal, + NULL, /* metrics */ + (*f)->mmap, + deferred_closes, + *f, /* template */ + &new_file); + + journal_initiate_close(*f, deferred_closes); + *f = new_file; + + return r; +} + +int journal_file_dispose(int dir_fd, const char *fname) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -1; + + assert(fname); + + /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that + * this is done without looking into the file or changing any of its contents. The idea is that this is called + * whenever something is suspicious and we want to move the file away and make clear that it is not accessed + * for writing anymore. */ + + if (!endswith(fname, ".journal")) + return -EINVAL; + + if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~", + (int) strlen(fname) - 8, fname, + now(CLOCK_REALTIME), + random_u64()) < 0) + return -ENOMEM; + + if (renameat(dir_fd, fname, dir_fd, p) < 0) + return -errno; + + /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */ + fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m"); + else { + (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL); + (void) btrfs_defrag_fd(fd); + } + + return 0; +} + +int journal_file_open_reliably( + const char *fname, + int flags, + mode_t mode, + bool compress, + uint64_t compress_threshold_bytes, + bool seal, + JournalMetrics *metrics, + MMapCache *mmap_cache, + Set *deferred_closes, + JournalFile *template, + JournalFile **ret) { + + int r; + + r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache, + deferred_closes, template, ret); + if (!IN_SET(r, + -EBADMSG, /* Corrupted */ + -ENODATA, /* Truncated */ + -EHOSTDOWN, /* Other machine */ + -EPROTONOSUPPORT, /* Incompatible feature */ + -EBUSY, /* Unclean shutdown */ + -ESHUTDOWN, /* Already archived */ + -EIO, /* IO error, including SIGBUS on mmap */ + -EIDRM, /* File has been deleted */ + -ETXTBSY)) /* File is from the future */ + return r; + + if ((flags & O_ACCMODE) == O_RDONLY) + return r; + + if (!(flags & O_CREAT)) + return r; + + if (!endswith(fname, ".journal")) + return r; + + /* The file is corrupted. Rotate it away and try it again (but only once) */ + log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname); + + r = journal_file_dispose(AT_FDCWD, fname); + if (r < 0) + return r; + + return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache, + deferred_closes, template, ret); +} + +int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) { + uint64_t i, n; + uint64_t q, xor_hash = 0; + int r; + EntryItem *items; + dual_timestamp ts; + const sd_id128_t *boot_id; + + assert(from); + assert(to); + assert(o); + assert(p); + + if (!to->writable) + return -EPERM; + + ts.monotonic = le64toh(o->entry.monotonic); + ts.realtime = le64toh(o->entry.realtime); + boot_id = &o->entry.boot_id; + + n = journal_file_entry_n_items(o); + /* alloca() can't take 0, hence let's allocate at least one */ + items = newa(EntryItem, MAX(1u, n)); + + for (i = 0; i < n; i++) { + uint64_t l, h; + le64_t le_hash; + size_t t; + void *data; + Object *u; + + q = le64toh(o->entry.items[i].object_offset); + le_hash = o->entry.items[i].hash; + + r = journal_file_move_to_object(from, OBJECT_DATA, q, &o); + if (r < 0) + return r; + + if (le_hash != o->data.hash) + return -EBADMSG; + + l = le64toh(READ_NOW(o->object.size)); + if (l < offsetof(Object, data.payload)) + return -EBADMSG; + + l -= offsetof(Object, data.payload); + t = (size_t) l; + + /* We hit the limit on 32bit machines */ + if ((uint64_t) t != l) + return -E2BIG; + + if (o->object.flags & OBJECT_COMPRESSION_MASK) { +#if HAVE_COMPRESSION + size_t rsize = 0; + + r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK, + o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0); + if (r < 0) + return r; + + data = from->compress_buffer; + l = rsize; +#else + return -EPROTONOSUPPORT; +#endif + } else + data = o->data.payload; + + r = journal_file_append_data(to, data, l, &u, &h); + if (r < 0) + return r; + + if (JOURNAL_HEADER_KEYED_HASH(to->header)) + xor_hash ^= jenkins_hash64(data, l); + else + xor_hash ^= le64toh(u->data.hash); + + items[i].object_offset = htole64(h); + items[i].hash = u->data.hash; + + r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + } + + r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, + NULL, NULL, NULL); + + if (mmap_cache_got_sigbus(to->mmap, to->cache_fd)) + return -EIO; + + return r; +} + +void journal_reset_metrics(JournalMetrics *m) { + assert(m); + + /* Set everything to "pick automatic values". */ + + *m = (JournalMetrics) { + .min_use = (uint64_t) -1, + .max_use = (uint64_t) -1, + .min_size = (uint64_t) -1, + .max_size = (uint64_t) -1, + .keep_free = (uint64_t) -1, + .n_max_files = (uint64_t) -1, + }; +} + +void journal_default_metrics(JournalMetrics *m, int fd) { + char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX]; + struct statvfs ss; + uint64_t fs_size = 0; + + assert(m); + assert(fd >= 0); + + if (fstatvfs(fd, &ss) >= 0) + fs_size = ss.f_frsize * ss.f_blocks; + else + log_debug_errno(errno, "Failed to determine disk size: %m"); + + if (m->max_use == (uint64_t) -1) { + + if (fs_size > 0) + m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */ + MAX_USE_LOWER, MAX_USE_UPPER); + else + m->max_use = MAX_USE_LOWER; + } else { + m->max_use = PAGE_ALIGN(m->max_use); + + if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2) + m->max_use = JOURNAL_FILE_SIZE_MIN*2; + } + + if (m->min_use == (uint64_t) -1) { + if (fs_size > 0) + m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */ + MIN_USE_LOW, MIN_USE_HIGH); + else + m->min_use = MIN_USE_LOW; + } + + if (m->min_use > m->max_use) + m->min_use = m->max_use; + + if (m->max_size == (uint64_t) -1) + m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */ + MAX_SIZE_UPPER); + else + m->max_size = PAGE_ALIGN(m->max_size); + + if (m->max_size != 0) { + if (m->max_size < JOURNAL_FILE_SIZE_MIN) + m->max_size = JOURNAL_FILE_SIZE_MIN; + + if (m->max_use != 0 && m->max_size*2 > m->max_use) + m->max_use = m->max_size*2; + } + + if (m->min_size == (uint64_t) -1) + m->min_size = JOURNAL_FILE_SIZE_MIN; + else + m->min_size = CLAMP(PAGE_ALIGN(m->min_size), + JOURNAL_FILE_SIZE_MIN, + m->max_size ?: UINT64_MAX); + + if (m->keep_free == (uint64_t) -1) { + if (fs_size > 0) + m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */ + KEEP_FREE_UPPER); + else + m->keep_free = DEFAULT_KEEP_FREE; + } + + if (m->n_max_files == (uint64_t) -1) + m->n_max_files = DEFAULT_N_MAX_FILES; + + log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64, + format_bytes(a, sizeof(a), m->min_use), + format_bytes(b, sizeof(b), m->max_use), + format_bytes(c, sizeof(c), m->max_size), + format_bytes(d, sizeof(d), m->min_size), + format_bytes(e, sizeof(e), m->keep_free), + m->n_max_files); +} + +int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) { + assert(f); + assert(f->header); + assert(from || to); + + if (from) { + if (f->header->head_entry_realtime == 0) + return -ENOENT; + + *from = le64toh(f->header->head_entry_realtime); + } + + if (to) { + if (f->header->tail_entry_realtime == 0) + return -ENOENT; + + *to = le64toh(f->header->tail_entry_realtime); + } + + return 1; +} + +int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) { + Object *o; + uint64_t p; + int r; + + assert(f); + assert(from || to); + + r = find_data_object_by_boot_id(f, boot_id, &o, &p); + if (r <= 0) + return r; + + if (le64toh(o->data.n_entries) <= 0) + return 0; + + if (from) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o); + if (r < 0) + return r; + + *from = le64toh(o->entry.monotonic); + } + + if (to) { + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + r = generic_array_get_plus_one(f, + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + le64toh(o->data.n_entries)-1, + &o, NULL); + if (r <= 0) + return r; + + *to = le64toh(o->entry.monotonic); + } + + return 1; +} + +bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) { + assert(f); + assert(f->header); + + /* If we gained new header fields we gained new features, + * hence suggest a rotation */ + if (le64toh(f->header->header_size) < sizeof(Header)) { + log_debug("%s uses an outdated header, suggesting rotation.", f->path); + return true; + } + + /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from + * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we + * need the n_data field, which only exists in newer versions. */ + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) { + log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.", + f->path, + 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))), + le64toh(f->header->n_data), + le64toh(f->header->data_hash_table_size) / sizeof(HashItem), + (unsigned long long) f->last_stat.st_size, + f->last_stat.st_size / le64toh(f->header->n_data)); + return true; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) { + log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.", + f->path, + 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))), + le64toh(f->header->n_fields), + le64toh(f->header->field_hash_table_size) / sizeof(HashItem)); + return true; + } + + /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our + * longest chain is longer than some threshold, let's suggest rotation. */ + if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) && + le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { + log_debug("Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.", + f->path, le64toh(f->header->data_hash_chain_depth)); + return true; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) && + le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { + log_debug("Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.", + f->path, le64toh(f->header->field_hash_chain_depth)); + return true; + } + + /* Are the data objects properly indexed by field objects? */ + if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && + JOURNAL_HEADER_CONTAINS(f->header, n_fields) && + le64toh(f->header->n_data) > 0 && + le64toh(f->header->n_fields) == 0) + return true; + + if (max_file_usec > 0) { + usec_t t, h; + + h = le64toh(f->header->head_entry_realtime); + t = now(CLOCK_REALTIME); + + if (h > 0 && t > h + max_file_usec) + return true; + } + + return false; +} diff --git a/src/journal/journal-file.h b/src/journal/journal-file.h new file mode 100644 index 0000000..c48d95f --- /dev/null +++ b/src/journal/journal-file.h @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> +#include <sys/uio.h> + +#if HAVE_GCRYPT +# include <gcrypt.h> +#endif + +#include "sd-event.h" +#include "sd-id128.h" + +#include "hashmap.h" +#include "journal-def.h" +#include "mmap-cache.h" +#include "sparse-endian.h" +#include "time-util.h" + +typedef struct JournalMetrics { + /* For all these: -1 means "pick automatically", and 0 means "no limit enforced" */ + uint64_t max_size; /* how large journal files grow at max */ + uint64_t min_size; /* how large journal files grow at least */ + uint64_t max_use; /* how much disk space to use in total at max, keep_free permitting */ + uint64_t min_use; /* how much disk space to use in total at least, even if keep_free says not to */ + uint64_t keep_free; /* how much to keep free on disk */ + uint64_t n_max_files; /* how many files to keep around at max */ +} JournalMetrics; + +typedef enum direction { + DIRECTION_UP, + DIRECTION_DOWN +} direction_t; + +typedef enum LocationType { + /* The first and last entries, resp. */ + LOCATION_HEAD, + LOCATION_TAIL, + + /* We already read the entry we currently point to, and the + * next one to read should probably not be this one again. */ + LOCATION_DISCRETE, + + /* We should seek to the precise location specified, and + * return it, as we haven't read it yet. */ + LOCATION_SEEK +} LocationType; + +typedef enum OfflineState { + OFFLINE_JOINED, + OFFLINE_SYNCING, + OFFLINE_OFFLINING, + OFFLINE_CANCEL, + OFFLINE_AGAIN_FROM_SYNCING, + OFFLINE_AGAIN_FROM_OFFLINING, + OFFLINE_DONE +} OfflineState; + +typedef struct JournalFile { + int fd; + MMapFileDescriptor *cache_fd; + + mode_t mode; + + int flags; + int prot; + bool writable:1; + bool compress_xz:1; + bool compress_lz4:1; + bool compress_zstd:1; + bool seal:1; + bool defrag_on_close:1; + bool close_fd:1; + bool archive:1; + bool keyed_hash:1; + + direction_t last_direction; + LocationType location_type; + uint64_t last_n_entries; + + char *path; + struct stat last_stat; + usec_t last_stat_usec; + + Header *header; + HashItem *data_hash_table; + HashItem *field_hash_table; + + uint64_t current_offset; + uint64_t current_seqnum; + uint64_t current_realtime; + uint64_t current_monotonic; + sd_id128_t current_boot_id; + uint64_t current_xor_hash; + + JournalMetrics metrics; + MMapCache *mmap; + + sd_event_source *post_change_timer; + usec_t post_change_timer_period; + + OrderedHashmap *chain_cache; + + pthread_t offline_thread; + volatile OfflineState offline_state; + + unsigned last_seen_generation; + + uint64_t compress_threshold_bytes; +#if HAVE_COMPRESSION + void *compress_buffer; + size_t compress_buffer_size; +#endif + +#if HAVE_GCRYPT + gcry_md_hd_t hmac; + bool hmac_running; + + FSSHeader *fss_file; + size_t fss_file_size; + + uint64_t fss_start_usec; + uint64_t fss_interval_usec; + + void *fsprg_state; + size_t fsprg_state_size; + + void *fsprg_seed; + size_t fsprg_seed_size; +#endif +} JournalFile; + +int journal_file_open( + int fd, + const char *fname, + int flags, + mode_t mode, + bool compress, + uint64_t compress_threshold_bytes, + bool seal, + JournalMetrics *metrics, + MMapCache *mmap_cache, + Set *deferred_closes, + JournalFile *template, + JournalFile **ret); + +int journal_file_set_offline(JournalFile *f, bool wait); +bool journal_file_is_offlining(JournalFile *f); +JournalFile* journal_file_close(JournalFile *j); +int journal_file_fstat(JournalFile *f); +DEFINE_TRIVIAL_CLEANUP_FUNC(JournalFile*, journal_file_close); + +int journal_file_open_reliably( + const char *fname, + int flags, + mode_t mode, + bool compress, + uint64_t compress_threshold_bytes, + bool seal, + JournalMetrics *metrics, + MMapCache *mmap_cache, + Set *deferred_closes, + JournalFile *template, + JournalFile **ret); + +#define ALIGN64(x) (((x) + 7ULL) & ~7ULL) +#define VALID64(x) (((x) & 7ULL) == 0ULL) + +/* Use six characters to cover the offsets common in smallish journal + * files without adding too many zeros. */ +#define OFSfmt "%06"PRIx64 + +static inline bool VALID_REALTIME(uint64_t u) { + /* This considers timestamps until the year 3112 valid. That should be plenty room... */ + return u > 0 && u < (1ULL << 55); +} + +static inline bool VALID_MONOTONIC(uint64_t u) { + /* This considers timestamps until 1142 years of runtime valid. */ + return u < (1ULL << 55); +} + +static inline bool VALID_EPOCH(uint64_t u) { + /* This allows changing the key for 1142 years, every usec. */ + return u < (1ULL << 55); +} + +#define JOURNAL_HEADER_CONTAINS(h, field) \ + (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field)) + +#define JOURNAL_HEADER_SEALED(h) \ + FLAGS_SET(le32toh((h)->compatible_flags), HEADER_COMPATIBLE_SEALED) + +#define JOURNAL_HEADER_COMPRESSED_XZ(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_XZ) + +#define JOURNAL_HEADER_COMPRESSED_LZ4(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_LZ4) + +#define JOURNAL_HEADER_COMPRESSED_ZSTD(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) + +#define JOURNAL_HEADER_KEYED_HASH(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_KEYED_HASH) + +int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret); + +uint64_t journal_file_entry_n_items(Object *o) _pure_; +uint64_t journal_file_entry_array_n_items(Object *o) _pure_; +uint64_t journal_file_hash_table_n_items(Object *o) _pure_; + +int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset); +int journal_file_append_entry( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + const struct iovec iovec[], unsigned n_iovec, + uint64_t *seqno, + Object **ret, + uint64_t *offset); + +int journal_file_find_data_object(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset); +int journal_file_find_data_object_with_hash(JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset); + +int journal_file_find_field_object(JournalFile *f, const void *field, uint64_t size, Object **ret, uint64_t *offset); +int journal_file_find_field_object_with_hash(JournalFile *f, const void *field, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset); + +void journal_file_reset_location(JournalFile *f); +void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset); +int journal_file_compare_locations(JournalFile *af, JournalFile *bf); +int journal_file_next_entry(JournalFile *f, uint64_t p, direction_t direction, Object **ret, uint64_t *offset); + +int journal_file_next_entry_for_data(JournalFile *f, Object *o, uint64_t p, uint64_t data_offset, direction_t direction, Object **ret, uint64_t *offset); + +int journal_file_move_to_entry_by_seqnum(JournalFile *f, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset); +int journal_file_move_to_entry_by_realtime(JournalFile *f, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset); +int journal_file_move_to_entry_by_monotonic(JournalFile *f, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset); + +int journal_file_move_to_entry_by_offset_for_data(JournalFile *f, uint64_t data_offset, uint64_t p, direction_t direction, Object **ret, uint64_t *offset); +int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, uint64_t data_offset, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset); +int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, uint64_t data_offset, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset); +int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, uint64_t data_offset, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset); + +int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p); + +void journal_file_dump(JournalFile *f); +void journal_file_print_header(JournalFile *f); + +int journal_file_archive(JournalFile *f); +JournalFile* journal_initiate_close(JournalFile *f, Set *deferred_closes); +int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes); + +int journal_file_dispose(int dir_fd, const char *fname); + +void journal_file_post_change(JournalFile *f); +int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t); + +void journal_reset_metrics(JournalMetrics *m); +void journal_default_metrics(JournalMetrics *m, int fd); + +int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to); +int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot, usec_t *from, usec_t *to); + +bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec); + +int journal_file_map_data_hash_table(JournalFile *f); +int journal_file_map_field_hash_table(JournalFile *f); + +static inline bool JOURNAL_FILE_COMPRESS(JournalFile *f) { + assert(f); + return f->compress_xz || f->compress_lz4 || f->compress_zstd; +} + +uint64_t journal_file_hash_data(JournalFile *f, const void *data, size_t sz); + +bool journal_field_valid(const char *p, size_t l, bool allow_protected); diff --git a/src/journal/journal-internal.h b/src/journal/journal-internal.h new file mode 100644 index 0000000..c2d29aa --- /dev/null +++ b/src/journal/journal-internal.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> +#include <stdbool.h> +#include <sys/types.h> + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "hashmap.h" +#include "journal-def.h" +#include "journal-file.h" +#include "list.h" +#include "set.h" + +typedef struct Match Match; +typedef struct Location Location; +typedef struct Directory Directory; + +typedef enum MatchType { + MATCH_DISCRETE, + MATCH_OR_TERM, + MATCH_AND_TERM +} MatchType; + +struct Match { + MatchType type; + Match *parent; + LIST_FIELDS(Match, matches); + + /* For concrete matches */ + char *data; + size_t size; + uint64_t hash; /* old-style jenkins hash. New-style siphash is different per file, hence won't be cached here */ + + /* For terms */ + LIST_HEAD(Match, matches); +}; + +struct Location { + LocationType type; + + bool seqnum_set:1; + bool realtime_set:1; + bool monotonic_set:1; + bool xor_hash_set:1; + + uint64_t seqnum; + sd_id128_t seqnum_id; + + uint64_t realtime; + + uint64_t monotonic; + sd_id128_t boot_id; + + uint64_t xor_hash; +}; + +struct Directory { + char *path; + int wd; + bool is_root; + unsigned last_seen_generation; +}; + +struct sd_journal { + int toplevel_fd; + + char *path; + char *prefix; + char *namespace; + + OrderedHashmap *files; + IteratedCache *files_cache; + MMapCache *mmap; + + Location current_location; + + JournalFile *current_file; + uint64_t current_field; + + Match *level0, *level1, *level2; + + pid_t original_pid; + + int inotify_fd; + unsigned current_invalidate_counter, last_invalidate_counter; + usec_t last_process_usec; + unsigned generation; + + /* Iterating through unique fields and their data values */ + char *unique_field; + JournalFile *unique_file; + uint64_t unique_offset; + + /* Iterating through known fields */ + JournalFile *fields_file; + uint64_t fields_offset; + uint64_t fields_hash_table_index; + char *fields_buffer; + size_t fields_buffer_allocated; + + int flags; + + bool on_network:1; + bool no_new_files:1; + bool no_inotify:1; + bool unique_file_lost:1; /* File we were iterating over got + removed, and there were no more + files, so sd_j_enumerate_unique + will return a value equal to 0. */ + bool fields_file_lost:1; + bool has_runtime_files:1; + bool has_persistent_files:1; + + size_t data_threshold; + + Hashmap *directories_by_path; + Hashmap *directories_by_wd; + + Hashmap *errors; +}; + +char *journal_make_match_string(sd_journal *j); +void journal_print_header(sd_journal *j); + +#define JOURNAL_FOREACH_DATA_RETVAL(j, data, l, retval) \ + for (sd_journal_restart_data(j); ((retval) = sd_journal_enumerate_data((j), &(data), &(l))) > 0; ) + +/* All errors that we might encounter while extracting a field that are not real errors, + * but only mean that the field is too large or we don't support the compression. */ +static inline bool JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(int r) { + return IN_SET(abs(r), + ENOBUFS, /* Field or decompressed field too large */ + E2BIG, /* Field too large for pointer width */ + EPROTONOSUPPORT); /* Unsupported compression */ +} diff --git a/src/journal/journal-send.c b/src/journal/journal-send.c new file mode 100644 index 0000000..fd3fd7e --- /dev/null +++ b/src/journal/journal-send.c @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <printf.h> +#include <stddef.h> +#include <sys/un.h> +#include <unistd.h> + +#define SD_JOURNAL_SUPPRESS_LOCATION + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "fileio.h" +#include "memfd-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tmpfile-util.h" + +#define SNDBUF_SIZE (8*1024*1024) + +#define ALLOCA_CODE_FUNC(f, func) \ + do { \ + size_t _fl; \ + const char *_func = (func); \ + char **_f = &(f); \ + _fl = strlen(_func) + 1; \ + *_f = newa(char, _fl + 10); \ + memcpy(*_f, "CODE_FUNC=", 10); \ + memcpy(*_f + 10, _func, _fl); \ + } while (false) + +/* We open a single fd, and we'll share it with the current process, + * all its threads, and all its subprocesses. This means we need to + * initialize it atomically, and need to operate on it atomically + * never assuming we are the only user */ + +static int journal_fd(void) { + int fd; + static int fd_plus_one = 0; + +retry: + if (fd_plus_one > 0) + return fd_plus_one - 1; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + fd_inc_sndbuf(fd, SNDBUF_SIZE); + + if (!__sync_bool_compare_and_swap(&fd_plus_one, 0, fd+1)) { + safe_close(fd); + goto retry; + } + + return fd; +} + +_public_ int sd_journal_print(int priority, const char *format, ...) { + int r; + va_list ap; + + va_start(ap, format); + r = sd_journal_printv(priority, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_journal_printv(int priority, const char *format, va_list ap) { + char p[STRLEN("PRIORITY=") + DECIMAL_STR_MAX(int) + 1]; + char sbuf[LINE_MAX + 8] = "MESSAGE="; + struct iovec iov[2]; + int len; + va_list aq; + char *buffer = sbuf; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + assert_return(format, -EINVAL); + + xsprintf(p, "PRIORITY=%i", priority & LOG_PRIMASK); + + va_copy(aq, ap); + len = vsnprintf(buffer + 8, LINE_MAX, format, aq); + va_end(aq); + + if (len >= (int)LONG_LINE_MAX - 8) + return -ENOBUFS; + + /* Allocate large buffer to accommodate big message */ + if (len >= LINE_MAX) { + buffer = alloca(len + 9); + memcpy(buffer, "MESSAGE=", 8); + assert_se(vsnprintf(buffer + 8, len + 1, format, ap) == len); + } + + /* Strip trailing whitespace, keep prefix whitespace. */ + (void) strstrip(buffer); + + /* Suppress empty lines */ + if (isempty(buffer + 8)) + return 0; + + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); + + return sd_journal_sendv(iov, 2); +} + +_printf_(1, 0) static int fill_iovec_sprintf(const char *format, va_list ap, int extra, struct iovec **_iov) { + PROTECT_ERRNO; + int r, n = 0, i = 0, j; + struct iovec *iov = NULL; + + assert(_iov); + + if (extra > 0) { + n = MAX(extra * 2, extra + 4); + iov = malloc0(n * sizeof(struct iovec)); + if (!iov) { + r = -ENOMEM; + goto fail; + } + + i = extra; + } + + while (format) { + struct iovec *c; + char *buffer; + va_list aq; + + if (i >= n) { + n = MAX(i*2, 4); + c = reallocarray(iov, n, sizeof(struct iovec)); + if (!c) { + r = -ENOMEM; + goto fail; + } + + iov = c; + } + + va_copy(aq, ap); + if (vasprintf(&buffer, format, aq) < 0) { + va_end(aq); + r = -ENOMEM; + goto fail; + } + va_end(aq); + + VA_FORMAT_ADVANCE(format, ap); + + (void) strstrip(buffer); /* strip trailing whitespace, keep prefixing whitespace */ + + iov[i++] = IOVEC_MAKE_STRING(buffer); + + format = va_arg(ap, char *); + } + + *_iov = iov; + + return i; + +fail: + for (j = 0; j < i; j++) + free(iov[j].iov_base); + + free(iov); + + return r; +} + +_public_ int sd_journal_send(const char *format, ...) { + int r, i, j; + va_list ap; + struct iovec *iov = NULL; + + va_start(ap, format); + i = fill_iovec_sprintf(format, ap, 0, &iov); + va_end(ap); + + if (_unlikely_(i < 0)) { + r = i; + goto finish; + } + + r = sd_journal_sendv(iov, i); + +finish: + for (j = 0; j < i; j++) + free(iov[j].iov_base); + + free(iov); + + return r; +} + +_public_ int sd_journal_sendv(const struct iovec *iov, int n) { + PROTECT_ERRNO; + int fd, r; + _cleanup_close_ int buffer_fd = -1; + struct iovec *w; + uint64_t *l; + int i, j = 0; + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/journal/socket", + }; + struct msghdr mh = { + .msg_name = (struct sockaddr*) &sa.sa, + .msg_namelen = SOCKADDR_UN_LEN(sa.un), + }; + ssize_t k; + bool have_syslog_identifier = false; + bool seal = true; + + assert_return(iov, -EINVAL); + assert_return(n > 0, -EINVAL); + + w = newa(struct iovec, n * 5 + 3); + l = newa(uint64_t, n); + + for (i = 0; i < n; i++) { + char *c, *nl; + + if (_unlikely_(!iov[i].iov_base || iov[i].iov_len <= 1)) + return -EINVAL; + + c = memchr(iov[i].iov_base, '=', iov[i].iov_len); + if (_unlikely_(!c || c == iov[i].iov_base)) + return -EINVAL; + + have_syslog_identifier = have_syslog_identifier || + (c == (char *) iov[i].iov_base + 17 && + startswith(iov[i].iov_base, "SYSLOG_IDENTIFIER")); + + nl = memchr(iov[i].iov_base, '\n', iov[i].iov_len); + if (nl) { + if (_unlikely_(nl < c)) + return -EINVAL; + + /* Already includes a newline? Bummer, then + * let's write the variable name, then a + * newline, then the size (64bit LE), followed + * by the data and a final newline */ + + w[j++] = IOVEC_MAKE(iov[i].iov_base, c - (char*) iov[i].iov_base); + w[j++] = IOVEC_MAKE_STRING("\n"); + + l[i] = htole64(iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); + w[j++] = IOVEC_MAKE(&l[i], sizeof(uint64_t)); + + w[j++] = IOVEC_MAKE(c + 1, iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); + } else + /* Nothing special? Then just add the line and + * append a newline */ + w[j++] = iov[i]; + + w[j++] = IOVEC_MAKE_STRING("\n"); + } + + if (!have_syslog_identifier && + string_is_safe(program_invocation_short_name)) { + + /* Implicitly add program_invocation_short_name, if it + * is not set explicitly. We only do this for + * program_invocation_short_name, and nothing else + * since everything else is much nicer to retrieve + * from the outside. */ + + w[j++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER="); + w[j++] = IOVEC_MAKE_STRING(program_invocation_short_name); + w[j++] = IOVEC_MAKE_STRING("\n"); + } + + fd = journal_fd(); + if (_unlikely_(fd < 0)) + return fd; + + mh.msg_iov = w; + mh.msg_iovlen = j; + + k = sendmsg(fd, &mh, MSG_NOSIGNAL); + if (k >= 0) + return 0; + + /* Fail silently if the journal is not available */ + if (errno == ENOENT) + return 0; + + if (!IN_SET(errno, EMSGSIZE, ENOBUFS)) + return -errno; + + /* Message doesn't fit... Let's dump the data in a memfd or + * temporary file and just pass a file descriptor of it to the + * other side. + * + * For the temporary files we use /dev/shm instead of /tmp + * here, since we want this to be a tmpfs, and one that is + * available from early boot on and where unprivileged users + * can create files. */ + buffer_fd = memfd_new(NULL); + if (buffer_fd < 0) { + if (buffer_fd == -ENOSYS) { + buffer_fd = open_tmpfile_unlinkable("/dev/shm", O_RDWR | O_CLOEXEC); + if (buffer_fd < 0) + return buffer_fd; + + seal = false; + } else + return buffer_fd; + } + + n = writev(buffer_fd, w, j); + if (n < 0) + return -errno; + + if (seal) { + r = memfd_set_sealed(buffer_fd); + if (r < 0) + return r; + } + + r = send_one_fd_sa(fd, buffer_fd, mh.msg_name, mh.msg_namelen, 0); + if (r == -ENOENT) + /* Fail silently if the journal is not available */ + return 0; + return r; +} + +static int fill_iovec_perror_and_send(const char *message, int skip, struct iovec iov[]) { + PROTECT_ERRNO; + size_t n, k; + + k = isempty(message) ? 0 : strlen(message) + 2; + n = 8 + k + 256 + 1; + + for (;;) { + char buffer[n]; + char* j; + + errno = 0; + j = strerror_r(_saved_errno_, buffer + 8 + k, n - 8 - k); + if (errno == 0) { + char error[STRLEN("ERRNO=") + DECIMAL_STR_MAX(int) + 1]; + + if (j != buffer + 8 + k) + memmove(buffer + 8 + k, j, strlen(j)+1); + + memcpy(buffer, "MESSAGE=", 8); + + if (k > 0) { + memcpy(buffer + 8, message, k - 2); + memcpy(buffer + 8 + k - 2, ": ", 2); + } + + xsprintf(error, "ERRNO=%i", _saved_errno_); + + assert_cc(3 == LOG_ERR); + iov[skip+0] = IOVEC_MAKE_STRING("PRIORITY=3"); + iov[skip+1] = IOVEC_MAKE_STRING(buffer); + iov[skip+2] = IOVEC_MAKE_STRING(error); + + return sd_journal_sendv(iov, skip + 3); + } + + if (errno != ERANGE) + return -errno; + + n *= 2; + } +} + +_public_ int sd_journal_perror(const char *message) { + struct iovec iovec[3]; + + return fill_iovec_perror_and_send(message, 0, iovec); +} + +_public_ int sd_journal_stream_fd(const char *identifier, int priority, int level_prefix) { + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/journal/stdout", + }; + _cleanup_close_ int fd = -1; + char *header; + size_t l; + int r; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return -errno; + + if (shutdown(fd, SHUT_RD) < 0) + return -errno; + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + identifier = strempty(identifier); + + l = strlen(identifier); + header = newa(char, l + 1 + 1 + 2 + 2 + 2 + 2 + 2); + + memcpy(header, identifier, l); + header[l++] = '\n'; + header[l++] = '\n'; /* unit id */ + header[l++] = '0' + priority; + header[l++] = '\n'; + header[l++] = '0' + !!level_prefix; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + + r = loop_write(fd, header, l, false); + if (r < 0) + return r; + + return TAKE_FD(fd); +} + +_public_ int sd_journal_print_with_location(int priority, const char *file, const char *line, const char *func, const char *format, ...) { + int r; + va_list ap; + + va_start(ap, format); + r = sd_journal_printv_with_location(priority, file, line, func, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_journal_printv_with_location(int priority, const char *file, const char *line, const char *func, const char *format, va_list ap) { + char p[STRLEN("PRIORITY=") + DECIMAL_STR_MAX(int) + 1]; + char sbuf[LINE_MAX + 8] = "MESSAGE="; + struct iovec iov[5]; + char *f; + int len; + char *buffer = sbuf; + va_list aq; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + assert_return(format, -EINVAL); + + xsprintf(p, "PRIORITY=%i", priority & LOG_PRIMASK); + + va_copy(aq, ap); + len = vsnprintf(buffer + 8, LINE_MAX, format, aq); + va_end(aq); + + if (len >= (int)LONG_LINE_MAX - 8) + return -ENOBUFS; + + /* Allocate large buffer to accommodate big message */ + if (len >= LINE_MAX) { + buffer = alloca(len + 9); + memcpy(buffer, "MESSAGE=", 8); + assert_se(vsnprintf(buffer + 8, len + 1, format, ap) == len); + } + + /* Strip trailing whitespace, keep prefixing whitespace */ + (void) strstrip(buffer); + + /* Suppress empty lines */ + if (isempty(buffer + 8)) + return 0; + + /* func is initialized from __func__ which is not a macro, but + * a static const char[], hence cannot easily be prefixed with + * CODE_FUNC=, hence let's do it manually here. */ + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); + iov[2] = IOVEC_MAKE_STRING(file); + iov[3] = IOVEC_MAKE_STRING(line); + iov[4] = IOVEC_MAKE_STRING(f); + + return sd_journal_sendv(iov, ELEMENTSOF(iov)); +} + +_public_ int sd_journal_send_with_location(const char *file, const char *line, const char *func, const char *format, ...) { + _cleanup_free_ struct iovec *iov = NULL; + int r, i, j; + va_list ap; + char *f; + + va_start(ap, format); + i = fill_iovec_sprintf(format, ap, 3, &iov); + va_end(ap); + + if (_unlikely_(i < 0)) { + r = i; + goto finish; + } + + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); + + r = sd_journal_sendv(iov, i); + +finish: + for (j = 3; j < i; j++) + free(iov[j].iov_base); + + return r; +} + +_public_ int sd_journal_sendv_with_location( + const char *file, const char *line, + const char *func, + const struct iovec *iov, int n) { + + struct iovec *niov; + char *f; + + assert_return(iov, -EINVAL); + assert_return(n > 0, -EINVAL); + + niov = newa(struct iovec, n + 3); + memcpy(niov, iov, sizeof(struct iovec) * n); + + ALLOCA_CODE_FUNC(f, func); + + niov[n++] = IOVEC_MAKE_STRING(file); + niov[n++] = IOVEC_MAKE_STRING(line); + niov[n++] = IOVEC_MAKE_STRING(f); + + return sd_journal_sendv(niov, n); +} + +_public_ int sd_journal_perror_with_location( + const char *file, const char *line, + const char *func, + const char *message) { + + struct iovec iov[6]; + char *f; + + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); + + return fill_iovec_perror_and_send(message, 3, iov); +} diff --git a/src/journal/journal-vacuum.c b/src/journal/journal-vacuum.c new file mode 100644 index 0000000..c173664 --- /dev/null +++ b/src/journal/journal-vacuum.c @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-vacuum.h" +#include "sort-util.h" +#include "string-util.h" +#include "time-util.h" +#include "xattr-util.h" + +struct vacuum_info { + uint64_t usage; + char *filename; + + uint64_t realtime; + + sd_id128_t seqnum_id; + uint64_t seqnum; + bool have_seqnum; +}; + +static int vacuum_compare(const struct vacuum_info *a, const struct vacuum_info *b) { + int r; + + if (a->have_seqnum && b->have_seqnum && + sd_id128_equal(a->seqnum_id, b->seqnum_id)) + return CMP(a->seqnum, b->seqnum); + + r = CMP(a->realtime, b->realtime); + if (r != 0) + return r; + + if (a->have_seqnum && b->have_seqnum) + return memcmp(&a->seqnum_id, &b->seqnum_id, 16); + + return strcmp(a->filename, b->filename); +} + +static void patch_realtime( + int fd, + const char *fn, + const struct stat *st, + unsigned long long *realtime) { + + usec_t x, crtime = 0; + + /* The timestamp was determined by the file name, but let's + * see if the file might actually be older than the file name + * suggested... */ + + assert(fd >= 0); + assert(fn); + assert(st); + assert(realtime); + + x = timespec_load(&st->st_ctim); + if (x > 0 && x != USEC_INFINITY && x < *realtime) + *realtime = x; + + x = timespec_load(&st->st_atim); + if (x > 0 && x != USEC_INFINITY && x < *realtime) + *realtime = x; + + x = timespec_load(&st->st_mtim); + if (x > 0 && x != USEC_INFINITY && x < *realtime) + *realtime = x; + + /* Let's read the original creation time, if possible. Ideally + * we'd just query the creation time the FS might provide, but + * unfortunately there's currently no sane API to query + * it. Hence let's implement this manually... */ + + if (fd_getcrtime_at(fd, fn, &crtime, 0) >= 0) { + if (crtime < *realtime) + *realtime = crtime; + } +} + +static int journal_file_empty(int dir_fd, const char *name) { + _cleanup_close_ int fd; + struct stat st; + le64_t n_entries; + ssize_t n; + + fd = openat(dir_fd, name, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NONBLOCK|O_NOATIME); + if (fd < 0) { + /* Maybe failed due to O_NOATIME and lack of privileges? */ + fd = openat(dir_fd, name, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NONBLOCK); + if (fd < 0) + return -errno; + } + + if (fstat(fd, &st) < 0) + return -errno; + + /* If an offline file doesn't even have a header we consider it empty */ + if (st.st_size < (off_t) sizeof(Header)) + return 1; + + /* If the number of entries is empty, we consider it empty, too */ + n = pread(fd, &n_entries, sizeof(n_entries), offsetof(Header, n_entries)); + if (n < 0) + return -errno; + if (n != sizeof(n_entries)) + return -EIO; + + return le64toh(n_entries) <= 0; +} + +int journal_directory_vacuum( + const char *directory, + uint64_t max_use, + uint64_t n_max_files, + usec_t max_retention_usec, + usec_t *oldest_usec, + bool verbose) { + + uint64_t sum = 0, freed = 0, n_active_files = 0; + size_t n_list = 0, n_allocated = 0, i; + _cleanup_closedir_ DIR *d = NULL; + struct vacuum_info *list = NULL; + usec_t retention_limit = 0; + char sbytes[FORMAT_BYTES_MAX]; + struct dirent *de; + int r; + + assert(directory); + + if (max_use <= 0 && max_retention_usec <= 0 && n_max_files <= 0) + return 0; + + if (max_retention_usec > 0) + retention_limit = usec_sub_unsigned(now(CLOCK_REALTIME), max_retention_usec); + + d = opendir(directory); + if (!d) + return -errno; + + FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) { + + unsigned long long seqnum = 0, realtime; + _cleanup_free_ char *p = NULL; + sd_id128_t seqnum_id; + bool have_seqnum; + uint64_t size; + struct stat st; + size_t q; + + if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + log_debug_errno(errno, "Failed to stat file %s while vacuuming, ignoring: %m", de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) + continue; + + q = strlen(de->d_name); + + if (endswith(de->d_name, ".journal")) { + + /* Vacuum archived files. Active files are + * left around */ + + if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8) { + n_active_files++; + continue; + } + + if (de->d_name[q-8-16-1] != '-' || + de->d_name[q-8-16-1-16-1] != '-' || + de->d_name[q-8-16-1-16-1-32-1] != '@') { + n_active_files++; + continue; + } + + p = strdup(de->d_name); + if (!p) { + r = -ENOMEM; + goto finish; + } + + de->d_name[q-8-16-1-16-1] = 0; + if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) { + n_active_files++; + continue; + } + + if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) { + n_active_files++; + continue; + } + + have_seqnum = true; + + } else if (endswith(de->d_name, ".journal~")) { + unsigned long long tmp; + + /* Vacuum corrupted files */ + + if (q < 1 + 16 + 1 + 16 + 8 + 1) { + n_active_files++; + continue; + } + + if (de->d_name[q-1-8-16-1] != '-' || + de->d_name[q-1-8-16-1-16-1] != '@') { + n_active_files++; + continue; + } + + p = strdup(de->d_name); + if (!p) { + r = -ENOMEM; + goto finish; + } + + if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) { + n_active_files++; + continue; + } + + have_seqnum = false; + } else { + /* We do not vacuum unknown files! */ + log_debug("Not vacuuming unknown file %s.", de->d_name); + continue; + } + + size = 512UL * (uint64_t) st.st_blocks; + + r = journal_file_empty(dirfd(d), p); + if (r < 0) { + log_debug_errno(r, "Failed check if %s is empty, ignoring: %m", p); + continue; + } + if (r > 0) { + /* Always vacuum empty non-online files. */ + + r = unlinkat_deallocate(dirfd(d), p, 0); + if (r >= 0) { + + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Deleted empty archived journal %s/%s (%s).", directory, p, format_bytes(sbytes, sizeof(sbytes), size)); + + freed += size; + } else if (r != -ENOENT) + log_warning_errno(r, "Failed to delete empty archived journal %s/%s: %m", directory, p); + + continue; + } + + patch_realtime(dirfd(d), p, &st, &realtime); + + if (!GREEDY_REALLOC(list, n_allocated, n_list + 1)) { + r = -ENOMEM; + goto finish; + } + + list[n_list++] = (struct vacuum_info) { + .filename = TAKE_PTR(p), + .usage = size, + .seqnum = seqnum, + .realtime = realtime, + .seqnum_id = seqnum_id, + .have_seqnum = have_seqnum, + }; + + sum += size; + } + + typesafe_qsort(list, n_list, vacuum_compare); + + for (i = 0; i < n_list; i++) { + uint64_t left; + + left = n_active_files + n_list - i; + + if ((max_retention_usec <= 0 || list[i].realtime >= retention_limit) && + (max_use <= 0 || sum <= max_use) && + (n_max_files <= 0 || left <= n_max_files)) + break; + + r = unlinkat_deallocate(dirfd(d), list[i].filename, 0); + if (r >= 0) { + log_full(verbose ? LOG_INFO : LOG_DEBUG, "Deleted archived journal %s/%s (%s).", directory, list[i].filename, format_bytes(sbytes, sizeof(sbytes), list[i].usage)); + freed += list[i].usage; + + if (list[i].usage < sum) + sum -= list[i].usage; + else + sum = 0; + + } else if (r != -ENOENT) + log_warning_errno(r, "Failed to delete archived journal %s/%s: %m", directory, list[i].filename); + } + + if (oldest_usec && i < n_list && (*oldest_usec == 0 || list[i].realtime < *oldest_usec)) + *oldest_usec = list[i].realtime; + + r = 0; + +finish: + for (i = 0; i < n_list; i++) + free(list[i].filename); + free(list); + + log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals from %s.", format_bytes(sbytes, sizeof(sbytes), freed), directory); + + return r; +} diff --git a/src/journal/journal-vacuum.h b/src/journal/journal-vacuum.h new file mode 100644 index 0000000..d87c847 --- /dev/null +++ b/src/journal/journal-vacuum.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> +#include <stdbool.h> + +#include "time-util.h" + +int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t n_max_files, usec_t max_retention_usec, usec_t *oldest_usec, bool verbose); diff --git a/src/journal/journal-verify.c b/src/journal/journal-verify.c new file mode 100644 index 0000000..6ea2f4c --- /dev/null +++ b/src/journal/journal-verify.c @@ -0,0 +1,1327 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <stddef.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-verify.h" +#include "lookup3.h" +#include "macro.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "util.h" + +static void draw_progress(uint64_t p, usec_t *last_usec) { + unsigned n, i, j, k; + usec_t z, x; + + if (!on_tty()) + return; + + z = now(CLOCK_MONOTONIC); + x = *last_usec; + + if (x != 0 && x + 40 * USEC_PER_MSEC > z) + return; + + *last_usec = z; + + n = (3 * columns()) / 4; + j = (n * (unsigned) p) / 65535ULL; + k = n - j; + + fputs("\r", stdout); + if (colors_enabled()) + fputs("\x1B[?25l" ANSI_HIGHLIGHT_GREEN, stdout); + + for (i = 0; i < j; i++) + fputs("\xe2\x96\x88", stdout); + + fputs(ansi_normal(), stdout); + + for (i = 0; i < k; i++) + fputs("\xe2\x96\x91", stdout); + + printf(" %3"PRIu64"%%", 100U * p / 65535U); + + fputs("\r", stdout); + if (colors_enabled()) + fputs("\x1B[?25h", stdout); + + fflush(stdout); +} + +static uint64_t scale_progress(uint64_t scale, uint64_t p, uint64_t m) { + /* Calculates scale * p / m, but handles m == 0 safely, and saturates. + * Currently all callers use m >= 1, but we keep the check to be defensive. + */ + + if (p >= m || m == 0) // lgtm[cpp/constant-comparison] + return scale; + + return scale * p / m; +} + +static void flush_progress(void) { + unsigned n, i; + + if (!on_tty()) + return; + + n = (3 * columns()) / 4; + + putchar('\r'); + + for (i = 0; i < n + 5; i++) + putchar(' '); + + putchar('\r'); + fflush(stdout); +} + +#define debug(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_debug(OFSfmt": " _fmt, _offset, ##__VA_ARGS__); \ + } while (0) + +#define warning(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_warning(OFSfmt": " _fmt, _offset, ##__VA_ARGS__); \ + } while (0) + +#define error(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_error(OFSfmt": " _fmt, (uint64_t)_offset, ##__VA_ARGS__); \ + } while (0) + +#define error_errno(_offset, error, _fmt, ...) do { \ + flush_progress(); \ + log_error_errno(error, OFSfmt": " _fmt, (uint64_t)_offset, ##__VA_ARGS__); \ + } while (0) + +static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o) { + uint64_t i; + + assert(f); + assert(offset); + assert(o); + + /* This does various superficial tests about the length an + * possible field values. It does not follow any references to + * other objects. */ + + if ((o->object.flags & OBJECT_COMPRESSED_XZ) && + o->object.type != OBJECT_DATA) { + error(offset, "Found compressed object that isn't of type DATA, which is not allowed."); + return -EBADMSG; + } + + switch (o->object.type) { + + case OBJECT_DATA: { + uint64_t h1, h2; + int compression, r; + + if (le64toh(o->data.entry_offset) == 0) + warning(offset, "Unused data (entry_offset==0)"); + + if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) { + error(offset, "Bad n_entries: %"PRIu64, le64toh(o->data.n_entries)); + return -EBADMSG; + } + + if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) { + error(offset, "Bad object size (<= %zu): %"PRIu64, + offsetof(DataObject, payload), + le64toh(o->object.size)); + return -EBADMSG; + } + + h1 = le64toh(o->data.hash); + + compression = o->object.flags & OBJECT_COMPRESSION_MASK; + if (compression) { + _cleanup_free_ void *b = NULL; + size_t alloc = 0, b_size; + + r = decompress_blob(compression, + o->data.payload, + le64toh(o->object.size) - offsetof(Object, data.payload), + &b, &alloc, &b_size, 0); + if (r < 0) { + error_errno(offset, r, "%s decompression failed: %m", + object_compressed_to_string(compression)); + return r; + } + + h2 = journal_file_hash_data(f, b, b_size); + } else + h2 = journal_file_hash_data(f, o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload)); + + if (h1 != h2) { + error(offset, "Invalid hash (%08"PRIx64" vs. %08"PRIx64, h1, h2); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->data.next_hash_offset)) || + !VALID64(le64toh(o->data.next_field_offset)) || + !VALID64(le64toh(o->data.entry_offset)) || + !VALID64(le64toh(o->data.entry_array_offset))) { + error(offset, "Invalid offset (next_hash_offset="OFSfmt", next_field_offset="OFSfmt", entry_offset="OFSfmt", entry_array_offset="OFSfmt, + le64toh(o->data.next_hash_offset), + le64toh(o->data.next_field_offset), + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset)); + return -EBADMSG; + } + + break; + } + + case OBJECT_FIELD: + if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) { + error(offset, + "Bad field size (<= %zu): %"PRIu64, + offsetof(FieldObject, payload), + le64toh(o->object.size)); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->field.next_hash_offset)) || + !VALID64(le64toh(o->field.head_data_offset))) { + error(offset, + "Invalid offset (next_hash_offset="OFSfmt", head_data_offset="OFSfmt, + le64toh(o->field.next_hash_offset), + le64toh(o->field.head_data_offset)); + return -EBADMSG; + } + break; + + case OBJECT_ENTRY: + if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) { + error(offset, + "Bad entry size (<= %zu): %"PRIu64, + offsetof(EntryObject, items), + le64toh(o->object.size)); + return -EBADMSG; + } + + if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) { + error(offset, + "Invalid number items in entry: %"PRIu64, + (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem)); + return -EBADMSG; + } + + if (le64toh(o->entry.seqnum) <= 0) { + error(offset, + "Invalid entry seqnum: %"PRIx64, + le64toh(o->entry.seqnum)); + return -EBADMSG; + } + + if (!VALID_REALTIME(le64toh(o->entry.realtime))) { + error(offset, + "Invalid entry realtime timestamp: %"PRIu64, + le64toh(o->entry.realtime)); + return -EBADMSG; + } + + if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) { + error(offset, + "Invalid entry monotonic timestamp: %"PRIu64, + le64toh(o->entry.monotonic)); + return -EBADMSG; + } + + for (i = 0; i < journal_file_entry_n_items(o); i++) { + if (le64toh(o->entry.items[i].object_offset) == 0 || + !VALID64(le64toh(o->entry.items[i].object_offset))) { + error(offset, + "Invalid entry item (%"PRIu64"/%"PRIu64" offset: "OFSfmt, + i, journal_file_entry_n_items(o), + le64toh(o->entry.items[i].object_offset)); + return -EBADMSG; + } + } + + break; + + case OBJECT_DATA_HASH_TABLE: + case OBJECT_FIELD_HASH_TABLE: + if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 || + (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) { + error(offset, + "Invalid %s hash table size: %"PRIu64, + o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field", + le64toh(o->object.size)); + return -EBADMSG; + } + + for (i = 0; i < journal_file_hash_table_n_items(o); i++) { + if (o->hash_table.items[i].head_hash_offset != 0 && + !VALID64(le64toh(o->hash_table.items[i].head_hash_offset))) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64") head_hash_offset: "OFSfmt, + o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field", + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].head_hash_offset)); + return -EBADMSG; + } + if (o->hash_table.items[i].tail_hash_offset != 0 && + !VALID64(le64toh(o->hash_table.items[i].tail_hash_offset))) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64") tail_hash_offset: "OFSfmt, + o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field", + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].tail_hash_offset)); + return -EBADMSG; + } + + if ((o->hash_table.items[i].head_hash_offset != 0) != + (o->hash_table.items[i].tail_hash_offset != 0)) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64"): head_hash_offset="OFSfmt" tail_hash_offset="OFSfmt, + o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field", + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].head_hash_offset), + le64toh(o->hash_table.items[i].tail_hash_offset)); + return -EBADMSG; + } + } + + break; + + case OBJECT_ENTRY_ARRAY: + if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 || + (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) { + error(offset, + "Invalid object entry array size: %"PRIu64, + le64toh(o->object.size)); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) { + error(offset, + "Invalid object entry array next_entry_array_offset: "OFSfmt, + le64toh(o->entry_array.next_entry_array_offset)); + return -EBADMSG; + } + + for (i = 0; i < journal_file_entry_array_n_items(o); i++) + if (le64toh(o->entry_array.items[i]) != 0 && + !VALID64(le64toh(o->entry_array.items[i]))) { + error(offset, + "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt, + i, journal_file_entry_array_n_items(o), + le64toh(o->entry_array.items[i])); + return -EBADMSG; + } + + break; + + case OBJECT_TAG: + if (le64toh(o->object.size) != sizeof(TagObject)) { + error(offset, + "Invalid object tag size: %"PRIu64, + le64toh(o->object.size)); + return -EBADMSG; + } + + if (!VALID_EPOCH(le64toh(o->tag.epoch))) { + error(offset, + "Invalid object tag epoch: %"PRIu64, + le64toh(o->tag.epoch)); + return -EBADMSG; + } + + break; + } + + return 0; +} + +static int write_uint64(int fd, uint64_t p) { + ssize_t k; + + k = write(fd, &p, sizeof(p)); + if (k < 0) + return -errno; + if (k != sizeof(p)) + return -EIO; + + return 0; +} + +static int contains_uint64(MMapCache *m, MMapFileDescriptor *f, uint64_t n, uint64_t p) { + uint64_t a, b; + int r; + + assert(m); + assert(f); + + /* Bisection ... */ + + a = 0; b = n; + while (a < b) { + uint64_t c, *z; + + c = (a + b) / 2; + + r = mmap_cache_get(m, f, PROT_READ|PROT_WRITE, 0, false, c * sizeof(uint64_t), sizeof(uint64_t), NULL, (void **) &z, NULL); + if (r < 0) + return r; + + if (*z == p) + return 1; + + if (a + 1 >= b) + return 0; + + if (p < *z) + b = c; + else + a = c; + } + + return 0; +} + +static int entry_points_to_data( + JournalFile *f, + MMapFileDescriptor *cache_entry_fd, + uint64_t n_entries, + uint64_t entry_p, + uint64_t data_p) { + + int r; + uint64_t i, n, a; + Object *o; + bool found = false; + + assert(f); + assert(cache_entry_fd); + + if (!contains_uint64(f->mmap, cache_entry_fd, n_entries, entry_p)) { + error(data_p, "Data object references invalid entry at "OFSfmt, entry_p); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY, entry_p, &o); + if (r < 0) + return r; + + n = journal_file_entry_n_items(o); + for (i = 0; i < n; i++) + if (le64toh(o->entry.items[i].object_offset) == data_p) { + found = true; + break; + } + + if (!found) { + error(entry_p, "Data object at "OFSfmt" not referenced by linked entry", data_p); + return -EBADMSG; + } + + /* Check if this entry is also in main entry array. Since the + * main entry array has already been verified we can rely on + * its consistency. */ + + i = 0; + n = le64toh(f->header->n_entries); + a = le64toh(f->header->entry_array_offset); + + while (i < n) { + uint64_t m, u; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + m = journal_file_entry_array_n_items(o); + u = MIN(n - i, m); + + if (entry_p <= le64toh(o->entry_array.items[u-1])) { + uint64_t x, y, z; + + x = 0; + y = u; + + while (x < y) { + z = (x + y) / 2; + + if (le64toh(o->entry_array.items[z]) == entry_p) + return 0; + + if (x + 1 >= y) + break; + + if (entry_p < le64toh(o->entry_array.items[z])) + y = z; + else + x = z; + } + + error(entry_p, "Entry object doesn't exist in main entry array"); + return -EBADMSG; + } + + i += u; + a = le64toh(o->entry_array.next_entry_array_offset); + } + + return 0; +} + +static int verify_data( + JournalFile *f, + Object *o, uint64_t p, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays) { + + uint64_t i, n, a, last, q; + int r; + + assert(f); + assert(o); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + + n = le64toh(o->data.n_entries); + a = le64toh(o->data.entry_array_offset); + + /* Entry array means at least two objects */ + if (a && n < 2) { + error(p, "Entry array present (entry_array_offset="OFSfmt", but n_entries=%"PRIu64")", a, n); + return -EBADMSG; + } + + if (n == 0) + return 0; + + /* We already checked that earlier */ + assert(o->data.entry_offset); + + last = q = le64toh(o->data.entry_offset); + r = entry_points_to_data(f, cache_entry_fd, n_entries, q, p); + if (r < 0) + return r; + + i = 1; + while (i < n) { + uint64_t next, m, j; + + if (a == 0) { + error(p, "Array chain too short"); + return -EBADMSG; + } + + if (!contains_uint64(f->mmap, cache_entry_array_fd, n_entry_arrays, a)) { + error(p, "Invalid array offset "OFSfmt, a); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + next = le64toh(o->entry_array.next_entry_array_offset); + if (next != 0 && next <= a) { + error(p, "Array chain has cycle (jumps back from "OFSfmt" to "OFSfmt")", a, next); + return -EBADMSG; + } + + m = journal_file_entry_array_n_items(o); + for (j = 0; i < n && j < m; i++, j++) { + + q = le64toh(o->entry_array.items[j]); + if (q <= last) { + error(p, "Data object's entry array not sorted"); + return -EBADMSG; + } + last = q; + + r = entry_points_to_data(f, cache_entry_fd, n_entries, q, p); + if (r < 0) + return r; + + /* Pointer might have moved, reposition */ + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + } + + a = next; + } + + return 0; +} + +static int verify_hash_table( + JournalFile *f, + MMapFileDescriptor *cache_data_fd, uint64_t n_data, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays, + usec_t *last_usec, + bool show_progress) { + + uint64_t i, n; + int r; + + assert(f); + assert(cache_data_fd); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + assert(last_usec); + + n = le64toh(f->header->data_hash_table_size) / sizeof(HashItem); + if (n <= 0) + return 0; + + r = journal_file_map_data_hash_table(f); + if (r < 0) + return log_error_errno(r, "Failed to map data hash table: %m"); + + for (i = 0; i < n; i++) { + uint64_t last = 0, p; + + if (show_progress) + draw_progress(0xC000 + scale_progress(0x3FFF, i, n), last_usec); + + p = le64toh(f->data_hash_table[i].head_hash_offset); + while (p != 0) { + Object *o; + uint64_t next; + + if (!contains_uint64(f->mmap, cache_data_fd, n_data, p)) { + error(p, "Invalid data object at hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + next = le64toh(o->data.next_hash_offset); + if (next != 0 && next <= p) { + error(p, "Hash chain has a cycle in hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + if (le64toh(o->data.hash) % n != i) { + error(p, "Hash value mismatch in hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = verify_data(f, o, p, cache_entry_fd, n_entries, cache_entry_array_fd, n_entry_arrays); + if (r < 0) + return r; + + last = p; + p = next; + } + + if (last != le64toh(f->data_hash_table[i].tail_hash_offset)) { + error(p, "Tail hash pointer mismatch in hash table"); + return -EBADMSG; + } + } + + return 0; +} + +static int data_object_in_hash_table(JournalFile *f, uint64_t hash, uint64_t p) { + uint64_t n, h, q; + int r; + assert(f); + + n = le64toh(f->header->data_hash_table_size) / sizeof(HashItem); + if (n <= 0) + return 0; + + r = journal_file_map_data_hash_table(f); + if (r < 0) + return log_error_errno(r, "Failed to map data hash table: %m"); + + h = hash % n; + + q = le64toh(f->data_hash_table[h].head_hash_offset); + while (q != 0) { + Object *o; + + if (p == q) + return 1; + + r = journal_file_move_to_object(f, OBJECT_DATA, q, &o); + if (r < 0) + return r; + + q = le64toh(o->data.next_hash_offset); + } + + return 0; +} + +static int verify_entry( + JournalFile *f, + Object *o, uint64_t p, + MMapFileDescriptor *cache_data_fd, uint64_t n_data) { + + uint64_t i, n; + int r; + + assert(f); + assert(o); + assert(cache_data_fd); + + n = journal_file_entry_n_items(o); + for (i = 0; i < n; i++) { + uint64_t q, h; + Object *u; + + q = le64toh(o->entry.items[i].object_offset); + h = le64toh(o->entry.items[i].hash); + + if (!contains_uint64(f->mmap, cache_data_fd, n_data, q)) { + error(p, "Invalid data object of entry"); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_DATA, q, &u); + if (r < 0) + return r; + + if (le64toh(u->data.hash) != h) { + error(p, "Hash mismatch for data object of entry"); + return -EBADMSG; + } + + r = data_object_in_hash_table(f, h, q); + if (r < 0) + return r; + if (r == 0) { + error(p, "Data object missing from hash table"); + return -EBADMSG; + } + } + + return 0; +} + +static int verify_entry_array( + JournalFile *f, + MMapFileDescriptor *cache_data_fd, uint64_t n_data, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays, + usec_t *last_usec, + bool show_progress) { + + uint64_t i = 0, a, n, last = 0; + int r; + + assert(f); + assert(cache_data_fd); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + assert(last_usec); + + n = le64toh(f->header->n_entries); + a = le64toh(f->header->entry_array_offset); + while (i < n) { + uint64_t next, m, j; + Object *o; + + if (show_progress) + draw_progress(0x8000 + scale_progress(0x3FFF, i, n), last_usec); + + if (a == 0) { + error(a, "Array chain too short at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + if (!contains_uint64(f->mmap, cache_entry_array_fd, n_entry_arrays, a)) { + error(a, "Invalid array %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + next = le64toh(o->entry_array.next_entry_array_offset); + if (next != 0 && next <= a) { + error(a, "Array chain has cycle at %"PRIu64" of %"PRIu64" (jumps back from to "OFSfmt")", i, n, next); + return -EBADMSG; + } + + m = journal_file_entry_array_n_items(o); + for (j = 0; i < n && j < m; i++, j++) { + uint64_t p; + + p = le64toh(o->entry_array.items[j]); + if (p <= last) { + error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + last = p; + + if (!contains_uint64(f->mmap, cache_entry_fd, n_entries, p)) { + error(a, "Invalid array entry at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + r = verify_entry(f, o, p, cache_data_fd, n_data); + if (r < 0) + return r; + + /* Pointer might have moved, reposition */ + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + } + + a = next; + } + + return 0; +} + +int journal_file_verify( + JournalFile *f, + const char *key, + usec_t *first_contained, usec_t *last_validated, usec_t *last_contained, + bool show_progress) { + int r; + Object *o; + uint64_t p = 0, last_epoch = 0, last_tag_realtime = 0, last_sealed_realtime = 0; + + uint64_t entry_seqnum = 0, entry_monotonic = 0, entry_realtime = 0; + sd_id128_t entry_boot_id; + bool entry_seqnum_set = false, entry_monotonic_set = false, entry_realtime_set = false, found_main_entry_array = false; + uint64_t n_weird = 0, n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0, n_entry_arrays = 0, n_tags = 0; + usec_t last_usec = 0; + int data_fd = -1, entry_fd = -1, entry_array_fd = -1; + MMapFileDescriptor *cache_data_fd = NULL, *cache_entry_fd = NULL, *cache_entry_array_fd = NULL; + unsigned i; + bool found_last = false; + const char *tmp_dir = NULL; + +#if HAVE_GCRYPT + uint64_t last_tag = 0; +#endif + assert(f); + + if (key) { +#if HAVE_GCRYPT + r = journal_file_parse_verification_key(f, key); + if (r < 0) { + log_error("Failed to parse seed."); + return r; + } +#else + return -EOPNOTSUPP; +#endif + } else if (f->seal) + return -ENOKEY; + + r = var_tmp_dir(&tmp_dir); + if (r < 0) { + log_error_errno(r, "Failed to determine temporary directory: %m"); + goto fail; + } + + data_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (data_fd < 0) { + r = log_error_errno(data_fd, "Failed to create data file: %m"); + goto fail; + } + + entry_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (entry_fd < 0) { + r = log_error_errno(entry_fd, "Failed to create entry file: %m"); + goto fail; + } + + entry_array_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (entry_array_fd < 0) { + r = log_error_errno(entry_array_fd, + "Failed to create entry array file: %m"); + goto fail; + } + + cache_data_fd = mmap_cache_add_fd(f->mmap, data_fd); + if (!cache_data_fd) { + r = log_oom(); + goto fail; + } + + cache_entry_fd = mmap_cache_add_fd(f->mmap, entry_fd); + if (!cache_entry_fd) { + r = log_oom(); + goto fail; + } + + cache_entry_array_fd = mmap_cache_add_fd(f->mmap, entry_array_fd); + if (!cache_entry_array_fd) { + r = log_oom(); + goto fail; + } + + if (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SUPPORTED) { + log_error("Cannot verify file with unknown extensions."); + r = -EOPNOTSUPP; + goto fail; + } + + for (i = 0; i < sizeof(f->header->reserved); i++) + if (f->header->reserved[i] != 0) { + error(offsetof(Header, reserved[i]), "Reserved field is non-zero"); + r = -EBADMSG; + goto fail; + } + + /* First iteration: we go through all objects, verify the + * superficial structure, headers, hashes. */ + + p = le64toh(f->header->header_size); + for (;;) { + /* Early exit if there are no objects in the file, at all */ + if (le64toh(f->header->tail_object_offset) == 0) + break; + + if (show_progress) + draw_progress(scale_progress(0x7FFF, p, le64toh(f->header->tail_object_offset)), &last_usec); + + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) { + error(p, "Invalid object"); + goto fail; + } + + if (p > le64toh(f->header->tail_object_offset)) { + error(offsetof(Header, tail_object_offset), "Invalid tail object pointer"); + r = -EBADMSG; + goto fail; + } + + n_objects++; + + r = journal_file_object_verify(f, p, o); + if (r < 0) { + error_errno(p, r, "Invalid object contents: %m"); + goto fail; + } + + if (!!(o->object.flags & OBJECT_COMPRESSED_XZ) + + !!(o->object.flags & OBJECT_COMPRESSED_LZ4) + + !!(o->object.flags & OBJECT_COMPRESSED_ZSTD) > 1) { + error(p, "Object has multiple compression flags set"); + r = -EINVAL; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_XZ) && !JOURNAL_HEADER_COMPRESSED_XZ(f->header)) { + error(p, "XZ compressed object in file without XZ compression"); + r = -EBADMSG; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_LZ4) && !JOURNAL_HEADER_COMPRESSED_LZ4(f->header)) { + error(p, "LZ4 compressed object in file without LZ4 compression"); + r = -EBADMSG; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_ZSTD) && !JOURNAL_HEADER_COMPRESSED_ZSTD(f->header)) { + error(p, "ZSTD compressed object in file without ZSTD compression"); + r = -EBADMSG; + goto fail; + } + + switch (o->object.type) { + + case OBJECT_DATA: + r = write_uint64(data_fd, p); + if (r < 0) + goto fail; + + n_data++; + break; + + case OBJECT_FIELD: + n_fields++; + break; + + case OBJECT_ENTRY: + if (JOURNAL_HEADER_SEALED(f->header) && n_tags <= 0) { + error(p, "First entry before first tag"); + r = -EBADMSG; + goto fail; + } + + r = write_uint64(entry_fd, p); + if (r < 0) + goto fail; + + if (le64toh(o->entry.realtime) < last_tag_realtime) { + error(p, "Older entry after newer tag"); + r = -EBADMSG; + goto fail; + } + + if (!entry_seqnum_set && + le64toh(o->entry.seqnum) != le64toh(f->header->head_entry_seqnum)) { + error(p, "Head entry sequence number incorrect"); + r = -EBADMSG; + goto fail; + } + + if (entry_seqnum_set && + entry_seqnum >= le64toh(o->entry.seqnum)) { + error(p, "Entry sequence number out of synchronization"); + r = -EBADMSG; + goto fail; + } + + entry_seqnum = le64toh(o->entry.seqnum); + entry_seqnum_set = true; + + if (entry_monotonic_set && + sd_id128_equal(entry_boot_id, o->entry.boot_id) && + entry_monotonic > le64toh(o->entry.monotonic)) { + error(p, "Entry timestamp out of synchronization"); + r = -EBADMSG; + goto fail; + } + + entry_monotonic = le64toh(o->entry.monotonic); + entry_boot_id = o->entry.boot_id; + entry_monotonic_set = true; + + if (!entry_realtime_set && + le64toh(o->entry.realtime) != le64toh(f->header->head_entry_realtime)) { + error(p, "Head entry realtime timestamp incorrect"); + r = -EBADMSG; + goto fail; + } + + entry_realtime = le64toh(o->entry.realtime); + entry_realtime_set = true; + + n_entries++; + break; + + case OBJECT_DATA_HASH_TABLE: + if (n_data_hash_tables > 1) { + error(p, "More than one data hash table"); + r = -EBADMSG; + goto fail; + } + + if (le64toh(f->header->data_hash_table_offset) != p + offsetof(HashTableObject, items) || + le64toh(f->header->data_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) { + error(p, "header fields for data hash table invalid"); + r = -EBADMSG; + goto fail; + } + + n_data_hash_tables++; + break; + + case OBJECT_FIELD_HASH_TABLE: + if (n_field_hash_tables > 1) { + error(p, "More than one field hash table"); + r = -EBADMSG; + goto fail; + } + + if (le64toh(f->header->field_hash_table_offset) != p + offsetof(HashTableObject, items) || + le64toh(f->header->field_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) { + error(p, "Header fields for field hash table invalid"); + r = -EBADMSG; + goto fail; + } + + n_field_hash_tables++; + break; + + case OBJECT_ENTRY_ARRAY: + r = write_uint64(entry_array_fd, p); + if (r < 0) + goto fail; + + if (p == le64toh(f->header->entry_array_offset)) { + if (found_main_entry_array) { + error(p, "More than one main entry array"); + r = -EBADMSG; + goto fail; + } + + found_main_entry_array = true; + } + + n_entry_arrays++; + break; + + case OBJECT_TAG: + if (!JOURNAL_HEADER_SEALED(f->header)) { + error(p, "Tag object in file without sealing"); + r = -EBADMSG; + goto fail; + } + + if (le64toh(o->tag.seqnum) != n_tags + 1) { + error(p, "Tag sequence number out of synchronization"); + r = -EBADMSG; + goto fail; + } + + if (le64toh(o->tag.epoch) < last_epoch) { + error(p, "Epoch sequence out of synchronization"); + r = -EBADMSG; + goto fail; + } + +#if HAVE_GCRYPT + if (f->seal) { + uint64_t q, rt; + + debug(p, "Checking tag %"PRIu64"...", le64toh(o->tag.seqnum)); + + rt = f->fss_start_usec + le64toh(o->tag.epoch) * f->fss_interval_usec; + if (entry_realtime_set && entry_realtime >= rt + f->fss_interval_usec) { + error(p, "tag/entry realtime timestamp out of synchronization"); + r = -EBADMSG; + goto fail; + } + + /* OK, now we know the epoch. So let's now set + * it, and calculate the HMAC for everything + * since the last tag. */ + r = journal_file_fsprg_seek(f, le64toh(o->tag.epoch)); + if (r < 0) + goto fail; + + r = journal_file_hmac_start(f); + if (r < 0) + goto fail; + + if (last_tag == 0) { + r = journal_file_hmac_put_header(f); + if (r < 0) + goto fail; + + q = le64toh(f->header->header_size); + } else + q = last_tag; + + while (q <= p) { + r = journal_file_move_to_object(f, OBJECT_UNUSED, q, &o); + if (r < 0) + goto fail; + + r = journal_file_hmac_put_object(f, OBJECT_UNUSED, o, q); + if (r < 0) + goto fail; + + q = q + ALIGN64(le64toh(o->object.size)); + } + + /* Position might have changed, let's reposition things */ + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) + goto fail; + + if (memcmp(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH) != 0) { + error(p, "Tag failed verification"); + r = -EBADMSG; + goto fail; + } + + f->hmac_running = false; + last_tag_realtime = rt; + last_sealed_realtime = entry_realtime; + } + + last_tag = p + ALIGN64(le64toh(o->object.size)); +#endif + + last_epoch = le64toh(o->tag.epoch); + + n_tags++; + break; + + default: + n_weird++; + } + + if (p == le64toh(f->header->tail_object_offset)) { + found_last = true; + break; + } + + p = p + ALIGN64(le64toh(o->object.size)); + }; + + if (!found_last && le64toh(f->header->tail_object_offset) != 0) { + error(le64toh(f->header->tail_object_offset), "Tail object pointer dead"); + r = -EBADMSG; + goto fail; + } + + if (n_objects != le64toh(f->header->n_objects)) { + error(offsetof(Header, n_objects), "Object number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (n_entries != le64toh(f->header->n_entries)) { + error(offsetof(Header, n_entries), "Entry number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && + n_data != le64toh(f->header->n_data)) { + error(offsetof(Header, n_data), "Data number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) && + n_fields != le64toh(f->header->n_fields)) { + error(offsetof(Header, n_fields), "Field number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) && + n_tags != le64toh(f->header->n_tags)) { + error(offsetof(Header, n_tags), "Tag number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays) && + n_entry_arrays != le64toh(f->header->n_entry_arrays)) { + error(offsetof(Header, n_entry_arrays), "Entry array number mismatch"); + r = -EBADMSG; + goto fail; + } + + if (!found_main_entry_array && le64toh(f->header->entry_array_offset) != 0) { + error(0, "Missing entry array"); + r = -EBADMSG; + goto fail; + } + + if (entry_seqnum_set && + entry_seqnum != le64toh(f->header->tail_entry_seqnum)) { + error(offsetof(Header, tail_entry_seqnum), "Invalid tail seqnum"); + r = -EBADMSG; + goto fail; + } + + if (entry_monotonic_set && + (sd_id128_equal(entry_boot_id, f->header->boot_id) && + entry_monotonic != le64toh(f->header->tail_entry_monotonic))) { + error(0, "Invalid tail monotonic timestamp"); + r = -EBADMSG; + goto fail; + } + + if (entry_realtime_set && entry_realtime != le64toh(f->header->tail_entry_realtime)) { + error(0, "Invalid tail realtime timestamp"); + r = -EBADMSG; + goto fail; + } + + /* Second iteration: we follow all objects referenced from the + * two entry points: the object hash table and the entry + * array. We also check that everything referenced (directly + * or indirectly) in the data hash table also exists in the + * entry array, and vice versa. Note that we do not care for + * unreferenced objects. We only care that everything that is + * referenced is consistent. */ + + r = verify_entry_array(f, + cache_data_fd, n_data, + cache_entry_fd, n_entries, + cache_entry_array_fd, n_entry_arrays, + &last_usec, + show_progress); + if (r < 0) + goto fail; + + r = verify_hash_table(f, + cache_data_fd, n_data, + cache_entry_fd, n_entries, + cache_entry_array_fd, n_entry_arrays, + &last_usec, + show_progress); + if (r < 0) + goto fail; + + if (show_progress) + flush_progress(); + + mmap_cache_free_fd(f->mmap, cache_data_fd); + mmap_cache_free_fd(f->mmap, cache_entry_fd); + mmap_cache_free_fd(f->mmap, cache_entry_array_fd); + + safe_close(data_fd); + safe_close(entry_fd); + safe_close(entry_array_fd); + + if (first_contained) + *first_contained = le64toh(f->header->head_entry_realtime); + if (last_validated) + *last_validated = last_sealed_realtime; + if (last_contained) + *last_contained = le64toh(f->header->tail_entry_realtime); + + return 0; + +fail: + if (show_progress) + flush_progress(); + + log_error("File corruption detected at %s:"OFSfmt" (of %llu bytes, %"PRIu64"%%).", + f->path, + p, + (unsigned long long) f->last_stat.st_size, + 100 * p / f->last_stat.st_size); + + if (data_fd >= 0) + safe_close(data_fd); + + if (entry_fd >= 0) + safe_close(entry_fd); + + if (entry_array_fd >= 0) + safe_close(entry_array_fd); + + if (cache_data_fd) + mmap_cache_free_fd(f->mmap, cache_data_fd); + + if (cache_entry_fd) + mmap_cache_free_fd(f->mmap, cache_entry_fd); + + if (cache_entry_array_fd) + mmap_cache_free_fd(f->mmap, cache_entry_array_fd); + + return r; +} diff --git a/src/journal/journal-verify.h b/src/journal/journal-verify.h new file mode 100644 index 0000000..5790330 --- /dev/null +++ b/src/journal/journal-verify.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journal-file.h" + +int journal_file_verify(JournalFile *f, const char *key, usec_t *first_contained, usec_t *last_validated, usec_t *last_contained, bool show_progress); diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c new file mode 100644 index 0000000..bcf2e01 --- /dev/null +++ b/src/journal/journalctl.c @@ -0,0 +1,2830 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <fnmatch.h> +#include <getopt.h> +#include <linux/fs.h> +#include <poll.h> +#include <signal.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/inotify.h> +#include <sys/stat.h> +#include <unistd.h> + +#if HAVE_PCRE2 +# define PCRE2_CODE_UNIT_WIDTH 8 +# include <pcre2.h> +#endif + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-journal.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "catalog.h" +#include "chattr-util.h" +#include "def.h" +#include "device-private.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "fsprg.h" +#include "glob-util.h" +#include "hostname-util.h" +#include "id128-print.h" +#include "io-util.h" +#include "journal-def.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "journal-vacuum.h" +#include "journal-verify.h" +#include "locale-util.h" +#include "log.h" +#include "logs-show.h" +#include "memory-util.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "pager.h" +#include "parse-util.h" +#include "path-util.h" +#include "pcre2-dlopen.h" +#include "pretty-print.h" +#include "qrcode-util.h" +#include "random-util.h" +#include "rlimit-util.h" +#include "set.h" +#include "sigbus.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "varlink.h" + +#define DEFAULT_FSS_INTERVAL_USEC (15*USEC_PER_MINUTE) +#define PROCESS_INOTIFY_INTERVAL 1024 /* Every 1,024 messages processed */ + +enum { + /* Special values for arg_lines */ + ARG_LINES_DEFAULT = -2, + ARG_LINES_ALL = -1, +}; + +static OutputMode arg_output = OUTPUT_SHORT; +static bool arg_utc = false; +static bool arg_follow = false; +static bool arg_full = true; +static bool arg_all = false; +static PagerFlags arg_pager_flags = 0; +static int arg_lines = ARG_LINES_DEFAULT; +static bool arg_no_tail = false; +static bool arg_quiet = false; +static bool arg_merge = false; +static bool arg_boot = false; +static sd_id128_t arg_boot_id = {}; +static int arg_boot_offset = 0; +static bool arg_dmesg = false; +static bool arg_no_hostname = false; +static const char *arg_cursor = NULL; +static const char *arg_cursor_file = NULL; +static const char *arg_after_cursor = NULL; +static bool arg_show_cursor = false; +static const char *arg_directory = NULL; +static char **arg_file = NULL; +static bool arg_file_stdin = false; +static int arg_priorities = 0xFF; +static Set *arg_facilities = NULL; +static char *arg_verify_key = NULL; +#if HAVE_GCRYPT +static usec_t arg_interval = DEFAULT_FSS_INTERVAL_USEC; +static bool arg_force = false; +#endif +static usec_t arg_since, arg_until; +static bool arg_since_set = false, arg_until_set = false; +static char **arg_syslog_identifier = NULL; +static char **arg_system_units = NULL; +static char **arg_user_units = NULL; +static const char *arg_field = NULL; +static bool arg_catalog = false; +static bool arg_reverse = false; +static int arg_journal_type = 0; +static int arg_namespace_flags = 0; +static char *arg_root = NULL; +static char *arg_image = NULL; +static const char *arg_machine = NULL; +static const char *arg_namespace = NULL; +static uint64_t arg_vacuum_size = 0; +static uint64_t arg_vacuum_n_files = 0; +static usec_t arg_vacuum_time = 0; +static char **arg_output_fields = NULL; +#if HAVE_PCRE2 +static const char *arg_pattern = NULL; +static pcre2_code *arg_compiled_pattern = NULL; +static int arg_case_sensitive = -1; /* -1 means be smart */ +#endif + +static enum { + ACTION_SHOW, + ACTION_NEW_ID128, + ACTION_PRINT_HEADER, + ACTION_SETUP_KEYS, + ACTION_VERIFY, + ACTION_DISK_USAGE, + ACTION_LIST_CATALOG, + ACTION_DUMP_CATALOG, + ACTION_UPDATE_CATALOG, + ACTION_LIST_BOOTS, + ACTION_FLUSH, + ACTION_RELINQUISH_VAR, + ACTION_SYNC, + ACTION_ROTATE, + ACTION_VACUUM, + ACTION_ROTATE_AND_VACUUM, + ACTION_LIST_FIELDS, + ACTION_LIST_FIELD_NAMES, +} arg_action = ACTION_SHOW; + +typedef struct BootId { + sd_id128_t id; + uint64_t first; + uint64_t last; + LIST_FIELDS(struct BootId, boot_list); +} BootId; + +#if HAVE_PCRE2 +DEFINE_TRIVIAL_CLEANUP_FUNC(pcre2_match_data*, sym_pcre2_match_data_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(pcre2_code*, sym_pcre2_code_free); + +static int pattern_compile(const char *pattern, unsigned flags, pcre2_code **out) { + int errorcode, r; + PCRE2_SIZE erroroffset; + pcre2_code *p; + + p = sym_pcre2_compile((PCRE2_SPTR8) pattern, + PCRE2_ZERO_TERMINATED, flags, &errorcode, &erroroffset, NULL); + if (!p) { + unsigned char buf[LINE_MAX]; + + r = sym_pcre2_get_error_message(errorcode, buf, sizeof buf); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Bad pattern \"%s\": %s", pattern, + r < 0 ? "unknown error" : (char *)buf); + } + + *out = p; + return 0; +} +#endif + +static int add_matches_for_device(sd_journal *j, const char *devpath) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + sd_device *d = NULL; + struct stat st; + int r; + + assert(j); + assert(devpath); + + if (!path_startswith(devpath, "/dev/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Devpath does not start with /dev/"); + + if (stat(devpath, &st) < 0) + return log_error_errno(errno, "Couldn't stat file: %m"); + + r = device_new_from_stat_rdev(&device, &st); + if (r < 0) + return log_error_errno(r, "Failed to get device from devnum %u:%u: %m", major(st.st_rdev), minor(st.st_rdev)); + + for (d = device; d; ) { + _cleanup_free_ char *match = NULL; + const char *subsys, *sysname, *devnode; + sd_device *parent; + + r = sd_device_get_subsystem(d, &subsys); + if (r < 0) + goto get_parent; + + r = sd_device_get_sysname(d, &sysname); + if (r < 0) + goto get_parent; + + match = strjoin("_KERNEL_DEVICE=+", subsys, ":", sysname); + if (!match) + return log_oom(); + + r = sd_journal_add_match(j, match, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + if (sd_device_get_devname(d, &devnode) >= 0) { + _cleanup_free_ char *match1 = NULL; + + r = stat(devnode, &st); + if (r < 0) + return log_error_errno(r, "Failed to stat() device node \"%s\": %m", devnode); + + r = asprintf(&match1, "_KERNEL_DEVICE=%c%u:%u", S_ISBLK(st.st_mode) ? 'b' : 'c', major(st.st_rdev), minor(st.st_rdev)); + if (r < 0) + return log_oom(); + + r = sd_journal_add_match(j, match1, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + +get_parent: + if (sd_device_get_parent(d, &parent) < 0) + break; + + d = parent; + } + + r = add_match_this_boot(j, arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to add match for the current boot: %m"); + + return 0; +} + +static char *format_timestamp_maybe_utc(char *buf, size_t l, usec_t t) { + + if (arg_utc) + return format_timestamp_style(buf, l, t, TIMESTAMP_UTC); + + return format_timestamp(buf, l, t); +} + +static int parse_boot_descriptor(const char *x, sd_id128_t *boot_id, int *offset) { + sd_id128_t id = SD_ID128_NULL; + int off = 0, r; + + if (streq(x, "all")) { + *boot_id = SD_ID128_NULL; + *offset = 0; + return 0; + } else if (strlen(x) >= 32) { + char *t; + + t = strndupa(x, 32); + r = sd_id128_from_string(t, &id); + if (r >= 0) + x += 32; + + if (!IN_SET(*x, 0, '-', '+')) + return -EINVAL; + + if (*x != 0) { + r = safe_atoi(x, &off); + if (r < 0) + return r; + } + } else { + r = safe_atoi(x, &off); + if (r < 0) + return r; + } + + if (boot_id) + *boot_id = id; + + if (offset) + *offset = off; + + return 1; +} + +static int help_facilities(void) { + if (!arg_quiet) + puts("Available facilities:"); + + for (int i = 0; i < LOG_NFACILITIES; i++) { + _cleanup_free_ char *t = NULL; + + if (log_facility_unshifted_to_string_alloc(i, &t)) + return log_oom(); + puts(t); + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + (void) pager_open(arg_pager_flags); + + r = terminal_urlify_man("journalctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [MATCHES...]\n\n" + "%5$sQuery the journal.%6$s\n\n" + "%3$sOptions:%4$s\n" + " --system Show the system journal\n" + " --user Show the user journal for the current user\n" + " -M --machine=CONTAINER Operate on local container\n" + " -S --since=DATE Show entries not older than the specified date\n" + " -U --until=DATE Show entries not newer than the specified date\n" + " -c --cursor=CURSOR Show entries starting at the specified cursor\n" + " --after-cursor=CURSOR Show entries after the specified cursor\n" + " --show-cursor Print the cursor after all the entries\n" + " --cursor-file=FILE Show entries after cursor in FILE and update FILE\n" + " -b --boot[=ID] Show current boot or the specified boot\n" + " --list-boots Show terse information about recorded boots\n" + " -k --dmesg Show kernel message log from the current boot\n" + " -u --unit=UNIT Show logs from the specified unit\n" + " --user-unit=UNIT Show logs from the specified user unit\n" + " -t --identifier=STRING Show entries with the specified syslog identifier\n" + " -p --priority=RANGE Show entries with the specified priority\n" + " --facility=FACILITY... Show entries with the specified facilities\n" + " -g --grep=PATTERN Show entries with MESSAGE matching PATTERN\n" + " --case-sensitive[=BOOL] Force case sensitive or insensitive matching\n" + " -e --pager-end Immediately jump to the end in the pager\n" + " -f --follow Follow the journal\n" + " -n --lines[=INTEGER] Number of journal entries to show\n" + " --no-tail Show all lines, even in follow mode\n" + " -r --reverse Show the newest entries first\n" + " -o --output=STRING Change journal output mode (short, short-precise,\n" + " short-iso, short-iso-precise, short-full,\n" + " short-monotonic, short-unix, verbose, export,\n" + " json, json-pretty, json-sse, json-seq, cat,\n" + " with-unit)\n" + " --output-fields=LIST Select fields to print in verbose/export/json modes\n" + " --utc Express time in Coordinated Universal Time (UTC)\n" + " -x --catalog Add message explanations where available\n" + " --no-full Ellipsize fields\n" + " -a --all Show all fields, including long and unprintable\n" + " -q --quiet Do not show info messages and privilege warning\n" + " --no-pager Do not pipe output into a pager\n" + " --no-hostname Suppress output of hostname field\n" + " -m --merge Show entries from all available journals\n" + " -D --directory=PATH Show journal files from directory\n" + " --file=PATH Show journal file\n" + " --root=ROOT Operate on files below a root directory\n" + " --image=IMAGE Operate on files in filesystem image\n" + " --namespace=NAMESPACE Show journal data from specified namespace\n" + " --interval=TIME Time interval for changing the FSS sealing key\n" + " --verify-key=KEY Specify FSS verification key\n" + " --force Override of the FSS key pair with --setup-keys\n" + "\n%3$sCommands:%4$s\n" + " -h --help Show this help text\n" + " --version Show package version\n" + " -N --fields List all field names currently used\n" + " -F --field=FIELD List all values that a specified field takes\n" + " --disk-usage Show total disk usage of all journal files\n" + " --vacuum-size=BYTES Reduce disk usage below specified size\n" + " --vacuum-files=INT Leave only the specified number of journal files\n" + " --vacuum-time=TIME Remove journal files older than specified time\n" + " --verify Verify journal file consistency\n" + " --sync Synchronize unwritten journal messages to disk\n" + " --relinquish-var Stop logging to disk, log to temporary file system\n" + " --smart-relinquish-var Similar, but NOP if log directory is on root mount\n" + " --flush Flush all journal data from /run into /var\n" + " --rotate Request immediate rotation of the journal files\n" + " --header Show journal header information\n" + " --list-catalog Show all message IDs in the catalog\n" + " --dump-catalog Show entries in the message catalog\n" + " --update-catalog Update the message catalog database\n" + " --setup-keys Generate a new FSS key pair\n" + "\nSee the %2$s for details.\n" + , program_invocation_short_name + , link + , ansi_underline(), ansi_normal() + , ansi_highlight(), ansi_normal() + ); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_FULL, + ARG_NO_TAIL, + ARG_NEW_ID128, + ARG_THIS_BOOT, + ARG_LIST_BOOTS, + ARG_USER, + ARG_SYSTEM, + ARG_ROOT, + ARG_IMAGE, + ARG_HEADER, + ARG_FACILITY, + ARG_SETUP_KEYS, + ARG_FILE, + ARG_INTERVAL, + ARG_VERIFY, + ARG_VERIFY_KEY, + ARG_DISK_USAGE, + ARG_AFTER_CURSOR, + ARG_CURSOR_FILE, + ARG_SHOW_CURSOR, + ARG_USER_UNIT, + ARG_LIST_CATALOG, + ARG_DUMP_CATALOG, + ARG_UPDATE_CATALOG, + ARG_FORCE, + ARG_CASE_SENSITIVE, + ARG_UTC, + ARG_SYNC, + ARG_FLUSH, + ARG_RELINQUISH_VAR, + ARG_SMART_RELINQUISH_VAR, + ARG_ROTATE, + ARG_VACUUM_SIZE, + ARG_VACUUM_FILES, + ARG_VACUUM_TIME, + ARG_NO_HOSTNAME, + ARG_OUTPUT_FIELDS, + ARG_NAMESPACE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version" , no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "pager-end", no_argument, NULL, 'e' }, + { "follow", no_argument, NULL, 'f' }, + { "force", no_argument, NULL, ARG_FORCE }, + { "output", required_argument, NULL, 'o' }, + { "all", no_argument, NULL, 'a' }, + { "full", no_argument, NULL, 'l' }, + { "no-full", no_argument, NULL, ARG_NO_FULL }, + { "lines", optional_argument, NULL, 'n' }, + { "no-tail", no_argument, NULL, ARG_NO_TAIL }, + { "new-id128", no_argument, NULL, ARG_NEW_ID128 }, /* deprecated */ + { "quiet", no_argument, NULL, 'q' }, + { "merge", no_argument, NULL, 'm' }, + { "this-boot", no_argument, NULL, ARG_THIS_BOOT }, /* deprecated */ + { "boot", optional_argument, NULL, 'b' }, + { "list-boots", no_argument, NULL, ARG_LIST_BOOTS }, + { "dmesg", no_argument, NULL, 'k' }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "directory", required_argument, NULL, 'D' }, + { "file", required_argument, NULL, ARG_FILE }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "header", no_argument, NULL, ARG_HEADER }, + { "identifier", required_argument, NULL, 't' }, + { "priority", required_argument, NULL, 'p' }, + { "facility", required_argument, NULL, ARG_FACILITY }, + { "grep", required_argument, NULL, 'g' }, + { "case-sensitive", optional_argument, NULL, ARG_CASE_SENSITIVE }, + { "setup-keys", no_argument, NULL, ARG_SETUP_KEYS }, + { "interval", required_argument, NULL, ARG_INTERVAL }, + { "verify", no_argument, NULL, ARG_VERIFY }, + { "verify-key", required_argument, NULL, ARG_VERIFY_KEY }, + { "disk-usage", no_argument, NULL, ARG_DISK_USAGE }, + { "cursor", required_argument, NULL, 'c' }, + { "cursor-file", required_argument, NULL, ARG_CURSOR_FILE }, + { "after-cursor", required_argument, NULL, ARG_AFTER_CURSOR }, + { "show-cursor", no_argument, NULL, ARG_SHOW_CURSOR }, + { "since", required_argument, NULL, 'S' }, + { "until", required_argument, NULL, 'U' }, + { "unit", required_argument, NULL, 'u' }, + { "user-unit", required_argument, NULL, ARG_USER_UNIT }, + { "field", required_argument, NULL, 'F' }, + { "fields", no_argument, NULL, 'N' }, + { "catalog", no_argument, NULL, 'x' }, + { "list-catalog", no_argument, NULL, ARG_LIST_CATALOG }, + { "dump-catalog", no_argument, NULL, ARG_DUMP_CATALOG }, + { "update-catalog", no_argument, NULL, ARG_UPDATE_CATALOG }, + { "reverse", no_argument, NULL, 'r' }, + { "machine", required_argument, NULL, 'M' }, + { "utc", no_argument, NULL, ARG_UTC }, + { "flush", no_argument, NULL, ARG_FLUSH }, + { "relinquish-var", no_argument, NULL, ARG_RELINQUISH_VAR }, + { "smart-relinquish-var", no_argument, NULL, ARG_SMART_RELINQUISH_VAR }, + { "sync", no_argument, NULL, ARG_SYNC }, + { "rotate", no_argument, NULL, ARG_ROTATE }, + { "vacuum-size", required_argument, NULL, ARG_VACUUM_SIZE }, + { "vacuum-files", required_argument, NULL, ARG_VACUUM_FILES }, + { "vacuum-time", required_argument, NULL, ARG_VACUUM_TIME }, + { "no-hostname", no_argument, NULL, ARG_NO_HOSTNAME }, + { "output-fields", required_argument, NULL, ARG_OUTPUT_FIELDS }, + { "namespace", required_argument, NULL, ARG_NAMESPACE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hefo:aln::qmb::kD:p:g:c:S:U:t:u:NF:xrM:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case 'e': + arg_pager_flags |= PAGER_JUMP_TO_END; + + if (arg_lines == ARG_LINES_DEFAULT) + arg_lines = 1000; + + break; + + case 'f': + arg_follow = true; + break; + + case 'o': + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(output_mode, OutputMode, _OUTPUT_MODE_MAX); + return 0; + } + + arg_output = output_mode_from_string(optarg); + if (arg_output < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown output format '%s'.", optarg); + + if (IN_SET(arg_output, OUTPUT_EXPORT, OUTPUT_JSON, OUTPUT_JSON_PRETTY, OUTPUT_JSON_SSE, OUTPUT_JSON_SEQ, OUTPUT_CAT)) + arg_quiet = true; + + break; + + case 'l': + arg_full = true; + break; + + case ARG_NO_FULL: + arg_full = false; + break; + + case 'a': + arg_all = true; + break; + + case 'n': + if (optarg) { + if (streq(optarg, "all")) + arg_lines = ARG_LINES_ALL; + else { + r = safe_atoi(optarg, &arg_lines); + if (r < 0 || arg_lines < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse lines '%s'", optarg); + } + } else { + arg_lines = 10; + + /* Hmm, no argument? Maybe the next + * word on the command line is + * supposed to be the argument? Let's + * see if there is one, and is + * parsable. */ + if (optind < argc) { + int n; + if (streq(argv[optind], "all")) { + arg_lines = ARG_LINES_ALL; + optind++; + } else if (safe_atoi(argv[optind], &n) >= 0 && n >= 0) { + arg_lines = n; + optind++; + } + } + } + + break; + + case ARG_NO_TAIL: + arg_no_tail = true; + break; + + case ARG_NEW_ID128: + arg_action = ACTION_NEW_ID128; + break; + + case 'q': + arg_quiet = true; + break; + + case 'm': + arg_merge = true; + break; + + case ARG_THIS_BOOT: + arg_boot = true; + arg_boot_id = SD_ID128_NULL; + arg_boot_offset = 0; + break; + + case 'b': + arg_boot = true; + arg_boot_id = SD_ID128_NULL; + arg_boot_offset = 0; + + if (optarg) { + r = parse_boot_descriptor(optarg, &arg_boot_id, &arg_boot_offset); + if (r < 0) + return log_error_errno(r, "Failed to parse boot descriptor '%s'", optarg); + + arg_boot = r; + + /* Hmm, no argument? Maybe the next + * word on the command line is + * supposed to be the argument? Let's + * see if there is one and is parsable + * as a boot descriptor... */ + } else if (optind < argc) { + r = parse_boot_descriptor(argv[optind], &arg_boot_id, &arg_boot_offset); + if (r >= 0) { + arg_boot = r; + optind++; + } + } + break; + + case ARG_LIST_BOOTS: + arg_action = ACTION_LIST_BOOTS; + break; + + case 'k': + arg_boot = arg_dmesg = true; + break; + + case ARG_SYSTEM: + arg_journal_type |= SD_JOURNAL_SYSTEM; + break; + + case ARG_USER: + arg_journal_type |= SD_JOURNAL_CURRENT_USER; + break; + + case 'M': + arg_machine = optarg; + break; + + case ARG_NAMESPACE: + if (streq(optarg, "*")) { + arg_namespace_flags = SD_JOURNAL_ALL_NAMESPACES; + arg_namespace = NULL; + } else if (startswith(optarg, "+")) { + arg_namespace_flags = SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE; + arg_namespace = optarg + 1; + } else if (isempty(optarg)) { + arg_namespace_flags = 0; + arg_namespace = NULL; + } else { + arg_namespace_flags = 0; + arg_namespace = optarg; + } + + break; + + case 'D': + arg_directory = optarg; + break; + + case ARG_FILE: + if (streq(optarg, "-")) + /* An undocumented feature: we can read journal files from STDIN. We don't document + * this though, since after all we only support this for mmap-able, seekable files, and + * not for example pipes which are probably the primary usecase for reading things from + * STDIN. To avoid confusion we hence don't document this feature. */ + arg_file_stdin = true; + else { + r = glob_extend(&arg_file, optarg, GLOB_NOCHECK); + if (r < 0) + return log_error_errno(r, "Failed to add paths: %m"); + } + break; + + case ARG_ROOT: + r = parse_path_argument_and_warn(optarg, /* suppress_root= */ true, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument_and_warn(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case 'c': + arg_cursor = optarg; + break; + + case ARG_CURSOR_FILE: + arg_cursor_file = optarg; + break; + + case ARG_AFTER_CURSOR: + arg_after_cursor = optarg; + break; + + case ARG_SHOW_CURSOR: + arg_show_cursor = true; + break; + + case ARG_HEADER: + arg_action = ACTION_PRINT_HEADER; + break; + + case ARG_VERIFY: + arg_action = ACTION_VERIFY; + break; + + case ARG_DISK_USAGE: + arg_action = ACTION_DISK_USAGE; + break; + + case ARG_VACUUM_SIZE: + r = parse_size(optarg, 1024, &arg_vacuum_size); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum size: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + + case ARG_VACUUM_FILES: + r = safe_atou64(optarg, &arg_vacuum_n_files); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum files: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + + case ARG_VACUUM_TIME: + r = parse_sec(optarg, &arg_vacuum_time); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum time: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + +#if HAVE_GCRYPT + case ARG_FORCE: + arg_force = true; + break; + + case ARG_SETUP_KEYS: + arg_action = ACTION_SETUP_KEYS; + break; + + case ARG_VERIFY_KEY: + r = free_and_strdup(&arg_verify_key, optarg); + if (r < 0) + return r; + /* Use memset not explicit_bzero() or similar so this doesn't look confusing + * in ps or htop output. */ + memset(optarg, 'x', strlen(optarg)); + + arg_action = ACTION_VERIFY; + arg_merge = false; + break; + + case ARG_INTERVAL: + r = parse_sec(optarg, &arg_interval); + if (r < 0 || arg_interval <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse sealing key change interval: %s", optarg); + break; +#else + case ARG_SETUP_KEYS: + case ARG_VERIFY_KEY: + case ARG_INTERVAL: + case ARG_FORCE: + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Compiled without forward-secure sealing support."); +#endif + + case 'p': { + const char *dots; + + dots = strstr(optarg, ".."); + if (dots) { + _cleanup_free_ char *a = NULL; + int from, to, i; + + /* a range */ + a = strndup(optarg, dots - optarg); + if (!a) + return log_oom(); + + from = log_level_from_string(a); + to = log_level_from_string(dots + 2); + + if (from < 0 || to < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse log level range %s", optarg); + + arg_priorities = 0; + + if (from < to) { + for (i = from; i <= to; i++) + arg_priorities |= 1 << i; + } else { + for (i = to; i <= from; i++) + arg_priorities |= 1 << i; + } + + } else { + int p, i; + + p = log_level_from_string(optarg); + if (p < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown log level %s", optarg); + + arg_priorities = 0; + + for (i = 0; i <= p; i++) + arg_priorities |= 1 << i; + } + + break; + } + + case ARG_FACILITY: { + const char *p; + + for (p = optarg;;) { + _cleanup_free_ char *fac = NULL; + int num; + + r = extract_first_word(&p, &fac, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse facilities: %s", optarg); + if (r == 0) + break; + + if (streq(fac, "help")) { + help_facilities(); + return 0; + } + + num = log_facility_unshifted_from_string(fac); + if (num < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Bad --facility= argument \"%s\".", fac); + + if (set_ensure_put(&arg_facilities, NULL, INT_TO_PTR(num)) < 0) + return log_oom(); + } + + break; + } + +#if HAVE_PCRE2 + case 'g': + arg_pattern = optarg; + break; + + case ARG_CASE_SENSITIVE: + if (optarg) { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Bad --case-sensitive= argument \"%s\": %m", optarg); + arg_case_sensitive = r; + } else + arg_case_sensitive = true; + + break; +#else + case 'g': + case ARG_CASE_SENSITIVE: + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Compiled without pattern matching support"); +#endif + + case 'S': + r = parse_timestamp(optarg, &arg_since); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse timestamp: %s", optarg); + arg_since_set = true; + break; + + case 'U': + r = parse_timestamp(optarg, &arg_until); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse timestamp: %s", optarg); + arg_until_set = true; + break; + + case 't': + r = strv_extend(&arg_syslog_identifier, optarg); + if (r < 0) + return log_oom(); + break; + + case 'u': + r = strv_extend(&arg_system_units, optarg); + if (r < 0) + return log_oom(); + break; + + case ARG_USER_UNIT: + r = strv_extend(&arg_user_units, optarg); + if (r < 0) + return log_oom(); + break; + + case 'F': + arg_action = ACTION_LIST_FIELDS; + arg_field = optarg; + break; + + case 'N': + arg_action = ACTION_LIST_FIELD_NAMES; + break; + + case ARG_NO_HOSTNAME: + arg_no_hostname = true; + break; + + case 'x': + arg_catalog = true; + break; + + case ARG_LIST_CATALOG: + arg_action = ACTION_LIST_CATALOG; + break; + + case ARG_DUMP_CATALOG: + arg_action = ACTION_DUMP_CATALOG; + break; + + case ARG_UPDATE_CATALOG: + arg_action = ACTION_UPDATE_CATALOG; + break; + + case 'r': + arg_reverse = true; + break; + + case ARG_UTC: + arg_utc = true; + break; + + case ARG_FLUSH: + arg_action = ACTION_FLUSH; + break; + + case ARG_SMART_RELINQUISH_VAR: { + int root_mnt_id, log_mnt_id; + + /* Try to be smart about relinquishing access to /var/log/journal/ during shutdown: + * if it's on the same mount as the root file system there's no point in + * relinquishing access and we can leave journald write to it until the very last + * moment. */ + + r = path_get_mnt_id("/", &root_mnt_id); + if (r < 0) + log_debug_errno(r, "Failed to get root mount ID, ignoring: %m"); + else { + r = path_get_mnt_id("/var/log/journal/", &log_mnt_id); + if (r < 0) + log_debug_errno(r, "Failed to get journal directory mount ID, ignoring: %m"); + else if (root_mnt_id == log_mnt_id) { + log_debug("/var/log/journal/ is on root file system, not relinquishing access to /var."); + return 0; + } else + log_debug("/var/log/journal/ is not on the root file system, relinquishing access to it."); + } + + _fallthrough_; + } + + case ARG_RELINQUISH_VAR: + arg_action = ACTION_RELINQUISH_VAR; + break; + + case ARG_ROTATE: + arg_action = arg_action == ACTION_VACUUM ? ACTION_ROTATE_AND_VACUUM : ACTION_ROTATE; + break; + + case ARG_SYNC: + arg_action = ACTION_SYNC; + break; + + case ARG_OUTPUT_FIELDS: { + _cleanup_strv_free_ char **v = NULL; + + v = strv_split(optarg, ","); + if (!v) + return log_oom(); + + if (!arg_output_fields) + arg_output_fields = TAKE_PTR(v); + else { + r = strv_extend_strv(&arg_output_fields, v, true); + if (r < 0) + return log_oom(); + } + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached("Unhandled option"); + } + + if (arg_follow && !arg_no_tail && !arg_since && arg_lines == ARG_LINES_DEFAULT) + arg_lines = 10; + + if (!!arg_directory + !!arg_file + !!arg_machine + !!arg_root + !!arg_image > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify at most one of -D/--directory=, --file=, -M/--machine=, --root=, --image=."); + + if (arg_since_set && arg_until_set && arg_since > arg_until) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--since= must be before --until=."); + + if (!!arg_cursor + !!arg_after_cursor + !!arg_since_set > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify only one of --since=, --cursor=, and --after-cursor."); + + if (arg_follow && arg_reverse) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify either --reverse= or --follow=, not both."); + + if (!IN_SET(arg_action, ACTION_SHOW, ACTION_DUMP_CATALOG, ACTION_LIST_CATALOG) && optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Extraneous arguments starting with '%s'", + argv[optind]); + + if ((arg_boot || arg_action == ACTION_LIST_BOOTS) && arg_merge) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Using --boot or --list-boots with --merge is not supported."); + + if (!strv_isempty(arg_system_units) && arg_journal_type == SD_JOURNAL_CURRENT_USER) { + /* Specifying --user and --unit= at the same time makes no sense (as the former excludes the user + * journal, but the latter excludes the system journal, thus resulting in empty output). Let's be nice + * to users, and automatically turn --unit= into --user-unit= if combined with --user. */ + r = strv_extend_strv(&arg_user_units, arg_system_units, true); + if (r < 0) + return r; + + arg_system_units = strv_free(arg_system_units); + } + +#if HAVE_PCRE2 + if (arg_pattern) { + unsigned flags; + + r = dlopen_pcre2(); + if (r < 0) + return r; + + if (arg_case_sensitive >= 0) + flags = !arg_case_sensitive * PCRE2_CASELESS; + else { + _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL; + bool has_case; + _cleanup_(sym_pcre2_code_freep) pcre2_code *cs = NULL; + + md = sym_pcre2_match_data_create(1, NULL); + if (!md) + return log_oom(); + + r = pattern_compile("[[:upper:]]", 0, &cs); + if (r < 0) + return r; + + r = sym_pcre2_match(cs, (PCRE2_SPTR8) arg_pattern, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL); + has_case = r >= 0; + + flags = !has_case * PCRE2_CASELESS; + } + + log_debug("Doing case %s matching based on %s", + flags & PCRE2_CASELESS ? "insensitive" : "sensitive", + arg_case_sensitive >= 0 ? "request" : "pattern casing"); + + r = pattern_compile(arg_pattern, flags, &arg_compiled_pattern); + if (r < 0) + return r; + } +#endif + + return 1; +} + +static int add_matches(sd_journal *j, char **args) { + char **i; + bool have_term = false; + + assert(j); + + STRV_FOREACH(i, args) { + int r; + + if (streq(*i, "+")) { + if (!have_term) + break; + r = sd_journal_add_disjunction(j); + have_term = false; + + } else if (path_is_absolute(*i)) { + _cleanup_free_ char *p = NULL, *t = NULL, *t2 = NULL, *interpreter = NULL; + struct stat st; + + r = chase_symlinks(*i, NULL, CHASE_TRAIL_SLASH, &p, NULL); + if (r < 0) + return log_error_errno(r, "Couldn't canonicalize path: %m"); + + if (lstat(p, &st) < 0) + return log_error_errno(errno, "Couldn't stat file: %m"); + + if (S_ISREG(st.st_mode) && (0111 & st.st_mode)) { + if (executable_is_script(p, &interpreter) > 0) { + _cleanup_free_ char *comm; + + comm = strndup(basename(p), 15); + if (!comm) + return log_oom(); + + t = strjoin("_COMM=", comm); + if (!t) + return log_oom(); + + /* Append _EXE only if the interpreter is not a link. + Otherwise, it might be outdated often. */ + if (lstat(interpreter, &st) == 0 && !S_ISLNK(st.st_mode)) { + t2 = strjoin("_EXE=", interpreter); + if (!t2) + return log_oom(); + } + } else { + t = strjoin("_EXE=", p); + if (!t) + return log_oom(); + } + + r = sd_journal_add_match(j, t, 0); + + if (r >=0 && t2) + r = sd_journal_add_match(j, t2, 0); + + } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + r = add_matches_for_device(j, p); + if (r < 0) + return r; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "File is neither a device node, nor regular file, nor executable: %s", + *i); + + have_term = true; + } else { + r = sd_journal_add_match(j, *i, 0); + have_term = true; + } + + if (r < 0) + return log_error_errno(r, "Failed to add match '%s': %m", *i); + } + + if (!strv_isempty(args) && !have_term) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "\"+\" can only be used between terms"); + + return 0; +} + +static void boot_id_free_all(BootId *l) { + + while (l) { + BootId *i = l; + LIST_REMOVE(boot_list, l, i); + free(i); + } +} + +static int discover_next_boot(sd_journal *j, + sd_id128_t previous_boot_id, + bool advance_older, + BootId **ret) { + + _cleanup_free_ BootId *next_boot = NULL; + char match[9+32+1] = "_BOOT_ID="; + sd_id128_t boot_id; + int r; + + assert(j); + assert(ret); + + /* We expect the journal to be on the last position of a boot + * (in relation to the direction we are going), so that the next + * invocation of sd_journal_next/previous will be from a different + * boot. We then collect any information we desire and then jump + * to the last location of the new boot by using a _BOOT_ID match + * coming from the other journal direction. */ + + /* Make sure we aren't restricted by any _BOOT_ID matches, so that + * we can actually advance to a *different* boot. */ + sd_journal_flush_matches(j); + + do { + if (advance_older) + r = sd_journal_previous(j); + else + r = sd_journal_next(j); + if (r < 0) + return r; + else if (r == 0) + return 0; /* End of journal, yay. */ + + r = sd_journal_get_monotonic_usec(j, NULL, &boot_id); + if (r < 0) + return r; + + /* We iterate through this in a loop, until the boot ID differs from the previous one. Note that + * normally, this will only require a single iteration, as we seeked to the last entry of the previous + * boot entry already. However, it might happen that the per-journal-field entry arrays are less + * complete than the main entry array, and hence might reference an entry that's not actually the last + * one of the boot ID as last one. Let's hence use the per-field array is initial seek position to + * speed things up, but let's not trust that it is complete, and hence, manually advance as + * necessary. */ + + } while (sd_id128_equal(boot_id, previous_boot_id)); + + next_boot = new0(BootId, 1); + if (!next_boot) + return -ENOMEM; + + next_boot->id = boot_id; + + r = sd_journal_get_realtime_usec(j, &next_boot->first); + if (r < 0) + return r; + + /* Now seek to the last occurrence of this boot ID. */ + sd_id128_to_string(next_boot->id, match + 9); + r = sd_journal_add_match(j, match, sizeof(match) - 1); + if (r < 0) + return r; + + if (advance_older) + r = sd_journal_seek_head(j); + else + r = sd_journal_seek_tail(j); + if (r < 0) + return r; + + if (advance_older) + r = sd_journal_next(j); + else + r = sd_journal_previous(j); + if (r < 0) + return r; + else if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENODATA), + "Whoopsie! We found a boot ID but can't read its last entry."); /* This shouldn't happen. We just came from this very boot ID. */ + + r = sd_journal_get_realtime_usec(j, &next_boot->last); + if (r < 0) + return r; + + *ret = TAKE_PTR(next_boot); + + return 0; +} + +static int get_boots( + sd_journal *j, + BootId **boots, + sd_id128_t *boot_id, + int offset) { + + bool skip_once; + int r, count = 0; + BootId *head = NULL, *tail = NULL, *id; + const bool advance_older = boot_id && offset <= 0; + sd_id128_t previous_boot_id; + + assert(j); + + /* Adjust for the asymmetry that offset 0 is + * the last (and current) boot, while 1 is considered the + * (chronological) first boot in the journal. */ + skip_once = boot_id && sd_id128_is_null(*boot_id) && offset <= 0; + + /* Advance to the earliest/latest occurrence of our reference + * boot ID (taking our lookup direction into account), so that + * discover_next_boot() can do its job. + * If no reference is given, the journal head/tail will do, + * they're "virtual" boots after all. */ + if (boot_id && !sd_id128_is_null(*boot_id)) { + char match[9+32+1] = "_BOOT_ID="; + + sd_journal_flush_matches(j); + + sd_id128_to_string(*boot_id, match + 9); + r = sd_journal_add_match(j, match, sizeof(match) - 1); + if (r < 0) + return r; + + if (advance_older) + r = sd_journal_seek_head(j); /* seek to oldest */ + else + r = sd_journal_seek_tail(j); /* seek to newest */ + if (r < 0) + return r; + + if (advance_older) + r = sd_journal_next(j); /* read the oldest entry */ + else + r = sd_journal_previous(j); /* read the most recently added entry */ + if (r < 0) + return r; + else if (r == 0) + goto finish; + else if (offset == 0) { + count = 1; + goto finish; + } + + /* At this point the read pointer is positioned at the oldest/newest occurrence of the reference boot + * ID. After flushing the matches, one more invocation of _previous()/_next() will hence place us at + * the following entry, which must then have an older/newer boot ID */ + } else { + + if (advance_older) + r = sd_journal_seek_tail(j); /* seek to newest */ + else + r = sd_journal_seek_head(j); /* seek to oldest */ + if (r < 0) + return r; + + /* No sd_journal_next()/_previous() here. + * + * At this point the read pointer is positioned after the newest/before the oldest entry in the whole + * journal. The next invocation of _previous()/_next() will hence position us at the newest/oldest + * entry we have. */ + } + + previous_boot_id = SD_ID128_NULL; + for (;;) { + _cleanup_free_ BootId *current = NULL; + + r = discover_next_boot(j, previous_boot_id, advance_older, ¤t); + if (r < 0) { + boot_id_free_all(head); + return r; + } + + if (!current) + break; + + previous_boot_id = current->id; + + if (boot_id) { + if (!skip_once) + offset += advance_older ? 1 : -1; + skip_once = false; + + if (offset == 0) { + count = 1; + *boot_id = current->id; + break; + } + } else { + LIST_FOREACH(boot_list, id, head) { + if (sd_id128_equal(id->id, current->id)) { + /* boot id already stored, something wrong with the journal files */ + /* exiting as otherwise this problem would cause forever loop */ + goto finish; + } + } + LIST_INSERT_AFTER(boot_list, head, tail, current); + tail = TAKE_PTR(current); + count++; + } + } + +finish: + if (boots) + *boots = head; + + sd_journal_flush_matches(j); + + return count; +} + +static int list_boots(sd_journal *j) { + int w, i, count; + BootId *id, *all_ids; + + assert(j); + + count = get_boots(j, &all_ids, NULL, 0); + if (count < 0) + return log_error_errno(count, "Failed to determine boots: %m"); + if (count == 0) + return count; + + (void) pager_open(arg_pager_flags); + + /* numbers are one less, but we need an extra char for the sign */ + w = DECIMAL_STR_WIDTH(count - 1) + 1; + + i = 0; + LIST_FOREACH(boot_list, id, all_ids) { + char a[FORMAT_TIMESTAMP_MAX], b[FORMAT_TIMESTAMP_MAX]; + + printf("% *i " SD_ID128_FORMAT_STR " %s—%s\n", + w, i - count + 1, + SD_ID128_FORMAT_VAL(id->id), + format_timestamp_maybe_utc(a, sizeof(a), id->first), + format_timestamp_maybe_utc(b, sizeof(b), id->last)); + i++; + } + + boot_id_free_all(all_ids); + + return 0; +} + +static int add_boot(sd_journal *j) { + char match[9+32+1] = "_BOOT_ID="; + sd_id128_t boot_id; + int r; + + assert(j); + + if (!arg_boot) + return 0; + + /* Take a shortcut and use the current boot_id, which we can do very quickly. + * We can do this only when we logs are coming from the current machine, + * so take the slow path if log location is specified. */ + if (arg_boot_offset == 0 && sd_id128_is_null(arg_boot_id) && + !arg_directory && !arg_file && !arg_root) + return add_match_this_boot(j, arg_machine); + + boot_id = arg_boot_id; + r = get_boots(j, NULL, &boot_id, arg_boot_offset); + assert(r <= 1); + if (r <= 0) { + const char *reason = (r == 0) ? "No such boot ID in journal" : strerror_safe(r); + + if (sd_id128_is_null(arg_boot_id)) + log_error("Data from the specified boot (%+i) is not available: %s", + arg_boot_offset, reason); + else + log_error("Data from the specified boot ("SD_ID128_FORMAT_STR") is not available: %s", + SD_ID128_FORMAT_VAL(arg_boot_id), reason); + + return r == 0 ? -ENODATA : r; + } + + sd_id128_to_string(boot_id, match + 9); + + r = sd_journal_add_match(j, match, sizeof(match) - 1); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int add_dmesg(sd_journal *j) { + int r; + assert(j); + + if (!arg_dmesg) + return 0; + + r = sd_journal_add_match(j, "_TRANSPORT=kernel", + STRLEN("_TRANSPORT=kernel")); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int get_possible_units( + sd_journal *j, + const char *fields, + char **patterns, + Set **units) { + + _cleanup_set_free_free_ Set *found; + const char *field; + int r; + + found = set_new(&string_hash_ops); + if (!found) + return -ENOMEM; + + NULSTR_FOREACH(field, fields) { + const void *data; + size_t size; + + r = sd_journal_query_unique(j, field); + if (r < 0) + return r; + + SD_JOURNAL_FOREACH_UNIQUE(j, data, size) { + char **pattern, *eq; + size_t prefix; + _cleanup_free_ char *u = NULL; + + eq = memchr(data, '=', size); + if (eq) + prefix = eq - (char*) data + 1; + else + prefix = 0; + + u = strndup((char*) data + prefix, size - prefix); + if (!u) + return -ENOMEM; + + STRV_FOREACH(pattern, patterns) + if (fnmatch(*pattern, u, FNM_NOESCAPE) == 0) { + log_debug("Matched %s with pattern %s=%s", u, field, *pattern); + + r = set_consume(found, u); + u = NULL; + if (r < 0 && r != -EEXIST) + return r; + + break; + } + } + } + + *units = TAKE_PTR(found); + + return 0; +} + +/* This list is supposed to return the superset of unit names + * possibly matched by rules added with add_matches_for_unit... */ +#define SYSTEM_UNITS \ + "_SYSTEMD_UNIT\0" \ + "COREDUMP_UNIT\0" \ + "UNIT\0" \ + "OBJECT_SYSTEMD_UNIT\0" \ + "_SYSTEMD_SLICE\0" + +/* ... and add_matches_for_user_unit */ +#define USER_UNITS \ + "_SYSTEMD_USER_UNIT\0" \ + "USER_UNIT\0" \ + "COREDUMP_USER_UNIT\0" \ + "OBJECT_SYSTEMD_USER_UNIT\0" \ + "_SYSTEMD_USER_SLICE\0" + +static int add_units(sd_journal *j) { + _cleanup_strv_free_ char **patterns = NULL; + int r, count = 0; + char **i; + + assert(j); + + STRV_FOREACH(i, arg_system_units) { + _cleanup_free_ char *u = NULL; + + r = unit_name_mangle(*i, UNIT_NAME_MANGLE_GLOB | (arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN), &u); + if (r < 0) + return r; + + if (string_is_glob(u)) { + r = strv_push(&patterns, u); + if (r < 0) + return r; + u = NULL; + } else { + r = add_matches_for_unit(j, u); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + if (!strv_isempty(patterns)) { + _cleanup_set_free_free_ Set *units = NULL; + char *u; + + r = get_possible_units(j, SYSTEM_UNITS, patterns, &units); + if (r < 0) + return r; + + SET_FOREACH(u, units) { + r = add_matches_for_unit(j, u); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + patterns = strv_free(patterns); + + STRV_FOREACH(i, arg_user_units) { + _cleanup_free_ char *u = NULL; + + r = unit_name_mangle(*i, UNIT_NAME_MANGLE_GLOB | (arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN), &u); + if (r < 0) + return r; + + if (string_is_glob(u)) { + r = strv_push(&patterns, u); + if (r < 0) + return r; + u = NULL; + } else { + r = add_matches_for_user_unit(j, u, getuid()); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + if (!strv_isempty(patterns)) { + _cleanup_set_free_free_ Set *units = NULL; + char *u; + + r = get_possible_units(j, USER_UNITS, patterns, &units); + if (r < 0) + return r; + + SET_FOREACH(u, units) { + r = add_matches_for_user_unit(j, u, getuid()); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + /* Complain if the user request matches but nothing whatsoever was + * found, since otherwise everything would be matched. */ + if (!(strv_isempty(arg_system_units) && strv_isempty(arg_user_units)) && count == 0) + return -ENODATA; + + r = sd_journal_add_conjunction(j); + if (r < 0) + return r; + + return 0; +} + +static int add_priorities(sd_journal *j) { + char match[] = "PRIORITY=0"; + int i, r; + assert(j); + + if (arg_priorities == 0xFF) + return 0; + + for (i = LOG_EMERG; i <= LOG_DEBUG; i++) + if (arg_priorities & (1 << i)) { + match[sizeof(match)-2] = '0' + i; + + r = sd_journal_add_match(j, match, strlen(match)); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int add_facilities(sd_journal *j) { + void *p; + int r; + + SET_FOREACH(p, arg_facilities) { + char match[STRLEN("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int)]; + + xsprintf(match, "SYSLOG_FACILITY=%d", PTR_TO_INT(p)); + + r = sd_journal_add_match(j, match, strlen(match)); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + + return 0; +} + +static int add_syslog_identifier(sd_journal *j) { + int r; + char **i; + + assert(j); + + STRV_FOREACH(i, arg_syslog_identifier) { + _cleanup_free_ char *u = NULL; + + u = strjoin("SYSLOG_IDENTIFIER=", *i); + if (!u) + return -ENOMEM; + r = sd_journal_add_match(j, u, 0); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + } + + r = sd_journal_add_conjunction(j); + if (r < 0) + return r; + + return 0; +} + +#if HAVE_GCRYPT +static int format_journal_url( + const void *seed, + size_t seed_size, + uint64_t start, + uint64_t interval, + const char *hn, + sd_id128_t machine, + bool full, + char **ret_url) { + _cleanup_free_ char *url = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t url_size = 0; + int r; + + assert(seed); + assert(seed_size > 0); + + f = open_memstream_unlocked(&url, &url_size); + if (!f) + return -ENOMEM; + + if (full) + fputs("fss://", f); + + for (size_t i = 0; i < seed_size; i++) { + if (i > 0 && i % 3 == 0) + fputc('-', f); + fprintf(f, "%02x", ((uint8_t*) seed)[i]); + } + + fprintf(f, "/%"PRIx64"-%"PRIx64, start, interval); + + if (full) { + fprintf(f, "?machine=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(machine)); + if (hn) + fprintf(f, ";hostname=%s", hn); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + *ret_url = TAKE_PTR(url); + return 0; +} +#endif + +static int setup_keys(void) { +#if HAVE_GCRYPT + size_t mpk_size, seed_size, state_size; + _cleanup_(unlink_and_freep) char *k = NULL; + _cleanup_free_ char *p = NULL; + uint8_t *mpk, *seed, *state; + _cleanup_close_ int fd = -1; + sd_id128_t machine, boot; + struct stat st; + uint64_t n; + int r; + + r = stat("/var/log/journal", &st); + if (r < 0 && !IN_SET(errno, ENOENT, ENOTDIR)) + return log_error_errno(errno, "stat(\"%s\") failed: %m", "/var/log/journal"); + + if (r < 0 || !S_ISDIR(st.st_mode)) { + log_error("%s is not a directory, must be using persistent logging for FSS.", + "/var/log/journal"); + return r < 0 ? -errno : -ENOTDIR; + } + + r = sd_id128_get_machine(&machine); + if (r < 0) + return log_error_errno(r, "Failed to get machine ID: %m"); + + r = sd_id128_get_boot(&boot); + if (r < 0) + return log_error_errno(r, "Failed to get boot ID: %m"); + + if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss", + SD_ID128_FORMAT_VAL(machine)) < 0) + return log_oom(); + + if (arg_force) { + r = unlink(p); + if (r < 0 && errno != ENOENT) + return log_error_errno(errno, "unlink(\"%s\") failed: %m", p); + } else if (access(p, F_OK) >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Sealing key file %s exists already. Use --force to recreate.", p); + + if (asprintf(&k, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss.tmp.XXXXXX", + SD_ID128_FORMAT_VAL(machine)) < 0) + return log_oom(); + + mpk_size = FSPRG_mskinbytes(FSPRG_RECOMMENDED_SECPAR); + mpk = alloca(mpk_size); + + seed_size = FSPRG_RECOMMENDED_SEEDLEN; + seed = alloca(seed_size); + + state_size = FSPRG_stateinbytes(FSPRG_RECOMMENDED_SECPAR); + state = alloca(state_size); + + log_info("Generating seed..."); + r = genuine_random_bytes(seed, seed_size, RANDOM_BLOCK); + if (r < 0) + return log_error_errno(r, "Failed to acquire random seed: %m"); + + log_info("Generating key pair..."); + FSPRG_GenMK(NULL, mpk, seed, seed_size, FSPRG_RECOMMENDED_SECPAR); + + log_info("Generating sealing key..."); + FSPRG_GenState0(state, mpk, seed, seed_size); + + assert(arg_interval > 0); + + n = now(CLOCK_REALTIME); + n /= arg_interval; + + safe_close(fd); + fd = mkostemp_safe(k); + if (fd < 0) + return log_error_errno(fd, "Failed to open %s: %m", k); + + /* Enable secure remove, exclusion from dump, synchronous writing and in-place updating */ + static const unsigned chattr_flags[] = { + FS_SECRM_FL, + FS_NODUMP_FL, + FS_SYNC_FL, + FS_NOCOW_FL, + }; + for (size_t j = 0; j < ELEMENTSOF(chattr_flags); j++) { + r = chattr_fd(fd, chattr_flags[j], chattr_flags[j], NULL); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set file attribute 0x%x: %m", chattr_flags[j]); + } + + struct FSSHeader h = { + .signature = { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' }, + .machine_id = machine, + .boot_id = boot, + .header_size = htole64(sizeof(h)), + .start_usec = htole64(n * arg_interval), + .interval_usec = htole64(arg_interval), + .fsprg_secpar = htole16(FSPRG_RECOMMENDED_SECPAR), + .fsprg_state_size = htole64(state_size), + }; + + r = loop_write(fd, &h, sizeof(h), false); + if (r < 0) + return log_error_errno(r, "Failed to write header: %m"); + + r = loop_write(fd, state, state_size, false); + if (r < 0) + return log_error_errno(r, "Failed to write state: %m"); + + if (rename(k, p) < 0) + return log_error_errno(errno, "Failed to link file: %m"); + + k = mfree(k); + + _cleanup_free_ char *hn = NULL, *key = NULL; + + r = format_journal_url(seed, seed_size, n, arg_interval, hn, machine, false, &key); + if (r < 0) + return r; + + if (on_tty()) { + hn = gethostname_malloc(); + if (hn) + hostname_cleanup(hn); + + char tsb[FORMAT_TIMESPAN_MAX]; + fprintf(stderr, + "\nNew keys have been generated for host %s%s" SD_ID128_FORMAT_STR ".\n" + "\n" + "The %ssecret sealing key%s has been written to the following local file.\n" + "This key file is automatically updated when the sealing key is advanced.\n" + "It should not be used on multiple hosts.\n" + "\n" + "\t%s\n" + "\n" + "The sealing key is automatically changed every %s.\n" + "\n" + "Please write down the following %ssecret verification key%s. It should be stored\n" + "in a safe location and should not be saved locally on disk.\n" + "\n\t%s", + strempty(hn), hn ? "/" : "", + SD_ID128_FORMAT_VAL(machine), + ansi_highlight(), ansi_normal(), + p, + format_timespan(tsb, sizeof(tsb), arg_interval, 0), + ansi_highlight(), ansi_normal(), + ansi_highlight_red()); + fflush(stderr); + } + + puts(key); + + if (on_tty()) { + fprintf(stderr, "%s", ansi_normal()); +#if HAVE_QRENCODE + _cleanup_free_ char *url = NULL; + r = format_journal_url(seed, seed_size, n, arg_interval, hn, machine, true, &url); + if (r < 0) + return r; + + (void) print_qrcode(stderr, + "To transfer the verification key to your phone scan the QR code below", + url); +#endif + } + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Forward-secure sealing not available."); +#endif +} + +static int verify(sd_journal *j) { + int r = 0; + JournalFile *f; + + assert(j); + + log_show_color(true); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + int k; + usec_t first = 0, validated = 0, last = 0; + +#if HAVE_GCRYPT + if (!arg_verify_key && JOURNAL_HEADER_SEALED(f->header)) + log_notice("Journal file %s has sealing enabled but verification key has not been passed using --verify-key=.", f->path); +#endif + + k = journal_file_verify(f, arg_verify_key, &first, &validated, &last, true); + if (k == -EINVAL) + /* If the key was invalid give up right-away. */ + return k; + else if (k < 0) + r = log_warning_errno(k, "FAIL: %s (%m)", f->path); + else { + char a[FORMAT_TIMESTAMP_MAX], b[FORMAT_TIMESTAMP_MAX], c[FORMAT_TIMESPAN_MAX]; + log_info("PASS: %s", f->path); + + if (arg_verify_key && JOURNAL_HEADER_SEALED(f->header)) { + if (validated > 0) { + log_info("=> Validated from %s to %s, final %s entries not sealed.", + format_timestamp_maybe_utc(a, sizeof(a), first), + format_timestamp_maybe_utc(b, sizeof(b), validated), + format_timespan(c, sizeof(c), last > validated ? last - validated : 0, 0)); + } else if (last > 0) + log_info("=> No sealing yet, %s of entries not sealed.", + format_timespan(c, sizeof(c), last - first, 0)); + else + log_info("=> No sealing yet, no entries in file."); + } + } + } + + return r; +} + +static int simple_varlink_call(const char *option, const char *method) { + _cleanup_(varlink_flush_close_unrefp) Varlink *link = NULL; + const char *error, *fn; + int r; + + if (arg_machine) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "%s is not supported in conjunction with --machine=.", option); + + fn = arg_namespace ? + strjoina("/run/systemd/journal.", arg_namespace, "/io.systemd.journal") : + "/run/systemd/journal/io.systemd.journal"; + + r = varlink_connect_address(&link, fn); + if (r < 0) + return log_error_errno(r, "Failed to connect to %s: %m", fn); + + (void) varlink_set_description(link, "journal"); + (void) varlink_set_relative_timeout(link, USEC_INFINITY); + + r = varlink_call(link, method, NULL, NULL, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to execute varlink call: %m"); + if (error) + return log_error_errno(SYNTHETIC_ERRNO(ENOANO), + "Failed to execute varlink call: %s", error); + + return 0; +} + +static int flush_to_var(void) { + return simple_varlink_call("--flush", "io.systemd.Journal.FlushToVar"); +} + +static int relinquish_var(void) { + return simple_varlink_call("--relinquish-var/--smart-relinquish-var", "io.systemd.Journal.RelinquishVar"); +} + +static int rotate(void) { + return simple_varlink_call("--rotate", "io.systemd.Journal.Rotate"); +} + +static int sync_journal(void) { + return simple_varlink_call("--sync", "io.systemd.Journal.Synchronize"); +} + +static int wait_for_change(sd_journal *j, int poll_fd) { + struct pollfd pollfds[] = { + { .fd = poll_fd, .events = POLLIN }, + { .fd = STDOUT_FILENO }, + }; + + struct timespec ts; + usec_t timeout; + int r; + + assert(j); + assert(poll_fd >= 0); + + /* Much like sd_journal_wait() but also keeps an eye on STDOUT, and exits as soon as we see a POLLHUP on that, + * i.e. when it is closed. */ + + r = sd_journal_get_timeout(j, &timeout); + if (r < 0) + return log_error_errno(r, "Failed to determine journal waiting time: %m"); + + if (ppoll(pollfds, ELEMENTSOF(pollfds), + timeout == USEC_INFINITY ? NULL : timespec_store(&ts, timeout), NULL) < 0) { + if (errno == EINTR) + return 0; + + return log_error_errno(errno, "Couldn't wait for journal event: %m"); + } + + if (pollfds[1].revents & (POLLHUP|POLLERR|POLLNVAL)) /* STDOUT has been closed? */ + return log_debug_errno(SYNTHETIC_ERRNO(ECANCELED), + "Standard output has been closed."); + + if (pollfds[0].revents & POLLNVAL) + return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "Change fd closed?"); + + r = sd_journal_process(j); + if (r < 0) + return log_error_errno(r, "Failed to process journal events: %m"); + + return 0; +} + +int main(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *unlink_dir = NULL; + bool previous_boot_id_valid = false, first_line = true, ellipsized = false, need_seek = false; + bool use_cursor = false, after_cursor = false; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + sd_id128_t previous_boot_id; + int n_shown = 0, r, poll_fd = -1; + + setlocale(LC_ALL, ""); + log_setup_cli(); + + /* Increase max number of open files if we can, we might needs this when browsing journal files, which might be + * split up into many files. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + r = parse_argv(argc, argv); + if (r <= 0) + goto finish; + + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + DISSECT_IMAGE_REQUIRE_ROOT|DISSECT_IMAGE_VALIDATE_OS|DISSECT_IMAGE_RELAX_VAR_CHECK| + (arg_action == ACTION_UPDATE_CATALOG ? DISSECT_IMAGE_FSCK : DISSECT_IMAGE_READ_ONLY), + &unlink_dir, + &loop_device, + &decrypted_image); + if (r < 0) + return r; + + arg_root = strdup(unlink_dir); + if (!arg_root) + return log_oom(); + } + + signal(SIGWINCH, columns_lines_cache_reset); + sigbus_install(); + + switch (arg_action) { + + case ACTION_NEW_ID128: + r = id128_print_new(ID128_PRINT_PRETTY); + goto finish; + + case ACTION_SETUP_KEYS: + r = setup_keys(); + goto finish; + + case ACTION_LIST_CATALOG: + case ACTION_DUMP_CATALOG: + case ACTION_UPDATE_CATALOG: { + _cleanup_free_ char *database; + + database = path_join(arg_root, CATALOG_DATABASE); + if (!database) { + r = log_oom(); + goto finish; + } + + if (arg_action == ACTION_UPDATE_CATALOG) { + r = catalog_update(database, arg_root, catalog_file_dirs); + if (r < 0) + log_error_errno(r, "Failed to list catalog: %m"); + } else { + bool oneline = arg_action == ACTION_LIST_CATALOG; + + (void) pager_open(arg_pager_flags); + + if (optind < argc) + r = catalog_list_items(stdout, database, oneline, argv + optind); + else + r = catalog_list(stdout, database, oneline); + if (r < 0) + log_error_errno(r, "Failed to list catalog: %m"); + } + + goto finish; + } + + case ACTION_FLUSH: + r = flush_to_var(); + goto finish; + + case ACTION_RELINQUISH_VAR: + r = relinquish_var(); + goto finish; + + case ACTION_SYNC: + r = sync_journal(); + goto finish; + + case ACTION_ROTATE: + r = rotate(); + goto finish; + + case ACTION_SHOW: + case ACTION_PRINT_HEADER: + case ACTION_VERIFY: + case ACTION_DISK_USAGE: + case ACTION_LIST_BOOTS: + case ACTION_VACUUM: + case ACTION_ROTATE_AND_VACUUM: + case ACTION_LIST_FIELDS: + case ACTION_LIST_FIELD_NAMES: + /* These ones require access to the journal files, continue below. */ + break; + + default: + assert_not_reached("Unknown action"); + } + + if (arg_directory) + r = sd_journal_open_directory(&j, arg_directory, arg_journal_type); + else if (arg_root) + r = sd_journal_open_directory(&j, arg_root, arg_journal_type | SD_JOURNAL_OS_ROOT); + else if (arg_file_stdin) + r = sd_journal_open_files_fd(&j, (int[]) { STDIN_FILENO }, 1, 0); + else if (arg_file) + r = sd_journal_open_files(&j, (const char**) arg_file, 0); + else if (arg_machine) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int fd; + + if (geteuid() != 0) { + /* The file descriptor returned by OpenMachineRootDirectory() will be owned by users/groups of + * the container, thus we need root privileges to override them. */ + r = log_error_errno(SYNTHETIC_ERRNO(EPERM), "Using the --machine= switch requires root privileges."); + goto finish; + } + + r = sd_bus_open_system(&bus); + if (r < 0) { + log_error_errno(r, "Failed to open system bus: %m"); + goto finish; + } + + r = sd_bus_call_method( + bus, + "org.freedesktop.machine1", + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + "OpenMachineRootDirectory", + &error, + &reply, + "s", arg_machine); + if (r < 0) { + log_error_errno(r, "Failed to open root directory: %s", bus_error_message(&error, r)); + goto finish; + } + + r = sd_bus_message_read(reply, "h", &fd); + if (r < 0) { + bus_log_parse_error(r); + goto finish; + } + + fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (fd < 0) { + r = log_error_errno(errno, "Failed to duplicate file descriptor: %m"); + goto finish; + } + + r = sd_journal_open_directory_fd(&j, fd, SD_JOURNAL_OS_ROOT); + if (r < 0) + safe_close(fd); + } else + r = sd_journal_open_namespace( + &j, + arg_namespace, + (arg_merge ? 0 : SD_JOURNAL_LOCAL_ONLY) | + arg_namespace_flags | arg_journal_type); + if (r < 0) { + log_error_errno(r, "Failed to open %s: %m", arg_directory ?: arg_file ? "files" : "journal"); + goto finish; + } + + r = journal_access_check_and_warn(j, arg_quiet, + !(arg_journal_type == SD_JOURNAL_CURRENT_USER || arg_user_units)); + if (r < 0) + goto finish; + + switch (arg_action) { + + case ACTION_NEW_ID128: + case ACTION_SETUP_KEYS: + case ACTION_LIST_CATALOG: + case ACTION_DUMP_CATALOG: + case ACTION_UPDATE_CATALOG: + case ACTION_FLUSH: + case ACTION_SYNC: + case ACTION_ROTATE: + assert_not_reached("Unexpected action."); + + case ACTION_PRINT_HEADER: + journal_print_header(j); + r = 0; + goto finish; + + case ACTION_VERIFY: + r = verify(j); + goto finish; + + case ACTION_DISK_USAGE: { + uint64_t bytes = 0; + char sbytes[FORMAT_BYTES_MAX]; + + r = sd_journal_get_usage(j, &bytes); + if (r < 0) + goto finish; + + printf("Archived and active journals take up %s in the file system.\n", + format_bytes(sbytes, sizeof(sbytes), bytes)); + goto finish; + } + + case ACTION_LIST_BOOTS: + r = list_boots(j); + goto finish; + + case ACTION_ROTATE_AND_VACUUM: + + r = rotate(); + if (r < 0) + goto finish; + + _fallthrough_; + + case ACTION_VACUUM: { + Directory *d; + + HASHMAP_FOREACH(d, j->directories_by_path) { + int q; + + q = journal_directory_vacuum(d->path, arg_vacuum_size, arg_vacuum_n_files, arg_vacuum_time, NULL, !arg_quiet); + if (q < 0) + r = log_error_errno(q, "Failed to vacuum %s: %m", d->path); + } + + goto finish; + } + + case ACTION_LIST_FIELD_NAMES: { + const char *field; + + SD_JOURNAL_FOREACH_FIELD(j, field) { + printf("%s\n", field); + n_shown++; + } + + r = 0; + goto finish; + } + + case ACTION_SHOW: + case ACTION_LIST_FIELDS: + break; + + default: + assert_not_reached("Unknown action"); + } + + if (arg_boot_offset != 0 && + sd_journal_has_runtime_files(j) > 0 && + sd_journal_has_persistent_files(j) == 0) { + log_info("Specifying boot ID or boot offset has no effect, no persistent journal was found."); + r = 0; + goto finish; + } + /* add_boot() must be called first! + * It may need to seek the journal to find parent boot IDs. */ + r = add_boot(j); + if (r < 0) + goto finish; + + r = add_dmesg(j); + if (r < 0) + goto finish; + + r = add_units(j); + if (r < 0) { + log_error_errno(r, "Failed to add filter for units: %m"); + goto finish; + } + + r = add_syslog_identifier(j); + if (r < 0) { + log_error_errno(r, "Failed to add filter for syslog identifiers: %m"); + goto finish; + } + + r = add_priorities(j); + if (r < 0) + goto finish; + + r = add_facilities(j); + if (r < 0) + goto finish; + + r = add_matches(j, argv + optind); + if (r < 0) + goto finish; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *filter; + + filter = journal_make_match_string(j); + if (!filter) + return log_oom(); + + log_debug("Journal filter: %s", filter); + } + + if (arg_action == ACTION_LIST_FIELDS) { + const void *data; + size_t size; + + assert(arg_field); + + r = sd_journal_set_data_threshold(j, 0); + if (r < 0) { + log_error_errno(r, "Failed to unset data size threshold: %m"); + goto finish; + } + + r = sd_journal_query_unique(j, arg_field); + if (r < 0) { + log_error_errno(r, "Failed to query unique data objects: %m"); + goto finish; + } + + SD_JOURNAL_FOREACH_UNIQUE(j, data, size) { + const void *eq; + + if (arg_lines >= 0 && n_shown >= arg_lines) + break; + + eq = memchr(data, '=', size); + if (eq) + printf("%.*s\n", (int) (size - ((const uint8_t*) eq - (const uint8_t*) data + 1)), (const char*) eq + 1); + else + printf("%.*s\n", (int) size, (const char*) data); + + n_shown++; + } + + r = 0; + goto finish; + } + + /* Opening the fd now means the first sd_journal_wait() will actually wait */ + if (arg_follow) { + poll_fd = sd_journal_get_fd(j); + if (poll_fd == -EMFILE) { + log_warning_errno(poll_fd, "Insufficient watch descriptors available. Reverting to -n."); + arg_follow = false; + } else if (poll_fd == -EMEDIUMTYPE) { + log_error_errno(poll_fd, "The --follow switch is not supported in conjunction with reading from STDIN."); + goto finish; + } else if (poll_fd < 0) { + log_error_errno(poll_fd, "Failed to get journal fd: %m"); + goto finish; + } + } + + if (arg_cursor || arg_after_cursor || arg_cursor_file) { + _cleanup_free_ char *cursor_from_file = NULL; + const char *cursor = arg_cursor ?: arg_after_cursor; + + if (arg_cursor_file) { + r = read_one_line_file(arg_cursor_file, &cursor_from_file); + if (r < 0 && r != -ENOENT) { + log_error_errno(r, "Failed to read cursor file %s: %m", arg_cursor_file); + goto finish; + } + + if (r > 0) { + cursor = cursor_from_file; + after_cursor = true; + } + } else + after_cursor = arg_after_cursor; + + if (cursor) { + r = sd_journal_seek_cursor(j, cursor); + if (r < 0) { + log_error_errno(r, "Failed to seek to cursor: %m"); + goto finish; + } + use_cursor = true; + } + } + + if (use_cursor) { + if (!arg_reverse) + r = sd_journal_next_skip(j, 1 + after_cursor); + else + r = sd_journal_previous_skip(j, 1 + after_cursor); + + if (after_cursor && r < 2) { + /* We couldn't find the next entry after the cursor. */ + if (arg_follow) + need_seek = true; + else + arg_lines = 0; + } + + } else if (arg_since_set && !arg_reverse) { + r = sd_journal_seek_realtime_usec(j, arg_since); + if (r < 0) { + log_error_errno(r, "Failed to seek to date: %m"); + goto finish; + } + r = sd_journal_next(j); + + } else if (arg_until_set && arg_reverse) { + r = sd_journal_seek_realtime_usec(j, arg_until); + if (r < 0) { + log_error_errno(r, "Failed to seek to date: %m"); + goto finish; + } + r = sd_journal_previous(j); + + } else if (arg_reverse) { + r = sd_journal_seek_tail(j); + if (r < 0) { + log_error_errno(r, "Failed to seek to tail: %m"); + goto finish; + } + + r = sd_journal_previous(j); + + } else if (arg_lines >= 0) { + r = sd_journal_seek_tail(j); + if (r < 0) { + log_error_errno(r, "Failed to seek to tail: %m"); + goto finish; + } + + r = sd_journal_previous_skip(j, arg_lines); + + } else { + r = sd_journal_seek_head(j); + if (r < 0) { + log_error_errno(r, "Failed to seek to head: %m"); + goto finish; + } + + r = sd_journal_next(j); + } + + if (r < 0) { + log_error_errno(r, "Failed to iterate through journal: %m"); + goto finish; + } + if (r == 0) + need_seek = true; + + if (!arg_follow) + (void) pager_open(arg_pager_flags); + + if (!arg_quiet && (arg_lines != 0 || arg_follow)) { + usec_t start, end; + char start_buf[FORMAT_TIMESTAMP_MAX], end_buf[FORMAT_TIMESTAMP_MAX]; + + r = sd_journal_get_cutoff_realtime_usec(j, &start, &end); + if (r < 0) { + log_error_errno(r, "Failed to get cutoff: %m"); + goto finish; + } + + if (r > 0) { + if (arg_follow) + printf("-- Journal begins at %s. --\n", + format_timestamp_maybe_utc(start_buf, sizeof(start_buf), start)); + else + printf("-- Journal begins at %s, ends at %s. --\n", + format_timestamp_maybe_utc(start_buf, sizeof(start_buf), start), + format_timestamp_maybe_utc(end_buf, sizeof(end_buf), end)); + } + } + + for (;;) { + while (arg_lines < 0 || n_shown < arg_lines || (arg_follow && !first_line)) { + int flags; + size_t highlight[2] = {}; + + if (need_seek) { + if (!arg_reverse) + r = sd_journal_next(j); + else + r = sd_journal_previous(j); + if (r < 0) { + log_error_errno(r, "Failed to iterate through journal: %m"); + goto finish; + } + if (r == 0) + break; + } + + if (arg_until_set && !arg_reverse) { + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) { + log_error_errno(r, "Failed to determine timestamp: %m"); + goto finish; + } + if (usec > arg_until) + break; + } + + if (arg_since_set && arg_reverse) { + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) { + log_error_errno(r, "Failed to determine timestamp: %m"); + goto finish; + } + if (usec < arg_since) + break; + } + + if (!arg_merge && !arg_quiet) { + sd_id128_t boot_id; + + r = sd_journal_get_monotonic_usec(j, NULL, &boot_id); + if (r >= 0) { + if (previous_boot_id_valid && + !sd_id128_equal(boot_id, previous_boot_id)) + printf("%s-- Boot "SD_ID128_FORMAT_STR" --%s\n", + ansi_highlight(), SD_ID128_FORMAT_VAL(boot_id), ansi_normal()); + + previous_boot_id = boot_id; + previous_boot_id_valid = true; + } + } + +#if HAVE_PCRE2 + if (arg_compiled_pattern) { + _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL; + const void *message; + size_t len; + PCRE2_SIZE *ovec; + + md = sym_pcre2_match_data_create(1, NULL); + if (!md) + return log_oom(); + + r = sd_journal_get_data(j, "MESSAGE", &message, &len); + if (r < 0) { + if (r == -ENOENT) { + need_seek = true; + continue; + } + + log_error_errno(r, "Failed to get MESSAGE field: %m"); + goto finish; + } + + assert_se(message = startswith(message, "MESSAGE=")); + + r = sym_pcre2_match(arg_compiled_pattern, + message, + len - strlen("MESSAGE="), + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + md, + NULL); + if (r == PCRE2_ERROR_NOMATCH) { + need_seek = true; + continue; + } + if (r < 0) { + unsigned char buf[LINE_MAX]; + int r2; + + r2 = sym_pcre2_get_error_message(r, buf, sizeof buf); + log_error("Pattern matching failed: %s", + r2 < 0 ? "unknown error" : (char*) buf); + r = -EINVAL; + goto finish; + } + + ovec = sym_pcre2_get_ovector_pointer(md); + highlight[0] = ovec[0]; + highlight[1] = ovec[1]; + } +#endif + + flags = + arg_all * OUTPUT_SHOW_ALL | + arg_full * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR | + arg_catalog * OUTPUT_CATALOG | + arg_utc * OUTPUT_UTC | + arg_no_hostname * OUTPUT_NO_HOSTNAME; + + r = show_journal_entry(stdout, j, arg_output, 0, flags, + arg_output_fields, highlight, &ellipsized); + need_seek = true; + if (r == -EADDRNOTAVAIL) + break; + else if (r < 0) + goto finish; + + n_shown++; + + /* If journalctl take a long time to process messages, and during that time journal file + * rotation occurs, a journalctl client will keep those rotated files open until it calls + * sd_journal_process(), which typically happens as a result of calling sd_journal_wait() below + * in the "following" case. By periodically calling sd_journal_process() during the processing + * loop we shrink the window of time a client instance has open file descriptors for rotated + * (deleted) journal files. */ + if ((n_shown % PROCESS_INOTIFY_INTERVAL) == 0) { + r = sd_journal_process(j); + if (r < 0) { + log_error_errno(r, "Failed to process inotify events: %m"); + goto finish; + } + } + } + + if (!arg_follow) { + if (n_shown == 0 && !arg_quiet) + printf("-- No entries --\n"); + break; + } + + fflush(stdout); + + r = wait_for_change(j, poll_fd); + if (r < 0) + goto finish; + + first_line = false; + } + + if (arg_show_cursor || arg_cursor_file) { + _cleanup_free_ char *cursor = NULL; + + r = sd_journal_get_cursor(j, &cursor); + if (r < 0 && r != -EADDRNOTAVAIL) + log_error_errno(r, "Failed to get cursor: %m"); + else if (r >= 0) { + if (arg_show_cursor) + printf("-- cursor: %s\n", cursor); + + if (arg_cursor_file) { + r = write_string_file(arg_cursor_file, cursor, + WRITE_STRING_FILE_CREATE | + WRITE_STRING_FILE_ATOMIC); + if (r < 0) + log_error_errno(r, + "Failed to write new cursor to %s: %m", + arg_cursor_file); + } + } + } + +finish: + pager_close(); + + strv_free(arg_file); + + set_free(arg_facilities); + strv_free(arg_syslog_identifier); + strv_free(arg_system_units); + strv_free(arg_user_units); + strv_free(arg_output_fields); + + free(arg_root); + free(arg_verify_key); + +#if HAVE_PCRE2 + if (arg_compiled_pattern) { + sym_pcre2_code_free(arg_compiled_pattern); + + /* --grep was used, no error was thrown, but the pattern didn't + * match anything. Let's mimic grep's behavior here and return + * a non-zero exit code, so journalctl --grep can be used + * in scripts and such */ + if (r == 0 && n_shown == 0) + r = -ENOENT; + } +#endif + + return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/journal/journald-audit.c b/src/journal/journald-audit.c new file mode 100644 index 0000000..744f750 --- /dev/null +++ b/src/journal/journald-audit.c @@ -0,0 +1,555 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "audit-type.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "journald-audit.h" +#include "missing_audit.h" +#include "string-util.h" + +typedef struct MapField { + const char *audit_field; + const char *journal_field; + int (*map)(const char *field, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov); +} MapField; + +static int map_simple_field(const char *field, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov) { + _cleanup_free_ char *c = NULL; + size_t l = 0, allocated = 0; + const char *e; + + assert(field); + assert(p); + assert(iov); + assert(n_iov); + + l = strlen(field); + allocated = l + 1; + c = malloc(allocated); + if (!c) + return -ENOMEM; + + memcpy(c, field, l); + for (e = *p; !IN_SET(*e, 0, ' '); e++) { + if (!GREEDY_REALLOC(c, allocated, l+2)) + return -ENOMEM; + + c[l++] = *e; + } + + c[l] = 0; + + if (!GREEDY_REALLOC(*iov, *n_iov_allocated, *n_iov + 1)) + return -ENOMEM; + + (*iov)[(*n_iov)++] = IOVEC_MAKE(c, l); + + *p = e; + c = NULL; + + return 1; +} + +static int map_string_field_internal(const char *field, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov, bool filter_printable) { + _cleanup_free_ char *c = NULL; + const char *s, *e; + size_t l; + + assert(field); + assert(p); + assert(iov); + assert(n_iov); + + /* The kernel formats string fields in one of two formats. */ + + if (**p == '"') { + /* Normal quoted syntax */ + s = *p + 1; + e = strchr(s, '"'); + if (!e) + return 0; + + l = strlen(field) + (e - s); + c = malloc(l+1); + if (!c) + return -ENOMEM; + + *((char*) mempcpy(stpcpy(c, field), s, e - s)) = 0; + + e += 1; + + } else if (unhexchar(**p) >= 0) { + /* Hexadecimal escaping */ + size_t allocated = 0; + + l = strlen(field); + allocated = l + 2; + c = malloc(allocated); + if (!c) + return -ENOMEM; + + memcpy(c, field, l); + for (e = *p; !IN_SET(*e, 0, ' '); e += 2) { + int a, b; + uint8_t x; + + a = unhexchar(e[0]); + if (a < 0) + return 0; + + b = unhexchar(e[1]); + if (b < 0) + return 0; + + x = ((uint8_t) a << 4 | (uint8_t) b); + + if (filter_printable && x < (uint8_t) ' ') + x = (uint8_t) ' '; + + if (!GREEDY_REALLOC(c, allocated, l+2)) + return -ENOMEM; + + c[l++] = (char) x; + } + + c[l] = 0; + } else + return 0; + + if (!GREEDY_REALLOC(*iov, *n_iov_allocated, *n_iov + 1)) + return -ENOMEM; + + (*iov)[(*n_iov)++] = IOVEC_MAKE(c, l); + + *p = e; + c = NULL; + + return 1; +} + +static int map_string_field(const char *field, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov) { + return map_string_field_internal(field, p, iov, n_iov_allocated, n_iov, false); +} + +static int map_string_field_printable(const char *field, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov) { + return map_string_field_internal(field, p, iov, n_iov_allocated, n_iov, true); +} + +static int map_generic_field(const char *prefix, const char **p, struct iovec **iov, size_t *n_iov_allocated, size_t *n_iov) { + const char *e, *f; + char *c, *t; + int r; + + /* Implements fallback mappings for all fields we don't know */ + + for (e = *p; e < *p + 16; e++) { + + if (IN_SET(*e, 0, ' ')) + return 0; + + if (*e == '=') + break; + + if (!((*e >= 'a' && *e <= 'z') || + (*e >= 'A' && *e <= 'Z') || + (*e >= '0' && *e <= '9') || + IN_SET(*e, '_', '-'))) + return 0; + } + + if (e <= *p || e >= *p + 16) + return 0; + + c = newa(char, strlen(prefix) + (e - *p) + 2); + + t = stpcpy(c, prefix); + for (f = *p; f < e; f++) { + char x; + + if (*f >= 'a' && *f <= 'z') + x = (*f - 'a') + 'A'; /* uppercase */ + else if (*f == '-') + x = '_'; /* dashes → underscores */ + else + x = *f; + + *(t++) = x; + } + strcpy(t, "="); + + e++; + + r = map_simple_field(c, &e, iov, n_iov_allocated, n_iov); + if (r < 0) + return r; + + *p = e; + return r; +} + +/* Kernel fields are those occurring in the audit string before + * msg='. All of these fields are trusted, hence carry the "_" prefix. + * We try to translate the fields we know into our native names. The + * other's are generically mapped to _AUDIT_FIELD_XYZ= */ +static const MapField map_fields_kernel[] = { + + /* First, we map certain well-known audit fields into native + * well-known fields */ + { "pid=", "_PID=", map_simple_field }, + { "ppid=", "_PPID=", map_simple_field }, + { "uid=", "_UID=", map_simple_field }, + { "euid=", "_EUID=", map_simple_field }, + { "fsuid=", "_FSUID=", map_simple_field }, + { "gid=", "_GID=", map_simple_field }, + { "egid=", "_EGID=", map_simple_field }, + { "fsgid=", "_FSGID=", map_simple_field }, + { "tty=", "_TTY=", map_simple_field }, + { "ses=", "_AUDIT_SESSION=", map_simple_field }, + { "auid=", "_AUDIT_LOGINUID=", map_simple_field }, + { "subj=", "_SELINUX_CONTEXT=", map_simple_field }, + { "comm=", "_COMM=", map_string_field }, + { "exe=", "_EXE=", map_string_field }, + { "proctitle=", "_CMDLINE=", map_string_field_printable }, + + /* Some fields don't map to native well-known fields. However, + * we know that they are string fields, hence let's undo + * string field escaping for them, though we stick to the + * generic field names. */ + { "path=", "_AUDIT_FIELD_PATH=", map_string_field }, + { "dev=", "_AUDIT_FIELD_DEV=", map_string_field }, + { "name=", "_AUDIT_FIELD_NAME=", map_string_field }, + {} +}; + +/* Userspace fields are those occurring in the audit string after + * msg='. All of these fields are untrusted, hence carry no "_" + * prefix. We map the fields we don't know to AUDIT_FIELD_XYZ= */ +static const MapField map_fields_userspace[] = { + { "cwd=", "AUDIT_FIELD_CWD=", map_string_field }, + { "cmd=", "AUDIT_FIELD_CMD=", map_string_field }, + { "acct=", "AUDIT_FIELD_ACCT=", map_string_field }, + { "exe=", "AUDIT_FIELD_EXE=", map_string_field }, + { "comm=", "AUDIT_FIELD_COMM=", map_string_field }, + {} +}; + +static int map_all_fields( + const char *p, + const MapField map_fields[], + const char *prefix, + bool handle_msg, + struct iovec **iov, + size_t *n_iov_allocated, + size_t *n_iov) { + + int r; + + assert(p); + assert(iov); + assert(n_iov_allocated); + assert(n_iov); + + for (;;) { + bool mapped = false; + const MapField *m; + const char *v; + + p += strspn(p, WHITESPACE); + + if (*p == 0) + return 0; + + if (handle_msg) { + v = startswith(p, "msg='"); + if (v) { + _cleanup_free_ char *c = NULL; + const char *e; + + /* Userspace message. It's enclosed in + simple quotation marks, is not + escaped, but the last field in the + line, hence let's remove the + quotation mark, and apply the + userspace mapping instead of the + kernel mapping. */ + + e = endswith(v, "'"); + if (!e) + return 0; /* don't continue splitting up if the final quotation mark is missing */ + + c = strndup(v, e - v); + if (!c) + return -ENOMEM; + + return map_all_fields(c, map_fields_userspace, "AUDIT_FIELD_", false, iov, n_iov_allocated, n_iov); + } + } + + /* Try to map the kernel fields to our own names */ + for (m = map_fields; m->audit_field; m++) { + v = startswith(p, m->audit_field); + if (!v) + continue; + + r = m->map(m->journal_field, &v, iov, n_iov_allocated, n_iov); + if (r < 0) + return log_debug_errno(r, "Failed to parse audit array: %m"); + + if (r > 0) { + mapped = true; + p = v; + break; + } + } + + if (!mapped) { + r = map_generic_field(prefix, &p, iov, n_iov_allocated, n_iov); + if (r < 0) + return log_debug_errno(r, "Failed to parse audit array: %m"); + + if (r == 0) + /* Couldn't process as generic field, let's just skip over it */ + p += strcspn(p, WHITESPACE); + } + } +} + +void process_audit_string(Server *s, int type, const char *data, size_t size) { + size_t n_iov_allocated = 0, n_iov = 0, z; + _cleanup_free_ struct iovec *iov = NULL; + uint64_t seconds, msec, id; + const char *p, *type_name; + char id_field[sizeof("_AUDIT_ID=") + DECIMAL_STR_MAX(uint64_t)], + type_field[sizeof("_AUDIT_TYPE=") + DECIMAL_STR_MAX(int)], + source_time_field[sizeof("_SOURCE_REALTIME_TIMESTAMP=") + DECIMAL_STR_MAX(usec_t)]; + char *m, *type_field_name; + int k; + + assert(s); + + if (size <= 0) + return; + + if (!data) + return; + + /* Note that the input buffer is NUL terminated, but let's + * check whether there is a spurious NUL byte */ + if (memchr(data, 0, size)) + return; + + p = startswith(data, "audit"); + if (!p) + return; + + k = 0; + if (sscanf(p, "(%" PRIu64 ".%" PRIu64 ":%" PRIu64 "):%n", + &seconds, + &msec, + &id, + &k) != 3 || k == 0) + return; + + p += k; + p += strspn(p, WHITESPACE); + + if (isempty(p)) + return; + + n_iov_allocated = N_IOVEC_META_FIELDS + 8; + iov = new(struct iovec, n_iov_allocated); + if (!iov) { + log_oom(); + return; + } + + iov[n_iov++] = IOVEC_MAKE_STRING("_TRANSPORT=audit"); + + sprintf(source_time_field, "_SOURCE_REALTIME_TIMESTAMP=%" PRIu64, + (usec_t) seconds * USEC_PER_SEC + (usec_t) msec * USEC_PER_MSEC); + iov[n_iov++] = IOVEC_MAKE_STRING(source_time_field); + + sprintf(type_field, "_AUDIT_TYPE=%i", type); + iov[n_iov++] = IOVEC_MAKE_STRING(type_field); + + sprintf(id_field, "_AUDIT_ID=%" PRIu64, id); + iov[n_iov++] = IOVEC_MAKE_STRING(id_field); + + assert_cc(4 == LOG_FAC(LOG_AUTH)); + iov[n_iov++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=4"); + iov[n_iov++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=audit"); + + type_name = audit_type_name_alloca(type); + + type_field_name = strjoina("_AUDIT_TYPE_NAME=", type_name); + iov[n_iov++] = IOVEC_MAKE_STRING(type_field_name); + + m = strjoina("MESSAGE=", type_name, " ", p); + iov[n_iov++] = IOVEC_MAKE_STRING(m); + + z = n_iov; + + map_all_fields(p, map_fields_kernel, "_AUDIT_FIELD_", true, &iov, &n_iov_allocated, &n_iov); + + if (!GREEDY_REALLOC(iov, n_iov_allocated, n_iov + N_IOVEC_META_FIELDS)) { + log_oom(); + goto finish; + } + + server_dispatch_message(s, iov, n_iov, n_iov_allocated, NULL, NULL, LOG_NOTICE, 0); + +finish: + /* free() all entries that map_all_fields() added. All others + * are allocated on the stack or are constant. */ + + for (; z < n_iov; z++) + free(iov[z].iov_base); +} + +void server_process_audit_message( + Server *s, + const void *buffer, + size_t buffer_size, + const struct ucred *ucred, + const union sockaddr_union *sa, + socklen_t salen) { + + const struct nlmsghdr *nl = buffer; + + assert(s); + + if (buffer_size < ALIGN(sizeof(struct nlmsghdr))) + return; + + assert(buffer); + + /* Filter out fake data */ + if (!sa || + salen != sizeof(struct sockaddr_nl) || + sa->nl.nl_family != AF_NETLINK || + sa->nl.nl_pid != 0) { + log_debug("Audit netlink message from invalid sender."); + return; + } + + if (!ucred || ucred->pid != 0) { + log_debug("Audit netlink message with invalid credentials."); + return; + } + + if (!NLMSG_OK(nl, buffer_size)) { + log_error("Audit netlink message truncated."); + return; + } + + /* Ignore special Netlink messages */ + if (IN_SET(nl->nlmsg_type, NLMSG_NOOP, NLMSG_ERROR)) + return; + + /* Except AUDIT_USER, all messages below AUDIT_FIRST_USER_MSG are control messages, let's ignore those */ + if (nl->nlmsg_type < AUDIT_FIRST_USER_MSG && nl->nlmsg_type != AUDIT_USER) + return; + + process_audit_string(s, nl->nlmsg_type, NLMSG_DATA(nl), nl->nlmsg_len - ALIGN(sizeof(struct nlmsghdr))); +} + +static int enable_audit(int fd, bool b) { + struct { + union { + struct nlmsghdr header; + uint8_t header_space[NLMSG_HDRLEN]; + }; + struct audit_status body; + } _packed_ request = { + .header.nlmsg_len = NLMSG_LENGTH(sizeof(struct audit_status)), + .header.nlmsg_type = AUDIT_SET, + .header.nlmsg_flags = NLM_F_REQUEST, + .header.nlmsg_seq = 1, + .header.nlmsg_pid = 0, + .body.mask = AUDIT_STATUS_ENABLED, + .body.enabled = b, + }; + union sockaddr_union sa = { + .nl.nl_family = AF_NETLINK, + .nl.nl_pid = 0, + }; + struct iovec iovec = { + .iov_base = &request, + .iov_len = NLMSG_LENGTH(sizeof(struct audit_status)), + }; + struct msghdr mh = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa.nl), + }; + + ssize_t n; + + n = sendmsg(fd, &mh, MSG_NOSIGNAL); + if (n < 0) + return -errno; + if (n != NLMSG_LENGTH(sizeof(struct audit_status))) + return -EIO; + + /* We don't wait for the result here, we can't do anything + * about it anyway */ + + return 0; +} + +int server_open_audit(Server *s) { + int r; + + if (s->audit_fd < 0) { + static const union sockaddr_union sa = { + .nl.nl_family = AF_NETLINK, + .nl.nl_pid = 0, + .nl.nl_groups = AUDIT_NLGRP_READLOG, + }; + + s->audit_fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_AUDIT); + if (s->audit_fd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + log_debug("Audit not supported in the kernel."); + else + log_warning_errno(errno, "Failed to create audit socket, ignoring: %m"); + + return 0; + } + + if (bind(s->audit_fd, &sa.sa, sizeof(sa.nl)) < 0) { + log_warning_errno(errno, + "Failed to join audit multicast group. " + "The kernel is probably too old or multicast reading is not supported. " + "Ignoring: %m"); + s->audit_fd = safe_close(s->audit_fd); + return 0; + } + } else + (void) fd_nonblock(s->audit_fd, true); + + r = setsockopt_int(s->audit_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "Failed to set SO_PASSCRED on audit socket: %m"); + + r = sd_event_add_io(s->event, &s->audit_event_source, s->audit_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add audit fd to event loop: %m"); + + if (s->set_audit >= 0) { + /* We are listening now, try to enable audit if configured so */ + r = enable_audit(s->audit_fd, s->set_audit); + if (r < 0) + log_warning_errno(r, "Failed to issue audit enable call: %m"); + else if (s->set_audit > 0) + log_debug("Auditing in kernel turned on."); + else + log_debug("Auditing in kernel turned off."); + } + + return 0; +} diff --git a/src/journal/journald-audit.h b/src/journal/journald-audit.h new file mode 100644 index 0000000..79f3da9 --- /dev/null +++ b/src/journal/journald-audit.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" +#include "socket-util.h" + +void server_process_audit_message(Server *s, const void *buffer, size_t buffer_size, const struct ucred *ucred, const union sockaddr_union *sa, socklen_t salen); + +void process_audit_string(Server *s, int type, const char *data, size_t size); + +int server_open_audit(Server *s); diff --git a/src/journal/journald-console.c b/src/journal/journald-console.c new file mode 100644 index 0000000..2035e2d --- /dev/null +++ b/src/journal/journald-console.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sys/socket.h> +#include <time.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "io-util.h" +#include "journald-console.h" +#include "journald-server.h" +#include "parse-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "terminal-util.h" + +static bool prefix_timestamp(void) { + + static int cached_printk_time = -1; + + if (_unlikely_(cached_printk_time < 0)) { + _cleanup_free_ char *p = NULL; + + cached_printk_time = + read_one_line_file("/sys/module/printk/parameters/time", &p) >= 0 + && parse_boolean(p) > 0; + } + + return cached_printk_time; +} + +void server_forward_console( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + struct iovec iovec[5]; + struct timespec ts; + char tbuf[STRLEN("[] ") + DECIMAL_STR_MAX(ts.tv_sec) + DECIMAL_STR_MAX(ts.tv_nsec)-3 + 1]; + char header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t)]; + _cleanup_free_ char *ident_buf = NULL; + _cleanup_close_ int fd = -1; + const char *tty; + int n = 0; + + assert(s); + assert(message); + + if (LOG_PRI(priority) > s->max_level_console) + return; + + /* First: timestamp */ + if (prefix_timestamp()) { + assert_se(clock_gettime(CLOCK_MONOTONIC, &ts) == 0); + xsprintf(tbuf, "[%5"PRI_TIME".%06"PRI_NSEC"] ", + ts.tv_sec, + (nsec_t)ts.tv_nsec / 1000); + + iovec[n++] = IOVEC_MAKE_STRING(tbuf); + } + + /* Second: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) get_process_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + tty = s->tty_path ?: "/dev/console"; + + /* Before you ask: yes, on purpose we open/close the console for each log line we write individually. This is a + * good strategy to avoid journald getting killed by the kernel's SAK concept (it doesn't fix this entirely, + * but minimizes the time window the kernel might end up killing journald due to SAK). It also makes things + * easier for us so that we don't have to recover from hangups and suchlike triggered on the console. */ + + fd = open_terminal(tty, O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) { + log_debug_errno(fd, "Failed to open %s for logging: %m", tty); + return; + } + + if (writev(fd, iovec, n) < 0) + log_debug_errno(errno, "Failed to write to %s for logging: %m", tty); +} diff --git a/src/journal/journald-console.h b/src/journal/journald-console.h new file mode 100644 index 0000000..0a26f9c --- /dev/null +++ b/src/journal/journald-console.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +void server_forward_console(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); diff --git a/src/journal/journald-context.c b/src/journal/journald-context.c new file mode 100644 index 0000000..8736495 --- /dev/null +++ b/src/journal/journald-context.c @@ -0,0 +1,792 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#include "alloc-util.h" +#include "audit-util.h" +#include "cgroup-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "journal-util.h" +#include "journald-context.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "unaligned.h" +#include "user-util.h" + +/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc + * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we + * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the + * log entry was originally created. We hence just increase the "window of inaccuracy" a bit. + * + * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed + * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s + * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are + * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not + * flushed out). Data newer than 1s is used immediately without refresh. + * + * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry + * as long as their socket is connected. Note that cache entries are shared between different transports. That means a + * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols. + * + * Caching metadata like this has two major benefits: + * + * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood. + * + * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache + * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating + * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a + * stream connection. This should improve cases where a service process logs immediately before exiting and we + * previously had trouble associating the log message with the service. + * + * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of + * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slightly older + * and sometimes slightly newer than what was current at the log event). + */ + +/* We refresh every 1s */ +#define REFRESH_USEC (1*USEC_PER_SEC) + +/* Data older than 5s we flush out */ +#define MAX_USEC (5*USEC_PER_SEC) + +/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in + * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream + * clients itself is limited.) */ +#define CACHE_MAX_FALLBACK 128U +#define CACHE_MAX_MAX (16*1024U) +#define CACHE_MAX_MIN 64U + +static size_t cache_max(void) { + static size_t cached = -1; + + if (cached == (size_t) -1) { + uint64_t mem_total; + int r; + + r = procfs_memory_get(&mem_total, NULL); + if (r < 0) { + log_warning_errno(r, "Cannot query /proc/meminfo for MemTotal: %m"); + cached = CACHE_MAX_FALLBACK; + } else + /* Cache entries are usually a few kB, but the process cmdline is controlled by the + * user and can be up to _SC_ARG_MAX, usually 2MB. Let's say that approximately up to + * 1/8th of memory may be used by the cache. + * + * In the common case, this formula gives 64 cache entries for each GB of RAM. + */ + cached = CLAMP(mem_total / 8 / sc_arg_max(), CACHE_MAX_MIN, CACHE_MAX_MAX); + } + + return cached; +} + +static int client_context_compare(const void *a, const void *b) { + const ClientContext *x = a, *y = b; + int r; + + r = CMP(x->timestamp, y->timestamp); + if (r != 0) + return r; + + return CMP(x->pid, y->pid); +} + +static int client_context_new(Server *s, pid_t pid, ClientContext **ret) { + ClientContext *c; + int r; + + assert(s); + assert(pid_is_valid(pid)); + assert(ret); + + r = hashmap_ensure_allocated(&s->client_contexts, NULL); + if (r < 0) + return r; + + r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare); + if (r < 0) + return r; + + c = new(ClientContext, 1); + if (!c) + return -ENOMEM; + + *c = (ClientContext) { + .pid = pid, + .uid = UID_INVALID, + .gid = GID_INVALID, + .auditid = AUDIT_SESSION_INVALID, + .loginuid = UID_INVALID, + .owner_uid = UID_INVALID, + .lru_index = PRIOQ_IDX_NULL, + .timestamp = USEC_INFINITY, + .extra_fields_mtime = NSEC_INFINITY, + .log_level_max = -1, + .log_ratelimit_interval = s->ratelimit_interval, + .log_ratelimit_burst = s->ratelimit_burst, + }; + + r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c); + if (r < 0) { + free(c); + return r; + } + + *ret = c; + return 0; +} + +static void client_context_reset(Server *s, ClientContext *c) { + assert(s); + assert(c); + + c->timestamp = USEC_INFINITY; + + c->uid = UID_INVALID; + c->gid = GID_INVALID; + + c->comm = mfree(c->comm); + c->exe = mfree(c->exe); + c->cmdline = mfree(c->cmdline); + c->capeff = mfree(c->capeff); + + c->auditid = AUDIT_SESSION_INVALID; + c->loginuid = UID_INVALID; + + c->cgroup = mfree(c->cgroup); + c->session = mfree(c->session); + c->owner_uid = UID_INVALID; + c->unit = mfree(c->unit); + c->user_unit = mfree(c->user_unit); + c->slice = mfree(c->slice); + c->user_slice = mfree(c->user_slice); + + c->invocation_id = SD_ID128_NULL; + + c->label = mfree(c->label); + c->label_size = 0; + + c->extra_fields_iovec = mfree(c->extra_fields_iovec); + c->extra_fields_n_iovec = 0; + c->extra_fields_data = mfree(c->extra_fields_data); + c->extra_fields_mtime = NSEC_INFINITY; + + c->log_level_max = -1; + + c->log_ratelimit_interval = s->ratelimit_interval; + c->log_ratelimit_burst = s->ratelimit_burst; +} + +static ClientContext* client_context_free(Server *s, ClientContext *c) { + assert(s); + + if (!c) + return NULL; + + assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c); + + if (c->in_lru) + assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0); + + client_context_reset(s, c); + + return mfree(c); +} + +static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) { + assert(c); + assert(pid_is_valid(c->pid)); + + /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */ + if (ucred && uid_is_valid(ucred->uid)) + c->uid = ucred->uid; + else + (void) get_process_uid(c->pid, &c->uid); + + if (ucred && gid_is_valid(ucred->gid)) + c->gid = ucred->gid; + else + (void) get_process_gid(c->pid, &c->gid); +} + +static void client_context_read_basic(ClientContext *c) { + char *t; + + assert(c); + assert(pid_is_valid(c->pid)); + + if (get_process_comm(c->pid, &t) >= 0) + free_and_replace(c->comm, t); + + if (get_process_exe(c->pid, &t) >= 0) + free_and_replace(c->exe, t); + + if (get_process_cmdline(c->pid, SIZE_MAX, 0, &t) >= 0) + free_and_replace(c->cmdline, t); + + if (get_process_capeff(c->pid, &t) >= 0) + free_and_replace(c->capeff, t); +} + +static int client_context_read_label( + ClientContext *c, + const char *label, size_t label_size) { + + assert(c); + assert(pid_is_valid(c->pid)); + assert(label_size == 0 || label); + + if (label_size > 0) { + char *l; + + /* If we got an SELinux label passed in it counts. */ + + l = newdup_suffix0(char, label, label_size); + if (!l) + return -ENOMEM; + + free_and_replace(c->label, l); + c->label_size = label_size; + } +#if HAVE_SELINUX + else { + char *con; + + /* If we got no SELinux label passed in, let's try to acquire one */ + + if (getpidcon(c->pid, &con) >= 0) { + free_and_replace(c->label, con); + c->label_size = strlen(c->label); + } + } +#endif + + return 0; +} + +static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) { + _cleanup_free_ char *t = NULL; + int r; + + assert(c); + + /* Try to acquire the current cgroup path */ + r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t); + if (r < 0 || empty_or_root(t)) { + /* We use the unit ID passed in as fallback if we have nothing cached yet and cg_pid_get_path_shifted() + * failed or process is running in a root cgroup. Zombie processes are automatically migrated to root cgroup + * on cgroup v1 and we want to be able to map log messages from them too. */ + if (unit_id && !c->unit) { + c->unit = strdup(unit_id); + if (c->unit) + return 0; + } + + return r; + } + + /* Let's shortcut this if the cgroup path didn't change */ + if (streq_ptr(c->cgroup, t)) + return 0; + + free_and_replace(c->cgroup, t); + + (void) cg_path_get_session(c->cgroup, &t); + free_and_replace(c->session, t); + + if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0) + c->owner_uid = UID_INVALID; + + (void) cg_path_get_unit(c->cgroup, &t); + free_and_replace(c->unit, t); + + (void) cg_path_get_user_unit(c->cgroup, &t); + free_and_replace(c->user_unit, t); + + (void) cg_path_get_slice(c->cgroup, &t); + free_and_replace(c->slice, t); + + (void) cg_path_get_user_slice(c->cgroup, &t); + free_and_replace(c->user_slice, t); + + return 0; +} + +static int client_context_read_invocation_id( + Server *s, + ClientContext *c) { + + _cleanup_free_ char *p = NULL, *value = NULL; + int r; + + assert(s); + assert(c); + + /* Read the invocation ID of a unit off a unit. + * PID 1 stores it in a per-unit symlink in /run/systemd/units/ + * User managers store it in a per-unit symlink under /run/user/<uid>/systemd/units/ */ + + if (!c->unit) + return 0; + + if (c->user_unit) { + r = asprintf(&p, "/run/user/" UID_FMT "/systemd/units/invocation:%s", c->owner_uid, c->user_unit); + if (r < 0) + return r; + } else { + p = strjoin("/run/systemd/units/invocation:", c->unit); + if (!p) + return -ENOMEM; + } + + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return sd_id128_from_string(value, &c->invocation_id); +} + +static int client_context_read_log_level_max( + Server *s, + ClientContext *c) { + + _cleanup_free_ char *value = NULL; + const char *p; + int r, ll; + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-level-max:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + ll = log_level_from_string(value); + if (ll < 0) + return -EINVAL; + + c->log_level_max = ll; + return 0; +} + +static int client_context_read_extra_fields( + Server *s, + ClientContext *c) { + + size_t size = 0, n_iovec = 0, n_allocated = 0, left; + _cleanup_free_ struct iovec *iovec = NULL; + _cleanup_free_ void *data = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + const char *p; + uint8_t *q; + int r; + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-extra-fields:", c->unit); + + if (c->extra_fields_mtime != NSEC_INFINITY) { + if (stat(p, &st) < 0) { + if (errno == ENOENT) + return 0; + + return -errno; + } + + if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime) + return 0; + } + + f = fopen(p, "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return -errno; + } + + if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new + * one, that matches the stuff we are reading */ + return -errno; + + r = read_full_stream(f, (char**) &data, &size); + if (r < 0) + return r; + + q = data, left = size; + while (left > 0) { + uint8_t *field, *eq; + uint64_t v, n; + + if (left < sizeof(uint64_t)) + return -EBADMSG; + + v = unaligned_read_le64(q); + if (v < 2) + return -EBADMSG; + + n = sizeof(uint64_t) + v; + if (left < n) + return -EBADMSG; + + field = q + sizeof(uint64_t); + + eq = memchr(field, '=', v); + if (!eq) + return -EBADMSG; + + if (!journal_field_valid((const char *) field, eq - field, false)) + return -EBADMSG; + + if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1)) + return -ENOMEM; + + iovec[n_iovec++] = IOVEC_MAKE(field, v); + + left -= n, q += n; + } + + free(c->extra_fields_iovec); + free(c->extra_fields_data); + + c->extra_fields_iovec = TAKE_PTR(iovec); + c->extra_fields_n_iovec = n_iovec; + c->extra_fields_data = TAKE_PTR(data); + c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim); + + return 0; +} + +static int client_context_read_log_ratelimit_interval(ClientContext *c) { + _cleanup_free_ char *value = NULL; + const char *p; + int r; + + assert(c); + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-interval:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return safe_atou64(value, &c->log_ratelimit_interval); +} + +static int client_context_read_log_ratelimit_burst(ClientContext *c) { + _cleanup_free_ char *value = NULL; + const char *p; + int r; + + assert(c); + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-burst:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return safe_atou(value, &c->log_ratelimit_burst); +} + +static void client_context_really_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t timestamp) { + + assert(s); + assert(c); + assert(pid_is_valid(c->pid)); + + if (timestamp == USEC_INFINITY) + timestamp = now(CLOCK_MONOTONIC); + + client_context_read_uid_gid(c, ucred); + client_context_read_basic(c); + (void) client_context_read_label(c, label, label_size); + + (void) audit_session_from_pid(c->pid, &c->auditid); + (void) audit_loginuid_from_pid(c->pid, &c->loginuid); + + (void) client_context_read_cgroup(s, c, unit_id); + (void) client_context_read_invocation_id(s, c); + (void) client_context_read_log_level_max(s, c); + (void) client_context_read_extra_fields(s, c); + (void) client_context_read_log_ratelimit_interval(c); + (void) client_context_read_log_ratelimit_burst(c); + + c->timestamp = timestamp; + + if (c->in_lru) { + assert(c->n_ref == 0); + assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0); + } +} + +void client_context_maybe_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t timestamp) { + + assert(s); + assert(c); + + if (timestamp == USEC_INFINITY) + timestamp = now(CLOCK_MONOTONIC); + + /* No cached data so far? Let's fill it up */ + if (c->timestamp == USEC_INFINITY) + goto refresh; + + /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out + * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */ + if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) { + client_context_reset(s, c); + goto refresh; + } + + /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */ + if (c->timestamp + REFRESH_USEC < timestamp) + goto refresh; + + /* If the data passed along doesn't match the cached data we also do a refresh */ + if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid) + goto refresh; + + if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid) + goto refresh; + + if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0)) + goto refresh; + + return; + +refresh: + client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp); +} + +static void client_context_try_shrink_to(Server *s, size_t limit) { + ClientContext *c; + usec_t t; + + assert(s); + + /* Flush any cache entries for PIDs that have already moved on. Don't do this + * too often, since it's a slow process. */ + t = now(CLOCK_MONOTONIC); + if (s->last_cache_pid_flush + MAX_USEC < t) { + unsigned n = prioq_size(s->client_contexts_lru), idx = 0; + + /* We do a number of iterations based on the initial size of the prioq. When we remove an + * item, a new item is moved into its places, and items to the right might be reshuffled. + */ + for (unsigned i = 0; i < n; i++) { + c = prioq_peek_by_index(s->client_contexts_lru, idx); + + assert(c->n_ref == 0); + + if (!pid_is_unwaited(c->pid)) + client_context_free(s, c); + else + idx ++; + } + + s->last_cache_pid_flush = t; + } + + /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without + * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of + * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */ + + while (hashmap_size(s->client_contexts) > limit) { + c = prioq_pop(s->client_contexts_lru); + if (!c) + break; /* All remaining entries are pinned, give up */ + + assert(c->in_lru); + assert(c->n_ref == 0); + + c->in_lru = false; + + client_context_free(s, c); + } +} + +void client_context_flush_all(Server *s) { + assert(s); + + /* Flush out all remaining entries. This assumes all references are already dropped. */ + + s->my_context = client_context_release(s, s->my_context); + s->pid1_context = client_context_release(s, s->pid1_context); + + client_context_try_shrink_to(s, 0); + + assert(prioq_size(s->client_contexts_lru) == 0); + assert(hashmap_size(s->client_contexts) == 0); + + s->client_contexts_lru = prioq_free(s->client_contexts_lru); + s->client_contexts = hashmap_free(s->client_contexts); +} + +static int client_context_get_internal( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + bool add_ref, + ClientContext **ret) { + + ClientContext *c; + int r; + + assert(s); + assert(ret); + + if (!pid_is_valid(pid)) + return -EINVAL; + + c = hashmap_get(s->client_contexts, PID_TO_PTR(pid)); + if (c) { + + if (add_ref) { + if (c->in_lru) { + /* The entry wasn't pinned so far, let's remove it from the LRU list then */ + assert(c->n_ref == 0); + assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0); + c->in_lru = false; + } + + c->n_ref++; + } + + client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY); + + *ret = c; + return 0; + } + + client_context_try_shrink_to(s, cache_max()-1); + + r = client_context_new(s, pid, &c); + if (r < 0) + return r; + + if (add_ref) + c->n_ref++; + else { + r = prioq_put(s->client_contexts_lru, c, &c->lru_index); + if (r < 0) { + client_context_free(s, c); + return r; + } + + c->in_lru = true; + } + + client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY); + + *ret = c; + return 0; +} + +int client_context_get( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret) { + + return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret); +} + +int client_context_acquire( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret) { + + return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret); +}; + +ClientContext *client_context_release(Server *s, ClientContext *c) { + assert(s); + + if (!c) + return NULL; + + assert(c->n_ref > 0); + assert(!c->in_lru); + + c->n_ref--; + if (c->n_ref > 0) + return NULL; + + /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it + * right-away */ + + if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0) + client_context_free(s, c); + else + c->in_lru = true; + + return NULL; +} + +void client_context_acquire_default(Server *s) { + int r; + + assert(s); + + /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to + * generate driver messages. */ + + if (!s->my_context) { + struct ucred ucred = { + .pid = getpid_cached(), + .uid = getuid(), + .gid = getgid(), + }; + + r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context); + if (r < 0) + log_warning_errno(r, "Failed to acquire our own context, ignoring: %m"); + } + + if (!s->namespace && !s->pid1_context) { + /* Acquire PID1's context, but only if we are in non-namespaced mode, since PID 1 is only + * going to log to the non-namespaced journal instance. */ + + r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context); + if (r < 0) + log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m"); + + } +} diff --git a/src/journal/journald-context.h b/src/journal/journald-context.h new file mode 100644 index 0000000..9bf74b2 --- /dev/null +++ b/src/journal/journald-context.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include "sd-id128.h" + +#include "time-util.h" + +typedef struct ClientContext ClientContext; + +#include "journald-server.h" + +struct ClientContext { + unsigned n_ref; + unsigned lru_index; + usec_t timestamp; + bool in_lru; + + pid_t pid; + uid_t uid; + gid_t gid; + + char *comm; + char *exe; + char *cmdline; + char *capeff; + + uint32_t auditid; + uid_t loginuid; + + char *cgroup; + char *session; + uid_t owner_uid; + + char *unit; + char *user_unit; + + char *slice; + char *user_slice; + + sd_id128_t invocation_id; + + char *label; + size_t label_size; + + int log_level_max; + + struct iovec *extra_fields_iovec; + size_t extra_fields_n_iovec; + void *extra_fields_data; + nsec_t extra_fields_mtime; + + usec_t log_ratelimit_interval; + unsigned log_ratelimit_burst; +}; + +int client_context_get( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret); + +int client_context_acquire( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret); + +ClientContext* client_context_release(Server *s, ClientContext *c); + +void client_context_maybe_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t tstamp); + +void client_context_acquire_default(Server *s); +void client_context_flush_all(Server *s); + +static inline size_t client_context_extra_fields_n_iovec(const ClientContext *c) { + return c ? c->extra_fields_n_iovec : 0; +} + +static inline bool client_context_test_priority(const ClientContext *c, int priority) { + if (!c) + return true; + + if (c->log_level_max < 0) + return true; + + return LOG_PRI(priority) <= c->log_level_max; +} diff --git a/src/journal/journald-gperf.gperf b/src/journal/journald-gperf.gperf new file mode 100644 index 0000000..c70ac9a --- /dev/null +++ b/src/journal/journald-gperf.gperf @@ -0,0 +1,52 @@ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include <stddef.h> +#include <sys/socket.h> +#include "conf-parser.h" +#include "journald-server.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name journald_gperf_hash +%define lookup-function-name journald_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Journal.Storage, config_parse_storage, 0, offsetof(Server, storage) +Journal.Compress, config_parse_compress, 0, offsetof(Server, compress) +Journal.Seal, config_parse_bool, 0, offsetof(Server, seal) +Journal.ReadKMsg, config_parse_bool, 0, offsetof(Server, read_kmsg) +Journal.Audit, config_parse_tristate, 0, offsetof(Server, set_audit) +Journal.SyncIntervalSec, config_parse_sec, 0, offsetof(Server, sync_interval_usec) +# The following is a legacy name for compatibility +Journal.RateLimitInterval, config_parse_sec, 0, offsetof(Server, ratelimit_interval) +Journal.RateLimitIntervalSec,config_parse_sec, 0, offsetof(Server, ratelimit_interval) +Journal.RateLimitBurst, config_parse_unsigned, 0, offsetof(Server, ratelimit_burst) +Journal.SystemMaxUse, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.max_use) +Journal.SystemMaxFileSize, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.max_size) +Journal.SystemKeepFree, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.keep_free) +Journal.SystemMaxFiles, config_parse_uint64, 0, offsetof(Server, system_storage.metrics.n_max_files) +Journal.RuntimeMaxUse, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.max_use) +Journal.RuntimeMaxFileSize, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.max_size) +Journal.RuntimeKeepFree, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.keep_free) +Journal.RuntimeMaxFiles, config_parse_uint64, 0, offsetof(Server, runtime_storage.metrics.n_max_files) +Journal.MaxRetentionSec, config_parse_sec, 0, offsetof(Server, max_retention_usec) +Journal.MaxFileSec, config_parse_sec, 0, offsetof(Server, max_file_usec) +Journal.ForwardToSyslog, config_parse_bool, 0, offsetof(Server, forward_to_syslog) +Journal.ForwardToKMsg, config_parse_bool, 0, offsetof(Server, forward_to_kmsg) +Journal.ForwardToConsole, config_parse_bool, 0, offsetof(Server, forward_to_console) +Journal.ForwardToWall, config_parse_bool, 0, offsetof(Server, forward_to_wall) +Journal.TTYPath, config_parse_path, 0, offsetof(Server, tty_path) +Journal.MaxLevelStore, config_parse_log_level, 0, offsetof(Server, max_level_store) +Journal.MaxLevelSyslog, config_parse_log_level, 0, offsetof(Server, max_level_syslog) +Journal.MaxLevelKMsg, config_parse_log_level, 0, offsetof(Server, max_level_kmsg) +Journal.MaxLevelConsole, config_parse_log_level, 0, offsetof(Server, max_level_console) +Journal.MaxLevelWall, config_parse_log_level, 0, offsetof(Server, max_level_wall) +Journal.SplitMode, config_parse_split_mode, 0, offsetof(Server, split_mode) +Journal.LineMax, config_parse_line_max, 0, offsetof(Server, line_max) diff --git a/src/journal/journald-kmsg.c b/src/journal/journald-kmsg.c new file mode 100644 index 0000000..e7255b0 --- /dev/null +++ b/src/journal/journald-kmsg.c @@ -0,0 +1,454 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sys/epoll.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <unistd.h> + +#include "sd-device.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "io-util.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "parse-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" + +void server_forward_kmsg( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + _cleanup_free_ char *ident_buf = NULL; + struct iovec iovec[5]; + char header_priority[DECIMAL_STR_MAX(priority) + 3], + header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1]; + int n = 0; + + assert(s); + assert(priority >= 0); + assert(priority <= 999); + assert(message); + + if (_unlikely_(LOG_PRI(priority) > s->max_level_kmsg)) + return; + + if (_unlikely_(s->dev_kmsg_fd < 0)) + return; + + /* Never allow messages with kernel facility to be written to + * kmsg, regardless where the data comes from. */ + priority = syslog_fixup_facility(priority); + + /* First: priority field */ + xsprintf(header_priority, "<%i>", priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); + + /* Second: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) get_process_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + if (writev(s->dev_kmsg_fd, iovec, n) < 0) + log_debug_errno(errno, "Failed to write to /dev/kmsg for logging: %m"); +} + +static bool is_us(const char *identifier, const char *pid) { + pid_t pid_num; + + if (!identifier || !pid) + return false; + + if (parse_pid(pid, &pid_num) < 0) + return false; + + return pid_num == getpid_cached() && + streq(identifier, program_invocation_short_name); +} + +void dev_kmsg_record(Server *s, char *p, size_t l) { + + _cleanup_free_ char *message = NULL, *syslog_priority = NULL, *syslog_pid = NULL, *syslog_facility = NULL, *syslog_identifier = NULL, *source_time = NULL, *identifier = NULL, *pid = NULL; + struct iovec iovec[N_IOVEC_META_FIELDS + 7 + N_IOVEC_KERNEL_FIELDS + 2 + N_IOVEC_UDEV_FIELDS]; + char *kernel_device = NULL; + unsigned long long usec; + size_t n = 0, z = 0, j; + int priority, r; + char *e, *f, *k; + uint64_t serial; + size_t pl; + + assert(s); + assert(p); + + if (l <= 0) + return; + + e = memchr(p, ',', l); + if (!e) + return; + *e = 0; + + r = safe_atoi(p, &priority); + if (r < 0 || priority < 0 || priority > 999) + return; + + if (s->forward_to_kmsg && LOG_FAC(priority) != LOG_KERN) + return; + + l -= (e - p) + 1; + p = e + 1; + e = memchr(p, ',', l); + if (!e) + return; + *e = 0; + + r = safe_atou64(p, &serial); + if (r < 0) + return; + + if (s->kernel_seqnum) { + /* We already read this one? */ + if (serial < *s->kernel_seqnum) + return; + + /* Did we lose any? */ + if (serial > *s->kernel_seqnum) + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_MISSED_STR, + LOG_MESSAGE("Missed %"PRIu64" kernel messages", + serial - *s->kernel_seqnum), + NULL); + + /* Make sure we never read this one again. Note that + * we always store the next message serial we expect + * here, simply because this makes handling the first + * message with serial 0 easy. */ + *s->kernel_seqnum = serial + 1; + } + + l -= (e - p) + 1; + p = e + 1; + f = memchr(p, ';', l); + if (!f) + return; + /* Kernel 3.6 has the flags field, kernel 3.5 lacks that */ + e = memchr(p, ',', l); + if (!e || f < e) + e = f; + *e = 0; + + r = safe_atollu(p, &usec); + if (r < 0) + return; + + l -= (f - p) + 1; + p = f + 1; + e = memchr(p, '\n', l); + if (!e) + return; + *e = 0; + + pl = e - p; + l -= (e - p) + 1; + k = e + 1; + + for (j = 0; l > 0 && j < N_IOVEC_KERNEL_FIELDS; j++) { + char *m; + /* Metadata fields attached */ + + if (*k != ' ') + break; + + k++, l--; + + e = memchr(k, '\n', l); + if (!e) + goto finish; + + *e = 0; + + if (cunescape_length_with_prefix(k, e - k, "_KERNEL_", UNESCAPE_RELAX, &m) < 0) + break; + + if (startswith(m, "_KERNEL_DEVICE=")) + kernel_device = m + 15; + + iovec[n++] = IOVEC_MAKE_STRING(m); + z++; + + l -= (e - k) + 1; + k = e + 1; + } + + if (kernel_device) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + + if (sd_device_new_from_device_id(&d, kernel_device) >= 0) { + const char *g; + char *b; + + if (sd_device_get_devname(d, &g) >= 0) { + b = strjoin("_UDEV_DEVNODE=", g); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + } + + if (sd_device_get_sysname(d, &g) >= 0) { + b = strjoin("_UDEV_SYSNAME=", g); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + } + + j = 0; + FOREACH_DEVICE_DEVLINK(d, g) { + + if (j >= N_IOVEC_UDEV_FIELDS) + break; + + b = strjoin("_UDEV_DEVLINK=", g); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + + j++; + } + } + } + + if (asprintf(&source_time, "_SOURCE_MONOTONIC_TIMESTAMP=%llu", usec) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(source_time); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=kernel"); + + if (asprintf(&syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (asprintf(&syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + + if (LOG_FAC(priority) == LOG_KERN) + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=kernel"); + else { + pl -= syslog_parse_identifier((const char**) &p, &identifier, &pid); + + /* Avoid any messages we generated ourselves via + * log_info() and friends. */ + if (is_us(identifier, pid)) + goto finish; + + if (identifier) { + syslog_identifier = strjoin("SYSLOG_IDENTIFIER=", identifier); + if (syslog_identifier) + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); + } + + if (pid) { + syslog_pid = strjoin("SYSLOG_PID=", pid); + if (syslog_pid) + iovec[n++] = IOVEC_MAKE_STRING(syslog_pid); + } + } + + if (cunescape_length_with_prefix(p, pl, "MESSAGE=", UNESCAPE_RELAX, &message) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(message); + + server_dispatch_message(s, iovec, n, ELEMENTSOF(iovec), NULL, NULL, priority, 0); + +finish: + for (j = 0; j < z; j++) + free(iovec[j].iov_base); +} + +static int server_read_dev_kmsg(Server *s) { + char buffer[8192+1]; /* the kernel-side limit per record is 8K currently */ + ssize_t l; + + assert(s); + assert(s->dev_kmsg_fd >= 0); + + l = read(s->dev_kmsg_fd, buffer, sizeof(buffer) - 1); + if (l == 0) + return 0; + if (l < 0) { + /* Old kernels who don't allow reading from /dev/kmsg + * return EINVAL when we try. So handle this cleanly, + * but don' try to ever read from it again. */ + if (errno == EINVAL) { + s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source); + return 0; + } + + if (IN_SET(errno, EAGAIN, EINTR, EPIPE)) + return 0; + + return log_error_errno(errno, "Failed to read from /dev/kmsg: %m"); + } + + dev_kmsg_record(s, buffer, l); + return 1; +} + +int server_flush_dev_kmsg(Server *s) { + int r; + + assert(s); + + if (s->dev_kmsg_fd < 0) + return 0; + + if (!s->dev_kmsg_readable) + return 0; + + log_debug("Flushing /dev/kmsg..."); + + for (;;) { + r = server_read_dev_kmsg(s); + if (r < 0) + return r; + + if (r == 0) + break; + } + + return 0; +} + +static int dispatch_dev_kmsg(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = userdata; + + assert(es); + assert(fd == s->dev_kmsg_fd); + assert(s); + + if (revents & EPOLLERR) + log_warning("/dev/kmsg buffer overrun, some messages lost."); + + if (!(revents & EPOLLIN)) + log_error("Got invalid event from epoll for /dev/kmsg: %"PRIx32, revents); + + return server_read_dev_kmsg(s); +} + +int server_open_dev_kmsg(Server *s) { + mode_t mode; + int r; + + assert(s); + + if (s->read_kmsg) + mode = O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + else + mode = O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + + s->dev_kmsg_fd = open("/dev/kmsg", mode); + if (s->dev_kmsg_fd < 0) { + log_full(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, + "Failed to open /dev/kmsg, ignoring: %m"); + return 0; + } + + if (!s->read_kmsg) + return 0; + + r = sd_event_add_io(s->event, &s->dev_kmsg_event_source, s->dev_kmsg_fd, EPOLLIN, dispatch_dev_kmsg, s); + if (r < 0) { + + /* This will fail with EPERM on older kernels where + * /dev/kmsg is not readable. */ + if (r == -EPERM) { + r = 0; + goto fail; + } + + log_error_errno(r, "Failed to add /dev/kmsg fd to event loop: %m"); + goto fail; + } + + r = sd_event_source_set_priority(s->dev_kmsg_event_source, SD_EVENT_PRIORITY_IMPORTANT+10); + if (r < 0) { + log_error_errno(r, "Failed to adjust priority of kmsg event source: %m"); + goto fail; + } + + s->dev_kmsg_readable = true; + + return 0; + +fail: + s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source); + s->dev_kmsg_fd = safe_close(s->dev_kmsg_fd); + + return r; +} + +int server_open_kernel_seqnum(Server *s) { + _cleanup_close_ int fd = -1; + const char *fn; + uint64_t *p; + int r; + + assert(s); + + /* We store the seqnum we last read in an mmapped file. That way we can just use it like a variable, + * but it is persistent and automatically flushed at reboot. */ + + if (!s->read_kmsg) + return 0; + + fn = strjoina(s->runtime_directory, "/kernel-seqnum"); + fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644); + if (fd < 0) { + log_error_errno(errno, "Failed to open %s, ignoring: %m", fn); + return 0; + } + + r = posix_fallocate(fd, 0, sizeof(uint64_t)); + if (r != 0) { + log_error_errno(r, "Failed to allocate sequential number file, ignoring: %m"); + return 0; + } + + p = mmap(NULL, sizeof(uint64_t), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + log_error_errno(errno, "Failed to map sequential number file, ignoring: %m"); + return 0; + } + + s->kernel_seqnum = p; + + return 0; +} diff --git a/src/journal/journald-kmsg.h b/src/journal/journald-kmsg.h new file mode 100644 index 0000000..bd288c5 --- /dev/null +++ b/src/journal/journald-kmsg.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +int server_open_dev_kmsg(Server *s); +int server_flush_dev_kmsg(Server *s); + +void server_forward_kmsg(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); + +int server_open_kernel_seqnum(Server *s); + +void dev_kmsg_record(Server *s, char *p, size_t l); diff --git a/src/journal/journald-native.c b/src/journal/journald-native.c new file mode 100644 index 0000000..1c5849e --- /dev/null +++ b/src/journal/journald-native.c @@ -0,0 +1,505 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stddef.h> +#include <sys/epoll.h> +#include <sys/mman.h> +#include <sys/statvfs.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "journal-importer.h" +#include "journal-util.h" +#include "journald-console.h" +#include "journald-kmsg.h" +#include "journald-native.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "memfd-util.h" +#include "memory-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "unaligned.h" + +static bool allow_object_pid(const struct ucred *ucred) { + return ucred && ucred->uid == 0; +} + +static void server_process_entry_meta( + const char *p, size_t l, + const struct ucred *ucred, + int *priority, + char **identifier, + char **message, + pid_t *object_pid) { + + /* We need to determine the priority of this entry for the rate limiting logic */ + + if (l == 10 && + startswith(p, "PRIORITY=") && + p[9] >= '0' && p[9] <= '9') + *priority = (*priority & LOG_FACMASK) | (p[9] - '0'); + + else if (l == 17 && + startswith(p, "SYSLOG_FACILITY=") && + p[16] >= '0' && p[16] <= '9') + *priority = (*priority & LOG_PRIMASK) | ((p[16] - '0') << 3); + + else if (l == 18 && + startswith(p, "SYSLOG_FACILITY=") && + p[16] >= '0' && p[16] <= '9' && + p[17] >= '0' && p[17] <= '9') + *priority = (*priority & LOG_PRIMASK) | (((p[16] - '0')*10 + (p[17] - '0')) << 3); + + else if (l >= 19 && + startswith(p, "SYSLOG_IDENTIFIER=")) { + char *t; + + t = memdup_suffix0(p + 18, l - 18); + if (t) { + free(*identifier); + *identifier = t; + } + + } else if (l >= 8 && + startswith(p, "MESSAGE=")) { + char *t; + + t = memdup_suffix0(p + 8, l - 8); + if (t) { + free(*message); + *message = t; + } + + } else if (l > STRLEN("OBJECT_PID=") && + l < STRLEN("OBJECT_PID=") + DECIMAL_STR_MAX(pid_t) && + startswith(p, "OBJECT_PID=") && + allow_object_pid(ucred)) { + char buf[DECIMAL_STR_MAX(pid_t)]; + memcpy(buf, p + STRLEN("OBJECT_PID="), + l - STRLEN("OBJECT_PID=")); + buf[l-STRLEN("OBJECT_PID=")] = '\0'; + + (void) parse_pid(buf, object_pid); + } +} + +static int server_process_entry( + Server *s, + const void *buffer, size_t *remaining, + ClientContext *context, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + /* Process a single entry from a native message. Returns 0 if nothing special happened and the message + * processing should continue, and a negative or positive value otherwise. + * + * Note that *remaining is altered on both success and failure. */ + + size_t n = 0, j, tn = (size_t) -1, m = 0, entry_size = 0; + char *identifier = NULL, *message = NULL; + struct iovec *iovec = NULL; + int priority = LOG_INFO; + pid_t object_pid = 0; + const char *p; + int r = 1; + + p = buffer; + + while (*remaining > 0) { + const char *e, *q; + + e = memchr(p, '\n', *remaining); + + if (!e) { + /* Trailing noise, let's ignore it, and flush what we collected */ + log_debug("Received message with trailing noise, ignoring."); + break; /* finish processing of the message */ + } + + if (e == p) { + /* Entry separator */ + *remaining -= 1; + break; + } + + if (IN_SET(*p, '.', '#')) { + /* Ignore control commands for now, and comments too. */ + *remaining -= (e - p) + 1; + p = e + 1; + continue; + } + + /* A property follows */ + if (n > ENTRY_FIELD_COUNT_MAX) { + log_debug("Received an entry that has more than " STRINGIFY(ENTRY_FIELD_COUNT_MAX) " fields, ignoring entry."); + goto finish; + } + + /* n existing properties, 1 new, +1 for _TRANSPORT */ + if (!GREEDY_REALLOC(iovec, m, + n + 2 + + N_IOVEC_META_FIELDS + N_IOVEC_OBJECT_FIELDS + + client_context_extra_fields_n_iovec(context))) { + r = log_oom(); + goto finish; + } + + q = memchr(p, '=', e - p); + if (q) { + if (journal_field_valid(p, q - p, false)) { + size_t l; + + l = e - p; + if (l > DATA_SIZE_MAX) { + log_debug("Received text block of %zu bytes is too large, ignoring entry.", l); + goto finish; + } + + if (entry_size + l + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big (%zu bytes after processing %zu entries), ignoring entry.", + entry_size + l, n + 1); + goto finish; + } + + /* If the field name starts with an underscore, skip the variable, since that indicates + * a trusted field */ + iovec[n++] = IOVEC_MAKE((char*) p, l); + entry_size += l; + + server_process_entry_meta(p, l, ucred, + &priority, + &identifier, + &message, + &object_pid); + } + + *remaining -= (e - p) + 1; + p = e + 1; + continue; + } else { + uint64_t l, total; + char *k; + + if (*remaining < e - p + 1 + sizeof(uint64_t) + 1) { + log_debug("Failed to parse message, ignoring."); + break; + } + + l = unaligned_read_le64(e + 1); + if (l > DATA_SIZE_MAX) { + log_debug("Received binary data block of %"PRIu64" bytes is too large, ignoring entry.", l); + goto finish; + } + + total = (e - p) + 1 + l; + if (entry_size + total + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big (%"PRIu64"bytes after processing %zu fields), ignoring.", + entry_size + total, n + 1); + goto finish; + } + + if ((uint64_t) *remaining < e - p + 1 + sizeof(uint64_t) + l + 1 || + e[1+sizeof(uint64_t)+l] != '\n') { + log_debug("Failed to parse message, ignoring."); + break; + } + + k = malloc(total); + if (!k) { + log_oom(); + break; + } + + memcpy(k, p, e - p); + k[e - p] = '='; + memcpy(k + (e - p) + 1, e + 1 + sizeof(uint64_t), l); + + if (journal_field_valid(p, e - p, false)) { + iovec[n] = IOVEC_MAKE(k, (e - p) + 1 + l); + entry_size += iovec[n].iov_len; + n++; + + server_process_entry_meta(k, (e - p) + 1 + l, ucred, + &priority, + &identifier, + &message, + &object_pid); + } else + free(k); + + *remaining -= (e - p) + 1 + sizeof(uint64_t) + l + 1; + p = e + 1 + sizeof(uint64_t) + l + 1; + } + } + + if (n <= 0) + goto finish; + + tn = n++; + iovec[tn] = IOVEC_MAKE_STRING("_TRANSPORT=journal"); + entry_size += STRLEN("_TRANSPORT=journal"); + + if (entry_size + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big with %zu properties and %zu bytes, ignoring.", n, entry_size); + goto finish; + } + + r = 0; /* Success, we read the message. */ + + if (!client_context_test_priority(context, priority)) + goto finish; + + if (message) { + if (s->forward_to_syslog) + server_forward_syslog(s, syslog_fixup_facility(priority), identifier, message, ucred, tv); + + if (s->forward_to_kmsg) + server_forward_kmsg(s, priority, identifier, message, ucred); + + if (s->forward_to_console) + server_forward_console(s, priority, identifier, message, ucred); + + if (s->forward_to_wall) + server_forward_wall(s, priority, identifier, message, ucred); + } + + server_dispatch_message(s, iovec, n, m, context, tv, priority, object_pid); + +finish: + for (j = 0; j < n; j++) { + if (j == tn) + continue; + + if (iovec[j].iov_base < buffer || + (const char*) iovec[j].iov_base >= p + *remaining) + free(iovec[j].iov_base); + } + + free(iovec); + free(identifier); + free(message); + + return r; +} + +void server_process_native_message( + Server *s, + const char *buffer, size_t buffer_size, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + size_t remaining = buffer_size; + ClientContext *context = NULL; + int r; + + assert(s); + assert(buffer || buffer_size == 0); + + if (ucred && pid_is_valid(ucred->pid)) { + r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context); + if (r < 0) + log_warning_errno(r, "Failed to retrieve credentials for PID " PID_FMT ", ignoring: %m", ucred->pid); + } + + do { + r = server_process_entry(s, + (const uint8_t*) buffer + (buffer_size - remaining), &remaining, + context, ucred, tv, label, label_len); + } while (r == 0); +} + +void server_process_native_file( + Server *s, + int fd, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + struct stat st; + bool sealed; + int r; + + /* Data is in the passed fd, probably it didn't fit in a datagram. */ + + assert(s); + assert(fd >= 0); + + /* If it's a memfd, check if it is sealed. If so, we can just + * mmap it and use it, and do not need to copy the data out. */ + sealed = memfd_get_sealed(fd) > 0; + + if (!sealed && (!ucred || ucred->uid != 0)) { + _cleanup_free_ char *k = NULL; + const char *e; + + /* If this is not a sealed memfd, and the peer is unknown or + * unprivileged, then verify the path. */ + + r = fd_get_path(fd, &k); + if (r < 0) { + log_error_errno(r, "readlink(/proc/self/fd/%i) failed: %m", fd); + return; + } + + e = PATH_STARTSWITH_SET(k, "/dev/shm/", "/tmp/", "/var/tmp/"); + if (!e) { + log_error("Received file outside of allowed directories. Refusing."); + return; + } + + if (!filename_is_valid(e)) { + log_error("Received file in subdirectory of allowed directories. Refusing."); + return; + } + } + + if (fstat(fd, &st) < 0) { + log_error_errno(errno, "Failed to stat passed file, ignoring: %m"); + return; + } + + if (!S_ISREG(st.st_mode)) { + log_error("File passed is not regular. Ignoring."); + return; + } + + if (st.st_size <= 0) + return; + + /* When !sealed, set a lower memory limit. We have to read the file, + * effectively doubling memory use. */ + if (st.st_size > ENTRY_SIZE_MAX / (sealed ? 1 : 2)) { + log_error("File passed too large (%"PRIu64" bytes). Ignoring.", (uint64_t) st.st_size); + return; + } + + if (sealed) { + void *p; + size_t ps; + + /* The file is sealed, we can just map it and use it. */ + + ps = PAGE_ALIGN(st.st_size); + p = mmap(NULL, ps, PROT_READ, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) { + log_error_errno(errno, "Failed to map memfd, ignoring: %m"); + return; + } + + server_process_native_message(s, p, st.st_size, ucred, tv, label, label_len); + assert_se(munmap(p, ps) >= 0); + } else { + _cleanup_free_ void *p = NULL; + struct statvfs vfs; + ssize_t n; + + if (fstatvfs(fd, &vfs) < 0) { + log_error_errno(errno, "Failed to stat file system of passed file, not processing it: %m"); + return; + } + + /* Refuse operating on file systems that have + * mandatory locking enabled, see: + * + * https://github.com/systemd/systemd/issues/1822 + */ + if (vfs.f_flag & ST_MANDLOCK) { + log_error("Received file descriptor from file system with mandatory locking enabled, not processing it."); + return; + } + + /* Make the fd non-blocking. On regular files this has + * the effect of bypassing mandatory locking. Of + * course, this should normally not be necessary given + * the check above, but let's better be safe than + * sorry, after all NFS is pretty confusing regarding + * file system flags, and we better don't trust it, + * and so is SMB. */ + r = fd_nonblock(fd, true); + if (r < 0) { + log_error_errno(r, "Failed to make fd non-blocking, not processing it: %m"); + return; + } + + /* The file is not sealed, we can't map the file here, since + * clients might then truncate it and trigger a SIGBUS for + * us. So let's stupidly read it. */ + + p = malloc(st.st_size); + if (!p) { + log_oom(); + return; + } + + n = pread(fd, p, st.st_size, 0); + if (n < 0) + log_error_errno(errno, "Failed to read file, ignoring: %m"); + else if (n > 0) + server_process_native_message(s, p, n, ucred, tv, label, label_len); + } +} + +int server_open_native_socket(Server *s, const char *native_socket) { + int r; + + assert(s); + assert(native_socket); + + if (s->native_fd < 0) { + union sockaddr_union sa; + size_t sa_len; + + r = sockaddr_un_set_path(&sa.un, native_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", native_socket); + sa_len = r; + + s->native_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->native_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->native_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + } else + (void) fd_nonblock(s->native_fd, true); + + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_warning_errno(r, "SO_PASSSEC failed: %m"); + } + + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return log_error_errno(r, "SO_TIMESTAMP failed: %m"); + + r = sd_event_add_io(s->event, &s->native_event_source, s->native_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add native server fd to event loop: %m"); + + r = sd_event_source_set_priority(s->native_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust native event source priority: %m"); + + return 0; +} diff --git a/src/journal/journald-native.h b/src/journal/journald-native.h new file mode 100644 index 0000000..7bbaaed --- /dev/null +++ b/src/journal/journald-native.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +void server_process_native_message( + Server *s, + const char *buffer, + size_t buffer_size, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len); + +void server_process_native_file( + Server *s, + int fd, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len); + +int server_open_native_socket(Server *s, const char *native_socket); diff --git a/src/journal/journald-rate-limit.c b/src/journal/journald-rate-limit.c new file mode 100644 index 0000000..f464b6e --- /dev/null +++ b/src/journal/journald-rate-limit.c @@ -0,0 +1,254 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> + +#include "alloc-util.h" +#include "hashmap.h" +#include "journald-rate-limit.h" +#include "list.h" +#include "random-util.h" +#include "string-util.h" +#include "time-util.h" + +#define POOLS_MAX 5 +#define BUCKETS_MAX 127 +#define GROUPS_MAX 2047 + +static const int priority_map[] = { + [LOG_EMERG] = 0, + [LOG_ALERT] = 0, + [LOG_CRIT] = 0, + [LOG_ERR] = 1, + [LOG_WARNING] = 2, + [LOG_NOTICE] = 3, + [LOG_INFO] = 3, + [LOG_DEBUG] = 4 +}; + +typedef struct JournalRateLimitPool JournalRateLimitPool; +typedef struct JournalRateLimitGroup JournalRateLimitGroup; + +struct JournalRateLimitPool { + usec_t begin; + unsigned num; + unsigned suppressed; +}; + +struct JournalRateLimitGroup { + JournalRateLimit *parent; + + char *id; + + /* Interval is stored to keep track of when the group expires */ + usec_t interval; + + JournalRateLimitPool pools[POOLS_MAX]; + uint64_t hash; + + LIST_FIELDS(JournalRateLimitGroup, bucket); + LIST_FIELDS(JournalRateLimitGroup, lru); +}; + +struct JournalRateLimit { + + JournalRateLimitGroup* buckets[BUCKETS_MAX]; + JournalRateLimitGroup *lru, *lru_tail; + + unsigned n_groups; + + uint8_t hash_key[16]; +}; + +JournalRateLimit *journal_ratelimit_new(void) { + JournalRateLimit *r; + + r = new0(JournalRateLimit, 1); + if (!r) + return NULL; + + random_bytes(r->hash_key, sizeof(r->hash_key)); + + return r; +} + +static void journal_ratelimit_group_free(JournalRateLimitGroup *g) { + assert(g); + + if (g->parent) { + assert(g->parent->n_groups > 0); + + if (g->parent->lru_tail == g) + g->parent->lru_tail = g->lru_prev; + + LIST_REMOVE(lru, g->parent->lru, g); + LIST_REMOVE(bucket, g->parent->buckets[g->hash % BUCKETS_MAX], g); + + g->parent->n_groups--; + } + + free(g->id); + free(g); +} + +void journal_ratelimit_free(JournalRateLimit *r) { + assert(r); + + while (r->lru) + journal_ratelimit_group_free(r->lru); + + free(r); +} + +static bool journal_ratelimit_group_expired(JournalRateLimitGroup *g, usec_t ts) { + unsigned i; + + assert(g); + + for (i = 0; i < POOLS_MAX; i++) + if (g->pools[i].begin + g->interval >= ts) + return false; + + return true; +} + +static void journal_ratelimit_vacuum(JournalRateLimit *r, usec_t ts) { + assert(r); + + /* Makes room for at least one new item, but drop all + * expored items too. */ + + while (r->n_groups >= GROUPS_MAX || + (r->lru_tail && journal_ratelimit_group_expired(r->lru_tail, ts))) + journal_ratelimit_group_free(r->lru_tail); +} + +static JournalRateLimitGroup* journal_ratelimit_group_new(JournalRateLimit *r, const char *id, usec_t interval, usec_t ts) { + JournalRateLimitGroup *g; + + assert(r); + assert(id); + + g = new0(JournalRateLimitGroup, 1); + if (!g) + return NULL; + + g->id = strdup(id); + if (!g->id) + goto fail; + + g->hash = siphash24_string(g->id, r->hash_key); + + g->interval = interval; + + journal_ratelimit_vacuum(r, ts); + + LIST_PREPEND(bucket, r->buckets[g->hash % BUCKETS_MAX], g); + LIST_PREPEND(lru, r->lru, g); + if (!g->lru_next) + r->lru_tail = g; + r->n_groups++; + + g->parent = r; + return g; + +fail: + journal_ratelimit_group_free(g); + return NULL; +} + +static unsigned burst_modulate(unsigned burst, uint64_t available) { + unsigned k; + + /* Modulates the burst rate a bit with the amount of available + * disk space */ + + k = u64log2(available); + + /* 1MB */ + if (k <= 20) + return burst; + + burst = (burst * (k-16)) / 4; + + /* + * Example: + * + * <= 1MB = rate * 1 + * 16MB = rate * 2 + * 256MB = rate * 3 + * 4GB = rate * 4 + * 64GB = rate * 5 + * 1TB = rate * 6 + */ + + return burst; +} + +int journal_ratelimit_test(JournalRateLimit *r, const char *id, usec_t rl_interval, unsigned rl_burst, int priority, uint64_t available) { + uint64_t h; + JournalRateLimitGroup *g; + JournalRateLimitPool *p; + unsigned burst; + usec_t ts; + + assert(id); + + /* Returns: + * + * 0 → the log message shall be suppressed, + * 1 + n → the log message shall be permitted, and n messages were dropped from the peer before + * < 0 → error + */ + + if (!r) + return 1; + + ts = now(CLOCK_MONOTONIC); + + h = siphash24_string(id, r->hash_key); + g = r->buckets[h % BUCKETS_MAX]; + + LIST_FOREACH(bucket, g, g) + if (streq(g->id, id)) + break; + + if (!g) { + g = journal_ratelimit_group_new(r, id, rl_interval, ts); + if (!g) + return -ENOMEM; + } else + g->interval = rl_interval; + + if (rl_interval == 0 || rl_burst == 0) + return 1; + + burst = burst_modulate(rl_burst, available); + + p = &g->pools[priority_map[priority]]; + + if (p->begin <= 0) { + p->suppressed = 0; + p->num = 1; + p->begin = ts; + return 1; + } + + if (p->begin + rl_interval < ts) { + unsigned s; + + s = p->suppressed; + p->suppressed = 0; + p->num = 1; + p->begin = ts; + + return 1 + s; + } + + if (p->num < burst) { + p->num++; + return 1; + } + + p->suppressed++; + return 0; +} diff --git a/src/journal/journald-rate-limit.h b/src/journal/journald-rate-limit.h new file mode 100644 index 0000000..8def60f --- /dev/null +++ b/src/journal/journald-rate-limit.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +typedef struct JournalRateLimit JournalRateLimit; + +JournalRateLimit *journal_ratelimit_new(void); +void journal_ratelimit_free(JournalRateLimit *r); +int journal_ratelimit_test(JournalRateLimit *r, const char *id, usec_t rl_interval, unsigned rl_burst, int priority, uint64_t available); diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c new file mode 100644 index 0000000..10ebc3e --- /dev/null +++ b/src/journal/journald-server.c @@ -0,0 +1,2619 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/signalfd.h> +#include <sys/statvfs.h> +#include <linux/sockios.h> + +#include "sd-daemon.h" +#include "sd-journal.h" +#include "sd-messages.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "audit-util.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "dirent-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "journal-authenticate.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "journal-vacuum.h" +#include "journald-audit.h" +#include "journald-context.h" +#include "journald-kmsg.h" +#include "journald-native.h" +#include "journald-rate-limit.h" +#include "journald-server.h" +#include "journald-stream.h" +#include "journald-syslog.h" +#include "log.h" +#include "missing_audit.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "syslog-util.h" +#include "user-record.h" +#include "user-util.h" + +#define USER_JOURNALS_MAX 1024 + +#define DEFAULT_SYNC_INTERVAL_USEC (5*USEC_PER_MINUTE) +#define DEFAULT_RATE_LIMIT_INTERVAL (30*USEC_PER_SEC) +#define DEFAULT_RATE_LIMIT_BURST 10000 +#define DEFAULT_MAX_FILE_USEC USEC_PER_MONTH + +#define RECHECK_SPACE_USEC (30*USEC_PER_SEC) + +#define NOTIFY_SNDBUF_SIZE (8*1024*1024) + +/* The period to insert between posting changes for coalescing */ +#define POST_CHANGE_TIMER_INTERVAL_USEC (250*USEC_PER_MSEC) + +/* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room + * for a bit of additional metadata. */ +#define DEFAULT_LINE_MAX (48*1024) + +#define DEFERRED_CLOSES_MAX (4096) + +#define IDLE_TIMEOUT_USEC (30*USEC_PER_SEC) + +static int determine_path_usage( + Server *s, + const char *path, + uint64_t *ret_used, + uint64_t *ret_free) { + + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + struct statvfs ss; + + assert(s); + assert(path); + assert(ret_used); + assert(ret_free); + + d = opendir(path); + if (!d) + return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, + errno, "Failed to open %s: %m", path); + + if (fstatvfs(dirfd(d), &ss) < 0) + return log_error_errno(errno, "Failed to fstatvfs(%s): %m", path); + + *ret_free = ss.f_bsize * ss.f_bavail; + *ret_used = 0; + FOREACH_DIRENT_ALL(de, d, break) { + struct stat st; + + if (!endswith(de->d_name, ".journal") && + !endswith(de->d_name, ".journal~")) + continue; + + if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + log_debug_errno(errno, "Failed to stat %s/%s, ignoring: %m", path, de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) + continue; + + *ret_used += (uint64_t) st.st_blocks * 512UL; + } + + return 0; +} + +static void cache_space_invalidate(JournalStorageSpace *space) { + zero(*space); +} + +static int cache_space_refresh(Server *s, JournalStorage *storage) { + JournalStorageSpace *space; + JournalMetrics *metrics; + uint64_t vfs_used, vfs_avail, avail; + usec_t ts; + int r; + + assert(s); + + metrics = &storage->metrics; + space = &storage->space; + + ts = now(CLOCK_MONOTONIC); + + if (space->timestamp != 0 && space->timestamp + RECHECK_SPACE_USEC > ts) + return 0; + + r = determine_path_usage(s, storage->path, &vfs_used, &vfs_avail); + if (r < 0) + return r; + + space->vfs_used = vfs_used; + space->vfs_available = vfs_avail; + + avail = LESS_BY(vfs_avail, metrics->keep_free); + + space->limit = MIN(MAX(vfs_used + avail, metrics->min_use), metrics->max_use); + space->available = LESS_BY(space->limit, vfs_used); + space->timestamp = ts; + return 1; +} + +static void patch_min_use(JournalStorage *storage) { + assert(storage); + + /* Let's bump the min_use limit to the current usage on disk. We do + * this when starting up and first opening the journal files. This way + * sudden spikes in disk usage will not cause journald to vacuum files + * without bounds. Note that this means that only a restart of journald + * will make it reset this value. */ + + storage->metrics.min_use = MAX(storage->metrics.min_use, storage->space.vfs_used); +} + +static JournalStorage* server_current_storage(Server *s) { + assert(s); + + return s->system_journal ? &s->system_storage : &s->runtime_storage; +} + +static int determine_space(Server *s, uint64_t *available, uint64_t *limit) { + JournalStorage *js; + int r; + + assert(s); + + js = server_current_storage(s); + + r = cache_space_refresh(s, js); + if (r >= 0) { + if (available) + *available = js->space.available; + if (limit) + *limit = js->space.limit; + } + return r; +} + +void server_space_usage_message(Server *s, JournalStorage *storage) { + char fb1[FORMAT_BYTES_MAX], fb2[FORMAT_BYTES_MAX], fb3[FORMAT_BYTES_MAX], + fb4[FORMAT_BYTES_MAX], fb5[FORMAT_BYTES_MAX], fb6[FORMAT_BYTES_MAX]; + JournalMetrics *metrics; + + assert(s); + + if (!storage) + storage = server_current_storage(s); + + if (cache_space_refresh(s, storage) < 0) + return; + + metrics = &storage->metrics; + format_bytes(fb1, sizeof(fb1), storage->space.vfs_used); + format_bytes(fb2, sizeof(fb2), metrics->max_use); + format_bytes(fb3, sizeof(fb3), metrics->keep_free); + format_bytes(fb4, sizeof(fb4), storage->space.vfs_available); + format_bytes(fb5, sizeof(fb5), storage->space.limit); + format_bytes(fb6, sizeof(fb6), storage->space.available); + + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_USAGE_STR, + LOG_MESSAGE("%s (%s) is %s, max %s, %s free.", + storage->name, storage->path, fb1, fb5, fb6), + "JOURNAL_NAME=%s", storage->name, + "JOURNAL_PATH=%s", storage->path, + "CURRENT_USE=%"PRIu64, storage->space.vfs_used, + "CURRENT_USE_PRETTY=%s", fb1, + "MAX_USE=%"PRIu64, metrics->max_use, + "MAX_USE_PRETTY=%s", fb2, + "DISK_KEEP_FREE=%"PRIu64, metrics->keep_free, + "DISK_KEEP_FREE_PRETTY=%s", fb3, + "DISK_AVAILABLE=%"PRIu64, storage->space.vfs_available, + "DISK_AVAILABLE_PRETTY=%s", fb4, + "LIMIT=%"PRIu64, storage->space.limit, + "LIMIT_PRETTY=%s", fb5, + "AVAILABLE=%"PRIu64, storage->space.available, + "AVAILABLE_PRETTY=%s", fb6, + NULL); +} + +static bool uid_for_system_journal(uid_t uid) { + + /* Returns true if the specified UID shall get its data stored in the system journal*/ + + return uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY; +} + +static void server_add_acls(JournalFile *f, uid_t uid) { + assert(f); + +#if HAVE_ACL + int r; + + if (uid_for_system_journal(uid)) + return; + + r = fd_add_uid_acl_permission(f->fd, uid, ACL_READ); + if (r < 0) + log_warning_errno(r, "Failed to set ACL on %s, ignoring: %m", f->path); +#endif +} + +static int open_journal( + Server *s, + bool reliably, + const char *fname, + int flags, + bool seal, + JournalMetrics *metrics, + JournalFile **ret) { + + _cleanup_(journal_file_closep) JournalFile *f = NULL; + int r; + + assert(s); + assert(fname); + assert(ret); + + if (reliably) + r = journal_file_open_reliably(fname, flags, 0640, s->compress.enabled, s->compress.threshold_bytes, + seal, metrics, s->mmap, s->deferred_closes, NULL, &f); + else + r = journal_file_open(-1, fname, flags, 0640, s->compress.enabled, s->compress.threshold_bytes, seal, + metrics, s->mmap, s->deferred_closes, NULL, &f); + + if (r < 0) + return r; + + r = journal_file_enable_post_change_timer(f, s->event, POST_CHANGE_TIMER_INTERVAL_USEC); + if (r < 0) + return r; + + *ret = TAKE_PTR(f); + return r; +} + +static bool flushed_flag_is_set(Server *s) { + const char *fn; + + assert(s); + + /* We don't support the "flushing" concept for namespace instances, we assume them to always have + * access to /var */ + if (s->namespace) + return true; + + fn = strjoina(s->runtime_directory, "/flushed"); + return access(fn, F_OK) >= 0; +} + +static int system_journal_open(Server *s, bool flush_requested, bool relinquish_requested) { + const char *fn; + int r = 0; + + if (!s->system_journal && + IN_SET(s->storage, STORAGE_PERSISTENT, STORAGE_AUTO) && + (flush_requested || flushed_flag_is_set(s)) && + !relinquish_requested) { + + /* If in auto mode: first try to create the machine path, but not the prefix. + * + * If in persistent mode: create /var/log/journal and the machine path */ + + if (s->storage == STORAGE_PERSISTENT) + (void) mkdir_parents(s->system_storage.path, 0755); + + (void) mkdir(s->system_storage.path, 0755); + + fn = strjoina(s->system_storage.path, "/system.journal"); + r = open_journal(s, true, fn, O_RDWR|O_CREAT, s->seal, &s->system_storage.metrics, &s->system_journal); + if (r >= 0) { + server_add_acls(s->system_journal, 0); + (void) cache_space_refresh(s, &s->system_storage); + patch_min_use(&s->system_storage); + } else { + if (!IN_SET(r, -ENOENT, -EROFS)) + log_warning_errno(r, "Failed to open system journal: %m"); + + r = 0; + } + + /* If the runtime journal is open, and we're post-flush, we're recovering from a failed + * system journal rotate (ENOSPC) for which the runtime journal was reopened. + * + * Perform an implicit flush to var, leaving the runtime journal closed, now that the system + * journal is back. + */ + if (!flush_requested) + (void) server_flush_to_var(s, true); + } + + if (!s->runtime_journal && + (s->storage != STORAGE_NONE)) { + + fn = strjoina(s->runtime_storage.path, "/system.journal"); + + if (s->system_journal && !relinquish_requested) { + + /* Try to open the runtime journal, but only + * if it already exists, so that we can flush + * it into the system journal */ + + r = open_journal(s, false, fn, O_RDWR, false, &s->runtime_storage.metrics, &s->runtime_journal); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to open runtime journal: %m"); + + r = 0; + } + + } else { + + /* OK, we really need the runtime journal, so create it if necessary. */ + + (void) mkdir_parents(s->runtime_storage.path, 0755); + (void) mkdir(s->runtime_storage.path, 0750); + + r = open_journal(s, true, fn, O_RDWR|O_CREAT, false, &s->runtime_storage.metrics, &s->runtime_journal); + if (r < 0) + return log_error_errno(r, "Failed to open runtime journal: %m"); + } + + if (s->runtime_journal) { + server_add_acls(s->runtime_journal, 0); + (void) cache_space_refresh(s, &s->runtime_storage); + patch_min_use(&s->runtime_storage); + } + } + + return r; +} + +static JournalFile* find_journal(Server *s, uid_t uid) { + _cleanup_free_ char *p = NULL; + JournalFile *f; + int r; + + assert(s); + + /* A rotate that fails to create the new journal (ENOSPC) leaves the rotated journal as NULL. Unless + * we revisit opening, even after space is made available we'll continue to return NULL indefinitely. + * + * system_journal_open() is a noop if the journals are already open, so we can just call it here to + * recover from failed rotates (or anything else that's left the journals as NULL). + * + * Fixes https://github.com/systemd/systemd/issues/3968 */ + (void) system_journal_open(s, false, false); + + /* We split up user logs only on /var, not on /run. If the runtime file is open, we write to it + * exclusively, in order to guarantee proper order as soon as we flush /run to /var and close the + * runtime file. */ + + if (s->runtime_journal) + return s->runtime_journal; + + if (uid_for_system_journal(uid)) + return s->system_journal; + + f = ordered_hashmap_get(s->user_journals, UID_TO_PTR(uid)); + if (f) + return f; + + if (asprintf(&p, "%s/user-" UID_FMT ".journal", s->system_storage.path, uid) < 0) { + log_oom(); + return s->system_journal; + } + + /* Too many open? Then let's close one (or more) */ + while (ordered_hashmap_size(s->user_journals) >= USER_JOURNALS_MAX) { + assert_se(f = ordered_hashmap_steal_first(s->user_journals)); + (void) journal_file_close(f); + } + + r = open_journal(s, true, p, O_RDWR|O_CREAT, s->seal, &s->system_storage.metrics, &f); + if (r < 0) + return s->system_journal; + + r = ordered_hashmap_put(s->user_journals, UID_TO_PTR(uid), f); + if (r < 0) { + (void) journal_file_close(f); + return s->system_journal; + } + + server_add_acls(f, uid); + return f; +} + +static int do_rotate( + Server *s, + JournalFile **f, + const char* name, + bool seal, + uint32_t uid) { + + int r; + assert(s); + + if (!*f) + return -EINVAL; + + r = journal_file_rotate(f, s->compress.enabled, s->compress.threshold_bytes, seal, s->deferred_closes); + if (r < 0) { + if (*f) + return log_error_errno(r, "Failed to rotate %s: %m", (*f)->path); + else + return log_error_errno(r, "Failed to create new %s journal: %m", name); + } + + server_add_acls(*f, uid); + return r; +} + +static void server_process_deferred_closes(Server *s) { + JournalFile *f; + + /* Perform any deferred closes which aren't still offlining. */ + SET_FOREACH(f, s->deferred_closes) { + if (journal_file_is_offlining(f)) + continue; + + (void) set_remove(s->deferred_closes, f); + (void) journal_file_close(f); + } +} + +static void server_vacuum_deferred_closes(Server *s) { + assert(s); + + /* Make some room in the deferred closes list, so that it doesn't grow without bounds */ + if (set_size(s->deferred_closes) < DEFERRED_CLOSES_MAX) + return; + + /* Let's first remove all journal files that might already have completed closing */ + server_process_deferred_closes(s); + + /* And now, let's close some more until we reach the limit again. */ + while (set_size(s->deferred_closes) >= DEFERRED_CLOSES_MAX) { + JournalFile *f; + + assert_se(f = set_steal_first(s->deferred_closes)); + journal_file_close(f); + } +} + +static int vacuum_offline_user_journals(Server *s) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(s); + + d = opendir(s->system_storage.path); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open %s: %m", s->system_storage.path); + } + + for (;;) { + _cleanup_free_ char *u = NULL, *full = NULL; + _cleanup_close_ int fd = -1; + const char *a, *b; + struct dirent *de; + JournalFile *f; + uid_t uid; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + log_warning_errno(errno, "Failed to enumerate %s, ignoring: %m", s->system_storage.path); + + break; + } + + a = startswith(de->d_name, "user-"); + if (!a) + continue; + b = endswith(de->d_name, ".journal"); + if (!b) + continue; + + u = strndup(a, b-a); + if (!u) + return log_oom(); + + r = parse_uid(u, &uid); + if (r < 0) { + log_debug_errno(r, "Failed to parse UID from file name '%s', ignoring: %m", de->d_name); + continue; + } + + /* Already rotated in the above loop? i.e. is it an open user journal? */ + if (ordered_hashmap_contains(s->user_journals, UID_TO_PTR(uid))) + continue; + + full = path_join(s->system_storage.path, de->d_name); + if (!full) + return log_oom(); + + fd = openat(dirfd(d), de->d_name, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|O_NONBLOCK); + if (fd < 0) { + log_full_errno(IN_SET(errno, ELOOP, ENOENT) ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to open journal file '%s' for rotation: %m", full); + continue; + } + + /* Make some room in the set of deferred close()s */ + server_vacuum_deferred_closes(s); + + /* Open the file briefly, so that we can archive it */ + r = journal_file_open(fd, + full, + O_RDWR, + 0640, + s->compress.enabled, + s->compress.threshold_bytes, + s->seal, + &s->system_storage.metrics, + s->mmap, + s->deferred_closes, + NULL, + &f); + if (r < 0) { + log_warning_errno(r, "Failed to read journal file %s for rotation, trying to move it out of the way: %m", full); + + r = journal_file_dispose(dirfd(d), de->d_name); + if (r < 0) + log_warning_errno(r, "Failed to move %s out of the way, ignoring: %m", full); + else + log_debug("Successfully moved %s out of the way.", full); + + continue; + } + + TAKE_FD(fd); /* Donated to journal_file_open() */ + + r = journal_file_archive(f); + if (r < 0) + log_debug_errno(r, "Failed to archive journal file '%s', ignoring: %m", full); + + f = journal_initiate_close(f, s->deferred_closes); + } + + return 0; +} + +void server_rotate(Server *s) { + JournalFile *f; + void *k; + int r; + + log_debug("Rotating..."); + + /* First, rotate the system journal (either in its runtime flavour or in its runtime flavour) */ + (void) do_rotate(s, &s->runtime_journal, "runtime", false, 0); + (void) do_rotate(s, &s->system_journal, "system", s->seal, 0); + + /* Then, rotate all user journals we have open (keeping them open) */ + ORDERED_HASHMAP_FOREACH_KEY(f, k, s->user_journals) { + r = do_rotate(s, &f, "user", s->seal, PTR_TO_UID(k)); + if (r >= 0) + ordered_hashmap_replace(s->user_journals, k, f); + else if (!f) + /* Old file has been closed and deallocated */ + ordered_hashmap_remove(s->user_journals, k); + } + + /* Finally, also rotate all user journals we currently do not have open. (But do so only if we + * actually have access to /var, i.e. are not in the log-to-runtime-journal mode). */ + if (!s->runtime_journal) + (void) vacuum_offline_user_journals(s); + + server_process_deferred_closes(s); +} + +void server_sync(Server *s) { + JournalFile *f; + int r; + + if (s->system_journal) { + r = journal_file_set_offline(s->system_journal, false); + if (r < 0) + log_warning_errno(r, "Failed to sync system journal, ignoring: %m"); + } + + ORDERED_HASHMAP_FOREACH(f, s->user_journals) { + r = journal_file_set_offline(f, false); + if (r < 0) + log_warning_errno(r, "Failed to sync user journal, ignoring: %m"); + } + + if (s->sync_event_source) { + r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_OFF); + if (r < 0) + log_error_errno(r, "Failed to disable sync timer source: %m"); + } + + s->sync_scheduled = false; +} + +static void do_vacuum(Server *s, JournalStorage *storage, bool verbose) { + + int r; + + assert(s); + assert(storage); + + (void) cache_space_refresh(s, storage); + + if (verbose) + server_space_usage_message(s, storage); + + r = journal_directory_vacuum(storage->path, storage->space.limit, + storage->metrics.n_max_files, s->max_retention_usec, + &s->oldest_file_usec, verbose); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to vacuum %s, ignoring: %m", storage->path); + + cache_space_invalidate(&storage->space); +} + +int server_vacuum(Server *s, bool verbose) { + assert(s); + + log_debug("Vacuuming..."); + + s->oldest_file_usec = 0; + + if (s->system_journal) + do_vacuum(s, &s->system_storage, verbose); + if (s->runtime_journal) + do_vacuum(s, &s->runtime_storage, verbose); + + return 0; +} + +static void server_cache_machine_id(Server *s) { + sd_id128_t id; + int r; + + assert(s); + + r = sd_id128_get_machine(&id); + if (r < 0) + return; + + sd_id128_to_string(id, stpcpy(s->machine_id_field, "_MACHINE_ID=")); +} + +static void server_cache_boot_id(Server *s) { + sd_id128_t id; + int r; + + assert(s); + + r = sd_id128_get_boot(&id); + if (r < 0) + return; + + sd_id128_to_string(id, stpcpy(s->boot_id_field, "_BOOT_ID=")); +} + +static void server_cache_hostname(Server *s) { + _cleanup_free_ char *t = NULL; + char *x; + + assert(s); + + t = gethostname_malloc(); + if (!t) + return; + + x = strjoin("_HOSTNAME=", t); + if (!x) + return; + + free_and_replace(s->hostname_field, x); +} + +static bool shall_try_append_again(JournalFile *f, int r) { + switch(r) { + + case -E2BIG: /* Hit configured limit */ + case -EFBIG: /* Hit fs limit */ + case -EDQUOT: /* Quota limit hit */ + case -ENOSPC: /* Disk full */ + log_debug("%s: Allocation limit reached, rotating.", f->path); + return true; + + case -EIO: /* I/O error of some kind (mmap) */ + log_warning("%s: IO error, rotating.", f->path); + return true; + + case -EHOSTDOWN: /* Other machine */ + log_info("%s: Journal file from other machine, rotating.", f->path); + return true; + + case -EBUSY: /* Unclean shutdown */ + log_info("%s: Unclean shutdown, rotating.", f->path); + return true; + + case -EPROTONOSUPPORT: /* Unsupported feature */ + log_info("%s: Unsupported feature, rotating.", f->path); + return true; + + case -EBADMSG: /* Corrupted */ + case -ENODATA: /* Truncated */ + case -ESHUTDOWN: /* Already archived */ + log_warning("%s: Journal file corrupted, rotating.", f->path); + return true; + + case -EIDRM: /* Journal file has been deleted */ + log_warning("%s: Journal file has been deleted, rotating.", f->path); + return true; + + case -ETXTBSY: /* Journal file is from the future */ + log_warning("%s: Journal file is from the future, rotating.", f->path); + return true; + + case -EAFNOSUPPORT: + log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path); + return false; + + default: + return false; + } +} + +static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) { + bool vacuumed = false, rotate = false; + struct dual_timestamp ts; + JournalFile *f; + int r; + + assert(s); + assert(iovec); + assert(n > 0); + + /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use + * the source time, and not even the time the event was originally seen, but instead simply the time we started + * processing it, as we want strictly linear ordering in what we write out.) */ + assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0); + assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0); + + if (ts.realtime < s->last_realtime_clock) { + /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during + * regular operation. However, when it does happen, then we should make sure that we start fresh files + * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure + * bisection works correctly. */ + + log_debug("Time jumped backwards, rotating."); + rotate = true; + } else { + + f = find_journal(s, uid); + if (!f) + return; + + if (journal_file_rotate_suggested(f, s->max_file_usec)) { + log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path); + rotate = true; + } + } + + if (rotate) { + server_rotate(s); + server_vacuum(s, false); + vacuumed = true; + + f = find_journal(s, uid); + if (!f) + return; + } + + s->last_realtime_clock = ts.realtime; + + r = journal_file_append_entry(f, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL); + if (r >= 0) { + server_schedule_sync(s, priority); + return; + } + + if (vacuumed || !shall_try_append_again(f, r)) { + log_error_errno(r, "Failed to write entry (%zu items, %zu bytes), ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n)); + return; + } + + server_rotate(s); + server_vacuum(s, false); + + f = find_journal(s, uid); + if (!f) + return; + + log_debug("Retrying write."); + r = journal_file_append_entry(f, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL); + if (r < 0) + log_error_errno(r, "Failed to write entry (%zu items, %zu bytes) despite vacuuming, ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n)); + else + server_schedule_sync(s, priority); +} + +#define IOVEC_ADD_NUMERIC_FIELD(iovec, n, value, type, isset, format, field) \ + if (isset(value)) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + DECIMAL_STR_MAX(type) + 1); \ + sprintf(k, field "=" format, value); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_STRING_FIELD(iovec, n, value, field) \ + if (!isempty(value)) { \ + char *k; \ + k = strjoina(field "=", value); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_ID128_FIELD(iovec, n, value, field) \ + if (!sd_id128_is_null(value)) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + SD_ID128_STRING_MAX); \ + sd_id128_to_string(value, stpcpy(k, field "=")); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_SIZED_FIELD(iovec, n, value, value_size, field) \ + if (value_size > 0) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + value_size + 1); \ + *((char*) mempcpy(stpcpy(k, field "="), value, value_size)) = 0; \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } \ + +static void dispatch_message_real( + Server *s, + struct iovec *iovec, size_t n, size_t m, + const ClientContext *c, + const struct timeval *tv, + int priority, + pid_t object_pid) { + + char source_time[sizeof("_SOURCE_REALTIME_TIMESTAMP=") + DECIMAL_STR_MAX(usec_t)]; + _cleanup_free_ char *cmdline1 = NULL, *cmdline2 = NULL; + uid_t journal_uid; + ClientContext *o; + + assert(s); + assert(iovec); + assert(n > 0); + assert(n + + N_IOVEC_META_FIELDS + + (pid_is_valid(object_pid) ? N_IOVEC_OBJECT_FIELDS : 0) + + client_context_extra_fields_n_iovec(c) <= m); + + if (c) { + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->pid, pid_t, pid_is_valid, PID_FMT, "_PID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->uid, uid_t, uid_is_valid, UID_FMT, "_UID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->gid, gid_t, gid_is_valid, GID_FMT, "_GID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->comm, "_COMM"); /* At most TASK_COMM_LENGTH (16 bytes) */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->exe, "_EXE"); /* A path, so at most PATH_MAX (4096 bytes) */ + + if (c->cmdline) + /* At most _SC_ARG_MAX (2MB usually), which is too much to put on stack. + * Let's use a heap allocation for this one. */ + cmdline1 = set_iovec_string_field(iovec, &n, "_CMDLINE=", c->cmdline); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->capeff, "_CAP_EFFECTIVE"); /* Read from /proc/.../status */ + IOVEC_ADD_SIZED_FIELD(iovec, n, c->label, c->label_size, "_SELINUX_CONTEXT"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "_AUDIT_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->loginuid, uid_t, uid_is_valid, UID_FMT, "_AUDIT_LOGINUID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->cgroup, "_SYSTEMD_CGROUP"); /* A path */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->session, "_SYSTEMD_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->owner_uid, uid_t, uid_is_valid, UID_FMT, "_SYSTEMD_OWNER_UID"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->unit, "_SYSTEMD_UNIT"); /* Unit names are bounded by UNIT_NAME_MAX */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->user_unit, "_SYSTEMD_USER_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->slice, "_SYSTEMD_SLICE"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->user_slice, "_SYSTEMD_USER_SLICE"); + + IOVEC_ADD_ID128_FIELD(iovec, n, c->invocation_id, "_SYSTEMD_INVOCATION_ID"); + + if (c->extra_fields_n_iovec > 0) { + memcpy(iovec + n, c->extra_fields_iovec, c->extra_fields_n_iovec * sizeof(struct iovec)); + n += c->extra_fields_n_iovec; + } + } + + assert(n <= m); + + if (pid_is_valid(object_pid) && client_context_get(s, object_pid, NULL, NULL, 0, NULL, &o) >= 0) { + + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->pid, pid_t, pid_is_valid, PID_FMT, "OBJECT_PID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_UID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->gid, gid_t, gid_is_valid, GID_FMT, "OBJECT_GID"); + + /* See above for size limits, only ->cmdline may be large, so use a heap allocation for it. */ + IOVEC_ADD_STRING_FIELD(iovec, n, o->comm, "OBJECT_COMM"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->exe, "OBJECT_EXE"); + if (o->cmdline) + cmdline2 = set_iovec_string_field(iovec, &n, "OBJECT_CMDLINE=", o->cmdline); + + IOVEC_ADD_STRING_FIELD(iovec, n, o->capeff, "OBJECT_CAP_EFFECTIVE"); + IOVEC_ADD_SIZED_FIELD(iovec, n, o->label, o->label_size, "OBJECT_SELINUX_CONTEXT"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "OBJECT_AUDIT_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->loginuid, uid_t, uid_is_valid, UID_FMT, "OBJECT_AUDIT_LOGINUID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, o->cgroup, "OBJECT_SYSTEMD_CGROUP"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->session, "OBJECT_SYSTEMD_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->owner_uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_SYSTEMD_OWNER_UID"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->unit, "OBJECT_SYSTEMD_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->user_unit, "OBJECT_SYSTEMD_USER_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->slice, "OBJECT_SYSTEMD_SLICE"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->user_slice, "OBJECT_SYSTEMD_USER_SLICE"); + + IOVEC_ADD_ID128_FIELD(iovec, n, o->invocation_id, "OBJECT_SYSTEMD_INVOCATION_ID="); + } + + assert(n <= m); + + if (tv) { + sprintf(source_time, "_SOURCE_REALTIME_TIMESTAMP=" USEC_FMT, timeval_load(tv)); + iovec[n++] = IOVEC_MAKE_STRING(source_time); + } + + /* Note that strictly speaking storing the boot id here is + * redundant since the entry includes this in-line + * anyway. However, we need this indexed, too. */ + if (!isempty(s->boot_id_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->boot_id_field); + + if (!isempty(s->machine_id_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->machine_id_field); + + if (!isempty(s->hostname_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->hostname_field); + + if (!isempty(s->namespace_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->namespace_field); + + assert(n <= m); + + if (s->split_mode == SPLIT_UID && c && uid_is_valid(c->uid)) + /* Split up strictly by (non-root) UID */ + journal_uid = c->uid; + else if (s->split_mode == SPLIT_LOGIN && c && c->uid > 0 && uid_is_valid(c->owner_uid)) + /* Split up by login UIDs. We do this only if the + * realuid is not root, in order not to accidentally + * leak privileged information to the user that is + * logged by a privileged process that is part of an + * unprivileged session. */ + journal_uid = c->owner_uid; + else + journal_uid = 0; + + write_to_journal(s, journal_uid, iovec, n, priority); +} + +void server_driver_message(Server *s, pid_t object_pid, const char *message_id, const char *format, ...) { + + struct iovec *iovec; + size_t n = 0, k, m; + va_list ap; + int r; + + assert(s); + assert(format); + + m = N_IOVEC_META_FIELDS + 5 + N_IOVEC_PAYLOAD_FIELDS + client_context_extra_fields_n_iovec(s->my_context) + N_IOVEC_OBJECT_FIELDS; + iovec = newa(struct iovec, m); + + assert_cc(3 == LOG_FAC(LOG_DAEMON)); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=3"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=systemd-journald"); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=driver"); + assert_cc(6 == LOG_INFO); + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=6"); + + if (message_id) + iovec[n++] = IOVEC_MAKE_STRING(message_id); + k = n; + + va_start(ap, format); + r = log_format_iovec(iovec, m, &n, false, 0, format, ap); + /* Error handling below */ + va_end(ap); + + if (r >= 0) + dispatch_message_real(s, iovec, n, m, s->my_context, NULL, LOG_INFO, object_pid); + + while (k < n) + free(iovec[k++].iov_base); + + if (r < 0) { + /* We failed to format the message. Emit a warning instead. */ + char buf[LINE_MAX]; + + xsprintf(buf, "MESSAGE=Entry printing failed: %s", strerror_safe(r)); + + n = 3; + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=4"); + iovec[n++] = IOVEC_MAKE_STRING(buf); + dispatch_message_real(s, iovec, n, m, s->my_context, NULL, LOG_INFO, object_pid); + } +} + +void server_dispatch_message( + Server *s, + struct iovec *iovec, size_t n, size_t m, + ClientContext *c, + const struct timeval *tv, + int priority, + pid_t object_pid) { + + uint64_t available = 0; + int rl; + + assert(s); + assert(iovec || n == 0); + + if (n == 0) + return; + + if (LOG_PRI(priority) > s->max_level_store) + return; + + /* Stop early in case the information will not be stored + * in a journal. */ + if (s->storage == STORAGE_NONE) + return; + + if (c && c->unit) { + (void) determine_space(s, &available, NULL); + + rl = journal_ratelimit_test(s->ratelimit, c->unit, c->log_ratelimit_interval, c->log_ratelimit_burst, priority & LOG_PRIMASK, available); + if (rl == 0) + return; + + /* Write a suppression message if we suppressed something */ + if (rl > 1) + server_driver_message(s, c->pid, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_DROPPED_STR, + LOG_MESSAGE("Suppressed %i messages from %s", rl - 1, c->unit), + "N_DROPPED=%i", rl - 1, + NULL); + } + + dispatch_message_real(s, iovec, n, m, c, tv, priority, object_pid); +} + +int server_flush_to_var(Server *s, bool require_flag_file) { + char ts[FORMAT_TIMESPAN_MAX]; + sd_journal *j = NULL; + const char *fn; + unsigned n = 0; + usec_t start; + int r, k; + + assert(s); + + if (!IN_SET(s->storage, STORAGE_AUTO, STORAGE_PERSISTENT)) + return 0; + + if (s->namespace) /* Flushing concept does not exist for namespace instances */ + return 0; + + if (!s->runtime_journal) /* Nothing to flush? */ + return 0; + + if (require_flag_file && !flushed_flag_is_set(s)) + return 0; + + (void) system_journal_open(s, true, false); + + if (!s->system_journal) + return 0; + + log_debug("Flushing to %s...", s->system_storage.path); + + start = now(CLOCK_MONOTONIC); + + r = sd_journal_open(&j, SD_JOURNAL_RUNTIME_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to read runtime journal: %m"); + + sd_journal_set_data_threshold(j, 0); + + SD_JOURNAL_FOREACH(j) { + Object *o = NULL; + JournalFile *f; + + f = j->current_file; + assert(f && f->current_offset > 0); + + n++; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) { + log_error_errno(r, "Can't read entry: %m"); + goto finish; + } + + r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset); + if (r >= 0) + continue; + + if (!shall_try_append_again(s->system_journal, r)) { + log_error_errno(r, "Can't write entry: %m"); + goto finish; + } + + server_rotate(s); + server_vacuum(s, false); + + if (!s->system_journal) { + log_notice("Didn't flush runtime journal since rotation of system journal wasn't successful."); + r = -EIO; + goto finish; + } + + log_debug("Retrying write."); + r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset); + if (r < 0) { + log_error_errno(r, "Can't write entry: %m"); + goto finish; + } + } + + r = 0; + +finish: + if (s->system_journal) + journal_file_post_change(s->system_journal); + + s->runtime_journal = journal_file_close(s->runtime_journal); + + if (r >= 0) + (void) rm_rf(s->runtime_storage.path, REMOVE_ROOT); + + sd_journal_close(j); + + server_driver_message(s, 0, NULL, + LOG_MESSAGE("Time spent on flushing to %s is %s for %u entries.", + s->system_storage.path, + format_timespan(ts, sizeof(ts), now(CLOCK_MONOTONIC) - start, 0), + n), + NULL); + + fn = strjoina(s->runtime_directory, "/flushed"); + k = touch(fn); + if (k < 0) + log_warning_errno(k, "Failed to touch %s, ignoring: %m", fn); + + server_refresh_idle_timer(s); + return r; +} + +static int server_relinquish_var(Server *s) { + const char *fn; + assert(s); + + if (s->storage == STORAGE_NONE) + return 0; + + if (s->namespace) /* Concept does not exist for namespaced instances */ + return -EOPNOTSUPP; + + if (s->runtime_journal && !s->system_journal) + return 0; + + log_debug("Relinquishing %s...", s->system_storage.path); + + (void) system_journal_open(s, false, true); + + s->system_journal = journal_file_close(s->system_journal); + ordered_hashmap_clear_with_destructor(s->user_journals, journal_file_close); + set_clear_with_destructor(s->deferred_closes, journal_file_close); + + fn = strjoina(s->runtime_directory, "/flushed"); + if (unlink(fn) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to unlink %s, ignoring: %m", fn); + + server_refresh_idle_timer(s); + return 0; +} + +int server_process_datagram( + sd_event_source *es, + int fd, + uint32_t revents, + void *userdata) { + + Server *s = userdata; + struct ucred *ucred = NULL; + struct timeval *tv = NULL; + struct cmsghdr *cmsg; + char *label = NULL; + size_t label_len = 0, m; + struct iovec iovec; + ssize_t n; + int *fds = NULL, v = 0; + size_t n_fds = 0; + + /* We use NAME_MAX space for the SELinux label here. The kernel currently enforces no limit, but + * according to suggestions from the SELinux people this will change and it will probably be + * identical to NAME_MAX. For now we use that, but this should be updated one day when the final + * limit is known. */ + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(struct timeval)) + + CMSG_SPACE(sizeof(int)) + /* fd */ + CMSG_SPACE(NAME_MAX) /* selinux label */) control; + + union sockaddr_union sa = {}; + + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_name = &sa, + .msg_namelen = sizeof(sa), + }; + + assert(s); + assert(fd == s->native_fd || fd == s->syslog_fd || fd == s->audit_fd); + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Got invalid event from epoll for datagram fd: %" PRIx32, + revents); + + /* Try to get the right size, if we can. (Not all sockets support SIOCINQ, hence we just try, but don't rely on + * it.) */ + (void) ioctl(fd, SIOCINQ, &v); + + /* Fix it up, if it is too small. We use the same fixed value as auditd here. Awful! */ + m = PAGE_ALIGN(MAX3((size_t) v + 1, + (size_t) LINE_MAX, + ALIGN(sizeof(struct nlmsghdr)) + ALIGN((size_t) MAX_AUDIT_MESSAGE_LENGTH)) + 1); + + if (!GREEDY_REALLOC(s->buffer, s->buffer_size, m)) + return log_oom(); + + iovec = IOVEC_MAKE(s->buffer, s->buffer_size - 1); /* Leave room for trailing NUL we add later */ + + n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (IN_SET(n, -EINTR, -EAGAIN)) + return 0; + if (n == -EXFULL) { + log_warning("Got message with truncated control data (too many fds sent?), ignoring."); + return 0; + } + if (n < 0) + return log_error_errno(n, "recvmsg() failed: %m"); + + CMSG_FOREACH(cmsg, &msghdr) + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + assert(!ucred); + ucred = (struct ucred*) CMSG_DATA(cmsg); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_SECURITY) { + assert(!label); + label = (char*) CMSG_DATA(cmsg); + label_len = cmsg->cmsg_len - CMSG_LEN(0); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SO_TIMESTAMP && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct timeval))) { + assert(!tv); + tv = (struct timeval*) CMSG_DATA(cmsg); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + assert(!fds); + fds = (int*) CMSG_DATA(cmsg); + n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + } + + /* And a trailing NUL, just in case */ + s->buffer[n] = 0; + + if (fd == s->syslog_fd) { + if (n > 0 && n_fds == 0) + server_process_syslog_message(s, s->buffer, n, ucred, tv, label, label_len); + else if (n_fds > 0) + log_warning("Got file descriptors via syslog socket. Ignoring."); + + } else if (fd == s->native_fd) { + if (n > 0 && n_fds == 0) + server_process_native_message(s, s->buffer, n, ucred, tv, label, label_len); + else if (n == 0 && n_fds == 1) + server_process_native_file(s, fds[0], ucred, tv, label, label_len); + else if (n_fds > 0) + log_warning("Got too many file descriptors via native socket. Ignoring."); + + } else { + assert(fd == s->audit_fd); + + if (n > 0 && n_fds == 0) + server_process_audit_message(s, s->buffer, n, ucred, &sa, msghdr.msg_namelen); + else if (n_fds > 0) + log_warning("Got file descriptors via audit socket. Ignoring."); + } + + close_many(fds, n_fds); + + server_refresh_idle_timer(s); + return 0; +} + +static void server_full_flush(Server *s) { + assert(s); + + (void) server_flush_to_var(s, false); + server_sync(s); + server_vacuum(s, false); + + server_space_usage_message(s, NULL); + + server_refresh_idle_timer(s); +} + +static int dispatch_sigusr1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = userdata; + + assert(s); + + if (s->namespace) { + log_error("Received SIGUSR1 signal from PID " PID_FMT ", but flushing runtime journals not supported for namespaced instances.", si->ssi_pid); + return 0; + } + + log_info("Received SIGUSR1 signal from PID " PID_FMT ", as request to flush runtime journal.", si->ssi_pid); + server_full_flush(s); + + return 0; +} + +static void server_full_rotate(Server *s) { + const char *fn; + int r; + + assert(s); + + server_rotate(s); + server_vacuum(s, true); + + if (s->system_journal) + patch_min_use(&s->system_storage); + if (s->runtime_journal) + patch_min_use(&s->runtime_storage); + + /* Let clients know when the most recent rotation happened. */ + fn = strjoina(s->runtime_directory, "/rotated"); + r = write_timestamp_file_atomic(fn, now(CLOCK_MONOTONIC)); + if (r < 0) + log_warning_errno(r, "Failed to write %s, ignoring: %m", fn); +} + +static int dispatch_sigusr2(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = userdata; + + assert(s); + + log_info("Received SIGUSR2 signal from PID " PID_FMT ", as request to rotate journal.", si->ssi_pid); + server_full_rotate(s); + + return 0; +} + +static int dispatch_sigterm(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = userdata; + + assert(s); + + log_received_signal(LOG_INFO, si); + + sd_event_exit(s->event, 0); + return 0; +} + +static void server_full_sync(Server *s) { + const char *fn; + int r; + + assert(s); + + server_sync(s); + + /* Let clients know when the most recent sync happened. */ + fn = strjoina(s->runtime_directory, "/synced"); + r = write_timestamp_file_atomic(fn, now(CLOCK_MONOTONIC)); + if (r < 0) + log_warning_errno(r, "Failed to write %s, ignoring: %m", fn); + + return; +} + +static int dispatch_sigrtmin1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = userdata; + + assert(s); + + log_debug("Received SIGRTMIN1 signal from PID " PID_FMT ", as request to sync.", si->ssi_pid ); + server_full_sync(s); + + return 0; +} + +static int setup_signals(Server *s) { + int r; + + assert(s); + + assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, SIGUSR1, SIGUSR2, SIGRTMIN+1, -1) >= 0); + + r = sd_event_add_signal(s->event, &s->sigusr1_event_source, SIGUSR1, dispatch_sigusr1, s); + if (r < 0) + return r; + + r = sd_event_add_signal(s->event, &s->sigusr2_event_source, SIGUSR2, dispatch_sigusr2, s); + if (r < 0) + return r; + + r = sd_event_add_signal(s->event, &s->sigterm_event_source, SIGTERM, dispatch_sigterm, s); + if (r < 0) + return r; + + /* Let's process SIGTERM late, so that we flush all queued messages to disk before we exit */ + r = sd_event_source_set_priority(s->sigterm_event_source, SD_EVENT_PRIORITY_NORMAL+20); + if (r < 0) + return r; + + /* When journald is invoked on the terminal (when debugging), it's useful if C-c is handled + * equivalent to SIGTERM. */ + r = sd_event_add_signal(s->event, &s->sigint_event_source, SIGINT, dispatch_sigterm, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sigint_event_source, SD_EVENT_PRIORITY_NORMAL+20); + if (r < 0) + return r; + + /* SIGRTMIN+1 causes an immediate sync. We process this very late, so that everything else queued at + * this point is really written to disk. Clients can watch /run/systemd/journal/synced with inotify + * until its mtime changes to see when a sync happened. */ + r = sd_event_add_signal(s->event, &s->sigrtmin1_event_source, SIGRTMIN+1, dispatch_sigrtmin1, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sigrtmin1_event_source, SD_EVENT_PRIORITY_NORMAL+15); + if (r < 0) + return r; + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + Server *s = data; + int r; + + assert(s); + + if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_syslog")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to syslog switch \"%s\". Ignoring.", value); + else + s->forward_to_syslog = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_kmsg")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to kmsg switch \"%s\". Ignoring.", value); + else + s->forward_to_kmsg = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_console")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to console switch \"%s\". Ignoring.", value); + else + s->forward_to_console = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_wall")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to wall switch \"%s\". Ignoring.", value); + else + s->forward_to_wall = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_console")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level console value \"%s\". Ignoring.", value); + else + s->max_level_console = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_store")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level store value \"%s\". Ignoring.", value); + else + s->max_level_store = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_syslog")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level syslog value \"%s\". Ignoring.", value); + else + s->max_level_syslog = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_kmsg")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level kmsg value \"%s\". Ignoring.", value); + else + s->max_level_kmsg = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_wall")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level wall value \"%s\". Ignoring.", value); + else + s->max_level_wall = r; + + } else if (startswith(key, "systemd.journald")) + log_warning("Unknown journald kernel command line option \"%s\". Ignoring.", key); + + /* do not warn about state here, since probably systemd already did */ + return 0; +} + +static int server_parse_config_file(Server *s) { + int r; + + assert(s); + + if (s->namespace) { + const char *namespaced; + + /* If we are running in namespace mode, load the namespace specific configuration file, and nothing else */ + namespaced = strjoina(PKGSYSCONFDIR "/journald@", s->namespace, ".conf"); + + r = config_parse(NULL, + namespaced, NULL, + "Journal\0", + config_item_perf_lookup, journald_gperf_lookup, + CONFIG_PARSE_WARN, s, + NULL); + if (r < 0) + return r; + + return 0; + } + + return config_parse_many_nulstr( + PKGSYSCONFDIR "/journald.conf", + CONF_PATHS_NULSTR("systemd/journald.conf.d"), + "Journal\0", + config_item_perf_lookup, journald_gperf_lookup, + CONFIG_PARSE_WARN, s, NULL); +} + +static int server_dispatch_sync(sd_event_source *es, usec_t t, void *userdata) { + Server *s = userdata; + + assert(s); + + server_sync(s); + return 0; +} + +int server_schedule_sync(Server *s, int priority) { + int r; + + assert(s); + + if (priority <= LOG_CRIT) { + /* Immediately sync to disk when this is of priority CRIT, ALERT, EMERG */ + server_sync(s); + return 0; + } + + if (s->sync_scheduled) + return 0; + + if (s->sync_interval_usec > 0) { + + if (!s->sync_event_source) { + r = sd_event_add_time_relative( + s->event, + &s->sync_event_source, + CLOCK_MONOTONIC, + s->sync_interval_usec, 0, + server_dispatch_sync, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sync_event_source, SD_EVENT_PRIORITY_IMPORTANT); + } else { + r = sd_event_source_set_time_relative(s->sync_event_source, s->sync_interval_usec); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_ONESHOT); + } + if (r < 0) + return r; + + s->sync_scheduled = true; + } + + return 0; +} + +static int dispatch_hostname_change(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = userdata; + + assert(s); + + server_cache_hostname(s); + return 0; +} + +static int server_open_hostname(Server *s) { + int r; + + assert(s); + + s->hostname_fd = open("/proc/sys/kernel/hostname", + O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (s->hostname_fd < 0) + return log_error_errno(errno, "Failed to open /proc/sys/kernel/hostname: %m"); + + r = sd_event_add_io(s->event, &s->hostname_event_source, s->hostname_fd, 0, dispatch_hostname_change, s); + if (r < 0) { + /* kernels prior to 3.2 don't support polling this file. Ignore + * the failure. */ + if (r == -EPERM) { + log_warning_errno(r, "Failed to register hostname fd in event loop, ignoring: %m"); + s->hostname_fd = safe_close(s->hostname_fd); + return 0; + } + + return log_error_errno(r, "Failed to register hostname fd in event loop: %m"); + } + + r = sd_event_source_set_priority(s->hostname_event_source, SD_EVENT_PRIORITY_IMPORTANT-10); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of hostname event source: %m"); + + return 0; +} + +static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = userdata; + int r; + + assert(s); + assert(s->notify_event_source == es); + assert(s->notify_fd == fd); + + /* The $NOTIFY_SOCKET is writable again, now send exactly one + * message on it. Either it's the watchdog event, the initial + * READY=1 event or an stdout stream event. If there's nothing + * to write anymore, turn our event source off. The next time + * there's something to send it will be turned on again. */ + + if (!s->sent_notify_ready) { + static const char p[] = + "READY=1\n" + "STATUS=Processing requests..."; + ssize_t l; + + l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT); + if (l < 0) { + if (errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to send READY=1 notification message: %m"); + } + + s->sent_notify_ready = true; + log_debug("Sent READY=1 notification."); + + } else if (s->send_watchdog) { + + static const char p[] = + "WATCHDOG=1"; + + ssize_t l; + + l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT); + if (l < 0) { + if (errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to send WATCHDOG=1 notification message: %m"); + } + + s->send_watchdog = false; + log_debug("Sent WATCHDOG=1 notification."); + + } else if (s->stdout_streams_notify_queue) + /* Dispatch one stream notification event */ + stdout_stream_send_notify(s->stdout_streams_notify_queue); + + /* Leave us enabled if there's still more to do. */ + if (s->send_watchdog || s->stdout_streams_notify_queue) + return 0; + + /* There was nothing to do anymore, let's turn ourselves off. */ + r = sd_event_source_set_enabled(es, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to turn off notify event source: %m"); + + return 0; +} + +static int dispatch_watchdog(sd_event_source *es, uint64_t usec, void *userdata) { + Server *s = userdata; + int r; + + assert(s); + + s->send_watchdog = true; + + r = sd_event_source_set_enabled(s->notify_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to turn on notify event source: %m"); + + r = sd_event_source_set_time(s->watchdog_event_source, usec + s->watchdog_usec / 2); + if (r < 0) + return log_error_errno(r, "Failed to restart watchdog event source: %m"); + + r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to enable watchdog event source: %m"); + + return 0; +} + +static int server_connect_notify(Server *s) { + union sockaddr_union sa; + socklen_t sa_len; + const char *e; + int r; + + assert(s); + assert(s->notify_fd < 0); + assert(!s->notify_event_source); + + /* + * So here's the problem: we'd like to send notification messages to PID 1, but we cannot do that via + * sd_notify(), since that's synchronous, and we might end up blocking on it. Specifically: given + * that PID 1 might block on dbus-daemon during IPC, and dbus-daemon is logging to us, and might + * hence block on us, we might end up in a deadlock if we block on sending PID 1 notification + * messages — by generating a full blocking circle. To avoid this, let's create a non-blocking + * socket, and connect it to the notification socket, and then wait for POLLOUT before we send + * anything. This should efficiently avoid any deadlocks, as we'll never block on PID 1, hence PID 1 + * can safely block on dbus-daemon which can safely block on us again. + * + * Don't think that this issue is real? It is, see: https://github.com/systemd/systemd/issues/1505 + */ + + e = getenv("NOTIFY_SOCKET"); + if (!e) + return 0; + + r = sockaddr_un_set_path(&sa.un, e); + if (r < 0) + return log_error_errno(r, "NOTIFY_SOCKET set to invalid value '%s': %m", e); + sa_len = r; + + s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->notify_fd < 0) + return log_error_errno(errno, "Failed to create notify socket: %m"); + + (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE); + + r = connect(s->notify_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "Failed to connect to notify socket: %m"); + + r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s); + if (r < 0) + return log_error_errno(r, "Failed to watch notification socket: %m"); + + if (sd_watchdog_enabled(false, &s->watchdog_usec) > 0) { + s->send_watchdog = true; + + r = sd_event_add_time_relative(s->event, &s->watchdog_event_source, CLOCK_MONOTONIC, s->watchdog_usec/2, s->watchdog_usec/4, dispatch_watchdog, s); + if (r < 0) + return log_error_errno(r, "Failed to add watchdog time event: %m"); + } + + /* This should fire pretty soon, which we'll use to send the READY=1 event. */ + + return 0; +} + +static int synchronize_second_half(sd_event_source *event_source, void *userdata) { + Varlink *link = userdata; + Server *s; + int r; + + assert(link); + assert_se(s = varlink_get_userdata(link)); + + /* This is the "second half" of the Synchronize() varlink method. This function is called as deferred + * event source at a low priority to ensure the synchronization completes after all queued log + * messages are processed. */ + server_full_sync(s); + + /* Let's get rid of the event source now, by marking it as non-floating again. It then has no ref + * anymore and is immediately destroyed after we return from this function, i.e. from this event + * source handler at the end. */ + r = sd_event_source_set_floating(event_source, false); + if (r < 0) + return log_error_errno(r, "Failed to mark event source as non-floating: %m"); + + return varlink_reply(link, NULL); +} + +static void synchronize_destroy(void *userdata) { + varlink_unref(userdata); +} + +static int vl_method_synchronize(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(sd_event_source_unrefp) sd_event_source *event_source = NULL; + Server *s = userdata; + int r; + + assert(link); + assert(s); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + log_info("Received client request to rotate journal."); + + /* We don't do the main work now, but instead enqueue a deferred event loop job which will do + * it. That job is scheduled at low priority, so that we return from this method call only after all + * queued but not processed log messages are written to disk, so that this method call returning can + * be used as nice synchronization point. */ + r = sd_event_add_defer(s->event, &event_source, synchronize_second_half, link); + if (r < 0) + return log_error_errno(r, "Failed to allocate defer event source: %m"); + + r = sd_event_source_set_destroy_callback(event_source, synchronize_destroy); + if (r < 0) + return log_error_errno(r, "Failed to set event source destroy callback: %m"); + + varlink_ref(link); /* The varlink object is now left to the destroy callback to unref */ + + r = sd_event_source_set_priority(event_source, SD_EVENT_PRIORITY_NORMAL+15); + if (r < 0) + return log_error_errno(r, "Failed to set defer event source priority: %m"); + + /* Give up ownership of this event source. It will now be destroyed along with event loop itself, + * unless it destroys itself earlier. */ + r = sd_event_source_set_floating(event_source, true); + if (r < 0) + return log_error_errno(r, "Failed to mark event source as floating: %m"); + + (void) sd_event_source_set_description(event_source, "deferred-sync"); + + return 0; +} + +static int vl_method_rotate(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = userdata; + + assert(link); + assert(s); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + log_info("Received client request to rotate journal."); + server_full_rotate(s); + + return varlink_reply(link, NULL); +} + +static int vl_method_flush_to_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = userdata; + + assert(link); + assert(s); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + if (s->namespace) + return varlink_error(link, "io.systemd.Journal.NotSupportedByNamespaces", NULL); + + log_info("Received client request to flush runtime journal."); + server_full_flush(s); + + return varlink_reply(link, NULL); +} + +static int vl_method_relinquish_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = userdata; + + assert(link); + assert(s); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + if (s->namespace) + return varlink_error(link, "io.systemd.Journal.NotSupportedByNamespaces", NULL); + + log_info("Received client request to relinquish %s access.", s->system_storage.path); + server_relinquish_var(s); + + return varlink_reply(link, NULL); +} + +static int vl_connect(VarlinkServer *server, Varlink *link, void *userdata) { + Server *s = userdata; + + assert(server); + assert(link); + assert(s); + + (void) server_start_or_stop_idle_timer(s); /* maybe we are no longer idle */ + + return 0; +} + +static void vl_disconnect(VarlinkServer *server, Varlink *link, void *userdata) { + Server *s = userdata; + + assert(server); + assert(link); + assert(s); + + (void) server_start_or_stop_idle_timer(s); /* maybe we are idle now */ +} + +static int server_open_varlink(Server *s, const char *socket, int fd) { + int r; + + assert(s); + + r = varlink_server_new(&s->varlink_server, VARLINK_SERVER_ROOT_ONLY); + if (r < 0) + return r; + + varlink_server_set_userdata(s->varlink_server, s); + + r = varlink_server_bind_method_many( + s->varlink_server, + "io.systemd.Journal.Synchronize", vl_method_synchronize, + "io.systemd.Journal.Rotate", vl_method_rotate, + "io.systemd.Journal.FlushToVar", vl_method_flush_to_var, + "io.systemd.Journal.RelinquishVar", vl_method_relinquish_var); + if (r < 0) + return r; + + r = varlink_server_bind_connect(s->varlink_server, vl_connect); + if (r < 0) + return r; + + r = varlink_server_bind_disconnect(s->varlink_server, vl_disconnect); + if (r < 0) + return r; + + if (fd < 0) + r = varlink_server_listen_address(s->varlink_server, socket, 0600); + else + r = varlink_server_listen_fd(s->varlink_server, fd); + if (r < 0) + return r; + + r = varlink_server_attach_event(s->varlink_server, s->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return r; + + return 0; +} + +static bool server_is_idle(Server *s) { + assert(s); + + /* The server for the main namespace is never idle */ + if (!s->namespace) + return false; + + /* If a retention maximum is set larger than the idle time we need to be running to enforce it, hence + * turn off the idle logic. */ + if (s->max_retention_usec > IDLE_TIMEOUT_USEC) + return false; + + /* We aren't idle if we have a varlink client */ + if (varlink_server_current_connections(s->varlink_server) > 0) + return false; + + /* If we have stdout streams we aren't idle */ + if (s->n_stdout_streams > 0) + return false; + + return true; +} + +static int server_idle_handler(sd_event_source *source, uint64_t usec, void *userdata) { + Server *s = userdata; + + assert(source); + assert(s); + + log_debug("Server is idle, exiting."); + sd_event_exit(s->event, 0); + return 0; +} + +int server_start_or_stop_idle_timer(Server *s) { + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + + if (!server_is_idle(s)) { + s->idle_event_source = sd_event_source_disable_unref(s->idle_event_source); + return 0; + } + + if (s->idle_event_source) + return 1; + + r = sd_event_add_time_relative(s->event, &source, CLOCK_MONOTONIC, IDLE_TIMEOUT_USEC, 0, server_idle_handler, s); + if (r < 0) + return log_error_errno(r, "Failed to allocate idle timer: %m"); + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to set idle timer priority: %m"); + + (void) sd_event_source_set_description(source, "idle-timer"); + + s->idle_event_source = TAKE_PTR(source); + return 1; +} + +int server_refresh_idle_timer(Server *s) { + int r; + + assert(s); + + if (!s->idle_event_source) + return 0; + + r = sd_event_source_set_time_relative(s->idle_event_source, IDLE_TIMEOUT_USEC); + if (r < 0) + return log_error_errno(r, "Failed to refresh idle timer: %m"); + + return 1; +} + +static int set_namespace(Server *s, const char *namespace) { + assert(s); + + if (!namespace) + return 0; + + if (!log_namespace_name_valid(namespace)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified namespace name not valid, refusing: %s", namespace); + + s->namespace = strdup(namespace); + if (!s->namespace) + return log_oom(); + + s->namespace_field = strjoin("_NAMESPACE=", namespace); + if (!s->namespace_field) + return log_oom(); + + return 1; +} + +int server_init(Server *s, const char *namespace) { + const char *native_socket, *syslog_socket, *stdout_socket, *varlink_socket, *e; + _cleanup_fdset_free_ FDSet *fds = NULL; + int n, r, fd, varlink_fd = -1; + bool no_sockets; + + assert(s); + + *s = (Server) { + .syslog_fd = -1, + .native_fd = -1, + .stdout_fd = -1, + .dev_kmsg_fd = -1, + .audit_fd = -1, + .hostname_fd = -1, + .notify_fd = -1, + + .compress.enabled = true, + .compress.threshold_bytes = (uint64_t) -1, + .seal = true, + + .set_audit = true, + + .watchdog_usec = USEC_INFINITY, + + .sync_interval_usec = DEFAULT_SYNC_INTERVAL_USEC, + .sync_scheduled = false, + + .ratelimit_interval = DEFAULT_RATE_LIMIT_INTERVAL, + .ratelimit_burst = DEFAULT_RATE_LIMIT_BURST, + + .forward_to_wall = true, + + .max_file_usec = DEFAULT_MAX_FILE_USEC, + + .max_level_store = LOG_DEBUG, + .max_level_syslog = LOG_DEBUG, + .max_level_kmsg = LOG_NOTICE, + .max_level_console = LOG_INFO, + .max_level_wall = LOG_EMERG, + + .line_max = DEFAULT_LINE_MAX, + + .runtime_storage.name = "Runtime Journal", + .system_storage.name = "System Journal", + }; + + r = set_namespace(s, namespace); + if (r < 0) + return r; + + /* By default, only read from /dev/kmsg if are the main namespace */ + s->read_kmsg = !s->namespace; + s->storage = s->namespace ? STORAGE_PERSISTENT : STORAGE_AUTO; + + journal_reset_metrics(&s->system_storage.metrics); + journal_reset_metrics(&s->runtime_storage.metrics); + + server_parse_config_file(s); + + if (!s->namespace) { + /* Parse kernel command line, but only if we are not a namespace instance */ + r = proc_cmdline_parse(parse_proc_cmdline_item, s, PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + } + + if (!!s->ratelimit_interval != !!s->ratelimit_burst) { /* One set to 0 and the other not? */ + log_debug("Setting both rate limit interval and burst from "USEC_FMT",%u to 0,0", + s->ratelimit_interval, s->ratelimit_burst); + s->ratelimit_interval = s->ratelimit_burst = 0; + } + + e = getenv("RUNTIME_DIRECTORY"); + if (e) + s->runtime_directory = strdup(e); + else if (s->namespace) + s->runtime_directory = strjoin("/run/systemd/journal.", s->namespace); + else + s->runtime_directory = strdup("/run/systemd/journal"); + if (!s->runtime_directory) + return log_oom(); + + (void) mkdir_p(s->runtime_directory, 0755); + + s->user_journals = ordered_hashmap_new(NULL); + if (!s->user_journals) + return log_oom(); + + s->mmap = mmap_cache_new(); + if (!s->mmap) + return log_oom(); + + s->deferred_closes = set_new(NULL); + if (!s->deferred_closes) + return log_oom(); + + r = sd_event_default(&s->event); + if (r < 0) + return log_error_errno(r, "Failed to create event loop: %m"); + + n = sd_listen_fds(true); + if (n < 0) + return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); + + native_socket = strjoina(s->runtime_directory, "/socket"); + stdout_socket = strjoina(s->runtime_directory, "/stdout"); + syslog_socket = strjoina(s->runtime_directory, "/dev-log"); + varlink_socket = strjoina(s->runtime_directory, "/io.systemd.journal"); + + for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) { + + if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, native_socket, 0) > 0) { + + if (s->native_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many native sockets passed."); + + s->native_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, stdout_socket, 0) > 0) { + + if (s->stdout_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many stdout sockets passed."); + + s->stdout_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, syslog_socket, 0) > 0) { + + if (s->syslog_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many /dev/log sockets passed."); + + s->syslog_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, varlink_socket, 0) > 0) { + + if (varlink_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many varlink sockets passed."); + + varlink_fd = fd; + } else if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, -1) > 0) { + + if (s->audit_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many audit sockets passed."); + + s->audit_fd = fd; + + } else { + + if (!fds) { + fds = fdset_new(); + if (!fds) + return log_oom(); + } + + r = fdset_put(fds, fd); + if (r < 0) + return log_oom(); + } + } + + /* Try to restore streams, but don't bother if this fails */ + (void) server_restore_streams(s, fds); + + if (fdset_size(fds) > 0) { + log_warning("%u unknown file descriptors passed, closing.", fdset_size(fds)); + fds = fdset_free(fds); + } + + no_sockets = s->native_fd < 0 && s->stdout_fd < 0 && s->syslog_fd < 0 && s->audit_fd < 0 && varlink_fd < 0; + + /* always open stdout, syslog, native, and kmsg sockets */ + + /* systemd-journald.socket: /run/systemd/journal/stdout */ + r = server_open_stdout_socket(s, stdout_socket); + if (r < 0) + return r; + + /* systemd-journald-dev-log.socket: /run/systemd/journal/dev-log */ + r = server_open_syslog_socket(s, syslog_socket); + if (r < 0) + return r; + + /* systemd-journald.socket: /run/systemd/journal/socket */ + r = server_open_native_socket(s, native_socket); + if (r < 0) + return r; + + /* /dev/kmsg */ + r = server_open_dev_kmsg(s); + if (r < 0) + return r; + + /* Unless we got *some* sockets and not audit, open audit socket */ + if (s->audit_fd >= 0 || no_sockets) { + r = server_open_audit(s); + if (r < 0) + return r; + } + + r = server_open_varlink(s, varlink_socket, varlink_fd); + if (r < 0) + return r; + + r = server_open_kernel_seqnum(s); + if (r < 0) + return r; + + r = server_open_hostname(s); + if (r < 0) + return r; + + r = setup_signals(s); + if (r < 0) + return r; + + s->ratelimit = journal_ratelimit_new(); + if (!s->ratelimit) + return log_oom(); + + r = cg_get_root_path(&s->cgroup_root); + if (r < 0) + return log_error_errno(r, "Failed to acquire cgroup root path: %m"); + + server_cache_hostname(s); + server_cache_boot_id(s); + server_cache_machine_id(s); + + if (s->namespace) + s->runtime_storage.path = strjoin("/run/log/journal/", SERVER_MACHINE_ID(s), ".", s->namespace); + else + s->runtime_storage.path = strjoin("/run/log/journal/", SERVER_MACHINE_ID(s)); + if (!s->runtime_storage.path) + return log_oom(); + + e = getenv("LOGS_DIRECTORY"); + if (e) + s->system_storage.path = strdup(e); + else if (s->namespace) + s->system_storage.path = strjoin("/var/log/journal/", SERVER_MACHINE_ID(s), ".", s->namespace); + else + s->system_storage.path = strjoin("/var/log/journal/", SERVER_MACHINE_ID(s)); + if (!s->system_storage.path) + return log_oom(); + + (void) server_connect_notify(s); + + (void) client_context_acquire_default(s); + + r = system_journal_open(s, false, false); + if (r < 0) + return r; + + server_start_or_stop_idle_timer(s); + return 0; +} + +void server_maybe_append_tags(Server *s) { +#if HAVE_GCRYPT + JournalFile *f; + usec_t n; + + n = now(CLOCK_REALTIME); + + if (s->system_journal) + journal_file_maybe_append_tag(s->system_journal, n); + + ORDERED_HASHMAP_FOREACH(f, s->user_journals) + journal_file_maybe_append_tag(f, n); +#endif +} + +void server_done(Server *s) { + assert(s); + + free(s->namespace); + free(s->namespace_field); + + set_free_with_destructor(s->deferred_closes, journal_file_close); + + while (s->stdout_streams) + stdout_stream_free(s->stdout_streams); + + client_context_flush_all(s); + + (void) journal_file_close(s->system_journal); + (void) journal_file_close(s->runtime_journal); + + ordered_hashmap_free_with_destructor(s->user_journals, journal_file_close); + + varlink_server_unref(s->varlink_server); + + sd_event_source_unref(s->syslog_event_source); + sd_event_source_unref(s->native_event_source); + sd_event_source_unref(s->stdout_event_source); + sd_event_source_unref(s->dev_kmsg_event_source); + sd_event_source_unref(s->audit_event_source); + sd_event_source_unref(s->sync_event_source); + sd_event_source_unref(s->sigusr1_event_source); + sd_event_source_unref(s->sigusr2_event_source); + sd_event_source_unref(s->sigterm_event_source); + sd_event_source_unref(s->sigint_event_source); + sd_event_source_unref(s->sigrtmin1_event_source); + sd_event_source_unref(s->hostname_event_source); + sd_event_source_unref(s->notify_event_source); + sd_event_source_unref(s->watchdog_event_source); + sd_event_source_unref(s->idle_event_source); + sd_event_unref(s->event); + + safe_close(s->syslog_fd); + safe_close(s->native_fd); + safe_close(s->stdout_fd); + safe_close(s->dev_kmsg_fd); + safe_close(s->audit_fd); + safe_close(s->hostname_fd); + safe_close(s->notify_fd); + + if (s->ratelimit) + journal_ratelimit_free(s->ratelimit); + + if (s->kernel_seqnum) + munmap(s->kernel_seqnum, sizeof(uint64_t)); + + free(s->buffer); + free(s->tty_path); + free(s->cgroup_root); + free(s->hostname_field); + free(s->runtime_storage.path); + free(s->system_storage.path); + free(s->runtime_directory); + + mmap_cache_unref(s->mmap); +} + +static const char* const storage_table[_STORAGE_MAX] = { + [STORAGE_AUTO] = "auto", + [STORAGE_VOLATILE] = "volatile", + [STORAGE_PERSISTENT] = "persistent", + [STORAGE_NONE] = "none" +}; + +DEFINE_STRING_TABLE_LOOKUP(storage, Storage); +DEFINE_CONFIG_PARSE_ENUM(config_parse_storage, storage, Storage, "Failed to parse storage setting"); + +static const char* const split_mode_table[_SPLIT_MAX] = { + [SPLIT_LOGIN] = "login", + [SPLIT_UID] = "uid", + [SPLIT_NONE] = "none", +}; + +DEFINE_STRING_TABLE_LOOKUP(split_mode, SplitMode); +DEFINE_CONFIG_PARSE_ENUM(config_parse_split_mode, split_mode, SplitMode, "Failed to parse split mode setting"); + +int config_parse_line_max( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + size_t *sz = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) + /* Empty assignment means default */ + *sz = DEFAULT_LINE_MAX; + else { + uint64_t v; + + r = parse_size(rvalue, 1024, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse LineMax= value, ignoring: %s", rvalue); + return 0; + } + + if (v < 79) { + /* Why specify 79 here as minimum line length? Simply, because the most common traditional + * terminal size is 80ch, and it might make sense to break one character before the natural + * line break would occur on that. */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too small, clamping to 79: %s", rvalue); + *sz = 79; + } else if (v > (uint64_t) (SSIZE_MAX-1)) { + /* So, why specify SSIZE_MAX-1 here? Because that's one below the largest size value read() + * can return, and we need one extra byte for the trailing NUL byte. Of course IRL such large + * memory allocations will fail anyway, hence this limit is mostly theoretical anyway, as we'll + * fail much earlier anyway. */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too large, clamping to %" PRIu64 ": %s", (uint64_t) (SSIZE_MAX-1), rvalue); + *sz = SSIZE_MAX-1; + } else + *sz = (size_t) v; + } + + return 0; +} + +int config_parse_compress( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + JournalCompressOptions* compress = data; + int r; + + if (isempty(rvalue)) { + compress->enabled = true; + compress->threshold_bytes = (uint64_t) -1; + } else if (streq(rvalue, "1")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Compress= ambiguously specified as 1, enabling compression with default threshold"); + compress->enabled = true; + } else if (streq(rvalue, "0")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Compress= ambiguously specified as 0, disabling compression"); + compress->enabled = false; + } else { + r = parse_boolean(rvalue); + if (r < 0) { + r = parse_size(rvalue, 1024, &compress->threshold_bytes); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Compress= value, ignoring: %s", rvalue); + else + compress->enabled = true; + } else + compress->enabled = r; + } + + return 0; +} diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h new file mode 100644 index 0000000..5fb145e --- /dev/null +++ b/src/journal/journald-server.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <sys/types.h> + +#include "sd-event.h" + +typedef struct Server Server; + +#include "conf-parser.h" +#include "hashmap.h" +#include "journal-file.h" +#include "journald-context.h" +#include "journald-rate-limit.h" +#include "journald-stream.h" +#include "list.h" +#include "prioq.h" +#include "time-util.h" +#include "varlink.h" + +typedef enum Storage { + STORAGE_AUTO, + STORAGE_VOLATILE, + STORAGE_PERSISTENT, + STORAGE_NONE, + _STORAGE_MAX, + _STORAGE_INVALID = -1 +} Storage; + +typedef enum SplitMode { + SPLIT_UID, + SPLIT_LOGIN, /* deprecated */ + SPLIT_NONE, + _SPLIT_MAX, + _SPLIT_INVALID = -1 +} SplitMode; + +typedef struct JournalCompressOptions { + bool enabled; + uint64_t threshold_bytes; +} JournalCompressOptions; + +typedef struct JournalStorageSpace { + usec_t timestamp; + + uint64_t available; + uint64_t limit; + + uint64_t vfs_used; /* space used by journal files */ + uint64_t vfs_available; +} JournalStorageSpace; + +typedef struct JournalStorage { + const char *name; + char *path; + + JournalMetrics metrics; + JournalStorageSpace space; +} JournalStorage; + +struct Server { + char *namespace; + + int syslog_fd; + int native_fd; + int stdout_fd; + int dev_kmsg_fd; + int audit_fd; + int hostname_fd; + int notify_fd; + + sd_event *event; + + sd_event_source *syslog_event_source; + sd_event_source *native_event_source; + sd_event_source *stdout_event_source; + sd_event_source *dev_kmsg_event_source; + sd_event_source *audit_event_source; + sd_event_source *sync_event_source; + sd_event_source *sigusr1_event_source; + sd_event_source *sigusr2_event_source; + sd_event_source *sigterm_event_source; + sd_event_source *sigint_event_source; + sd_event_source *sigrtmin1_event_source; + sd_event_source *hostname_event_source; + sd_event_source *notify_event_source; + sd_event_source *watchdog_event_source; + sd_event_source *idle_event_source; + + JournalFile *runtime_journal; + JournalFile *system_journal; + OrderedHashmap *user_journals; + + uint64_t seqnum; + + char *buffer; + size_t buffer_size; + + JournalRateLimit *ratelimit; + usec_t sync_interval_usec; + usec_t ratelimit_interval; + unsigned ratelimit_burst; + + JournalStorage runtime_storage; + JournalStorage system_storage; + + JournalCompressOptions compress; + bool seal; + bool read_kmsg; + int set_audit; + + bool forward_to_kmsg; + bool forward_to_syslog; + bool forward_to_console; + bool forward_to_wall; + + unsigned n_forward_syslog_missed; + usec_t last_warn_forward_syslog_missed; + + usec_t max_retention_usec; + usec_t max_file_usec; + usec_t oldest_file_usec; + + LIST_HEAD(StdoutStream, stdout_streams); + LIST_HEAD(StdoutStream, stdout_streams_notify_queue); + unsigned n_stdout_streams; + + char *tty_path; + + int max_level_store; + int max_level_syslog; + int max_level_kmsg; + int max_level_console; + int max_level_wall; + + Storage storage; + SplitMode split_mode; + + MMapCache *mmap; + + Set *deferred_closes; + + uint64_t *kernel_seqnum; + bool dev_kmsg_readable:1; + + bool send_watchdog:1; + bool sent_notify_ready:1; + bool sync_scheduled:1; + + char machine_id_field[sizeof("_MACHINE_ID=") + 32]; + char boot_id_field[sizeof("_BOOT_ID=") + 32]; + char *hostname_field; + char *namespace_field; + char *runtime_directory; + + /* Cached cgroup root, so that we don't have to query that all the time */ + char *cgroup_root; + + usec_t watchdog_usec; + + usec_t last_realtime_clock; + + size_t line_max; + + /* Caching of client metadata */ + Hashmap *client_contexts; + Prioq *client_contexts_lru; + + usec_t last_cache_pid_flush; + + ClientContext *my_context; /* the context of journald itself */ + ClientContext *pid1_context; /* the context of PID 1 */ + + VarlinkServer *varlink_server; +}; + +#define SERVER_MACHINE_ID(s) ((s)->machine_id_field + STRLEN("_MACHINE_ID=")) + +/* Extra fields for any log messages */ +#define N_IOVEC_META_FIELDS 23 + +/* Extra fields for log messages that contain OBJECT_PID= (i.e. log about another process) */ +#define N_IOVEC_OBJECT_FIELDS 18 + +/* Maximum number of fields we'll add in for driver (i.e. internal) messages */ +#define N_IOVEC_PAYLOAD_FIELDS 16 + +/* kmsg: Maximum number of extra fields we'll import from the kernel's /dev/kmsg */ +#define N_IOVEC_KERNEL_FIELDS 64 + +/* kmsg: Maximum number of extra fields we'll import from udev's devices */ +#define N_IOVEC_UDEV_FIELDS 32 + +void server_dispatch_message(Server *s, struct iovec *iovec, size_t n, size_t m, ClientContext *c, const struct timeval *tv, int priority, pid_t object_pid); +void server_driver_message(Server *s, pid_t object_pid, const char *message_id, const char *format, ...) _sentinel_ _printf_(4,0); + +/* gperf lookup function */ +const struct ConfigPerfItem* journald_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +CONFIG_PARSER_PROTOTYPE(config_parse_storage); +CONFIG_PARSER_PROTOTYPE(config_parse_line_max); +CONFIG_PARSER_PROTOTYPE(config_parse_compress); + +const char *storage_to_string(Storage s) _const_; +Storage storage_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_split_mode); + +const char *split_mode_to_string(SplitMode s) _const_; +SplitMode split_mode_from_string(const char *s) _pure_; + +int server_init(Server *s, const char *namespace); +void server_done(Server *s); +void server_sync(Server *s); +int server_vacuum(Server *s, bool verbose); +void server_rotate(Server *s); +int server_schedule_sync(Server *s, int priority); +int server_flush_to_var(Server *s, bool require_flag_file); +void server_maybe_append_tags(Server *s); +int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata); +void server_space_usage_message(Server *s, JournalStorage *storage); + +int server_start_or_stop_idle_timer(Server *s); +int server_refresh_idle_timer(Server *s); diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c new file mode 100644 index 0000000..3241ef2 --- /dev/null +++ b/src/journal/journald-stream.c @@ -0,0 +1,963 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stddef.h> +#include <unistd.h> + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#include "sd-daemon.h" +#include "sd-event.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "env-file.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "journald-console.h" +#include "journald-context.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-stream.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "mkdir.h" +#include "parse-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" + +#define STDOUT_STREAMS_MAX 4096 + +typedef enum StdoutStreamState { + STDOUT_STREAM_IDENTIFIER, + STDOUT_STREAM_UNIT_ID, + STDOUT_STREAM_PRIORITY, + STDOUT_STREAM_LEVEL_PREFIX, + STDOUT_STREAM_FORWARD_TO_SYSLOG, + STDOUT_STREAM_FORWARD_TO_KMSG, + STDOUT_STREAM_FORWARD_TO_CONSOLE, + STDOUT_STREAM_RUNNING +} StdoutStreamState; + +/* The different types of log record terminators: a real \n was read, a NUL character was read, the maximum line length + * was reached, or the end of the stream was reached */ + +typedef enum LineBreak { + LINE_BREAK_NEWLINE, + LINE_BREAK_NUL, + LINE_BREAK_LINE_MAX, + LINE_BREAK_EOF, + LINE_BREAK_PID_CHANGE, + _LINE_BREAK_MAX, + _LINE_BREAK_INVALID = -1, +} LineBreak; + +struct StdoutStream { + Server *server; + StdoutStreamState state; + + int fd; + + struct ucred ucred; + char *label; + char *identifier; + char *unit_id; + int priority; + bool level_prefix:1; + bool forward_to_syslog:1; + bool forward_to_kmsg:1; + bool forward_to_console:1; + + bool fdstore:1; + bool in_notify_queue:1; + + char *buffer; + size_t length; + size_t allocated; + + sd_event_source *event_source; + + char *state_file; + + ClientContext *context; + + LIST_FIELDS(StdoutStream, stdout_stream); + LIST_FIELDS(StdoutStream, stdout_stream_notify_queue); + + char id_field[STRLEN("_STREAM_ID=") + SD_ID128_STRING_MAX]; +}; + +void stdout_stream_free(StdoutStream *s) { + if (!s) + return; + + if (s->server) { + + if (s->context) + client_context_release(s->server, s->context); + + assert(s->server->n_stdout_streams > 0); + s->server->n_stdout_streams--; + LIST_REMOVE(stdout_stream, s->server->stdout_streams, s); + + if (s->in_notify_queue) + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + + (void) server_start_or_stop_idle_timer(s->server); /* Maybe we are idle now? */ + } + + if (s->event_source) { + sd_event_source_set_enabled(s->event_source, SD_EVENT_OFF); + s->event_source = sd_event_source_unref(s->event_source); + } + + safe_close(s->fd); + free(s->label); + free(s->identifier); + free(s->unit_id); + free(s->state_file); + free(s->buffer); + + free(s); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(StdoutStream*, stdout_stream_free); + +void stdout_stream_destroy(StdoutStream *s) { + if (!s) + return; + + if (s->state_file) + (void) unlink(s->state_file); + + stdout_stream_free(s); +} + +static int stdout_stream_save(StdoutStream *s) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(s); + + if (s->state != STDOUT_STREAM_RUNNING) + return 0; + + if (!s->state_file) { + struct stat st; + + r = fstat(s->fd, &st); + if (r < 0) + return log_warning_errno(errno, "Failed to stat connected stream: %m"); + + /* We use device and inode numbers as identifier for the stream */ + r = asprintf(&s->state_file, "%s/streams/%lu:%lu", s->server->runtime_directory, (unsigned long) st.st_dev, (unsigned long) st.st_ino); + if (r < 0) + return log_oom(); + } + + (void) mkdir_parents(s->state_file, 0755); + + r = fopen_temporary(s->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + fprintf(f, + "# This is private data. Do not parse\n" + "PRIORITY=%i\n" + "LEVEL_PREFIX=%i\n" + "FORWARD_TO_SYSLOG=%i\n" + "FORWARD_TO_KMSG=%i\n" + "FORWARD_TO_CONSOLE=%i\n" + "STREAM_ID=%s\n", + s->priority, + s->level_prefix, + s->forward_to_syslog, + s->forward_to_kmsg, + s->forward_to_console, + s->id_field + STRLEN("_STREAM_ID=")); + + if (!isempty(s->identifier)) { + _cleanup_free_ char *escaped; + + escaped = cescape(s->identifier); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "IDENTIFIER=%s\n", escaped); + } + + if (!isempty(s->unit_id)) { + _cleanup_free_ char *escaped; + + escaped = cescape(s->unit_id); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "UNIT=%s\n", escaped); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, s->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + + if (!s->fdstore && !s->in_notify_queue) { + LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = true; + + if (s->server->notify_event_source) { + r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to enable notify event source: %m"); + } + } + + return 0; + +fail: + (void) unlink(s->state_file); + return log_error_errno(r, "Failed to save stream data %s: %m", s->state_file); +} + +static int stdout_stream_log( + StdoutStream *s, + const char *p, + LineBreak line_break) { + + struct iovec *iovec; + int priority; + char syslog_priority[] = "PRIORITY=\0"; + char syslog_facility[STRLEN("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int) + 1]; + _cleanup_free_ char *message = NULL, *syslog_identifier = NULL; + size_t n = 0, m; + int r; + + assert(s); + assert(p); + + assert(line_break >= 0); + assert(line_break < _LINE_BREAK_MAX); + + if (s->context) + (void) client_context_maybe_refresh(s->server, s->context, NULL, NULL, 0, NULL, USEC_INFINITY); + else if (pid_is_valid(s->ucred.pid)) { + r = client_context_acquire(s->server, s->ucred.pid, &s->ucred, s->label, strlen_ptr(s->label), s->unit_id, &s->context); + if (r < 0) + log_warning_errno(r, "Failed to acquire client context, ignoring: %m"); + } + + priority = s->priority; + + if (s->level_prefix) + syslog_parse_priority(&p, &priority, false); + + if (!client_context_test_priority(s->context, priority)) + return 0; + + if (isempty(p)) + return 0; + + if (s->forward_to_syslog || s->server->forward_to_syslog) + server_forward_syslog(s->server, syslog_fixup_facility(priority), s->identifier, p, &s->ucred, NULL); + + if (s->forward_to_kmsg || s->server->forward_to_kmsg) + server_forward_kmsg(s->server, priority, s->identifier, p, &s->ucred); + + if (s->forward_to_console || s->server->forward_to_console) + server_forward_console(s->server, priority, s->identifier, p, &s->ucred); + + if (s->server->forward_to_wall) + server_forward_wall(s->server, priority, s->identifier, p, &s->ucred); + + m = N_IOVEC_META_FIELDS + 7 + client_context_extra_fields_n_iovec(s->context); + iovec = newa(struct iovec, m); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=stdout"); + iovec[n++] = IOVEC_MAKE_STRING(s->id_field); + + syslog_priority[STRLEN("PRIORITY=")] = '0' + LOG_PRI(priority); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (priority & LOG_FACMASK) { + xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + } + + if (s->identifier) { + syslog_identifier = strjoin("SYSLOG_IDENTIFIER=", s->identifier); + if (syslog_identifier) + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); + } + + static const char * const line_break_field_table[_LINE_BREAK_MAX] = { + [LINE_BREAK_NEWLINE] = NULL, /* Do not add field if traditional newline */ + [LINE_BREAK_NUL] = "_LINE_BREAK=nul", + [LINE_BREAK_LINE_MAX] = "_LINE_BREAK=line-max", + [LINE_BREAK_EOF] = "_LINE_BREAK=eof", + [LINE_BREAK_PID_CHANGE] = "_LINE_BREAK=pid-change", + }; + + const char *c = line_break_field_table[line_break]; + + /* If this log message was generated due to an uncommon line break then mention this in the log + * entry */ + if (c) + iovec[n++] = IOVEC_MAKE_STRING(c); + + message = strjoin("MESSAGE=", p); + if (message) + iovec[n++] = IOVEC_MAKE_STRING(message); + + server_dispatch_message(s->server, iovec, n, m, s->context, NULL, priority, 0); + return 0; +} + +static int stdout_stream_line(StdoutStream *s, char *p, LineBreak line_break) { + char *orig; + int r; + + assert(s); + assert(p); + + orig = p; + p = strstrip(p); + + /* line breaks by NUL, line max length or EOF are not permissible during the negotiation part of the protocol */ + if (line_break != LINE_BREAK_NEWLINE && s->state != STDOUT_STREAM_RUNNING) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Control protocol line not properly terminated."); + + switch (s->state) { + + case STDOUT_STREAM_IDENTIFIER: + if (!isempty(p)) { + s->identifier = strdup(p); + if (!s->identifier) + return log_oom(); + } + + s->state = STDOUT_STREAM_UNIT_ID; + return 0; + + case STDOUT_STREAM_UNIT_ID: + if (s->ucred.uid == 0 && + unit_name_is_valid(p, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + + s->unit_id = strdup(p); + if (!s->unit_id) + return log_oom(); + } + + s->state = STDOUT_STREAM_PRIORITY; + return 0; + + case STDOUT_STREAM_PRIORITY: + r = safe_atoi(p, &s->priority); + if (r < 0 || s->priority < 0 || s->priority > 999) { + log_warning("Failed to parse log priority line."); + return -EINVAL; + } + + s->state = STDOUT_STREAM_LEVEL_PREFIX; + return 0; + + case STDOUT_STREAM_LEVEL_PREFIX: + r = parse_boolean(p); + if (r < 0) { + log_warning("Failed to parse level prefix line."); + return -EINVAL; + } + + s->level_prefix = r; + s->state = STDOUT_STREAM_FORWARD_TO_SYSLOG; + return 0; + + case STDOUT_STREAM_FORWARD_TO_SYSLOG: + r = parse_boolean(p); + if (r < 0) { + log_warning("Failed to parse forward to syslog line."); + return -EINVAL; + } + + s->forward_to_syslog = r; + s->state = STDOUT_STREAM_FORWARD_TO_KMSG; + return 0; + + case STDOUT_STREAM_FORWARD_TO_KMSG: + r = parse_boolean(p); + if (r < 0) { + log_warning("Failed to parse copy to kmsg line."); + return -EINVAL; + } + + s->forward_to_kmsg = r; + s->state = STDOUT_STREAM_FORWARD_TO_CONSOLE; + return 0; + + case STDOUT_STREAM_FORWARD_TO_CONSOLE: + r = parse_boolean(p); + if (r < 0) { + log_warning("Failed to parse copy to console line."); + return -EINVAL; + } + + s->forward_to_console = r; + s->state = STDOUT_STREAM_RUNNING; + + /* Try to save the stream, so that journald can be restarted and we can recover */ + (void) stdout_stream_save(s); + return 0; + + case STDOUT_STREAM_RUNNING: + return stdout_stream_log(s, orig, line_break); + } + + assert_not_reached("Unknown stream state"); +} + +static int stdout_stream_found( + StdoutStream *s, + char *p, + size_t l, + LineBreak line_break) { + + char saved; + int r; + + assert(s); + assert(p); + + /* Let's NUL terminate the specified buffer for this call, and revert back afterwards */ + saved = p[l]; + p[l] = 0; + r = stdout_stream_line(s, p, line_break); + p[l] = saved; + + return r; +} + +static int stdout_stream_scan( + StdoutStream *s, + char *p, + size_t remaining, + LineBreak force_flush, + size_t *ret_consumed) { + + size_t consumed = 0; + int r; + + assert(s); + assert(p); + + for (;;) { + LineBreak line_break; + size_t skip, found; + char *end1, *end2; + + end1 = memchr(p, '\n', remaining); + end2 = memchr(p, 0, end1 ? (size_t) (end1 - p) : remaining); + + if (end2) { + /* We found a NUL terminator */ + found = end2 - p; + skip = found + 1; + line_break = LINE_BREAK_NUL; + } else if (end1) { + /* We found a \n terminator */ + found = end1 - p; + skip = found + 1; + line_break = LINE_BREAK_NEWLINE; + } else if (remaining >= s->server->line_max) { + /* Force a line break after the maximum line length */ + found = skip = s->server->line_max; + line_break = LINE_BREAK_LINE_MAX; + } else + break; + + r = stdout_stream_found(s, p, found, line_break); + if (r < 0) + return r; + + p += skip; + consumed += skip; + remaining -= skip; + } + + if (force_flush >= 0 && remaining > 0) { + r = stdout_stream_found(s, p, remaining, force_flush); + if (r < 0) + return r; + + consumed += remaining; + } + + if (ret_consumed) + *ret_consumed = consumed; + + return 0; +} + +static int stdout_stream_process(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + StdoutStream *s = userdata; + size_t limit, consumed; + struct ucred *ucred; + struct iovec iovec; + ssize_t l; + char *p; + int r; + + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + assert(s); + + if ((revents|EPOLLIN|EPOLLHUP) != (EPOLLIN|EPOLLHUP)) { + log_error("Got invalid event from epoll for stdout stream: %"PRIx32, revents); + goto terminate; + } + + /* If the buffer is almost full, add room for another 1K */ + if (s->length + 512 >= s->allocated) { + if (!GREEDY_REALLOC(s->buffer, s->allocated, s->length + 1 + 1024)) { + log_oom(); + goto terminate; + } + } + + /* Try to make use of the allocated buffer in full, but never read more than the configured line size. Also, + * always leave room for a terminating NUL we might need to add. */ + limit = MIN(s->allocated - 1, s->server->line_max); + assert(s->length <= limit); + iovec = IOVEC_MAKE(s->buffer + s->length, limit - s->length); + + l = recvmsg(s->fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (l < 0) { + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + log_warning_errno(errno, "Failed to read from stream: %m"); + goto terminate; + } + cmsg_close_all(&msghdr); + + if (l == 0) { + (void) stdout_stream_scan(s, s->buffer, s->length, /* force_flush = */ LINE_BREAK_EOF, NULL); + goto terminate; + } + + /* Invalidate the context if the PID of the sender changed. This happens when a forked process + * inherits stdout/stderr from a parent. In this case getpeercred() returns the ucred of the parent, + * which can be invalid if the parent has exited in the meantime. */ + ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (ucred && ucred->pid != s->ucred.pid) { + /* Force out any previously half-written lines from a different process, before we switch to + * the new ucred structure for everything we just added */ + r = stdout_stream_scan(s, s->buffer, s->length, /* force_flush = */ LINE_BREAK_PID_CHANGE, NULL); + if (r < 0) + goto terminate; + + s->context = client_context_release(s->server, s->context); + + p = s->buffer + s->length; + } else { + p = s->buffer; + l += s->length; + } + + /* Always copy in the new credentials */ + if (ucred) + s->ucred = *ucred; + + r = stdout_stream_scan(s, p, l, _LINE_BREAK_INVALID, &consumed); + if (r < 0) + goto terminate; + + /* Move what wasn't consumed to the front of the buffer */ + assert(consumed <= (size_t) l); + s->length = l - consumed; + memmove(s->buffer, p + consumed, s->length); + + return 1; + +terminate: + stdout_stream_destroy(s); + return 0; +} + +int stdout_stream_install(Server *s, int fd, StdoutStream **ret) { + _cleanup_(stdout_stream_freep) StdoutStream *stream = NULL; + sd_id128_t id; + int r; + + assert(s); + assert(fd >= 0); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_error_errno(r, "Failed to generate stream ID: %m"); + + stream = new(StdoutStream, 1); + if (!stream) + return log_oom(); + + *stream = (StdoutStream) { + .fd = -1, + .priority = LOG_INFO, + }; + + xsprintf(stream->id_field, "_STREAM_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)); + + r = getpeercred(fd, &stream->ucred); + if (r < 0) + return log_error_errno(r, "Failed to determine peer credentials: %m"); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = getpeersec(fd, &stream->label); + if (r < 0 && r != -EOPNOTSUPP) + (void) log_warning_errno(r, "Failed to determine peer security context: %m"); + } + + (void) shutdown(fd, SHUT_WR); + + r = sd_event_add_io(s->event, &stream->event_source, fd, EPOLLIN, stdout_stream_process, stream); + if (r < 0) + return log_error_errno(r, "Failed to add stream to event loop: %m"); + + r = sd_event_source_set_priority(stream->event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust stdout event source priority: %m"); + + stream->fd = fd; + + stream->server = s; + LIST_PREPEND(stdout_stream, s->stdout_streams, stream); + s->n_stdout_streams++; + + (void) server_start_or_stop_idle_timer(s); /* Maybe no longer idle? */ + + if (ret) + *ret = stream; + + TAKE_PTR(stream); + return 0; +} + +static int stdout_stream_new(sd_event_source *es, int listen_fd, uint32_t revents, void *userdata) { + _cleanup_close_ int fd = -1; + Server *s = userdata; + int r; + + assert(s); + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Got invalid event from epoll for stdout server fd: %" PRIx32, + revents); + + fd = accept4(s->stdout_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (fd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return log_error_errno(errno, "Failed to accept stdout connection: %m"); + } + + if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) { + struct ucred u; + + r = getpeercred(fd, &u); + + /* By closing fd here we make sure that the client won't wait too long for journald to + * gather all the data it adds to the error message to find out that the connection has + * just been refused. + */ + fd = safe_close(fd); + + server_driver_message(s, r < 0 ? 0 : u.pid, NULL, LOG_MESSAGE("Too many stdout streams, refusing connection."), NULL); + return 0; + } + + r = stdout_stream_install(s, fd, NULL); + if (r < 0) + return r; + + TAKE_FD(fd); + return 0; +} + +static int stdout_stream_load(StdoutStream *stream, const char *fname) { + _cleanup_free_ char + *priority = NULL, + *level_prefix = NULL, + *forward_to_syslog = NULL, + *forward_to_kmsg = NULL, + *forward_to_console = NULL, + *stream_id = NULL; + int r; + + assert(stream); + assert(fname); + + if (!stream->state_file) { + stream->state_file = path_join(stream->server->runtime_directory, "streams", fname); + if (!stream->state_file) + return log_oom(); + } + + r = parse_env_file(NULL, stream->state_file, + "PRIORITY", &priority, + "LEVEL_PREFIX", &level_prefix, + "FORWARD_TO_SYSLOG", &forward_to_syslog, + "FORWARD_TO_KMSG", &forward_to_kmsg, + "FORWARD_TO_CONSOLE", &forward_to_console, + "IDENTIFIER", &stream->identifier, + "UNIT", &stream->unit_id, + "STREAM_ID", &stream_id); + if (r < 0) + return log_error_errno(r, "Failed to read: %s", stream->state_file); + + if (priority) { + int p; + + p = log_level_from_string(priority); + if (p >= 0) + stream->priority = p; + } + + if (level_prefix) { + r = parse_boolean(level_prefix); + if (r >= 0) + stream->level_prefix = r; + } + + if (forward_to_syslog) { + r = parse_boolean(forward_to_syslog); + if (r >= 0) + stream->forward_to_syslog = r; + } + + if (forward_to_kmsg) { + r = parse_boolean(forward_to_kmsg); + if (r >= 0) + stream->forward_to_kmsg = r; + } + + if (forward_to_console) { + r = parse_boolean(forward_to_console); + if (r >= 0) + stream->forward_to_console = r; + } + + if (stream_id) { + sd_id128_t id; + + r = sd_id128_from_string(stream_id, &id); + if (r >= 0) + xsprintf(stream->id_field, "_STREAM_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)); + } + + return 0; +} + +static int stdout_stream_restore(Server *s, const char *fname, int fd) { + StdoutStream *stream; + int r; + + assert(s); + assert(fname); + assert(fd >= 0); + + if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) { + log_warning("Too many stdout streams, refusing restoring of stream."); + return -ENOBUFS; + } + + r = stdout_stream_install(s, fd, &stream); + if (r < 0) + return r; + + stream->state = STDOUT_STREAM_RUNNING; + stream->fdstore = true; + + /* Ignore all parsing errors */ + (void) stdout_stream_load(stream, fname); + + return 0; +} + +int server_restore_streams(Server *s, FDSet *fds) { + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + const char *path; + int r; + + path = strjoina(s->runtime_directory, "/streams"); + d = opendir(path); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to enumerate %s: %m", path); + } + + FOREACH_DIRENT(de, d, goto fail) { + unsigned long st_dev, st_ino; + bool found = false; + int fd; + + if (sscanf(de->d_name, "%lu:%lu", &st_dev, &st_ino) != 2) + continue; + + FDSET_FOREACH(fd, fds) { + struct stat st; + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", de->d_name); + + if (S_ISSOCK(st.st_mode) && st.st_dev == st_dev && st.st_ino == st_ino) { + found = true; + break; + } + } + + if (!found) { + /* No file descriptor? Then let's delete the state file */ + log_debug("Cannot restore stream file %s", de->d_name); + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + log_warning_errno(errno, "Failed to remove %s/%s: %m", path, de->d_name); + continue; + } + + fdset_remove(fds, fd); + + r = stdout_stream_restore(s, de->d_name, fd); + if (r < 0) + safe_close(fd); + } + + return 0; + +fail: + return log_error_errno(errno, "Failed to read streams directory: %m"); +} + +int server_open_stdout_socket(Server *s, const char *stdout_socket) { + int r; + + assert(s); + assert(stdout_socket); + + if (s->stdout_fd < 0) { + union sockaddr_union sa; + socklen_t sa_len; + + r = sockaddr_un_set_path(&sa.un, stdout_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", stdout_socket); + sa_len = r; + + s->stdout_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->stdout_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->stdout_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + + if (listen(s->stdout_fd, SOMAXCONN) < 0) + return log_error_errno(errno, "listen(%s) failed: %m", sa.un.sun_path); + } else + (void) fd_nonblock(s->stdout_fd, true); + + r = sd_event_add_io(s->event, &s->stdout_event_source, s->stdout_fd, EPOLLIN, stdout_stream_new, s); + if (r < 0) + return log_error_errno(r, "Failed to add stdout server fd to event source: %m"); + + r = sd_event_source_set_priority(s->stdout_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of stdout server event source: %m"); + + return 0; +} + +void stdout_stream_send_notify(StdoutStream *s) { + struct iovec iovec = { + .iov_base = (char*) "FDSTORE=1", + .iov_len = STRLEN("FDSTORE=1"), + }; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + }; + struct cmsghdr *cmsg; + ssize_t l; + + assert(s); + assert(!s->fdstore); + assert(s->in_notify_queue); + assert(s->server); + assert(s->server->notify_fd >= 0); + + /* Store the connection fd in PID 1, so that we get it passed + * in again on next start */ + + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); + msghdr.msg_control = alloca0(msghdr.msg_controllen); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int)); + + l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL); + if (l < 0) { + if (errno == EAGAIN) + return; + + log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m"); + } else { + log_debug("Successfully sent stream file descriptor to service manager."); + s->fdstore = 1; + } + + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = false; + +} diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h new file mode 100644 index 0000000..0a033b4 --- /dev/null +++ b/src/journal/journald-stream.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct StdoutStream StdoutStream; + +#include "fdset.h" +#include "journald-server.h" + +int server_open_stdout_socket(Server *s, const char *stdout_socket); +int server_restore_streams(Server *s, FDSet *fds); + +void stdout_stream_free(StdoutStream *s); +int stdout_stream_install(Server *s, int fd, StdoutStream **ret); +void stdout_stream_destroy(StdoutStream *s); +void stdout_stream_send_notify(StdoutStream *s); diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c new file mode 100644 index 0000000..925bd50 --- /dev/null +++ b/src/journal/journald-syslog.c @@ -0,0 +1,527 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stddef.h> +#include <sys/epoll.h> +#include <unistd.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "io-util.h" +#include "journald-console.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "syslog-util.h" + +/* Warn once every 30s if we missed syslog message */ +#define WARN_FORWARD_SYSLOG_MISSED_USEC (30 * USEC_PER_SEC) + +static void forward_syslog_iovec( + Server *s, + const struct iovec *iovec, + unsigned n_iovec, + const struct ucred *ucred, + const struct timeval *tv) { + + union sockaddr_union sa; + + struct msghdr msghdr = { + .msg_iov = (struct iovec *) iovec, + .msg_iovlen = n_iovec, + }; + struct cmsghdr *cmsg; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + const char *j; + int r; + + assert(s); + assert(iovec); + assert(n_iovec > 0); + + j = strjoina(s->runtime_directory, "/syslog"); + r = sockaddr_un_set_path(&sa.un, j); + if (r < 0) { + log_debug_errno(r, "Forwarding socket path %s too long for AF_UNIX, not forwarding: %m", j); + return; + } + + msghdr.msg_name = &sa.sa; + msghdr.msg_namelen = r; + + if (ucred) { + zero(control); + msghdr.msg_control = &control; + msghdr.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred)); + msghdr.msg_controllen = cmsg->cmsg_len; + } + + /* Forward the syslog message we received via /dev/log to /run/systemd/syslog. Unfortunately we + * currently can't set the SO_TIMESTAMP auxiliary data, and hence we don't. */ + + if (sendmsg(s->syslog_fd, &msghdr, MSG_NOSIGNAL) >= 0) + return; + + /* The socket is full? I guess the syslog implementation is + * too slow, and we shouldn't wait for that... */ + if (errno == EAGAIN) { + s->n_forward_syslog_missed++; + return; + } + + if (ucred && IN_SET(errno, ESRCH, EPERM)) { + struct ucred u; + + /* Hmm, presumably the sender process vanished + * by now, or we don't have CAP_SYS_AMDIN, so + * let's fix it as good as we can, and retry */ + + u = *ucred; + u.pid = getpid_cached(); + memcpy(CMSG_DATA(cmsg), &u, sizeof(struct ucred)); + + if (sendmsg(s->syslog_fd, &msghdr, MSG_NOSIGNAL) >= 0) + return; + + if (errno == EAGAIN) { + s->n_forward_syslog_missed++; + return; + } + } + + if (errno != ENOENT) + log_debug_errno(errno, "Failed to forward syslog message: %m"); +} + +static void forward_syslog_raw(Server *s, int priority, const char *buffer, size_t buffer_len, const struct ucred *ucred, const struct timeval *tv) { + struct iovec iovec; + + assert(s); + assert(buffer); + + if (LOG_PRI(priority) > s->max_level_syslog) + return; + + iovec = IOVEC_MAKE((char *) buffer, buffer_len); + forward_syslog_iovec(s, &iovec, 1, ucred, tv); +} + +void server_forward_syslog(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred, const struct timeval *tv) { + struct iovec iovec[5]; + char header_priority[DECIMAL_STR_MAX(priority) + 3], header_time[64], + header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1]; + int n = 0; + time_t t; + struct tm tm; + _cleanup_free_ char *ident_buf = NULL; + + assert(s); + assert(priority >= 0); + assert(priority <= 999); + assert(message); + + if (LOG_PRI(priority) > s->max_level_syslog) + return; + + /* First: priority field */ + xsprintf(header_priority, "<%i>", priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); + + /* Second: timestamp */ + t = tv ? tv->tv_sec : ((time_t) (now(CLOCK_REALTIME) / USEC_PER_SEC)); + if (!localtime_r(&t, &tm)) + return; + if (strftime(header_time, sizeof(header_time), "%h %e %T ", &tm) <= 0) + return; + iovec[n++] = IOVEC_MAKE_STRING(header_time); + + /* Third: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) get_process_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(message); + + forward_syslog_iovec(s, iovec, n, ucred, tv); +} + +int syslog_fixup_facility(int priority) { + + if ((priority & LOG_FACMASK) == 0) + return (priority & LOG_PRIMASK) | LOG_USER; + + return priority; +} + +size_t syslog_parse_identifier(const char **buf, char **identifier, char **pid) { + const char *p; + char *t; + size_t l, e; + + assert(buf); + assert(identifier); + assert(pid); + + p = *buf; + + p += strspn(p, WHITESPACE); + l = strcspn(p, WHITESPACE); + + if (l <= 0 || + p[l-1] != ':') + return 0; + + e = l; + l--; + + if (l > 0 && p[l-1] == ']') { + size_t k = l-1; + + for (;;) { + + if (p[k] == '[') { + t = strndup(p+k+1, l-k-2); + if (t) + *pid = t; + + l = k; + break; + } + + if (k == 0) + break; + + k--; + } + } + + t = strndup(p, l); + if (t) + *identifier = t; + + /* Single space is used as separator */ + if (p[e] != '\0' && strchr(WHITESPACE, p[e])) + e++; + + l = (p - *buf) + e; + *buf = p + e; + return l; +} + +static int syslog_skip_timestamp(const char **buf) { + enum { + LETTER, + SPACE, + NUMBER, + SPACE_OR_NUMBER, + COLON + } sequence[] = { + LETTER, LETTER, LETTER, + SPACE, + SPACE_OR_NUMBER, NUMBER, + SPACE, + SPACE_OR_NUMBER, NUMBER, + COLON, + SPACE_OR_NUMBER, NUMBER, + COLON, + SPACE_OR_NUMBER, NUMBER, + SPACE + }; + + const char *p, *t; + unsigned i; + + assert(buf); + assert(*buf); + + for (i = 0, p = *buf; i < ELEMENTSOF(sequence); i++, p++) { + if (!*p) + return 0; + + switch (sequence[i]) { + + case SPACE: + if (*p != ' ') + return 0; + break; + + case SPACE_OR_NUMBER: + if (*p == ' ') + break; + + _fallthrough_; + case NUMBER: + if (*p < '0' || *p > '9') + return 0; + + break; + + case LETTER: + if (!(*p >= 'A' && *p <= 'Z') && + !(*p >= 'a' && *p <= 'z')) + return 0; + + break; + + case COLON: + if (*p != ':') + return 0; + break; + + } + } + + t = *buf; + *buf = p; + return p - t; +} + +void server_process_syslog_message( + Server *s, + const char *buf, + size_t raw_len, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len) { + + char *t, syslog_priority[sizeof("PRIORITY=") + DECIMAL_STR_MAX(int)], + syslog_facility[sizeof("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int)]; + const char *msg, *syslog_ts, *a; + _cleanup_free_ char *identifier = NULL, *pid = NULL, + *dummy = NULL, *msg_msg = NULL, *msg_raw = NULL; + int priority = LOG_USER | LOG_INFO, r; + ClientContext *context = NULL; + struct iovec *iovec; + size_t n = 0, m, i, leading_ws, syslog_ts_len; + bool store_raw; + + assert(s); + assert(buf); + /* The message cannot be empty. */ + assert(raw_len > 0); + /* The buffer NUL-terminated and can be used a string. raw_len is the length + * without the terminating NUL byte, the buffer is actually one bigger. */ + assert(buf[raw_len] == '\0'); + + if (ucred && pid_is_valid(ucred->pid)) { + r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context); + if (r < 0) + log_warning_errno(r, "Failed to retrieve credentials for PID " PID_FMT ", ignoring: %m", ucred->pid); + } + + /* We are creating a copy of the message because we want to forward the original message + verbatim to the legacy syslog implementation */ + for (i = raw_len; i > 0; i--) + if (!strchr(WHITESPACE, buf[i-1])) + break; + + leading_ws = strspn(buf, WHITESPACE); + + if (i == 0) + /* The message contains only whitespaces */ + msg = buf + raw_len; + else if (i == raw_len) + /* Nice! No need to strip anything on the end, let's optimize this a bit */ + msg = buf + leading_ws; + else { + msg = dummy = new(char, i - leading_ws + 1); + if (!dummy) { + log_oom(); + return; + } + + memcpy(dummy, buf + leading_ws, i - leading_ws); + dummy[i - leading_ws] = 0; + } + + /* We will add the SYSLOG_RAW= field when we stripped anything + * _or_ if the input message contained NUL bytes. */ + store_raw = msg != buf || strlen(msg) != raw_len; + + syslog_parse_priority(&msg, &priority, true); + + if (!client_context_test_priority(context, priority)) + return; + + syslog_ts = msg; + syslog_ts_len = syslog_skip_timestamp(&msg); + if (syslog_ts_len == 0) + /* We failed to parse the full timestamp, store the raw message too */ + store_raw = true; + + syslog_parse_identifier(&msg, &identifier, &pid); + + if (s->forward_to_syslog) + forward_syslog_raw(s, priority, buf, raw_len, ucred, tv); + + if (s->forward_to_kmsg) + server_forward_kmsg(s, priority, identifier, msg, ucred); + + if (s->forward_to_console) + server_forward_console(s, priority, identifier, msg, ucred); + + if (s->forward_to_wall) + server_forward_wall(s, priority, identifier, msg, ucred); + + m = N_IOVEC_META_FIELDS + 8 + client_context_extra_fields_n_iovec(context); + iovec = newa(struct iovec, m); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=syslog"); + + xsprintf(syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (priority & LOG_FACMASK) { + xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + } + + if (identifier) { + a = strjoina("SYSLOG_IDENTIFIER=", identifier); + iovec[n++] = IOVEC_MAKE_STRING(a); + } + + if (pid) { + a = strjoina("SYSLOG_PID=", pid); + iovec[n++] = IOVEC_MAKE_STRING(a); + } + + if (syslog_ts_len > 0) { + const size_t hlen = STRLEN("SYSLOG_TIMESTAMP="); + + t = newa(char, hlen + syslog_ts_len); + memcpy(t, "SYSLOG_TIMESTAMP=", hlen); + memcpy(t + hlen, syslog_ts, syslog_ts_len); + + iovec[n++] = IOVEC_MAKE(t, hlen + syslog_ts_len); + } + + msg_msg = strjoin("MESSAGE=", msg); + if (!msg_msg) { + log_oom(); + return; + } + iovec[n++] = IOVEC_MAKE_STRING(msg_msg); + + if (store_raw) { + const size_t hlen = STRLEN("SYSLOG_RAW="); + + msg_raw = new(char, hlen + raw_len); + if (!msg_raw) { + log_oom(); + return; + } + + memcpy(msg_raw, "SYSLOG_RAW=", hlen); + memcpy(msg_raw + hlen, buf, raw_len); + + iovec[n++] = IOVEC_MAKE(msg_raw, hlen + raw_len); + } + + server_dispatch_message(s, iovec, n, m, context, tv, priority, 0); +} + +int server_open_syslog_socket(Server *s, const char *syslog_socket) { + int r; + + assert(s); + assert(syslog_socket); + + if (s->syslog_fd < 0) { + union sockaddr_union sa; + socklen_t sa_len; + + r = sockaddr_un_set_path(&sa.un, syslog_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", syslog_socket); + sa_len = r; + + s->syslog_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->syslog_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->syslog_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + } else + (void) fd_nonblock(s->syslog_fd, true); + + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_warning_errno(r, "SO_PASSSEC failed: %m"); + } + + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return log_error_errno(r, "SO_TIMESTAMP failed: %m"); + + r = sd_event_add_io(s->event, &s->syslog_event_source, s->syslog_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add syslog server fd to event loop: %m"); + + r = sd_event_source_set_priority(s->syslog_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust syslog event source priority: %m"); + + return 0; +} + +void server_maybe_warn_forward_syslog_missed(Server *s) { + usec_t n; + + assert(s); + + if (s->n_forward_syslog_missed <= 0) + return; + + n = now(CLOCK_MONOTONIC); + if (s->last_warn_forward_syslog_missed + WARN_FORWARD_SYSLOG_MISSED_USEC > n) + return; + + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_FORWARD_SYSLOG_MISSED_STR, + LOG_MESSAGE("Forwarding to syslog missed %u messages.", + s->n_forward_syslog_missed), + NULL); + + s->n_forward_syslog_missed = 0; + s->last_warn_forward_syslog_missed = n; +} diff --git a/src/journal/journald-syslog.h b/src/journal/journald-syslog.h new file mode 100644 index 0000000..3bc3ffd --- /dev/null +++ b/src/journal/journald-syslog.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +int syslog_fixup_facility(int priority) _const_; + +size_t syslog_parse_identifier(const char **buf, char **identifier, char **pid); + +void server_forward_syslog(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred, const struct timeval *tv); + +void server_process_syslog_message(Server *s, const char *buf, size_t buf_len, const struct ucred *ucred, const struct timeval *tv, const char *label, size_t label_len); +int server_open_syslog_socket(Server *s, const char *syslog_socket); + +void server_maybe_warn_forward_syslog_missed(Server *s); diff --git a/src/journal/journald-wall.c b/src/journal/journald-wall.c new file mode 100644 index 0000000..21ec5a7 --- /dev/null +++ b/src/journal/journald-wall.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "format-util.h" +#include "journald-server.h" +#include "journald-wall.h" +#include "process-util.h" +#include "string-util.h" +#include "utmp-wtmp.h" + +void server_forward_wall( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + _cleanup_free_ char *ident_buf = NULL, *l_buf = NULL; + const char *l; + int r; + + assert(s); + assert(message); + + if (LOG_PRI(priority) > s->max_level_wall) + return; + + if (ucred) { + if (!identifier) { + (void) get_process_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + if (asprintf(&l_buf, "%s["PID_FMT"]: %s", strempty(identifier), ucred->pid, message) < 0) { + log_oom(); + return; + } + + l = l_buf; + + } else if (identifier) { + + l = l_buf = strjoin(identifier, ": ", message); + if (!l_buf) { + log_oom(); + return; + } + } else + l = message; + + r = utmp_wall(l, "systemd-journald", NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send wall message: %m"); +} diff --git a/src/journal/journald-wall.h b/src/journal/journald-wall.h new file mode 100644 index 0000000..3f98c35 --- /dev/null +++ b/src/journal/journald-wall.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <sys/socket.h> + +#include "journald-server.h" + +void server_forward_wall(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); diff --git a/src/journal/journald.c b/src/journal/journald.c new file mode 100644 index 0000000..cfbaf36 --- /dev/null +++ b/src/journal/journald.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <unistd.h> + +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "format-util.h" +#include "journal-authenticate.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "process-util.h" +#include "sigbus.h" + +int main(int argc, char *argv[]) { + const char *namespace; + LogTarget log_target; + Server server; + int r; + + if (argc > 2) { + log_error("This program takes one or no arguments."); + return EXIT_FAILURE; + } + + namespace = argc > 1 ? empty_to_null(argv[1]) : NULL; + + /* So here's the deal: journald can't be considered as regular daemon when it comes to + * logging hence LOG_TARGET_AUTO won't do the right thing for it. Hence explicitly log to + * the console if we're started from a console or to kmsg otherwise. */ + log_target = isatty(STDERR_FILENO) > 0 ? LOG_TARGET_CONSOLE : LOG_TARGET_KMSG; + + log_set_prohibit_ipc(true); /* better safe than sorry */ + log_set_target(log_target); + log_set_facility(LOG_SYSLOG); + log_parse_environment(); + log_open(); + + umask(0022); + + sigbus_install(); + + r = server_init(&server, namespace); + if (r < 0) + goto finish; + + server_vacuum(&server, false); + server_flush_to_var(&server, true); + server_flush_dev_kmsg(&server); + + if (server.namespace) + log_debug("systemd-journald running as PID "PID_FMT" for namespace '%s'.", getpid_cached(), server.namespace); + else + log_debug("systemd-journald running as PID "PID_FMT" for the system.", getpid_cached()); + + server_driver_message(&server, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_START_STR, + LOG_MESSAGE("Journal started"), + NULL); + + /* Make sure to send the usage message *after* flushing the + * journal so entries from the runtime journals are ordered + * before this message. See #4190 for some details. */ + server_space_usage_message(&server, NULL); + + for (;;) { + usec_t t = USEC_INFINITY, n; + + r = sd_event_get_state(server.event); + if (r < 0) { + log_error_errno(r, "Failed to get event loop state: %m"); + goto finish; + } + if (r == SD_EVENT_FINISHED) + break; + + n = now(CLOCK_REALTIME); + + if (server.max_retention_usec > 0 && server.oldest_file_usec > 0) { + + /* The retention time is reached, so let's vacuum! */ + if (server.oldest_file_usec + server.max_retention_usec < n) { + log_info("Retention time reached."); + server_rotate(&server); + server_vacuum(&server, false); + continue; + } + + /* Calculate when to rotate the next time */ + t = server.oldest_file_usec + server.max_retention_usec - n; + } + +#if HAVE_GCRYPT + if (server.system_journal) { + usec_t u; + + if (journal_file_next_evolve_usec(server.system_journal, &u)) { + if (n >= u) + t = 0; + else + t = MIN(t, u - n); + } + } +#endif + + r = sd_event_run(server.event, t); + if (r < 0) { + log_error_errno(r, "Failed to run event loop: %m"); + goto finish; + } + + server_maybe_append_tags(&server); + server_maybe_warn_forward_syslog_missed(&server); + } + + if (server.namespace) + log_debug("systemd-journald stopped as PID "PID_FMT" for namespace '%s'.", getpid_cached(), server.namespace); + else + log_debug("systemd-journald stopped as PID "PID_FMT" for the system.", getpid_cached()); + + server_driver_message(&server, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_STOP_STR, + LOG_MESSAGE("Journal stopped"), + NULL); + +finish: + server_done(&server); + + return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/journal/journald.conf b/src/journal/journald.conf new file mode 100644 index 0000000..2e1aacd --- /dev/null +++ b/src/journal/journald.conf @@ -0,0 +1,44 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# Entries in this file show the compile time defaults. +# You can change settings by editing this file. +# Defaults can be restored by simply deleting this file. +# +# See journald.conf(5) for details. + +[Journal] +#Storage=auto +#Compress=yes +#Seal=yes +#SplitMode=uid +#SyncIntervalSec=5m +#RateLimitIntervalSec=30s +#RateLimitBurst=10000 +#SystemMaxUse= +#SystemKeepFree= +#SystemMaxFileSize= +#SystemMaxFiles=100 +#RuntimeMaxUse= +#RuntimeKeepFree= +#RuntimeMaxFileSize= +#RuntimeMaxFiles=100 +#MaxRetentionSec= +#MaxFileSec=1month +#ForwardToSyslog=no +#ForwardToKMsg=no +#ForwardToConsole=no +#ForwardToWall=yes +#TTYPath=/dev/console +#MaxLevelStore=debug +#MaxLevelSyslog=debug +#MaxLevelKMsg=notice +#MaxLevelConsole=info +#MaxLevelWall=emerg +#LineMax=48K +#ReadKMsg=yes +#Audit=yes diff --git a/src/journal/lookup3.c b/src/journal/lookup3.c new file mode 100644 index 0000000..39967f2 --- /dev/null +++ b/src/journal/lookup3.c @@ -0,0 +1,1006 @@ +/* SPDX-License-Identifier: LicenseRef-lookup3-public-domain */ +/* Slightly modified by Lennart Poettering, to avoid name clashes, and + * unexport a few functions. */ + +#include "lookup3.h" + +/* +------------------------------------------------------------------------------- +lookup3.c, by Bob Jenkins, May 2006, Public Domain. + +These are functions for producing 32-bit hashes for hash table lookup. +hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() +are externally useful functions. Routines to test the hash are included +if SELF_TEST is defined. You can use this free for any purpose. It's in +the public domain. It has no warranty. + +You probably want to use hashlittle(). hashlittle() and hashbig() +hash byte arrays. hashlittle() is faster than hashbig() on +little-endian machines. Intel and AMD are little-endian machines. +On second thought, you probably want hashlittle2(), which is identical to +hashlittle() except it returns two 32-bit hashes for the price of one. +You could implement hashbig2() if you wanted but I haven't bothered here. + +If you want to find a hash of, say, exactly 7 integers, do + a = i1; b = i2; c = i3; + mix(a,b,c); + a += i4; b += i5; c += i6; + mix(a,b,c); + a += i7; + final(a,b,c); +then use c as the hash value. If you have a variable length array of +4-byte integers to hash, use hashword(). If you have a byte array (like +a character string), use hashlittle(). If you have several byte arrays, or +a mix of things, see the comments above hashlittle(). + +Why is this so big? I read 12 bytes at a time into 3 4-byte integers, +then mix those integers. This is fast (you can do a lot more thorough +mixing with 12*3 instructions on 3 integers than you can with 3 instructions +on 1 byte), but shoehorning those bytes into integers efficiently is messy. +------------------------------------------------------------------------------- +*/ +/* #define SELF_TEST 1 */ + +#include <stdint.h> /* defines uint32_t etc */ +#include <stdio.h> /* defines printf for tests */ +#include <sys/param.h> /* attempt to define endianness */ +#include <time.h> /* defines time_t for timings in the test */ +#ifdef linux +# include <endian.h> /* attempt to define endianness */ +#endif + +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif + +/* + * My best guess at if you are big-endian or little-endian. This may + * need adjustment. + */ +#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ + __BYTE_ORDER == __LITTLE_ENDIAN) || \ + (defined(i386) || defined(__i386__) || defined(__i486__) || \ + defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL)) +# define HASH_LITTLE_ENDIAN 1 +# define HASH_BIG_ENDIAN 0 +#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \ + __BYTE_ORDER == __BIG_ENDIAN) || \ + (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel)) +# define HASH_LITTLE_ENDIAN 0 +# define HASH_BIG_ENDIAN 1 +#else +# define HASH_LITTLE_ENDIAN 0 +# define HASH_BIG_ENDIAN 0 +#endif + +#define hashsize(n) ((uint32_t)1<<(n)) +#define hashmask(n) (hashsize(n)-1) +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + +/* +-------------------------------------------------------------------- + This works on all machines. To be useful, it requires + -- that the key be an array of uint32_t's, and + -- that the length be the number of uint32_t's in the key + + The function hashword() is identical to hashlittle() on little-endian + machines, and identical to hashbig() on big-endian machines, + except that the length has to be measured in uint32_ts rather than in + bytes. hashlittle() is more complicated than hashword() only because + hashlittle() has to dance around fitting the key bytes into registers. +-------------------------------------------------------------------- +*/ +uint32_t jenkins_hashword( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + return c; +} + +/* +-------------------------------------------------------------------- +hashword2() -- same as hashword(), but take two seeds and return two +32-bit values. pc and pb must both be nonnull, and *pc and *pb must +both be initialized with seeds. If you pass in (*pb)==0, the output +(*pc) will be the same as the return value from hashword(). +-------------------------------------------------------------------- +*/ +void jenkins_hashword2 ( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t *pc, /* IN: seed OUT: primary hash value */ +uint32_t *pb) /* IN: more seed OUT: secondary hash value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; + c += *pb; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + *pc=c; *pb=b; +} + +/* +------------------------------------------------------------------------------- +hashlittle() -- hash a variable-length key into a 32-bit value + k : the key (the unaligned variable-length array of bytes) + length : the length of the key, counting by bytes + initval : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Two keys differing by one or two bits will have +totally different hash values. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do + h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (uint8_t **)k, do it like this: + for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h); + +By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this +code any way you wish, private, educational, or commercial. It's free. + +Use for hash table lookup, or anything where one collision in 2^^32 is +acceptable. Do NOT use for cryptographic purposes. +------------------------------------------------------------------------------- +*/ + +uint32_t jenkins_hashlittle( const void *key, size_t length, uint32_t initval) +{ + uint32_t a,b,c; /* internal state */ + union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; + + u.ptr = key; + if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ +#if !VALGRIND && !HAS_FEATURE_ADDRESS_SANITIZER && !HAS_FEATURE_MEMORY_SANITIZER + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : return c; /* zero length strings require no mixing */ + } + +#else /* make valgrind happy */ + { + const uint8_t *k8 = (const uint8_t *) k; + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : return c; + } + } + +#endif /* !valgrind */ + + } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : return c; /* zero length requires no mixing */ + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; + case 11: c+=((uint32_t)k[10])<<16; + case 10: c+=((uint32_t)k[9])<<8; + case 9 : c+=k[8]; + case 8 : b+=((uint32_t)k[7])<<24; + case 7 : b+=((uint32_t)k[6])<<16; + case 6 : b+=((uint32_t)k[5])<<8; + case 5 : b+=k[4]; + case 4 : a+=((uint32_t)k[3])<<24; + case 3 : a+=((uint32_t)k[2])<<16; + case 2 : a+=((uint32_t)k[1])<<8; + case 1 : a+=k[0]; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} + +/* + * hashlittle2: return 2 32-bit hash values + * + * This is identical to hashlittle(), except it returns two 32-bit hash + * values instead of just one. This is good enough for hash table + * lookup with 2^^64 buckets, or if you want a second hash if you're not + * happy with the first, or if you want a probably-unique 64-bit ID for + * the key. *pc is better mixed than *pb, so use *pc first. If you want + * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)". + */ +void jenkins_hashlittle2( + const void *key, /* the key to hash */ + size_t length, /* length of the key */ + uint32_t *pc, /* IN: primary initval, OUT: primary hash */ + uint32_t *pb) /* IN: secondary initval, OUT: secondary hash */ +{ + uint32_t a,b,c; /* internal state */ + union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc; + c += *pb; + + u.ptr = key; + if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ +#if !VALGRIND && !HAS_FEATURE_ADDRESS_SANITIZER && !HAS_FEATURE_MEMORY_SANITIZER + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + +#else /* make valgrind happy */ + + { + const uint8_t *k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + } + +#endif /* !valgrind */ + + } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; + case 11: c+=((uint32_t)k[10])<<16; + case 10: c+=((uint32_t)k[9])<<8; + case 9 : c+=k[8]; + case 8 : b+=((uint32_t)k[7])<<24; + case 7 : b+=((uint32_t)k[6])<<16; + case 6 : b+=((uint32_t)k[5])<<8; + case 5 : b+=k[4]; + case 4 : a+=((uint32_t)k[3])<<24; + case 3 : a+=((uint32_t)k[2])<<16; + case 2 : a+=((uint32_t)k[1])<<8; + case 1 : a+=k[0]; + break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + } + + final(a,b,c); + *pc=c; *pb=b; +} + +/* + * hashbig(): + * This is the same as hashword() on big-endian machines. It is different + * from hashlittle() on all machines. hashbig() takes advantage of + * big-endian byte ordering. + */ +uint32_t jenkins_hashbig( const void *key, size_t length, uint32_t initval) +{ + uint32_t a,b,c; + union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; + + u.ptr = key; + if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]<<8" actually reads beyond the end of the string, but + * then shifts out the part it's not allowed to read. Because the + * string is aligned, the illegal read is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ +#if !VALGRIND && !HAS_FEATURE_ADDRESS_SANITIZER && !HAS_FEATURE_MEMORY_SANITIZER + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff00; a+=k[0]; break; + case 6 : b+=k[1]&0xffff0000; a+=k[0]; break; + case 5 : b+=k[1]&0xff000000; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff00; break; + case 2 : a+=k[0]&0xffff0000; break; + case 1 : a+=k[0]&0xff000000; break; + case 0 : return c; /* zero length strings require no mixing */ + } + +#else /* make valgrind happy */ + + { + const uint8_t *k8 = (const uint8_t *)k; + switch(length) /* all the case statements fall through */ + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<8; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<16; /* fall through */ + case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */ + case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */ + case 1 : a+=((uint32_t)k8[0])<<24; break; + case 0 : return c; + } + } + +#endif /* !VALGRIND */ + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += ((uint32_t)k[0])<<24; + a += ((uint32_t)k[1])<<16; + a += ((uint32_t)k[2])<<8; + a += ((uint32_t)k[3]); + b += ((uint32_t)k[4])<<24; + b += ((uint32_t)k[5])<<16; + b += ((uint32_t)k[6])<<8; + b += ((uint32_t)k[7]); + c += ((uint32_t)k[8])<<24; + c += ((uint32_t)k[9])<<16; + c += ((uint32_t)k[10])<<8; + c += ((uint32_t)k[11]); + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=k[11]; + case 11: c+=((uint32_t)k[10])<<8; + case 10: c+=((uint32_t)k[9])<<16; + case 9 : c+=((uint32_t)k[8])<<24; + case 8 : b+=k[7]; + case 7 : b+=((uint32_t)k[6])<<8; + case 6 : b+=((uint32_t)k[5])<<16; + case 5 : b+=((uint32_t)k[4])<<24; + case 4 : a+=k[3]; + case 3 : a+=((uint32_t)k[2])<<8; + case 2 : a+=((uint32_t)k[1])<<16; + case 1 : a+=((uint32_t)k[0])<<24; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} + +#ifdef SELF_TEST + +/* used for timings */ +void driver1() +{ + uint8_t buf[256]; + uint32_t i; + uint32_t h=0; + time_t a,z; + + time(&a); + for (i=0; i<256; ++i) buf[i] = 'x'; + for (i=0; i<1; ++i) + { + h = hashlittle(&buf[0],1,h); + } + time(&z); + if (z-a > 0) printf("time %d %.8x\n", z-a, h); +} + +/* check that every input bit changes every output bit half the time */ +#define HASHSTATE 1 +#define HASHLEN 1 +#define MAXPAIR 60 +#define MAXLEN 70 +void driver2() +{ + uint8_t qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1]; + uint32_t c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, m=0, z; + uint32_t e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE]; + uint32_t x[HASHSTATE],y[HASHSTATE]; + uint32_t hlen; + + printf("No more than %d trials should ever be needed \n",MAXPAIR/2); + for (hlen=0; hlen < MAXLEN; ++hlen) + { + z=0; + for (i=0; i<hlen; ++i) /*----------------------- for each input byte, */ + { + for (j=0; j<8; ++j) /*------------------------ for each input bit, */ + { + for (m=1; m<8; ++m) /*------------- for several possible initvals, */ + { + for (l=0; l<HASHSTATE; ++l) + e[l]=f[l]=g[l]=h[l]=x[l]=y[l]=~((uint32_t)0); + + /*---- check that every output bit is affected by that input bit */ + for (k=0; k<MAXPAIR; k+=2) + { + uint32_t finished=1; + /* keys have one bit different */ + for (l=0; l<hlen+1; ++l) {a[l] = b[l] = (uint8_t)0;} + /* have a and b be two keys differing in only one bit */ + a[i] ^= (k<<j); + a[i] ^= (k>>(8-j)); + c[0] = hashlittle(a, hlen, m); + b[i] ^= ((k+1)<<j); + b[i] ^= ((k+1)>>(8-j)); + d[0] = hashlittle(b, hlen, m); + /* check every bit is 1, 0, set, and not set at least once */ + for (l=0; l<HASHSTATE; ++l) + { + e[l] &= (c[l]^d[l]); + f[l] &= ~(c[l]^d[l]); + g[l] &= c[l]; + h[l] &= ~c[l]; + x[l] &= d[l]; + y[l] &= ~d[l]; + if (e[l]|f[l]|g[l]|h[l]|x[l]|y[l]) finished=0; + } + if (finished) break; + } + if (k>z) z=k; + if (k==MAXPAIR) + { + printf("Some bit didn't change: "); + printf("%.8x %.8x %.8x %.8x %.8x %.8x ", + e[0],f[0],g[0],h[0],x[0],y[0]); + printf("i %d j %d m %d len %d\n", i, j, m, hlen); + } + if (z==MAXPAIR) goto done; + } + } + } + done: + if (z < MAXPAIR) + { + printf("Mix success %2d bytes %2d initvals ",i,m); + printf("required %d trials\n", z/2); + } + } + printf("\n"); +} + +/* Check for reading beyond the end of the buffer and alignment problems */ +void driver3() +{ + uint8_t buf[MAXLEN+20], *b; + uint32_t len; + uint8_t q[] = "This is the time for all good men to come to the aid of their country..."; + uint32_t h; + uint8_t qq[] = "xThis is the time for all good men to come to the aid of their country..."; + uint32_t i; + uint8_t qqq[] = "xxThis is the time for all good men to come to the aid of their country..."; + uint32_t j; + uint8_t qqqq[] = "xxxThis is the time for all good men to come to the aid of their country..."; + uint32_t ref,x,y; + uint8_t *p; + + printf("Endianness. These lines should all be the same (for values filled in):\n"); + printf("%.8x %.8x %.8x\n", + hashword((const uint32_t *)q, (sizeof(q)-1)/4, 13), + hashword((const uint32_t *)q, (sizeof(q)-5)/4, 13), + hashword((const uint32_t *)q, (sizeof(q)-9)/4, 13)); + p = q; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qq[1]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qqq[2]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qqqq[3]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + printf("\n"); + + /* check that hashlittle2 and hashlittle produce the same results */ + i=47; j=0; + hashlittle2(q, sizeof(q), &i, &j); + if (hashlittle(q, sizeof(q), 47) != i) + printf("hashlittle2 and hashlittle mismatch\n"); + + /* check that hashword2 and hashword produce the same results */ + len = 0xdeadbeef; + i=47, j=0; + hashword2(&len, 1, &i, &j); + if (hashword(&len, 1, 47) != i) + printf("hashword2 and hashword mismatch %x %x\n", + i, hashword(&len, 1, 47)); + + /* check hashlittle doesn't read before or after the ends of the string */ + for (h=0, b=buf+1; h<8; ++h, ++b) + { + for (i=0; i<MAXLEN; ++i) + { + len = i; + for (j=0; j<i; ++j) *(b+j)=0; + + /* these should all be equal */ + ref = hashlittle(b, len, (uint32_t)1); + *(b+i)=(uint8_t)~0; + *(b-1)=(uint8_t)~0; + x = hashlittle(b, len, (uint32_t)1); + y = hashlittle(b, len, (uint32_t)1); + if ((ref != x) || (ref != y)) + { + printf("alignment error: %.8x %.8x %.8x %d %d\n",ref,x,y, + h, i); + } + } + } +} + +/* check for problems with nulls */ + void driver4() +{ + uint8_t buf[1]; + uint32_t h,i,state[HASHSTATE]; + + buf[0] = ~0; + for (i=0; i<HASHSTATE; ++i) state[i] = 1; + printf("These should all be different\n"); + for (i=0, h=0; i<8; ++i) + { + h = hashlittle(buf, 0, h); + printf("%2ld 0-byte strings, hash is %.8x\n", i, h); + } +} + +void driver5() +{ + uint32_t b,c; + b=0, c=0, hashlittle2("", 0, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* deadbeef deadbeef */ + b=0xdeadbeef, c=0, hashlittle2("", 0, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* bd5b7dde deadbeef */ + b=0xdeadbeef, c=0xdeadbeef, hashlittle2("", 0, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* 9c093ccd bd5b7dde */ + b=0, c=0, hashlittle2("Four score and seven years ago", 30, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* 17770551 ce7226e6 */ + b=1, c=0, hashlittle2("Four score and seven years ago", 30, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* e3607cae bd371de4 */ + b=0, c=1, hashlittle2("Four score and seven years ago", 30, &c, &b); + printf("hash is %.8lx %.8lx\n", c, b); /* cd628161 6cbea4b3 */ + c = hashlittle("Four score and seven years ago", 30, 0); + printf("hash is %.8lx\n", c); /* 17770551 */ + c = hashlittle("Four score and seven years ago", 30, 1); + printf("hash is %.8lx\n", c); /* cd628161 */ +} + +int main() +{ + driver1(); /* test that the key is hashed: used for timings */ + driver2(); /* test that whole key is hashed thoroughly */ + driver3(); /* test that nothing but the key is hashed */ + driver4(); /* test hashing multiple buffers (all buffers are null) */ + driver5(); /* test the hash against known vectors */ + return 1; +} + +#endif /* SELF_TEST */ diff --git a/src/journal/lookup3.h b/src/journal/lookup3.h new file mode 100644 index 0000000..04e493e --- /dev/null +++ b/src/journal/lookup3.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LicenseRef-lookup3-public-domain */ +#pragma once + +#include <inttypes.h> +#include <sys/types.h> + +#include "macro.h" + +uint32_t jenkins_hashword(const uint32_t *k, size_t length, uint32_t initval) _pure_; +void jenkins_hashword2(const uint32_t *k, size_t length, uint32_t *pc, uint32_t *pb); + +uint32_t jenkins_hashlittle(const void *key, size_t length, uint32_t initval) _pure_; +void jenkins_hashlittle2(const void *key, size_t length, uint32_t *pc, uint32_t *pb); + +uint32_t jenkins_hashbig(const void *key, size_t length, uint32_t initval) _pure_; + +static inline uint64_t jenkins_hash64(const void *data, size_t length) { + uint32_t a = 0, b = 0; + + jenkins_hashlittle2(data, length, &a, &b); + + return ((uint64_t) a << 32ULL) | (uint64_t) b; +} diff --git a/src/journal/meson.build b/src/journal/meson.build new file mode 100644 index 0000000..7aea28d --- /dev/null +++ b/src/journal/meson.build @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +journal_client_sources = files(''' + audit-type.c + audit-type.h + catalog.c + catalog.h + compress.c + compress.h + journal-def.h + journal-file.c + journal-file.h + journal-send.c + journal-vacuum.c + journal-vacuum.h + journal-verify.c + journal-verify.h + lookup3.c + lookup3.h + mmap-cache.c + mmap-cache.h + sd-journal.c +'''.split()) + +if conf.get('HAVE_GCRYPT') == 1 + journal_client_sources += files(''' + journal-authenticate.c + journal-authenticate.h + fsprg.c + fsprg.h + '''.split()) +endif + +############################################################ + +audit_type_includes = [config_h, + missing_audit_h, + 'linux/audit.h'] +if conf.get('HAVE_AUDIT') == 1 + audit_type_includes += 'libaudit.h' +endif + +generate_audit_type_list = find_program('generate-audit_type-list.sh') +audit_type_list_txt = custom_target( + 'audit_type-list.txt', + output : 'audit_type-list.txt', + command : [generate_audit_type_list, cpp] + audit_type_includes, + capture : true) + +audit_type_to_name = custom_target( + 'audit_type-to-name.h', + input : ['audit_type-to-name.awk', audit_type_list_txt], + output : 'audit_type-to-name.h', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +journal_client_sources += [audit_type_to_name] + +libjournal_client = static_library( + 'journal-client', + journal_client_sources, + include_directories : includes, + c_args : ['-fvisibility=default']) + +############################################################ + +libjournal_core_sources = files(''' + journald-audit.c + journald-audit.h + journald-console.c + journald-console.h + journald-context.c + journald-context.h + journald-kmsg.c + journald-kmsg.h + journald-native.c + journald-native.h + journald-rate-limit.c + journald-rate-limit.h + journald-server.c + journald-server.h + journald-stream.c + journald-stream.h + journald-syslog.c + journald-syslog.h + journald-wall.c + journald-wall.h + journal-internal.h +'''.split()) + +systemd_journald_sources = files(''' + journald.c + journald-server.h +'''.split()) + +journald_gperf_c = custom_target( + 'journald-gperf.c', + input : 'journald-gperf.gperf', + output : 'journald-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +systemd_cat_sources = files('cat.c') + +journalctl_sources = files(''' + journalctl.c + pcre2-dlopen.c + pcre2-dlopen.h +'''.split()) + +if install_sysconfdir + install_data('journald.conf', + install_dir : pkgsysconfdir) +endif + +if get_option('create-log-dirs') + meson.add_install_script( + 'sh', '-c', + mkdir_p.format('/var/log/journal')) + meson.add_install_script( + 'sh', '-c', + '''chown 0:0 $DESTDIR/var/log/journal && + chmod 755 $DESTDIR/var/log/journal || :''') + if get_option('adm-group') + meson.add_install_script( + 'sh', '-c', + 'setfacl -nm g:adm:rx,d:g:adm:rx $DESTDIR/var/log/journal || :') + endif + if get_option('wheel-group') + meson.add_install_script( + 'sh', '-c', + 'setfacl -nm g:wheel:rx,d:g:wheel:rx $DESTDIR/var/log/journal || :') + endif +endif diff --git a/src/journal/mmap-cache.c b/src/journal/mmap-cache.c new file mode 100644 index 0000000..9882016 --- /dev/null +++ b/src/journal/mmap-cache.c @@ -0,0 +1,669 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdlib.h> +#include <sys/mman.h> + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "list.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "mmap-cache.h" +#include "sigbus.h" + +typedef struct Window Window; +typedef struct Context Context; + +struct Window { + MMapCache *cache; + + bool invalidated:1; + bool keep_always:1; + bool in_unused:1; + + int prot; + void *ptr; + uint64_t offset; + size_t size; + + MMapFileDescriptor *fd; + + LIST_FIELDS(Window, by_fd); + LIST_FIELDS(Window, unused); + + LIST_HEAD(Context, contexts); +}; + +struct Context { + MMapCache *cache; + unsigned id; + Window *window; + + LIST_FIELDS(Context, by_window); +}; + +struct MMapFileDescriptor { + MMapCache *cache; + int fd; + bool sigbus; + LIST_HEAD(Window, windows); +}; + +struct MMapCache { + unsigned n_ref; + unsigned n_windows; + + unsigned n_hit, n_missed; + + Hashmap *fds; + Context *contexts[MMAP_CACHE_MAX_CONTEXTS]; + + LIST_HEAD(Window, unused); + Window *last_unused; +}; + +#define WINDOWS_MIN 64 + +#if ENABLE_DEBUG_MMAP_CACHE +/* Tiny windows increase mmap activity and the chance of exposing unsafe use. */ +# define WINDOW_SIZE (page_size()) +#else +# define WINDOW_SIZE (8ULL*1024ULL*1024ULL) +#endif + +MMapCache* mmap_cache_new(void) { + MMapCache *m; + + m = new0(MMapCache, 1); + if (!m) + return NULL; + + m->n_ref = 1; + return m; +} + +static void window_unlink(Window *w) { + Context *c; + + assert(w); + + if (w->ptr) + munmap(w->ptr, w->size); + + if (w->fd) + LIST_REMOVE(by_fd, w->fd->windows, w); + + if (w->in_unused) { + if (w->cache->last_unused == w) + w->cache->last_unused = w->unused_prev; + + LIST_REMOVE(unused, w->cache->unused, w); + } + + LIST_FOREACH(by_window, c, w->contexts) { + assert(c->window == w); + c->window = NULL; + } +} + +static void window_invalidate(Window *w) { + assert(w); + + if (w->invalidated) + return; + + /* Replace the window with anonymous pages. This is useful + * when we hit a SIGBUS and want to make sure the file cannot + * trigger any further SIGBUS, possibly overrunning the sigbus + * queue. */ + + assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); + w->invalidated = true; +} + +static void window_free(Window *w) { + assert(w); + + window_unlink(w); + w->cache->n_windows--; + free(w); +} + +_pure_ static bool window_matches(Window *w, int prot, uint64_t offset, size_t size) { + assert(w); + assert(size > 0); + + return + prot == w->prot && + offset >= w->offset && + offset + size <= w->offset + w->size; +} + +_pure_ static bool window_matches_fd(Window *w, MMapFileDescriptor *f, int prot, uint64_t offset, size_t size) { + assert(w); + assert(f); + + return + w->fd && + f->fd == w->fd->fd && + window_matches(w, prot, offset, size); +} + +static Window *window_add(MMapCache *m, MMapFileDescriptor *f, int prot, bool keep_always, uint64_t offset, size_t size, void *ptr) { + Window *w; + + assert(m); + assert(f); + + if (!m->last_unused || m->n_windows <= WINDOWS_MIN) { + + /* Allocate a new window */ + w = new(Window, 1); + if (!w) + return NULL; + m->n_windows++; + } else { + + /* Reuse an existing one */ + w = m->last_unused; + window_unlink(w); + } + + *w = (Window) { + .cache = m, + .fd = f, + .prot = prot, + .keep_always = keep_always, + .offset = offset, + .size = size, + .ptr = ptr, + }; + + LIST_PREPEND(by_fd, f->windows, w); + + return w; +} + +static void context_detach_window(Context *c) { + Window *w; + + assert(c); + + if (!c->window) + return; + + w = TAKE_PTR(c->window); + LIST_REMOVE(by_window, w->contexts, c); + + if (!w->contexts && !w->keep_always) { + /* Not used anymore? */ +#if ENABLE_DEBUG_MMAP_CACHE + /* Unmap unused windows immediately to expose use-after-unmap + * by SIGSEGV. */ + window_free(w); +#else + LIST_PREPEND(unused, c->cache->unused, w); + if (!c->cache->last_unused) + c->cache->last_unused = w; + + w->in_unused = true; +#endif + } +} + +static void context_attach_window(Context *c, Window *w) { + assert(c); + assert(w); + + if (c->window == w) + return; + + context_detach_window(c); + + if (w->in_unused) { + /* Used again? */ + LIST_REMOVE(unused, c->cache->unused, w); + if (c->cache->last_unused == w) + c->cache->last_unused = w->unused_prev; + + w->in_unused = false; + } + + c->window = w; + LIST_PREPEND(by_window, w->contexts, c); +} + +static Context *context_add(MMapCache *m, unsigned id) { + Context *c; + + assert(m); + + c = m->contexts[id]; + if (c) + return c; + + c = new0(Context, 1); + if (!c) + return NULL; + + c->cache = m; + c->id = id; + + assert(!m->contexts[id]); + m->contexts[id] = c; + + return c; +} + +static void context_free(Context *c) { + assert(c); + + context_detach_window(c); + + if (c->cache) { + assert(c->cache->contexts[c->id] == c); + c->cache->contexts[c->id] = NULL; + } + + free(c); +} + +static MMapCache *mmap_cache_free(MMapCache *m) { + int i; + + assert(m); + + for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++) + if (m->contexts[i]) + context_free(m->contexts[i]); + + hashmap_free(m->fds); + + while (m->unused) + window_free(m->unused); + + return mfree(m); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free); + +static int make_room(MMapCache *m) { + assert(m); + + if (!m->last_unused) + return 0; + + window_free(m->last_unused); + return 1; +} + +static int try_context( + MMapCache *m, + MMapFileDescriptor *f, + int prot, + unsigned context, + bool keep_always, + uint64_t offset, + size_t size, + void **ret, + size_t *ret_size) { + + Context *c; + + assert(m); + assert(m->n_ref > 0); + assert(f); + assert(size > 0); + assert(ret); + + c = m->contexts[context]; + if (!c) + return 0; + + assert(c->id == context); + + if (!c->window) + return 0; + + if (!window_matches_fd(c->window, f, prot, offset, size)) { + + /* Drop the reference to the window, since it's unnecessary now */ + context_detach_window(c); + return 0; + } + + if (c->window->fd->sigbus) + return -EIO; + + c->window->keep_always = c->window->keep_always || keep_always; + + *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset); + if (ret_size) + *ret_size = c->window->size - (offset - c->window->offset); + + return 1; +} + +static int find_mmap( + MMapCache *m, + MMapFileDescriptor *f, + int prot, + unsigned context, + bool keep_always, + uint64_t offset, + size_t size, + void **ret, + size_t *ret_size) { + + Window *w; + Context *c; + + assert(m); + assert(m->n_ref > 0); + assert(f); + assert(size > 0); + + if (f->sigbus) + return -EIO; + + LIST_FOREACH(by_fd, w, f->windows) + if (window_matches(w, prot, offset, size)) + break; + + if (!w) + return 0; + + c = context_add(m, context); + if (!c) + return -ENOMEM; + + context_attach_window(c, w); + w->keep_always = w->keep_always || keep_always; + + *ret = (uint8_t*) w->ptr + (offset - w->offset); + if (ret_size) + *ret_size = w->size - (offset - w->offset); + + return 1; +} + +static int mmap_try_harder(MMapCache *m, void *addr, MMapFileDescriptor *f, int prot, int flags, uint64_t offset, size_t size, void **res) { + void *ptr; + + assert(m); + assert(f); + assert(res); + + for (;;) { + int r; + + ptr = mmap(addr, size, prot, flags, f->fd, offset); + if (ptr != MAP_FAILED) + break; + if (errno != ENOMEM) + return negative_errno(); + + r = make_room(m); + if (r < 0) + return r; + if (r == 0) + return -ENOMEM; + } + + *res = ptr; + return 0; +} + +static int add_mmap( + MMapCache *m, + MMapFileDescriptor *f, + int prot, + unsigned context, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret, + size_t *ret_size) { + + uint64_t woffset, wsize; + Context *c; + Window *w; + void *d; + int r; + + assert(m); + assert(m->n_ref > 0); + assert(f); + assert(size > 0); + assert(ret); + + woffset = offset & ~((uint64_t) page_size() - 1ULL); + wsize = size + (offset - woffset); + wsize = PAGE_ALIGN(wsize); + + if (wsize < WINDOW_SIZE) { + uint64_t delta; + + delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2); + + if (delta > offset) + woffset = 0; + else + woffset -= delta; + + wsize = WINDOW_SIZE; + } + + if (st) { + /* Memory maps that are larger then the files + underneath have undefined behavior. Hence, clamp + things to the file size if we know it */ + + if (woffset >= (uint64_t) st->st_size) + return -EADDRNOTAVAIL; + + if (woffset + wsize > (uint64_t) st->st_size) + wsize = PAGE_ALIGN(st->st_size - woffset); + } + + r = mmap_try_harder(m, NULL, f, prot, MAP_SHARED, woffset, wsize, &d); + if (r < 0) + return r; + + c = context_add(m, context); + if (!c) + goto outofmem; + + w = window_add(m, f, prot, keep_always, woffset, wsize, d); + if (!w) + goto outofmem; + + context_attach_window(c, w); + + *ret = (uint8_t*) w->ptr + (offset - w->offset); + if (ret_size) + *ret_size = w->size - (offset - w->offset); + + return 1; + +outofmem: + (void) munmap(d, wsize); + return -ENOMEM; +} + +int mmap_cache_get( + MMapCache *m, + MMapFileDescriptor *f, + int prot, + unsigned context, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret, + size_t *ret_size) { + + int r; + + assert(m); + assert(m->n_ref > 0); + assert(f); + assert(size > 0); + assert(ret); + assert(context < MMAP_CACHE_MAX_CONTEXTS); + + /* Check whether the current context is the right one already */ + r = try_context(m, f, prot, context, keep_always, offset, size, ret, ret_size); + if (r != 0) { + m->n_hit++; + return r; + } + + /* Search for a matching mmap */ + r = find_mmap(m, f, prot, context, keep_always, offset, size, ret, ret_size); + if (r != 0) { + m->n_hit++; + return r; + } + + m->n_missed++; + + /* Create a new mmap */ + return add_mmap(m, f, prot, context, keep_always, offset, size, st, ret, ret_size); +} + +unsigned mmap_cache_get_hit(MMapCache *m) { + assert(m); + + return m->n_hit; +} + +unsigned mmap_cache_get_missed(MMapCache *m) { + assert(m); + + return m->n_missed; +} + +static void mmap_cache_process_sigbus(MMapCache *m) { + bool found = false; + MMapFileDescriptor *f; + int r; + + assert(m); + + /* Iterate through all triggered pages and mark their files as + * invalidated */ + for (;;) { + bool ours; + void *addr; + + r = sigbus_pop(&addr); + if (_likely_(r == 0)) + break; + if (r < 0) { + log_error_errno(r, "SIGBUS handling failed: %m"); + abort(); + } + + ours = false; + HASHMAP_FOREACH(f, m->fds) { + Window *w; + + LIST_FOREACH(by_fd, w, f->windows) { + if ((uint8_t*) addr >= (uint8_t*) w->ptr && + (uint8_t*) addr < (uint8_t*) w->ptr + w->size) { + found = ours = f->sigbus = true; + break; + } + } + + if (ours) + break; + } + + /* Didn't find a matching window, give up */ + if (!ours) { + log_error("Unknown SIGBUS page, aborting."); + abort(); + } + } + + /* The list of triggered pages is now empty. Now, let's remap + * all windows of the triggered file to anonymous maps, so + * that no page of the file in question is triggered again, so + * that we can be sure not to hit the queue size limit. */ + if (_likely_(!found)) + return; + + HASHMAP_FOREACH(f, m->fds) { + Window *w; + + if (!f->sigbus) + continue; + + LIST_FOREACH(by_fd, w, f->windows) + window_invalidate(w); + } +} + +bool mmap_cache_got_sigbus(MMapCache *m, MMapFileDescriptor *f) { + assert(m); + assert(f); + + mmap_cache_process_sigbus(m); + + return f->sigbus; +} + +MMapFileDescriptor* mmap_cache_add_fd(MMapCache *m, int fd) { + MMapFileDescriptor *f; + int r; + + assert(m); + assert(fd >= 0); + + f = hashmap_get(m->fds, FD_TO_PTR(fd)); + if (f) + return f; + + r = hashmap_ensure_allocated(&m->fds, NULL); + if (r < 0) + return NULL; + + f = new0(MMapFileDescriptor, 1); + if (!f) + return NULL; + + f->cache = m; + f->fd = fd; + + r = hashmap_put(m->fds, FD_TO_PTR(fd), f); + if (r < 0) + return mfree(f); + + return f; +} + +void mmap_cache_free_fd(MMapCache *m, MMapFileDescriptor *f) { + assert(m); + assert(f); + + /* Make sure that any queued SIGBUS are first dispatched, so + * that we don't end up with a SIGBUS entry we cannot relate + * to any existing memory map */ + + mmap_cache_process_sigbus(m); + + while (f->windows) + window_free(f->windows); + + if (f->cache) + assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd))); + + free(f); +} diff --git a/src/journal/mmap-cache.h b/src/journal/mmap-cache.h new file mode 100644 index 0000000..28d5ab1 --- /dev/null +++ b/src/journal/mmap-cache.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <sys/stat.h> + +/* One context per object type, plus one of the header, plus one "additional" one */ +#define MMAP_CACHE_MAX_CONTEXTS 9 + +typedef struct MMapCache MMapCache; +typedef struct MMapFileDescriptor MMapFileDescriptor; + +MMapCache* mmap_cache_new(void); +MMapCache* mmap_cache_ref(MMapCache *m); +MMapCache* mmap_cache_unref(MMapCache *m); + +int mmap_cache_get( + MMapCache *m, + MMapFileDescriptor *f, + int prot, + unsigned context, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret, + size_t *ret_size); +MMapFileDescriptor * mmap_cache_add_fd(MMapCache *m, int fd); +void mmap_cache_free_fd(MMapCache *m, MMapFileDescriptor *f); + +unsigned mmap_cache_get_hit(MMapCache *m); +unsigned mmap_cache_get_missed(MMapCache *m); + +bool mmap_cache_got_sigbus(MMapCache *m, MMapFileDescriptor *f); diff --git a/src/journal/pcre2-dlopen.c b/src/journal/pcre2-dlopen.c new file mode 100644 index 0000000..fbe81f9 --- /dev/null +++ b/src/journal/pcre2-dlopen.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "pcre2-dlopen.h" + +#if HAVE_PCRE2 +static void *pcre2_dl = NULL; + +pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *); +void (*sym_pcre2_match_data_free)(pcre2_match_data *); +void (*sym_pcre2_code_free)(pcre2_code *); +pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *); +int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE); +int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *); +PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *); + +int dlopen_pcre2(void) { + _cleanup_(dlclosep) void *dl = NULL; + int r; + + if (pcre2_dl) + return 0; /* Already loaded */ + + dl = dlopen("libpcre2-8.so.0", RTLD_LAZY); + if (!dl) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "PCRE2 support is not installed: %s", dlerror()); + + r = dlsym_many_and_warn( + dl, + LOG_ERR, + &sym_pcre2_match_data_create, "pcre2_match_data_create_8", + &sym_pcre2_match_data_free, "pcre2_match_data_free_8", + &sym_pcre2_code_free, "pcre2_code_free_8", + &sym_pcre2_compile, "pcre2_compile_8", + &sym_pcre2_get_error_message, "pcre2_get_error_message_8", + &sym_pcre2_match, "pcre2_match_8", + &sym_pcre2_get_ovector_pointer, "pcre2_get_ovector_pointer_8", + NULL); + if (r < 0) + return r; + + /* Note that we never release the reference here, because there's no real reason to, after all this + * was traditionally a regular shared library dependency which lives forever too. */ + pcre2_dl = TAKE_PTR(dl); + + return 1; +} + +#else + +int dlopen_pcre2(void) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "PCRE2 support is not compiled in."); +} +#endif diff --git a/src/journal/pcre2-dlopen.h b/src/journal/pcre2-dlopen.h new file mode 100644 index 0000000..1306334 --- /dev/null +++ b/src/journal/pcre2-dlopen.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_PCRE2 + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> + +extern pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *); +extern void (*sym_pcre2_match_data_free)(pcre2_match_data *); +extern void (*sym_pcre2_code_free)(pcre2_code *); +extern pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *); +extern int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE); +extern int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *); +extern PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *); +#endif + +int dlopen_pcre2(void); diff --git a/src/journal/sd-journal.c b/src/journal/sd-journal.c new file mode 100644 index 0000000..346970d --- /dev/null +++ b/src/journal/sd-journal.c @@ -0,0 +1,3274 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <linux/magic.h> +#include <poll.h> +#include <stddef.h> +#include <sys/inotify.h> +#include <sys/vfs.h> +#include <unistd.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "compress.h" +#include "dirent-util.h" +#include "env-file.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "list.h" +#include "lookup3.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "process-util.h" +#include "replace-var.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" + +#define JOURNAL_FILES_MAX 7168 + +#define JOURNAL_FILES_RECHECK_USEC (2 * USEC_PER_SEC) + +/* The maximum size of variable values we'll expand in catalog entries. We bind this to PATH_MAX for now, as + * we want to be able to show all officially valid paths at least */ +#define REPLACE_VAR_MAX PATH_MAX + +#define DEFAULT_DATA_THRESHOLD (64*1024) + +static void remove_file_real(sd_journal *j, JournalFile *f); + +static bool journal_pid_changed(sd_journal *j) { + assert(j); + + /* We don't support people creating a journal object and + * keeping it around over a fork(). Let's complain. */ + + return j->original_pid != getpid_cached(); +} + +static int journal_put_error(sd_journal *j, int r, const char *path) { + char *copy; + int k; + + /* Memorize an error we encountered, and store which + * file/directory it was generated from. Note that we store + * only *one* path per error code, as the error code is the + * key into the hashmap, and the path is the value. This means + * we keep track only of all error kinds, but not of all error + * locations. This has the benefit that the hashmap cannot + * grow beyond bounds. + * + * We return an error here only if we didn't manage to + * memorize the real error. */ + + if (r >= 0) + return r; + + k = hashmap_ensure_allocated(&j->errors, NULL); + if (k < 0) + return k; + + if (path) { + copy = strdup(path); + if (!copy) + return -ENOMEM; + } else + copy = NULL; + + k = hashmap_put(j->errors, INT_TO_PTR(r), copy); + if (k < 0) { + free(copy); + + if (k == -EEXIST) + return 0; + + return k; + } + + return 0; +} + +static void detach_location(sd_journal *j) { + JournalFile *f; + + assert(j); + + j->current_file = NULL; + j->current_field = 0; + + ORDERED_HASHMAP_FOREACH(f, j->files) + journal_file_reset_location(f); +} + +static void init_location(Location *l, LocationType type, JournalFile *f, Object *o) { + assert(l); + assert(IN_SET(type, LOCATION_DISCRETE, LOCATION_SEEK)); + assert(f); + + *l = (Location) { + .type = type, + .seqnum = le64toh(o->entry.seqnum), + .seqnum_id = f->header->seqnum_id, + .realtime = le64toh(o->entry.realtime), + .monotonic = le64toh(o->entry.monotonic), + .boot_id = o->entry.boot_id, + .xor_hash = le64toh(o->entry.xor_hash), + .seqnum_set = true, + .realtime_set = true, + .monotonic_set = true, + .xor_hash_set = true, + }; +} + +static void set_location(sd_journal *j, JournalFile *f, Object *o) { + assert(j); + assert(f); + assert(o); + + init_location(&j->current_location, LOCATION_DISCRETE, f, o); + + j->current_file = f; + j->current_field = 0; + + /* Let f know its candidate entry was picked. */ + assert(f->location_type == LOCATION_SEEK); + f->location_type = LOCATION_DISCRETE; +} + +static int match_is_valid(const void *data, size_t size) { + const char *b, *p; + + assert(data); + + if (size < 2) + return false; + + if (((char*) data)[0] == '_' && ((char*) data)[1] == '_') + return false; + + b = data; + for (p = b; p < b + size; p++) { + + if (*p == '=') + return p > b; + + if (*p == '_') + continue; + + if (*p >= 'A' && *p <= 'Z') + continue; + + if (*p >= '0' && *p <= '9') + continue; + + return false; + } + + return false; +} + +static bool same_field(const void *_a, size_t s, const void *_b, size_t t) { + const uint8_t *a = _a, *b = _b; + size_t j; + + for (j = 0; j < s && j < t; j++) { + + if (a[j] != b[j]) + return false; + + if (a[j] == '=') + return true; + } + + assert_not_reached("\"=\" not found"); +} + +static Match *match_new(Match *p, MatchType t) { + Match *m; + + m = new(Match, 1); + if (!m) + return NULL; + + *m = (Match) { + .type = t, + .parent = p, + }; + + if (p) + LIST_PREPEND(matches, p->matches, m); + + return m; +} + +static void match_free(Match *m) { + assert(m); + + while (m->matches) + match_free(m->matches); + + if (m->parent) + LIST_REMOVE(matches, m->parent->matches, m); + + free(m->data); + free(m); +} + +static void match_free_if_empty(Match *m) { + if (!m || m->matches) + return; + + match_free(m); +} + +_public_ int sd_journal_add_match(sd_journal *j, const void *data, size_t size) { + Match *l3, *l4, *add_here = NULL, *m; + uint64_t hash; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(data, -EINVAL); + + if (size == 0) + size = strlen(data); + + assert_return(match_is_valid(data, size), -EINVAL); + + /* level 0: AND term + * level 1: OR terms + * level 2: AND terms + * level 3: OR terms + * level 4: concrete matches */ + + if (!j->level0) { + j->level0 = match_new(NULL, MATCH_AND_TERM); + if (!j->level0) + return -ENOMEM; + } + + if (!j->level1) { + j->level1 = match_new(j->level0, MATCH_OR_TERM); + if (!j->level1) + return -ENOMEM; + } + + if (!j->level2) { + j->level2 = match_new(j->level1, MATCH_AND_TERM); + if (!j->level2) + return -ENOMEM; + } + + assert(j->level0->type == MATCH_AND_TERM); + assert(j->level1->type == MATCH_OR_TERM); + assert(j->level2->type == MATCH_AND_TERM); + + /* Old-style Jenkins (unkeyed) hashing only here. We do not cover new-style siphash (keyed) hashing + * here, since it's different for each file, and thus can't be pre-calculated in the Match object. */ + hash = jenkins_hash64(data, size); + + LIST_FOREACH(matches, l3, j->level2->matches) { + assert(l3->type == MATCH_OR_TERM); + + LIST_FOREACH(matches, l4, l3->matches) { + assert(l4->type == MATCH_DISCRETE); + + /* Exactly the same match already? Then ignore + * this addition */ + if (l4->hash == hash && + l4->size == size && + memcmp(l4->data, data, size) == 0) + return 0; + + /* Same field? Then let's add this to this OR term */ + if (same_field(data, size, l4->data, l4->size)) { + add_here = l3; + break; + } + } + + if (add_here) + break; + } + + if (!add_here) { + add_here = match_new(j->level2, MATCH_OR_TERM); + if (!add_here) + goto fail; + } + + m = match_new(add_here, MATCH_DISCRETE); + if (!m) + goto fail; + + m->hash = hash; + m->size = size; + m->data = memdup(data, size); + if (!m->data) + goto fail; + + detach_location(j); + + return 0; + +fail: + match_free_if_empty(add_here); + match_free_if_empty(j->level2); + match_free_if_empty(j->level1); + match_free_if_empty(j->level0); + + return -ENOMEM; +} + +_public_ int sd_journal_add_conjunction(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + if (!j->level0) + return 0; + + if (!j->level1) + return 0; + + if (!j->level1->matches) + return 0; + + j->level1 = NULL; + j->level2 = NULL; + + return 0; +} + +_public_ int sd_journal_add_disjunction(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + if (!j->level0) + return 0; + + if (!j->level1) + return 0; + + if (!j->level2) + return 0; + + if (!j->level2->matches) + return 0; + + j->level2 = NULL; + return 0; +} + +static char *match_make_string(Match *m) { + char *p = NULL, *r; + Match *i; + bool enclose = false; + + if (!m) + return strdup("none"); + + if (m->type == MATCH_DISCRETE) + return cescape_length(m->data, m->size); + + LIST_FOREACH(matches, i, m->matches) { + char *t, *k; + + t = match_make_string(i); + if (!t) + return mfree(p); + + if (p) { + k = strjoin(p, m->type == MATCH_OR_TERM ? " OR " : " AND ", t); + free(p); + free(t); + + if (!k) + return NULL; + + p = k; + + enclose = true; + } else + p = t; + } + + if (enclose) { + r = strjoin("(", p, ")"); + free(p); + return r; + } + + return p; +} + +char *journal_make_match_string(sd_journal *j) { + assert(j); + + return match_make_string(j->level0); +} + +_public_ void sd_journal_flush_matches(sd_journal *j) { + if (!j) + return; + + if (j->level0) + match_free(j->level0); + + j->level0 = j->level1 = j->level2 = NULL; + + detach_location(j); +} + +_pure_ static int compare_with_location(const JournalFile *f, const Location *l, const JournalFile *current_file) { + int r; + + assert(f); + assert(l); + assert(f->location_type == LOCATION_SEEK); + assert(IN_SET(l->type, LOCATION_DISCRETE, LOCATION_SEEK)); + + if (l->monotonic_set && + sd_id128_equal(f->current_boot_id, l->boot_id) && + l->realtime_set && + f->current_realtime == l->realtime && + l->xor_hash_set && + f->current_xor_hash == l->xor_hash && + l->seqnum_set && + sd_id128_equal(f->header->seqnum_id, l->seqnum_id) && + f->current_seqnum == l->seqnum && + f != current_file) + return 0; + + if (l->seqnum_set && + sd_id128_equal(f->header->seqnum_id, l->seqnum_id)) { + + r = CMP(f->current_seqnum, l->seqnum); + if (r != 0) + return r; + } + + if (l->monotonic_set && + sd_id128_equal(f->current_boot_id, l->boot_id)) { + + r = CMP(f->current_monotonic, l->monotonic); + if (r != 0) + return r; + } + + if (l->realtime_set) { + + r = CMP(f->current_realtime, l->realtime); + if (r != 0) + return r; + } + + if (l->xor_hash_set) { + + r = CMP(f->current_xor_hash, l->xor_hash); + if (r != 0) + return r; + } + + return 0; +} + +static int next_for_match( + sd_journal *j, + Match *m, + JournalFile *f, + uint64_t after_offset, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + uint64_t np = 0; + Object *n; + + assert(j); + assert(m); + assert(f); + + if (m->type == MATCH_DISCRETE) { + uint64_t dp, hash; + + /* If the keyed hash logic is used, we need to calculate the hash fresh per file. Otherwise + * we can use what we pre-calculated. */ + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + hash = journal_file_hash_data(f, m->data, m->size); + else + hash = m->hash; + + r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, NULL, &dp); + if (r <= 0) + return r; + + return journal_file_move_to_entry_by_offset_for_data(f, dp, after_offset, direction, ret, offset); + + } else if (m->type == MATCH_OR_TERM) { + Match *i; + + /* Find the earliest match beyond after_offset */ + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = next_for_match(j, i, f, after_offset, direction, NULL, &cp); + if (r < 0) + return r; + else if (r > 0) { + if (np == 0 || (direction == DIRECTION_DOWN ? cp < np : cp > np)) + np = cp; + } + } + + if (np == 0) + return 0; + + } else if (m->type == MATCH_AND_TERM) { + Match *i, *last_moved; + + /* Always jump to the next matching entry and repeat + * this until we find an offset that matches for all + * matches. */ + + if (!m->matches) + return 0; + + r = next_for_match(j, m->matches, f, after_offset, direction, NULL, &np); + if (r <= 0) + return r; + + assert(direction == DIRECTION_DOWN ? np >= after_offset : np <= after_offset); + last_moved = m->matches; + + LIST_LOOP_BUT_ONE(matches, i, m->matches, last_moved) { + uint64_t cp; + + r = next_for_match(j, i, f, np, direction, NULL, &cp); + if (r <= 0) + return r; + + assert(direction == DIRECTION_DOWN ? cp >= np : cp <= np); + if (direction == DIRECTION_DOWN ? cp > np : cp < np) { + np = cp; + last_moved = i; + } + } + } + + assert(np > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, np, &n); + if (r < 0) + return r; + + if (ret) + *ret = n; + if (offset) + *offset = np; + + return 1; +} + +static int find_location_for_match( + sd_journal *j, + Match *m, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + + assert(j); + assert(m); + assert(f); + + if (m->type == MATCH_DISCRETE) { + uint64_t dp, hash; + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + hash = journal_file_hash_data(f, m->data, m->size); + else + hash = m->hash; + + r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, NULL, &dp); + if (r <= 0) + return r; + + /* FIXME: missing: find by monotonic */ + + if (j->current_location.type == LOCATION_HEAD) + return journal_file_next_entry_for_data(f, NULL, 0, dp, DIRECTION_DOWN, ret, offset); + if (j->current_location.type == LOCATION_TAIL) + return journal_file_next_entry_for_data(f, NULL, 0, dp, DIRECTION_UP, ret, offset); + if (j->current_location.seqnum_set && sd_id128_equal(j->current_location.seqnum_id, f->header->seqnum_id)) + return journal_file_move_to_entry_by_seqnum_for_data(f, dp, j->current_location.seqnum, direction, ret, offset); + if (j->current_location.monotonic_set) { + r = journal_file_move_to_entry_by_monotonic_for_data(f, dp, j->current_location.boot_id, j->current_location.monotonic, direction, ret, offset); + if (r != -ENOENT) + return r; + } + if (j->current_location.realtime_set) + return journal_file_move_to_entry_by_realtime_for_data(f, dp, j->current_location.realtime, direction, ret, offset); + + return journal_file_next_entry_for_data(f, NULL, 0, dp, direction, ret, offset); + + } else if (m->type == MATCH_OR_TERM) { + uint64_t np = 0; + Object *n; + Match *i; + + /* Find the earliest match */ + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = find_location_for_match(j, i, f, direction, NULL, &cp); + if (r < 0) + return r; + else if (r > 0) { + if (np == 0 || (direction == DIRECTION_DOWN ? np > cp : np < cp)) + np = cp; + } + } + + if (np == 0) + return 0; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, np, &n); + if (r < 0) + return r; + + if (ret) + *ret = n; + if (offset) + *offset = np; + + return 1; + + } else { + Match *i; + uint64_t np = 0; + + assert(m->type == MATCH_AND_TERM); + + /* First jump to the last match, and then find the + * next one where all matches match */ + + if (!m->matches) + return 0; + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = find_location_for_match(j, i, f, direction, NULL, &cp); + if (r <= 0) + return r; + + if (np == 0 || (direction == DIRECTION_DOWN ? cp > np : cp < np)) + np = cp; + } + + return next_for_match(j, m, f, np, direction, ret, offset); + } +} + +static int find_location_with_matches( + sd_journal *j, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + + assert(j); + assert(f); + assert(ret); + assert(offset); + + if (!j->level0) { + /* No matches is simple */ + + if (j->current_location.type == LOCATION_HEAD) + return journal_file_next_entry(f, 0, DIRECTION_DOWN, ret, offset); + if (j->current_location.type == LOCATION_TAIL) + return journal_file_next_entry(f, 0, DIRECTION_UP, ret, offset); + if (j->current_location.seqnum_set && sd_id128_equal(j->current_location.seqnum_id, f->header->seqnum_id)) + return journal_file_move_to_entry_by_seqnum(f, j->current_location.seqnum, direction, ret, offset); + if (j->current_location.monotonic_set) { + r = journal_file_move_to_entry_by_monotonic(f, j->current_location.boot_id, j->current_location.monotonic, direction, ret, offset); + if (r != -ENOENT) + return r; + } + if (j->current_location.realtime_set) + return journal_file_move_to_entry_by_realtime(f, j->current_location.realtime, direction, ret, offset); + + return journal_file_next_entry(f, 0, direction, ret, offset); + } else + return find_location_for_match(j, j->level0, f, direction, ret, offset); +} + +static int next_with_matches( + sd_journal *j, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + assert(j); + assert(f); + assert(ret); + assert(offset); + + /* No matches is easy. We simple advance the file + * pointer by one. */ + if (!j->level0) + return journal_file_next_entry(f, f->current_offset, direction, ret, offset); + + /* If we have a match then we look for the next matching entry + * with an offset at least one step larger */ + return next_for_match(j, j->level0, f, + direction == DIRECTION_DOWN ? f->current_offset + 1 + : f->current_offset - 1, + direction, ret, offset); +} + +static int next_beyond_location(sd_journal *j, JournalFile *f, direction_t direction) { + Object *c; + uint64_t cp, n_entries; + int r; + + assert(j); + assert(f); + + n_entries = le64toh(f->header->n_entries); + + /* If we hit EOF before, we don't need to look into this file again + * unless direction changed or new entries appeared. */ + if (f->last_direction == direction && f->location_type == LOCATION_TAIL && + n_entries == f->last_n_entries) + return 0; + + f->last_n_entries = n_entries; + + if (f->last_direction == direction && f->current_offset > 0) { + /* LOCATION_SEEK here means we did the work in a previous + * iteration and the current location already points to a + * candidate entry. */ + if (f->location_type != LOCATION_SEEK) { + r = next_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } + } else { + f->last_direction = direction; + + r = find_location_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } + + /* OK, we found the spot, now let's advance until an entry + * that is actually different from what we were previously + * looking at. This is necessary to handle entries which exist + * in two (or more) journal files, and which shall all be + * suppressed but one. */ + + for (;;) { + bool found; + + if (j->current_location.type == LOCATION_DISCRETE) { + int k; + + k = compare_with_location(f, &j->current_location, j->current_file); + + found = direction == DIRECTION_DOWN ? k > 0 : k < 0; + } else + found = true; + + if (found) + return 1; + + r = next_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } +} + +static int real_journal_next(sd_journal *j, direction_t direction) { + JournalFile *new_file = NULL; + unsigned i, n_files; + const void **files; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + r = iterated_cache_get(j->files_cache, NULL, &files, &n_files); + if (r < 0) + return r; + + for (i = 0; i < n_files; i++) { + JournalFile *f = (JournalFile *)files[i]; + bool found; + + r = next_beyond_location(j, f, direction); + if (r < 0) { + log_debug_errno(r, "Can't iterate through %s, ignoring: %m", f->path); + remove_file_real(j, f); + continue; + } else if (r == 0) { + f->location_type = LOCATION_TAIL; + continue; + } + + if (!new_file) + found = true; + else { + int k; + + k = journal_file_compare_locations(f, new_file); + + found = direction == DIRECTION_DOWN ? k < 0 : k > 0; + } + + if (found) + new_file = f; + } + + if (!new_file) + return 0; + + r = journal_file_move_to_object(new_file, OBJECT_ENTRY, new_file->current_offset, &o); + if (r < 0) + return r; + + set_location(j, new_file, o); + + return 1; +} + +_public_ int sd_journal_next(sd_journal *j) { + return real_journal_next(j, DIRECTION_DOWN); +} + +_public_ int sd_journal_previous(sd_journal *j) { + return real_journal_next(j, DIRECTION_UP); +} + +static int real_journal_next_skip(sd_journal *j, direction_t direction, uint64_t skip) { + int c = 0, r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(skip <= INT_MAX, -ERANGE); + + if (skip == 0) { + /* If this is not a discrete skip, then at least + * resolve the current location */ + if (j->current_location.type != LOCATION_DISCRETE) { + r = real_journal_next(j, direction); + if (r < 0) + return r; + } + + return 0; + } + + do { + r = real_journal_next(j, direction); + if (r < 0) + return r; + + if (r == 0) + return c; + + skip--; + c++; + } while (skip > 0); + + return c; +} + +_public_ int sd_journal_next_skip(sd_journal *j, uint64_t skip) { + return real_journal_next_skip(j, DIRECTION_DOWN, skip); +} + +_public_ int sd_journal_previous_skip(sd_journal *j, uint64_t skip) { + return real_journal_next_skip(j, DIRECTION_UP, skip); +} + +_public_ int sd_journal_get_cursor(sd_journal *j, char **cursor) { + Object *o; + int r; + char bid[SD_ID128_STRING_MAX], sid[SD_ID128_STRING_MAX]; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(cursor, -EINVAL); + + if (!j->current_file || j->current_file->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(j->current_file, OBJECT_ENTRY, j->current_file->current_offset, &o); + if (r < 0) + return r; + + sd_id128_to_string(j->current_file->header->seqnum_id, sid); + sd_id128_to_string(o->entry.boot_id, bid); + + if (asprintf(cursor, + "s=%s;i=%"PRIx64";b=%s;m=%"PRIx64";t=%"PRIx64";x=%"PRIx64, + sid, le64toh(o->entry.seqnum), + bid, le64toh(o->entry.monotonic), + le64toh(o->entry.realtime), + le64toh(o->entry.xor_hash)) < 0) + return -ENOMEM; + + return 0; +} + +_public_ int sd_journal_seek_cursor(sd_journal *j, const char *cursor) { + unsigned long long seqnum, monotonic, realtime, xor_hash; + bool seqnum_id_set = false, + seqnum_set = false, + boot_id_set = false, + monotonic_set = false, + realtime_set = false, + xor_hash_set = false; + sd_id128_t seqnum_id, boot_id; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(!isempty(cursor), -EINVAL); + + for (const char *p = cursor;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ";", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + if (word[0] == '\0' || word[1] != '=') + return -EINVAL; + + switch (word[0]) { + case 's': + seqnum_id_set = true; + r = sd_id128_from_string(word + 2, &seqnum_id); + if (r < 0) + return r; + break; + + case 'i': + seqnum_set = true; + if (sscanf(word + 2, "%llx", &seqnum) != 1) + return -EINVAL; + break; + + case 'b': + boot_id_set = true; + r = sd_id128_from_string(word + 2, &boot_id); + break; + + case 'm': + monotonic_set = true; + if (sscanf(word + 2, "%llx", &monotonic) != 1) + return -EINVAL; + break; + + case 't': + realtime_set = true; + if (sscanf(word + 2, "%llx", &realtime) != 1) + return -EINVAL; + break; + + case 'x': + xor_hash_set = true; + if (sscanf(word + 2, "%llx", &xor_hash) != 1) + return -EINVAL; + break; + } + } + + if ((!seqnum_set || !seqnum_id_set) && + (!monotonic_set || !boot_id_set) && + !realtime_set) + return -EINVAL; + + detach_location(j); + j->current_location = (Location) { + .type = LOCATION_SEEK, + }; + + if (realtime_set) { + j->current_location.realtime = (uint64_t) realtime; + j->current_location.realtime_set = true; + } + + if (seqnum_set && seqnum_id_set) { + j->current_location.seqnum = (uint64_t) seqnum; + j->current_location.seqnum_id = seqnum_id; + j->current_location.seqnum_set = true; + } + + if (monotonic_set && boot_id_set) { + j->current_location.monotonic = (uint64_t) monotonic; + j->current_location.boot_id = boot_id; + j->current_location.monotonic_set = true; + } + + if (xor_hash_set) { + j->current_location.xor_hash = (uint64_t) xor_hash; + j->current_location.xor_hash_set = true; + } + + return 0; +} + +_public_ int sd_journal_test_cursor(sd_journal *j, const char *cursor) { + int r; + Object *o; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(!isempty(cursor), -EINVAL); + + if (!j->current_file || j->current_file->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(j->current_file, OBJECT_ENTRY, j->current_file->current_offset, &o); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *item = NULL; + unsigned long long ll; + sd_id128_t id; + int k = 0; + + r = extract_first_word(&cursor, &item, ";", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + + if (r == 0) + break; + + if (strlen(item) < 2 || item[1] != '=') + return -EINVAL; + + switch (item[0]) { + + case 's': + k = sd_id128_from_string(item+2, &id); + if (k < 0) + return k; + if (!sd_id128_equal(id, j->current_file->header->seqnum_id)) + return 0; + break; + + case 'i': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.seqnum)) + return 0; + break; + + case 'b': + k = sd_id128_from_string(item+2, &id); + if (k < 0) + return k; + if (!sd_id128_equal(id, o->entry.boot_id)) + return 0; + break; + + case 'm': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.monotonic)) + return 0; + break; + + case 't': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.realtime)) + return 0; + break; + + case 'x': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.xor_hash)) + return 0; + break; + } + } + + return 1; +} + +_public_ int sd_journal_seek_monotonic_usec(sd_journal *j, sd_id128_t boot_id, uint64_t usec) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_SEEK, + .boot_id = boot_id, + .monotonic = usec, + .monotonic_set = true, + }; + + return 0; +} + +_public_ int sd_journal_seek_realtime_usec(sd_journal *j, uint64_t usec) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_SEEK, + .realtime = usec, + .realtime_set = true, + }; + + return 0; +} + +_public_ int sd_journal_seek_head(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_HEAD, + }; + + return 0; +} + +_public_ int sd_journal_seek_tail(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_TAIL, + }; + + return 0; +} + +static void check_network(sd_journal *j, int fd) { + assert(j); + + if (j->on_network) + return; + + j->on_network = fd_is_network_fs(fd); +} + +static bool file_has_type_prefix(const char *prefix, const char *filename) { + const char *full, *tilded, *atted; + + full = strjoina(prefix, ".journal"); + tilded = strjoina(full, "~"); + atted = strjoina(prefix, "@"); + + return STR_IN_SET(filename, full, tilded) || + startswith(filename, atted); +} + +static bool file_type_wanted(int flags, const char *filename) { + assert(filename); + + if (!endswith(filename, ".journal") && !endswith(filename, ".journal~")) + return false; + + /* no flags set → every type is OK */ + if (!(flags & (SD_JOURNAL_SYSTEM | SD_JOURNAL_CURRENT_USER))) + return true; + + if (flags & SD_JOURNAL_SYSTEM && file_has_type_prefix("system", filename)) + return true; + + if (flags & SD_JOURNAL_CURRENT_USER) { + char prefix[5 + DECIMAL_STR_MAX(uid_t) + 1]; + + xsprintf(prefix, "user-"UID_FMT, getuid()); + + if (file_has_type_prefix(prefix, filename)) + return true; + } + + return false; +} + +static bool path_has_prefix(sd_journal *j, const char *path, const char *prefix) { + assert(j); + assert(path); + assert(prefix); + + if (j->toplevel_fd >= 0) + return false; + + return path_startswith(path, prefix); +} + +static void track_file_disposition(sd_journal *j, JournalFile *f) { + assert(j); + assert(f); + + if (!j->has_runtime_files && path_has_prefix(j, f->path, "/run")) + j->has_runtime_files = true; + else if (!j->has_persistent_files && path_has_prefix(j, f->path, "/var")) + j->has_persistent_files = true; +} + +static const char *skip_slash(const char *p) { + + if (!p) + return NULL; + + while (*p == '/') + p++; + + return p; +} + +static int add_any_file( + sd_journal *j, + int fd, + const char *path) { + + bool close_fd = false; + JournalFile *f; + struct stat st; + int r, k; + + assert(j); + assert(fd >= 0 || path); + + if (fd < 0) { + if (j->toplevel_fd >= 0) + /* If there's a top-level fd defined make the path relative, explicitly, since otherwise + * openat() ignores the first argument. */ + + fd = openat(j->toplevel_fd, skip_slash(path), O_RDONLY|O_CLOEXEC|O_NONBLOCK); + else + fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + r = log_debug_errno(errno, "Failed to open journal file %s: %m", path); + goto finish; + } + + close_fd = true; + + r = fd_nonblock(fd, false); + if (r < 0) { + r = log_debug_errno(errno, "Failed to turn off O_NONBLOCK for %s: %m", path); + goto finish; + } + } + + if (fstat(fd, &st) < 0) { + r = log_debug_errno(errno, "Failed to fstat file '%s': %m", path); + goto finish; + } + + r = stat_verify_regular(&st); + if (r < 0) { + log_debug_errno(r, "Refusing to open '%s', as it is not a regular file.", path); + goto finish; + } + + f = ordered_hashmap_get(j->files, path); + if (f) { + if (f->last_stat.st_dev == st.st_dev && + f->last_stat.st_ino == st.st_ino) { + + /* We already track this file, under the same path and with the same device/inode numbers, it's + * hence really the same. Mark this file as seen in this generation. This is used to GC old + * files in process_q_overflow() to detect journal files that are still there and discern them + * from those which are gone. */ + + f->last_seen_generation = j->generation; + r = 0; + goto finish; + } + + /* So we tracked a file under this name, but it has a different inode/device. In that case, it got + * replaced (probably due to rotation?), let's drop it hence from our list. */ + remove_file_real(j, f); + f = NULL; + } + + if (ordered_hashmap_size(j->files) >= JOURNAL_FILES_MAX) { + log_debug("Too many open journal files, not adding %s.", path); + r = -ETOOMANYREFS; + goto finish; + } + + r = journal_file_open(fd, path, O_RDONLY, 0, false, 0, false, NULL, j->mmap, NULL, NULL, &f); + if (r < 0) { + log_debug_errno(r, "Failed to open journal file %s: %m", path); + goto finish; + } + + /* journal_file_dump(f); */ + + r = ordered_hashmap_put(j->files, f->path, f); + if (r < 0) { + f->close_fd = false; /* make sure journal_file_close() doesn't close the caller's fd (or our own). We'll let the caller do that, or ourselves */ + (void) journal_file_close(f); + goto finish; + } + + close_fd = false; /* the fd is now owned by the JournalFile object */ + + f->last_seen_generation = j->generation; + + track_file_disposition(j, f); + check_network(j, f->fd); + + j->current_invalidate_counter++; + + log_debug("File %s added.", f->path); + + r = 0; + +finish: + if (close_fd) + safe_close(fd); + + if (r < 0) { + k = journal_put_error(j, r, path); + if (k < 0) + return k; + } + + return r; +} + +static int add_file_by_name( + sd_journal *j, + const char *prefix, + const char *filename) { + + const char *path; + + assert(j); + assert(prefix); + assert(filename); + + if (j->no_new_files) + return 0; + + if (!file_type_wanted(j->flags, filename)) + return 0; + + path = prefix_roota(prefix, filename); + return add_any_file(j, -1, path); +} + +static void remove_file_by_name( + sd_journal *j, + const char *prefix, + const char *filename) { + + const char *path; + JournalFile *f; + + assert(j); + assert(prefix); + assert(filename); + + path = prefix_roota(prefix, filename); + f = ordered_hashmap_get(j->files, path); + if (!f) + return; + + remove_file_real(j, f); +} + +static void remove_file_real(sd_journal *j, JournalFile *f) { + assert(j); + assert(f); + + (void) ordered_hashmap_remove(j->files, f->path); + + log_debug("File %s removed.", f->path); + + if (j->current_file == f) { + j->current_file = NULL; + j->current_field = 0; + } + + if (j->unique_file == f) { + /* Jump to the next unique_file or NULL if that one was last */ + j->unique_file = ordered_hashmap_next(j->files, j->unique_file->path); + j->unique_offset = 0; + if (!j->unique_file) + j->unique_file_lost = true; + } + + if (j->fields_file == f) { + j->fields_file = ordered_hashmap_next(j->files, j->fields_file->path); + j->fields_offset = 0; + if (!j->fields_file) + j->fields_file_lost = true; + } + + (void) journal_file_close(f); + + j->current_invalidate_counter++; +} + +static int dirname_is_machine_id(const char *fn) { + sd_id128_t id, machine; + const char *e; + int r; + + /* Returns true if the specified directory name matches the local machine ID */ + + r = sd_id128_get_machine(&machine); + if (r < 0) + return r; + + e = strchr(fn, '.'); + if (e) { + const char *k; + + /* Looks like it has a namespace suffix. Verify that. */ + if (!log_namespace_name_valid(e + 1)) + return false; + + k = strndupa(fn, e - fn); + r = sd_id128_from_string(k, &id); + } else + r = sd_id128_from_string(fn, &id); + if (r < 0) + return r; + + return sd_id128_equal(id, machine); +} + +static int dirname_has_namespace(const char *fn, const char *namespace) { + const char *e; + + /* Returns true if the specified directory name matches the specified namespace */ + + e = strchr(fn, '.'); + if (e) { + const char *k; + + if (!namespace) + return false; + + if (!streq(e + 1, namespace)) + return false; + + k = strndupa(fn, e - fn); + return id128_is_valid(k); + } + + if (namespace) + return false; + + return id128_is_valid(fn); +} + +static bool dirent_is_journal_file(const struct dirent *de) { + assert(de); + + /* Returns true if the specified directory entry looks like a journal file we might be interested in */ + + if (!IN_SET(de->d_type, DT_REG, DT_LNK, DT_UNKNOWN)) + return false; + + return endswith(de->d_name, ".journal") || + endswith(de->d_name, ".journal~"); +} + +static bool dirent_is_journal_subdir(const struct dirent *de) { + const char *e, *n; + assert(de); + + /* returns true if the specified directory entry looks like a directory that might contain journal + * files we might be interested in, i.e. is either a 128bit ID or a 128bit ID suffixed by a + * namespace. */ + + if (!IN_SET(de->d_type, DT_DIR, DT_LNK, DT_UNKNOWN)) + return false; + + e = strchr(de->d_name, '.'); + if (!e) + return id128_is_valid(de->d_name); /* No namespace */ + + n = strndupa(de->d_name, e - de->d_name); + if (!id128_is_valid(n)) + return false; + + return log_namespace_name_valid(e + 1); +} + +static int directory_open(sd_journal *j, const char *path, DIR **ret) { + DIR *d; + + assert(j); + assert(path); + assert(ret); + + if (j->toplevel_fd < 0) + d = opendir(path); + else + /* Open the specified directory relative to the toplevel fd. Enforce that the path specified is + * relative, by dropping the initial slash */ + d = xopendirat(j->toplevel_fd, skip_slash(path), 0); + if (!d) + return -errno; + + *ret = d; + return 0; +} + +static int add_directory(sd_journal *j, const char *prefix, const char *dirname); + +static void directory_enumerate(sd_journal *j, Directory *m, DIR *d) { + struct dirent *de; + + assert(j); + assert(m); + assert(d); + + FOREACH_DIRENT_ALL(de, d, goto fail) { + + if (dirent_is_journal_file(de)) + (void) add_file_by_name(j, m->path, de->d_name); + + if (m->is_root && dirent_is_journal_subdir(de)) + (void) add_directory(j, m->path, de->d_name); + } + + return; + +fail: + log_debug_errno(errno, "Failed to enumerate directory %s, ignoring: %m", m->path); +} + +static void directory_watch(sd_journal *j, Directory *m, int fd, uint32_t mask) { + int r; + + assert(j); + assert(m); + assert(fd >= 0); + + /* Watch this directory if that's enabled and if it not being watched yet. */ + + if (m->wd > 0) /* Already have a watch? */ + return; + if (j->inotify_fd < 0) /* Not watching at all? */ + return; + + m->wd = inotify_add_watch_fd(j->inotify_fd, fd, mask); + if (m->wd < 0) { + log_debug_errno(errno, "Failed to watch journal directory '%s', ignoring: %m", m->path); + return; + } + + r = hashmap_put(j->directories_by_wd, INT_TO_PTR(m->wd), m); + if (r == -EEXIST) + log_debug_errno(r, "Directory '%s' already being watched under a different path, ignoring: %m", m->path); + if (r < 0) { + log_debug_errno(r, "Failed to add watch for journal directory '%s' to hashmap, ignoring: %m", m->path); + (void) inotify_rm_watch(j->inotify_fd, m->wd); + m->wd = -1; + } +} + +static int add_directory( + sd_journal *j, + const char *prefix, + const char *dirname) { + + _cleanup_free_ char *path = NULL; + _cleanup_closedir_ DIR *d = NULL; + Directory *m; + int r, k; + + assert(j); + assert(prefix); + + /* Adds a journal file directory to watch. If the directory is already tracked this updates the inotify watch + * and reenumerates directory contents */ + + path = path_join(prefix, dirname); + if (!path) { + r = -ENOMEM; + goto fail; + } + + log_debug("Considering directory '%s'.", path); + + /* We consider everything local that is in a directory for the local machine ID, or that is stored in /run */ + if ((j->flags & SD_JOURNAL_LOCAL_ONLY) && + !((dirname && dirname_is_machine_id(dirname) > 0) || path_has_prefix(j, path, "/run"))) + return 0; + + if (dirname && + (!(FLAGS_SET(j->flags, SD_JOURNAL_ALL_NAMESPACES) || + dirname_has_namespace(dirname, j->namespace) > 0 || + (FLAGS_SET(j->flags, SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE) && dirname_has_namespace(dirname, NULL) > 0)))) + return 0; + + r = directory_open(j, path, &d); + if (r < 0) { + log_debug_errno(r, "Failed to open directory '%s': %m", path); + goto fail; + } + + m = hashmap_get(j->directories_by_path, path); + if (!m) { + m = new(Directory, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + *m = (Directory) { + .is_root = false, + .path = path, + }; + + if (hashmap_put(j->directories_by_path, m->path, m) < 0) { + free(m); + r = -ENOMEM; + goto fail; + } + + path = NULL; /* avoid freeing in cleanup */ + j->current_invalidate_counter++; + + log_debug("Directory %s added.", m->path); + + } else if (m->is_root) + return 0; /* Don't 'downgrade' from root directory */ + + m->last_seen_generation = j->generation; + + directory_watch(j, m, dirfd(d), + IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB|IN_DELETE| + IN_DELETE_SELF|IN_MOVE_SELF|IN_UNMOUNT|IN_MOVED_FROM| + IN_ONLYDIR); + + if (!j->no_new_files) + directory_enumerate(j, m, d); + + check_network(j, dirfd(d)); + + return 0; + +fail: + k = journal_put_error(j, r, path ?: prefix); + if (k < 0) + return k; + + return r; +} + +static int add_root_directory(sd_journal *j, const char *p, bool missing_ok) { + + _cleanup_closedir_ DIR *d = NULL; + Directory *m; + int r, k; + + assert(j); + + /* Adds a root directory to our set of directories to use. If the root directory is already in the set, we + * update the inotify logic, and renumerate the directory entries. This call may hence be called to initially + * populate the set, as well as to update it later. */ + + if (p) { + /* If there's a path specified, use it. */ + + log_debug("Considering root directory '%s'.", p); + + if ((j->flags & SD_JOURNAL_RUNTIME_ONLY) && + !path_has_prefix(j, p, "/run")) + return -EINVAL; + + if (j->prefix) + p = strjoina(j->prefix, p); + + r = directory_open(j, p, &d); + if (r == -ENOENT && missing_ok) + return 0; + if (r < 0) { + log_debug_errno(r, "Failed to open root directory %s: %m", p); + goto fail; + } + } else { + _cleanup_close_ int dfd = -1; + + /* If there's no path specified, then we use the top-level fd itself. We duplicate the fd here, since + * opendir() will take possession of the fd, and close it, which we don't want. */ + + p = "."; /* store this as "." in the directories hashmap */ + + dfd = fcntl(j->toplevel_fd, F_DUPFD_CLOEXEC, 3); + if (dfd < 0) { + r = -errno; + goto fail; + } + + d = take_fdopendir(&dfd); + if (!d) { + r = -errno; + goto fail; + } + + rewinddir(d); + } + + m = hashmap_get(j->directories_by_path, p); + if (!m) { + m = new0(Directory, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + m->is_root = true; + + m->path = strdup(p); + if (!m->path) { + free(m); + r = -ENOMEM; + goto fail; + } + + if (hashmap_put(j->directories_by_path, m->path, m) < 0) { + free(m->path); + free(m); + r = -ENOMEM; + goto fail; + } + + j->current_invalidate_counter++; + + log_debug("Root directory %s added.", m->path); + + } else if (!m->is_root) + return 0; + + directory_watch(j, m, dirfd(d), + IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB|IN_DELETE| + IN_ONLYDIR); + + if (!j->no_new_files) + directory_enumerate(j, m, d); + + check_network(j, dirfd(d)); + + return 0; + +fail: + k = journal_put_error(j, r, p); + if (k < 0) + return k; + + return r; +} + +static void remove_directory(sd_journal *j, Directory *d) { + assert(j); + + if (d->wd > 0) { + hashmap_remove(j->directories_by_wd, INT_TO_PTR(d->wd)); + + if (j->inotify_fd >= 0) + (void) inotify_rm_watch(j->inotify_fd, d->wd); + } + + hashmap_remove(j->directories_by_path, d->path); + + if (d->is_root) + log_debug("Root directory %s removed.", d->path); + else + log_debug("Directory %s removed.", d->path); + + free(d->path); + free(d); +} + +static int add_search_paths(sd_journal *j) { + + static const char search_paths[] = + "/run/log/journal\0" + "/var/log/journal\0"; + const char *p; + + assert(j); + + /* We ignore most errors here, since the idea is to only open + * what's actually accessible, and ignore the rest. */ + + NULSTR_FOREACH(p, search_paths) + (void) add_root_directory(j, p, true); + + if (!(j->flags & SD_JOURNAL_LOCAL_ONLY)) + (void) add_root_directory(j, "/var/log/journal/remote", true); + + return 0; +} + +static int add_current_paths(sd_journal *j) { + JournalFile *f; + + assert(j); + assert(j->no_new_files); + + /* Simply adds all directories for files we have open as directories. We don't expect errors here, so we + * treat them as fatal. */ + + ORDERED_HASHMAP_FOREACH(f, j->files) { + _cleanup_free_ char *dir; + int r; + + dir = dirname_malloc(f->path); + if (!dir) + return -ENOMEM; + + r = add_directory(j, dir, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int allocate_inotify(sd_journal *j) { + assert(j); + + if (j->inotify_fd < 0) { + j->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (j->inotify_fd < 0) + return -errno; + } + + return hashmap_ensure_allocated(&j->directories_by_wd, NULL); +} + +static sd_journal *journal_new(int flags, const char *path, const char *namespace) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + + j = new0(sd_journal, 1); + if (!j) + return NULL; + + j->original_pid = getpid_cached(); + j->toplevel_fd = -1; + j->inotify_fd = -1; + j->flags = flags; + j->data_threshold = DEFAULT_DATA_THRESHOLD; + + if (path) { + char *t; + + t = strdup(path); + if (!t) + return NULL; + + if (flags & SD_JOURNAL_OS_ROOT) + j->prefix = t; + else + j->path = t; + } + + if (namespace) { + j->namespace = strdup(namespace); + if (!j->namespace) + return NULL; + } + + j->files = ordered_hashmap_new(&path_hash_ops); + if (!j->files) + return NULL; + + j->files_cache = ordered_hashmap_iterated_cache_new(j->files); + j->directories_by_path = hashmap_new(&path_hash_ops); + j->mmap = mmap_cache_new(); + if (!j->files_cache || !j->directories_by_path || !j->mmap) + return NULL; + + return TAKE_PTR(j); +} + +#define OPEN_ALLOWED_FLAGS \ + (SD_JOURNAL_LOCAL_ONLY | \ + SD_JOURNAL_RUNTIME_ONLY | \ + SD_JOURNAL_SYSTEM | \ + SD_JOURNAL_CURRENT_USER | \ + SD_JOURNAL_ALL_NAMESPACES | \ + SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE) + +_public_ int sd_journal_open_namespace(sd_journal **ret, const char *namespace, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return((flags & ~OPEN_ALLOWED_FLAGS) == 0, -EINVAL); + + j = journal_new(flags, NULL, namespace); + if (!j) + return -ENOMEM; + + r = add_search_paths(j); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open(sd_journal **ret, int flags) { + return sd_journal_open_namespace(ret, NULL, flags); +} + +#define OPEN_CONTAINER_ALLOWED_FLAGS \ + (SD_JOURNAL_LOCAL_ONLY | SD_JOURNAL_SYSTEM) + +_public_ int sd_journal_open_container(sd_journal **ret, const char *machine, int flags) { + _cleanup_free_ char *root = NULL, *class = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + char *p; + int r; + + /* This is deprecated, people should use machined's OpenMachineRootDirectory() call instead in + * combination with sd_journal_open_directory_fd(). */ + + assert_return(machine, -EINVAL); + assert_return(ret, -EINVAL); + assert_return((flags & ~OPEN_CONTAINER_ALLOWED_FLAGS) == 0, -EINVAL); + assert_return(machine_name_is_valid(machine), -EINVAL); + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, + "ROOT", &root, + "CLASS", &class); + if (r == -ENOENT) + return -EHOSTDOWN; + if (r < 0) + return r; + if (!root) + return -ENODATA; + + if (!streq_ptr(class, "container")) + return -EIO; + + j = journal_new(flags, root, NULL); + if (!j) + return -ENOMEM; + + r = add_search_paths(j); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +#define OPEN_DIRECTORY_ALLOWED_FLAGS \ + (SD_JOURNAL_OS_ROOT | \ + SD_JOURNAL_SYSTEM | SD_JOURNAL_CURRENT_USER ) + +_public_ int sd_journal_open_directory(sd_journal **ret, const char *path, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(path, -EINVAL); + assert_return((flags & ~OPEN_DIRECTORY_ALLOWED_FLAGS) == 0, -EINVAL); + + j = journal_new(flags, path, NULL); + if (!j) + return -ENOMEM; + + if (flags & SD_JOURNAL_OS_ROOT) + r = add_search_paths(j); + else + r = add_root_directory(j, path, false); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open_files(sd_journal **ret, const char **paths, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + const char **path; + int r; + + assert_return(ret, -EINVAL); + assert_return(flags == 0, -EINVAL); + + j = journal_new(flags, NULL, NULL); + if (!j) + return -ENOMEM; + + STRV_FOREACH(path, paths) { + r = add_any_file(j, -1, *path); + if (r < 0) + return r; + } + + j->no_new_files = true; + + *ret = TAKE_PTR(j); + return 0; +} + +#define OPEN_DIRECTORY_FD_ALLOWED_FLAGS \ + (SD_JOURNAL_OS_ROOT | \ + SD_JOURNAL_SYSTEM | SD_JOURNAL_CURRENT_USER ) + +_public_ int sd_journal_open_directory_fd(sd_journal **ret, int fd, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + struct stat st; + int r; + + assert_return(ret, -EINVAL); + assert_return(fd >= 0, -EBADF); + assert_return((flags & ~OPEN_DIRECTORY_FD_ALLOWED_FLAGS) == 0, -EINVAL); + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISDIR(st.st_mode)) + return -EBADFD; + + j = journal_new(flags, NULL, NULL); + if (!j) + return -ENOMEM; + + j->toplevel_fd = fd; + + if (flags & SD_JOURNAL_OS_ROOT) + r = add_search_paths(j); + else + r = add_root_directory(j, NULL, false); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open_files_fd(sd_journal **ret, int fds[], unsigned n_fds, int flags) { + JournalFile *f; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + unsigned i; + int r; + + assert_return(ret, -EINVAL); + assert_return(n_fds > 0, -EBADF); + assert_return(flags == 0, -EINVAL); + + j = journal_new(flags, NULL, NULL); + if (!j) + return -ENOMEM; + + for (i = 0; i < n_fds; i++) { + struct stat st; + + if (fds[i] < 0) { + r = -EBADF; + goto fail; + } + + if (fstat(fds[i], &st) < 0) { + r = -errno; + goto fail; + } + + r = stat_verify_regular(&st); + if (r < 0) + goto fail; + + r = add_any_file(j, fds[i], NULL); + if (r < 0) + goto fail; + } + + j->no_new_files = true; + j->no_inotify = true; + + *ret = TAKE_PTR(j); + return 0; + +fail: + /* If we fail, make sure we don't take possession of the files we managed to make use of successfully, and they + * remain open */ + ORDERED_HASHMAP_FOREACH(f, j->files) + f->close_fd = false; + + return r; +} + +_public_ void sd_journal_close(sd_journal *j) { + Directory *d; + + if (!j) + return; + + sd_journal_flush_matches(j); + + ordered_hashmap_free_with_destructor(j->files, journal_file_close); + iterated_cache_free(j->files_cache); + + while ((d = hashmap_first(j->directories_by_path))) + remove_directory(j, d); + + while ((d = hashmap_first(j->directories_by_wd))) + remove_directory(j, d); + + hashmap_free(j->directories_by_path); + hashmap_free(j->directories_by_wd); + + safe_close(j->inotify_fd); + + if (j->mmap) { + log_debug("mmap cache statistics: %u hit, %u miss", mmap_cache_get_hit(j->mmap), mmap_cache_get_missed(j->mmap)); + mmap_cache_unref(j->mmap); + } + + hashmap_free_free(j->errors); + + free(j->path); + free(j->prefix); + free(j->namespace); + free(j->unique_field); + free(j->fields_buffer); + free(j); +} + +_public_ int sd_journal_get_realtime_usec(sd_journal *j, uint64_t *ret) { + Object *o; + JournalFile *f; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(ret, -EINVAL); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + *ret = le64toh(o->entry.realtime); + return 0; +} + +_public_ int sd_journal_get_monotonic_usec(sd_journal *j, uint64_t *ret, sd_id128_t *ret_boot_id) { + Object *o; + JournalFile *f; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + if (ret_boot_id) + *ret_boot_id = o->entry.boot_id; + else { + sd_id128_t id; + + r = sd_id128_get_boot(&id); + if (r < 0) + return r; + + if (!sd_id128_equal(id, o->entry.boot_id)) + return -ESTALE; + } + + if (ret) + *ret = le64toh(o->entry.monotonic); + + return 0; +} + +static bool field_is_valid(const char *field) { + const char *p; + + assert(field); + + if (isempty(field)) + return false; + + if (startswith(field, "__")) + return false; + + for (p = field; *p; p++) { + + if (*p == '_') + continue; + + if (*p >= 'A' && *p <= 'Z') + continue; + + if (*p >= '0' && *p <= '9') + continue; + + return false; + } + + return true; +} + +_public_ int sd_journal_get_data(sd_journal *j, const char *field, const void **data, size_t *size) { + JournalFile *f; + uint64_t i, n; + size_t field_length; + int r; + Object *o; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(field, -EINVAL); + assert_return(data, -EINVAL); + assert_return(size, -EINVAL); + assert_return(field_is_valid(field), -EINVAL); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + field_length = strlen(field); + + n = journal_file_entry_n_items(o); + for (i = 0; i < n; i++) { + uint64_t p, l; + le64_t le_hash; + size_t t; + int compression; + + p = le64toh(o->entry.items[i].object_offset); + le_hash = o->entry.items[i].hash; + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + if (le_hash != o->data.hash) + return -EBADMSG; + + l = le64toh(o->object.size) - offsetof(Object, data.payload); + + compression = o->object.flags & OBJECT_COMPRESSION_MASK; + if (compression) { +#if HAVE_COMPRESSION + r = decompress_startswith(compression, + o->data.payload, l, + &f->compress_buffer, &f->compress_buffer_size, + field, field_length, '='); + if (r < 0) + log_debug_errno(r, "Cannot decompress %s object of length %"PRIu64" at offset "OFSfmt": %m", + object_compressed_to_string(compression), l, p); + else if (r > 0) { + + size_t rsize; + + r = decompress_blob(compression, + o->data.payload, l, + &f->compress_buffer, &f->compress_buffer_size, &rsize, + j->data_threshold); + if (r < 0) + return r; + + *data = f->compress_buffer; + *size = (size_t) rsize; + + return 0; + } +#else + return -EPROTONOSUPPORT; +#endif + } else if (l >= field_length+1 && + memcmp(o->data.payload, field, field_length) == 0 && + o->data.payload[field_length] == '=') { + + t = (size_t) l; + + if ((uint64_t) t != l) + return -E2BIG; + + *data = o->data.payload; + *size = t; + + return 0; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + } + + return -ENOENT; +} + +static int return_data(sd_journal *j, JournalFile *f, Object *o, const void **data, size_t *size) { + size_t t; + uint64_t l; + int compression; + + l = le64toh(READ_NOW(o->object.size)); + if (l < offsetof(Object, data.payload)) + return -EBADMSG; + l -= offsetof(Object, data.payload); + t = (size_t) l; + + /* We can't read objects larger than 4G on a 32bit machine */ + if ((uint64_t) t != l) + return -E2BIG; + + compression = o->object.flags & OBJECT_COMPRESSION_MASK; + if (compression) { +#if HAVE_COMPRESSION + size_t rsize; + int r; + + r = decompress_blob(compression, + o->data.payload, l, &f->compress_buffer, + &f->compress_buffer_size, &rsize, j->data_threshold); + if (r < 0) + return r; + + *data = f->compress_buffer; + *size = (size_t) rsize; +#else + return -EPROTONOSUPPORT; +#endif + } else { + *data = o->data.payload; + *size = t; + } + + return 0; +} + +_public_ int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t *size) { + JournalFile *f; + uint64_t p, n; + le64_t le_hash; + int r; + Object *o; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(data, -EINVAL); + assert_return(size, -EINVAL); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + n = journal_file_entry_n_items(o); + if (j->current_field >= n) + return 0; + + p = le64toh(o->entry.items[j->current_field].object_offset); + le_hash = o->entry.items[j->current_field].hash; + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + if (le_hash != o->data.hash) + return -EBADMSG; + + r = return_data(j, f, o, data, size); + if (r < 0) + return r; + + j->current_field++; + + return 1; +} + +_public_ int sd_journal_enumerate_available_data(sd_journal *j, const void **data, size_t *size) { + for (;;) { + int r; + + r = sd_journal_enumerate_data(j, data, size); + if (r >= 0) + return r; + if (!JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(r)) + return r; + j->current_field++; /* Try with the next field */ + } +} + +_public_ void sd_journal_restart_data(sd_journal *j) { + if (!j) + return; + + j->current_field = 0; +} + +static int reiterate_all_paths(sd_journal *j) { + assert(j); + + if (j->no_new_files) + return add_current_paths(j); + + if (j->flags & SD_JOURNAL_OS_ROOT) + return add_search_paths(j); + + if (j->toplevel_fd >= 0) + return add_root_directory(j, NULL, false); + + if (j->path) + return add_root_directory(j, j->path, true); + + return add_search_paths(j); +} + +_public_ int sd_journal_get_fd(sd_journal *j) { + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + if (j->no_inotify) + return -EMEDIUMTYPE; + + if (j->inotify_fd >= 0) + return j->inotify_fd; + + r = allocate_inotify(j); + if (r < 0) + return r; + + log_debug("Reiterating files to get inotify watches established."); + + /* Iterate through all dirs again, to add them to the inotify */ + r = reiterate_all_paths(j); + if (r < 0) + return r; + + return j->inotify_fd; +} + +_public_ int sd_journal_get_events(sd_journal *j) { + int fd; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + fd = sd_journal_get_fd(j); + if (fd < 0) + return fd; + + return POLLIN; +} + +_public_ int sd_journal_get_timeout(sd_journal *j, uint64_t *timeout_usec) { + int fd; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(timeout_usec, -EINVAL); + + fd = sd_journal_get_fd(j); + if (fd < 0) + return fd; + + if (!j->on_network) { + *timeout_usec = (uint64_t) -1; + return 0; + } + + /* If we are on the network we need to regularly check for + * changes manually */ + + *timeout_usec = j->last_process_usec + JOURNAL_FILES_RECHECK_USEC; + return 1; +} + +static void process_q_overflow(sd_journal *j) { + JournalFile *f; + Directory *m; + + assert(j); + + /* When the inotify queue overruns we need to enumerate and re-validate all journal files to bring our list + * back in sync with what's on disk. For this we pick a new generation counter value. It'll be assigned to all + * journal files we encounter. All journal files and all directories that don't carry it after reenumeration + * are subject for unloading. */ + + log_debug("Inotify queue overrun, reiterating everything."); + + j->generation++; + (void) reiterate_all_paths(j); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + + if (f->last_seen_generation == j->generation) + continue; + + log_debug("File '%s' hasn't been seen in this enumeration, removing.", f->path); + remove_file_real(j, f); + } + + HASHMAP_FOREACH(m, j->directories_by_path) { + + if (m->last_seen_generation == j->generation) + continue; + + if (m->is_root) /* Never GC root directories */ + continue; + + log_debug("Directory '%s' hasn't been seen in this enumeration, removing.", f->path); + remove_directory(j, m); + } + + log_debug("Reiteration complete."); +} + +static void process_inotify_event(sd_journal *j, const struct inotify_event *e) { + Directory *d; + + assert(j); + assert(e); + + if (e->mask & IN_Q_OVERFLOW) { + process_q_overflow(j); + return; + } + + /* Is this a subdirectory we watch? */ + d = hashmap_get(j->directories_by_wd, INT_TO_PTR(e->wd)); + if (d) { + if (!(e->mask & IN_ISDIR) && e->len > 0 && + (endswith(e->name, ".journal") || + endswith(e->name, ".journal~"))) { + + /* Event for a journal file */ + + if (e->mask & (IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB)) + (void) add_file_by_name(j, d->path, e->name); + else if (e->mask & (IN_DELETE|IN_MOVED_FROM|IN_UNMOUNT)) + remove_file_by_name(j, d->path, e->name); + + } else if (!d->is_root && e->len == 0) { + + /* Event for a subdirectory */ + + if (e->mask & (IN_DELETE_SELF|IN_MOVE_SELF|IN_UNMOUNT)) + remove_directory(j, d); + + } else if (d->is_root && (e->mask & IN_ISDIR) && e->len > 0 && id128_is_valid(e->name)) { + + /* Event for root directory */ + + if (e->mask & (IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB)) + (void) add_directory(j, d->path, e->name); + } + + return; + } + + if (e->mask & IN_IGNORED) + return; + + log_debug("Unexpected inotify event."); +} + +static int determine_change(sd_journal *j) { + bool b; + + assert(j); + + b = j->current_invalidate_counter != j->last_invalidate_counter; + j->last_invalidate_counter = j->current_invalidate_counter; + + return b ? SD_JOURNAL_INVALIDATE : SD_JOURNAL_APPEND; +} + +_public_ int sd_journal_process(sd_journal *j) { + bool got_something = false; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + if (j->inotify_fd < 0) /* We have no inotify fd yet? Then there's noting to process. */ + return 0; + + j->last_process_usec = now(CLOCK_MONOTONIC); + j->last_invalidate_counter = j->current_invalidate_counter; + + for (;;) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + + l = read(j->inotify_fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (IN_SET(errno, EAGAIN, EINTR)) + return got_something ? determine_change(j) : SD_JOURNAL_NOP; + + return -errno; + } + + got_something = true; + + FOREACH_INOTIFY_EVENT(e, buffer, l) + process_inotify_event(j, e); + } +} + +_public_ int sd_journal_wait(sd_journal *j, uint64_t timeout_usec) { + int r; + uint64_t t; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + if (j->inotify_fd < 0) { + JournalFile *f; + + /* This is the first invocation, hence create the + * inotify watch */ + r = sd_journal_get_fd(j); + if (r < 0) + return r; + + /* Server might have done some vacuuming while we weren't watching. + Get rid of the deleted files now so they don't stay around indefinitely. */ + ORDERED_HASHMAP_FOREACH(f, j->files) { + r = journal_file_fstat(f); + if (r == -EIDRM) + remove_file_real(j, f); + else if (r < 0) { + log_debug_errno(r,"Failed to fstat() journal file '%s' : %m", f->path); + continue; + } + } + + /* The journal might have changed since the context + * object was created and we weren't watching before, + * hence don't wait for anything, and return + * immediately. */ + return determine_change(j); + } + + r = sd_journal_get_timeout(j, &t); + if (r < 0) + return r; + + if (t != (uint64_t) -1) { + usec_t n; + + n = now(CLOCK_MONOTONIC); + t = t > n ? t - n : 0; + + if (timeout_usec == (uint64_t) -1 || timeout_usec > t) + timeout_usec = t; + } + + do { + r = fd_wait_for_event(j->inotify_fd, POLLIN, timeout_usec); + } while (r == -EINTR); + + if (r < 0) + return r; + + return sd_journal_process(j); +} + +_public_ int sd_journal_get_cutoff_realtime_usec(sd_journal *j, uint64_t *from, uint64_t *to) { + JournalFile *f; + bool first = true; + uint64_t fmin = 0, tmax = 0; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(from || to, -EINVAL); + assert_return(from != to, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + usec_t fr, t; + + r = journal_file_get_cutoff_realtime_usec(f, &fr, &t); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + if (r == 0) + continue; + + if (first) { + fmin = fr; + tmax = t; + first = false; + } else { + fmin = MIN(fr, fmin); + tmax = MAX(t, tmax); + } + } + + if (from) + *from = fmin; + if (to) + *to = tmax; + + return first ? 0 : 1; +} + +_public_ int sd_journal_get_cutoff_monotonic_usec(sd_journal *j, sd_id128_t boot_id, uint64_t *from, uint64_t *to) { + JournalFile *f; + bool found = false; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(from || to, -EINVAL); + assert_return(from != to, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + usec_t fr, t; + + r = journal_file_get_cutoff_monotonic_usec(f, boot_id, &fr, &t); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + if (r == 0) + continue; + + if (found) { + if (from) + *from = MIN(fr, *from); + if (to) + *to = MAX(t, *to); + } else { + if (from) + *from = fr; + if (to) + *to = t; + found = true; + } + } + + return found; +} + +void journal_print_header(sd_journal *j) { + JournalFile *f; + bool newline = false; + + assert(j); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + if (newline) + putchar('\n'); + else + newline = true; + + journal_file_print_header(f); + } +} + +_public_ int sd_journal_get_usage(sd_journal *j, uint64_t *bytes) { + JournalFile *f; + uint64_t sum = 0; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(bytes, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + struct stat st; + + if (fstat(f->fd, &st) < 0) + return -errno; + + sum += (uint64_t) st.st_blocks * 512ULL; + } + + *bytes = sum; + return 0; +} + +_public_ int sd_journal_query_unique(sd_journal *j, const char *field) { + char *f; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(!isempty(field), -EINVAL); + assert_return(field_is_valid(field), -EINVAL); + + f = strdup(field); + if (!f) + return -ENOMEM; + + free(j->unique_field); + j->unique_field = f; + j->unique_file = NULL; + j->unique_offset = 0; + j->unique_file_lost = false; + + return 0; +} + +_public_ int sd_journal_enumerate_unique(sd_journal *j, const void **data, size_t *l) { + size_t k; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(data, -EINVAL); + assert_return(l, -EINVAL); + assert_return(j->unique_field, -EINVAL); + + k = strlen(j->unique_field); + + if (!j->unique_file) { + if (j->unique_file_lost) + return 0; + + j->unique_file = ordered_hashmap_first(j->files); + if (!j->unique_file) + return 0; + + j->unique_offset = 0; + } + + for (;;) { + JournalFile *of; + Object *o; + const void *odata; + size_t ol; + bool found; + int r; + + /* Proceed to next data object in the field's linked list */ + if (j->unique_offset == 0) { + r = journal_file_find_field_object(j->unique_file, j->unique_field, k, &o, NULL); + if (r < 0) + return r; + + j->unique_offset = r > 0 ? le64toh(o->field.head_data_offset) : 0; + } else { + r = journal_file_move_to_object(j->unique_file, OBJECT_DATA, j->unique_offset, &o); + if (r < 0) + return r; + + j->unique_offset = le64toh(o->data.next_field_offset); + } + + /* We reached the end of the list? Then start again, with the next file */ + if (j->unique_offset == 0) { + j->unique_file = ordered_hashmap_next(j->files, j->unique_file->path); + if (!j->unique_file) + return 0; + + continue; + } + + /* We do not use OBJECT_DATA context here, but OBJECT_UNUSED + * instead, so that we can look at this data object at the same + * time as one on another file */ + r = journal_file_move_to_object(j->unique_file, OBJECT_UNUSED, j->unique_offset, &o); + if (r < 0) + return r; + + /* Let's do the type check by hand, since we used 0 context above. */ + if (o->object.type != OBJECT_DATA) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object has type %d, expected %d", + j->unique_file->path, + j->unique_offset, + o->object.type, OBJECT_DATA); + + r = return_data(j, j->unique_file, o, &odata, &ol); + if (r < 0) + return r; + + /* Check if we have at least the field name and "=". */ + if (ol <= k) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object has size %zu, expected at least %zu", + j->unique_file->path, + j->unique_offset, ol, k + 1); + + if (memcmp(odata, j->unique_field, k) || ((const char*) odata)[k] != '=') + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object does not start with \"%s=\"", + j->unique_file->path, + j->unique_offset, + j->unique_field); + + /* OK, now let's see if we already returned this data + * object by checking if it exists in the earlier + * traversed files. */ + found = false; + ORDERED_HASHMAP_FOREACH(of, j->files) { + if (of == j->unique_file) + break; + + /* Skip this file it didn't have any fields indexed */ + if (JOURNAL_HEADER_CONTAINS(of->header, n_fields) && le64toh(of->header->n_fields) <= 0) + continue; + + r = journal_file_find_data_object_with_hash(of, odata, ol, le64toh(o->data.hash), NULL, NULL); + if (r < 0) + return r; + if (r > 0) { + found = true; + break; + } + } + + if (found) + continue; + + r = return_data(j, j->unique_file, o, data, l); + if (r < 0) + return r; + + return 1; + } +} + +_public_ int sd_journal_enumerate_available_unique(sd_journal *j, const void **data, size_t *size) { + for (;;) { + int r; + + r = sd_journal_enumerate_unique(j, data, size); + if (r >= 0) + return r; + if (!JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(r)) + return r; + /* Try with the next field. sd_journal_enumerate_unique() modifies state, so on the next try + * we will access the next field. */ + } +} + +_public_ void sd_journal_restart_unique(sd_journal *j) { + if (!j) + return; + + j->unique_file = NULL; + j->unique_offset = 0; + j->unique_file_lost = false; +} + +_public_ int sd_journal_enumerate_fields(sd_journal *j, const char **field) { + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(field, -EINVAL); + + if (!j->fields_file) { + if (j->fields_file_lost) + return 0; + + j->fields_file = ordered_hashmap_first(j->files); + if (!j->fields_file) + return 0; + + j->fields_hash_table_index = 0; + j->fields_offset = 0; + } + + for (;;) { + JournalFile *f, *of; + uint64_t m; + Object *o; + size_t sz; + bool found; + + f = j->fields_file; + + if (j->fields_offset == 0) { + bool eof = false; + + /* We are not yet positioned at any field. Let's pick the first one */ + r = journal_file_map_field_hash_table(f); + if (r < 0) + return r; + + m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem); + for (;;) { + if (j->fields_hash_table_index >= m) { + /* Reached the end of the hash table, go to the next file. */ + eof = true; + break; + } + + j->fields_offset = le64toh(f->field_hash_table[j->fields_hash_table_index].head_hash_offset); + + if (j->fields_offset != 0) + break; + + /* Empty hash table bucket, go to next one */ + j->fields_hash_table_index++; + } + + if (eof) { + /* Proceed with next file */ + j->fields_file = ordered_hashmap_next(j->files, f->path); + if (!j->fields_file) { + *field = NULL; + return 0; + } + + j->fields_offset = 0; + j->fields_hash_table_index = 0; + continue; + } + + } else { + /* We are already positioned at a field. If so, let's figure out the next field from it */ + + r = journal_file_move_to_object(f, OBJECT_FIELD, j->fields_offset, &o); + if (r < 0) + return r; + + j->fields_offset = le64toh(o->field.next_hash_offset); + if (j->fields_offset == 0) { + /* Reached the end of the hash table chain */ + j->fields_hash_table_index++; + continue; + } + } + + /* We use OBJECT_UNUSED here, so that the iterator below doesn't remove our mmap window */ + r = journal_file_move_to_object(f, OBJECT_UNUSED, j->fields_offset, &o); + if (r < 0) + return r; + + /* Because we used OBJECT_UNUSED above, we need to do our type check manually */ + if (o->object.type != OBJECT_FIELD) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object has type %i, expected %i", + f->path, j->fields_offset, + o->object.type, OBJECT_FIELD); + + sz = le64toh(o->object.size) - offsetof(Object, field.payload); + + /* Let's see if we already returned this field name before. */ + found = false; + ORDERED_HASHMAP_FOREACH(of, j->files) { + if (of == f) + break; + + /* Skip this file it didn't have any fields indexed */ + if (JOURNAL_HEADER_CONTAINS(of->header, n_fields) && le64toh(of->header->n_fields) <= 0) + continue; + + r = journal_file_find_field_object_with_hash(of, o->field.payload, sz, le64toh(o->field.hash), NULL, NULL); + if (r < 0) + return r; + if (r > 0) { + found = true; + break; + } + } + + if (found) + continue; + + /* Check if this is really a valid string containing no NUL byte */ + if (memchr(o->field.payload, 0, sz)) + return -EBADMSG; + + if (sz > j->data_threshold) + sz = j->data_threshold; + + if (!GREEDY_REALLOC(j->fields_buffer, j->fields_buffer_allocated, sz + 1)) + return -ENOMEM; + + memcpy(j->fields_buffer, o->field.payload, sz); + j->fields_buffer[sz] = 0; + + if (!field_is_valid(j->fields_buffer)) + return -EBADMSG; + + *field = j->fields_buffer; + return 1; + } +} + +_public_ void sd_journal_restart_fields(sd_journal *j) { + if (!j) + return; + + j->fields_file = NULL; + j->fields_hash_table_index = 0; + j->fields_offset = 0; + j->fields_file_lost = false; +} + +_public_ int sd_journal_reliable_fd(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + return !j->on_network; +} + +static char *lookup_field(const char *field, void *userdata) { + sd_journal *j = userdata; + const void *data; + size_t size, d; + int r; + + assert(field); + assert(j); + + r = sd_journal_get_data(j, field, &data, &size); + if (r < 0 || + size > REPLACE_VAR_MAX) + return strdup(field); + + d = strlen(field) + 1; + + return strndup((const char*) data + d, size - d); +} + +_public_ int sd_journal_get_catalog(sd_journal *j, char **ret) { + const void *data; + size_t size; + sd_id128_t id; + _cleanup_free_ char *text = NULL, *cid = NULL; + char *t; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(ret, -EINVAL); + + r = sd_journal_get_data(j, "MESSAGE_ID", &data, &size); + if (r < 0) + return r; + + cid = strndup((const char*) data + 11, size - 11); + if (!cid) + return -ENOMEM; + + r = sd_id128_from_string(cid, &id); + if (r < 0) + return r; + + r = catalog_get(CATALOG_DATABASE, id, &text); + if (r < 0) + return r; + + t = replace_var(text, lookup_field, j); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +_public_ int sd_journal_get_catalog_for_message_id(sd_id128_t id, char **ret) { + assert_return(ret, -EINVAL); + + return catalog_get(CATALOG_DATABASE, id, ret); +} + +_public_ int sd_journal_set_data_threshold(sd_journal *j, size_t sz) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + j->data_threshold = sz; + return 0; +} + +_public_ int sd_journal_get_data_threshold(sd_journal *j, size_t *sz) { + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + assert_return(sz, -EINVAL); + + *sz = j->data_threshold; + return 0; +} + +_public_ int sd_journal_has_runtime_files(sd_journal *j) { + assert_return(j, -EINVAL); + + return j->has_runtime_files; +} + +_public_ int sd_journal_has_persistent_files(sd_journal *j) { + assert_return(j, -EINVAL); + + return j->has_persistent_files; +} diff --git a/src/journal/test-audit-type.c b/src/journal/test-audit-type.c new file mode 100644 index 0000000..5adbf0d --- /dev/null +++ b/src/journal/test-audit-type.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdio.h> +#include <linux/audit.h> + +#include "audit-type.h" + +static void print_audit_label(int i) { + const char *name; + + name = audit_type_name_alloca(i); + /* This is a separate function only because of alloca */ + printf("%i → %s → %s\n", i, audit_type_to_string(i), name); +} + +static void test_audit_type(void) { + int i; + + for (i = 0; i <= AUDIT_KERNEL; i++) + print_audit_label(i); +} + +int main(int argc, char **argv) { + test_audit_type(); + return 0; +} diff --git a/src/journal/test-catalog.c b/src/journal/test-catalog.c new file mode 100644 index 0000000..982fec0 --- /dev/null +++ b/src/journal/test-catalog.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <locale.h> +#include <unistd.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "util.h" + +static char** catalog_dirs = NULL; +static const char *no_catalog_dirs[] = { + "/bin/hopefully/with/no/catalog", + NULL +}; + +static OrderedHashmap* test_import(const char* contents, ssize_t size, int code) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-catalog.XXXXXX"; + _cleanup_close_ int fd; + OrderedHashmap *h; + + if (size < 0) + size = strlen(contents); + + assert_se(h = ordered_hashmap_new(&catalog_hash_ops)); + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(write(fd, contents, size) == size); + + assert_se(catalog_import_file(h, name) == code); + + return h; +} + +static void test_catalog_import_invalid(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + + h = test_import("xxx", -1, -EINVAL); + assert_se(ordered_hashmap_isempty(h)); +} + +static void test_catalog_import_badid(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededede\n" \ +"Subject: message\n" \ +"\n" \ +"payload\n"; + h = test_import(input, -1, -EINVAL); +} + +static void test_catalog_import_one(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"\n" \ +"payload\n"; + const char *expect = +"Subject: message\n" \ +"\n" \ +"payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) { + printf("expect: %s\n", expect); + printf("actual: %s\n", payload); + assert_se(streq(expect, payload)); + } +} + +static void test_catalog_import_merge(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n" \ +"\n" \ +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"\n" \ +"override payload\n"; + + const char *combined = +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"override payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) + assert_se(streq(combined, payload)); +} + +static void test_catalog_import_merge_no_body(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n" \ +"\n" \ +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"\n"; + + const char *combined = +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) + assert_se(streq(combined, payload)); +} + +static void test_catalog_update(const char *database) { + int r; + + /* Test what happens if there are no files. */ + r = catalog_update(database, NULL, NULL); + assert_se(r == 0); + + /* Test what happens if there are no files in the directory. */ + r = catalog_update(database, NULL, no_catalog_dirs); + assert_se(r == 0); + + /* Make sure that we at least have some files loaded or the + * catalog_list below will fail. */ + r = catalog_update(database, NULL, (const char * const *) catalog_dirs); + assert_se(r == 0); +} + +static void test_catalog_file_lang(void) { + _cleanup_free_ char *lang = NULL, *lang2 = NULL, *lang3 = NULL, *lang4 = NULL; + + assert_se(catalog_file_lang("systemd.de_DE.catalog", &lang) == 1); + assert_se(streq(lang, "de_DE")); + + assert_se(catalog_file_lang("systemd..catalog", &lang2) == 0); + assert_se(lang2 == NULL); + + assert_se(catalog_file_lang("systemd.fr.catalog", &lang2) == 1); + assert_se(streq(lang2, "fr")); + + assert_se(catalog_file_lang("systemd.fr.catalog.gz", &lang3) == 0); + assert_se(lang3 == NULL); + + assert_se(catalog_file_lang("systemd.01234567890123456789012345678901.catalog", &lang3) == 0); + assert_se(lang3 == NULL); + + assert_se(catalog_file_lang("systemd.0123456789012345678901234567890.catalog", &lang3) == 1); + assert_se(streq(lang3, "0123456789012345678901234567890")); + + assert_se(catalog_file_lang("/x/y/systemd.catalog", &lang4) == 0); + assert_se(lang4 == NULL); + + assert_se(catalog_file_lang("/x/y/systemd.ru_RU.catalog", &lang4) == 1); + assert_se(streq(lang4, "ru_RU")); +} + +int main(int argc, char *argv[]) { + _cleanup_(unlink_tempfilep) char database[] = "/tmp/test-catalog.XXXXXX"; + _cleanup_free_ char *text = NULL; + int r; + + setlocale(LC_ALL, "de_DE.UTF-8"); + + test_setup_logging(LOG_DEBUG); + + /* If test-catalog is located at the build directory, then use catalogs in that. + * If it is not, e.g. installed by systemd-tests package, then use installed catalogs. */ + catalog_dirs = STRV_MAKE(get_catalog_dir()); + + assert_se(access(catalog_dirs[0], F_OK) >= 0); + log_notice("Using catalog directory '%s'", catalog_dirs[0]); + + test_catalog_file_lang(); + + test_catalog_import_invalid(); + test_catalog_import_badid(); + test_catalog_import_one(); + test_catalog_import_merge(); + test_catalog_import_merge_no_body(); + + assert_se(mkostemp_safe(database) >= 0); + + test_catalog_update(database); + + r = catalog_list(stdout, database, true); + assert_se(r >= 0); + + r = catalog_list(stdout, database, false); + assert_se(r >= 0); + + assert_se(catalog_get(database, SD_MESSAGE_COREDUMP, &text) >= 0); + printf(">>>%s<<<\n", text); + + return 0; +} diff --git a/src/journal/test-compress-benchmark.c b/src/journal/test-compress-benchmark.c new file mode 100644 index 0000000..0019760 --- /dev/null +++ b/src/journal/test-compress-benchmark.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "compress.h" +#include "env-util.h" +#include "macro.h" +#include "memory-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "random-util.h" +#include "string-util.h" +#include "tests.h" + +typedef int (compress_t)(const void *src, uint64_t src_size, void *dst, + size_t dst_alloc_size, size_t *dst_size); +typedef int (decompress_t)(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, size_t* dst_size, size_t dst_max); + +#if HAVE_COMPRESSION + +static usec_t arg_duration; +static size_t arg_start; + +#define MAX_SIZE (1024*1024LU) +#define PRIME 1048571 /* A prime close enough to one megabyte that mod 4 == 3 */ + +static size_t _permute(size_t x) { + size_t residue; + + if (x >= PRIME) + return x; + + residue = x*x % PRIME; + if (x <= PRIME / 2) + return residue; + else + return PRIME - residue; +} + +static size_t permute(size_t x) { + return _permute((_permute(x) + arg_start) % MAX_SIZE ^ 0xFF345); +} + +static char* make_buf(size_t count, const char *type) { + char *buf; + size_t i; + + buf = malloc(count); + assert_se(buf); + + if (streq(type, "zeros")) + memzero(buf, count); + else if (streq(type, "simple")) + for (i = 0; i < count; i++) + buf[i] = 'a' + i % ('z' - 'a' + 1); + else if (streq(type, "random")) { + size_t step = count / 10; + + random_bytes(buf, step); + memzero(buf + 1*step, step); + random_bytes(buf + 2*step, step); + memzero(buf + 3*step, step); + random_bytes(buf + 4*step, step); + memzero(buf + 5*step, step); + random_bytes(buf + 6*step, step); + memzero(buf + 7*step, step); + random_bytes(buf + 8*step, step); + memzero(buf + 9*step, step); + } else + assert_not_reached("here"); + + return buf; +} + +static void test_compress_decompress(const char* label, const char* type, + compress_t compress, decompress_t decompress) { + usec_t n, n2 = 0; + float dt; + + _cleanup_free_ char *text, *buf; + _cleanup_free_ void *buf2 = NULL; + size_t buf2_allocated = 0; + size_t skipped = 0, compressed = 0, total = 0; + + text = make_buf(MAX_SIZE, type); + buf = calloc(MAX_SIZE + 1, 1); + assert_se(text && buf); + + n = now(CLOCK_MONOTONIC); + + for (size_t i = 0; i <= MAX_SIZE; i++) { + size_t j = 0, k = 0, size; + int r; + + size = permute(i); + if (size == 0) + continue; + + log_debug("%s %zu %zu", type, i, size); + + memzero(buf, MIN(size + 1000, MAX_SIZE)); + + r = compress(text, size, buf, size, &j); + /* assume compression must be successful except for small or random inputs */ + assert_se(r == 0 || (size < 2048 && r == -ENOBUFS) || streq(type, "random")); + + /* check for overwrites */ + assert_se(buf[size] == 0); + if (r != 0) { + skipped += size; + continue; + } + + assert_se(j > 0); + if (j >= size) + log_error("%s \"compressed\" %zu -> %zu", label, size, j); + + r = decompress(buf, j, &buf2, &buf2_allocated, &k, 0); + assert_se(r == 0); + assert_se(buf2_allocated >= k); + assert_se(k == size); + + assert_se(memcmp(text, buf2, size) == 0); + + total += size; + compressed += j; + + n2 = now(CLOCK_MONOTONIC); + if (n2 - n > arg_duration) + break; + } + + dt = (n2-n) / 1e6; + + log_info("%s/%s: compressed & decompressed %zu bytes in %.2fs (%.2fMiB/s), " + "mean compression %.2f%%, skipped %zu bytes", + label, type, total, dt, + total / 1024. / 1024 / dt, + 100 - compressed * 100. / total, + skipped); +} +#endif + +int main(int argc, char *argv[]) { +#if HAVE_COMPRESSION + test_setup_logging(LOG_INFO); + + if (argc >= 2) { + unsigned x; + + assert_se(safe_atou(argv[1], &x) >= 0); + arg_duration = x * USEC_PER_SEC; + } else + arg_duration = slow_tests_enabled() ? + 2 * USEC_PER_SEC : USEC_PER_SEC / 50; + + if (argc == 3) + (void) safe_atozu(argv[2], &arg_start); + else + arg_start = getpid_cached(); + + const char *i; + NULSTR_FOREACH(i, "zeros\0simple\0random\0") { +#if HAVE_XZ + test_compress_decompress("XZ", i, compress_blob_xz, decompress_blob_xz); +#endif +#if HAVE_LZ4 + test_compress_decompress("LZ4", i, compress_blob_lz4, decompress_blob_lz4); +#endif +#if HAVE_ZSTD + test_compress_decompress("ZSTD", i, compress_blob_zstd, decompress_blob_zstd); +#endif + } + return 0; +#else + return log_tests_skipped("No compression feature is enabled"); +#endif +} diff --git a/src/journal/test-compress.c b/src/journal/test-compress.c new file mode 100644 index 0000000..ccd4605 --- /dev/null +++ b/src/journal/test-compress.c @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/stat.h> + +#if HAVE_LZ4 +#include <lz4.h> +#endif + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "memory-util.h" +#include "path-util.h" +#include "random-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +#if HAVE_XZ +# define XZ_OK 0 +#else +# define XZ_OK -EPROTONOSUPPORT +#endif + +#if HAVE_LZ4 +# define LZ4_OK 0 +#else +# define LZ4_OK -EPROTONOSUPPORT +#endif + +#define HUGE_SIZE (4096*1024) + +typedef int (compress_blob_t)(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +typedef int (decompress_blob_t)(const void *src, uint64_t src_size, + void **dst, size_t *dst_alloc_size, + size_t* dst_size, size_t dst_max); +typedef int (decompress_sw_t)(const void *src, uint64_t src_size, + void **buffer, size_t *buffer_size, + const void *prefix, size_t prefix_len, + uint8_t extra); + +typedef int (compress_stream_t)(int fdf, int fdt, uint64_t max_bytes); +typedef int (decompress_stream_t)(int fdf, int fdt, uint64_t max_size); + +#if HAVE_COMPRESSION +_unused_ static void test_compress_decompress(const char *compression, + compress_blob_t compress, + decompress_blob_t decompress, + const char *data, + size_t data_len, + bool may_fail) { + char compressed[512]; + size_t csize, usize = 0; + _cleanup_free_ char *decompressed = NULL; + int r; + + log_info("/* testing %s %s blob compression/decompression */", + compression, data); + + r = compress(data, data_len, compressed, sizeof(compressed), &csize); + if (r == -ENOBUFS) { + log_info_errno(r, "compression failed: %m"); + assert_se(may_fail); + } else { + assert_se(r == 0); + r = decompress(compressed, csize, + (void **) &decompressed, &usize, &csize, 0); + assert_se(r == 0); + assert_se(decompressed); + assert_se(memcmp(decompressed, data, data_len) == 0); + } + + r = decompress("garbage", 7, + (void **) &decompressed, &usize, &csize, 0); + assert_se(r < 0); + + /* make sure to have the minimal lz4 compressed size */ + r = decompress("00000000\1g", 9, + (void **) &decompressed, &usize, &csize, 0); + assert_se(r < 0); + + r = decompress("\100000000g", 9, + (void **) &decompressed, &usize, &csize, 0); + assert_se(r < 0); + + memzero(decompressed, usize); +} + +_unused_ static void test_decompress_startswith(const char *compression, + compress_blob_t compress, + decompress_sw_t decompress_sw, + const char *data, + size_t data_len, + bool may_fail) { + + char *compressed; + _cleanup_free_ char *compressed1 = NULL, *compressed2 = NULL, *decompressed = NULL; + size_t csize, usize = 0, len; + int r; + + log_info("/* testing decompress_startswith with %s on %.20s text */", + compression, data); + +#define BUFSIZE_1 512 +#define BUFSIZE_2 20000 + + compressed = compressed1 = malloc(BUFSIZE_1); + assert_se(compressed1); + r = compress(data, data_len, compressed, BUFSIZE_1, &csize); + if (r == -ENOBUFS) { + log_info_errno(r, "compression failed: %m"); + assert_se(may_fail); + + compressed = compressed2 = malloc(BUFSIZE_2); + assert_se(compressed2); + r = compress(data, data_len, compressed, BUFSIZE_2, &csize); + assert(r == 0); + } + assert_se(r == 0); + + len = strlen(data); + + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, data, len, '\0'); + assert_se(r > 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, data, len, 'w'); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, "barbarbar", 9, ' '); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, data, len - 1, data[len-1]); + assert_se(r > 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, data, len - 1, 'w'); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, &usize, data, len, '\0'); + assert_se(r > 0); +} + +_unused_ static void test_decompress_startswith_short(const char *compression, + compress_blob_t compress, + decompress_sw_t decompress_sw) { + +#define TEXT "HUGE=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + + char buf[1024]; + size_t i, csize; + int r; + + log_info("/* %s with %s */", __func__, compression); + + r = compress(TEXT, sizeof TEXT, buf, sizeof buf, &csize); + assert_se(r == 0); + + for (i = 1; i < strlen(TEXT); i++) { + size_t alloc_size = i; + _cleanup_free_ void *buf2 = NULL; + + assert_se(buf2 = malloc(i)); + + assert_se(decompress_sw(buf, csize, &buf2, &alloc_size, TEXT, i, TEXT[i]) == 1); + assert_se(decompress_sw(buf, csize, &buf2, &alloc_size, TEXT, i, 'y') == 0); + } +} + +_unused_ static void test_compress_stream(const char *compression, + const char *cat, + compress_stream_t compress, + decompress_stream_t decompress, + const char *srcfile) { + + _cleanup_close_ int src = -1, dst = -1, dst2 = -1; + _cleanup_(unlink_tempfilep) char + pattern[] = "/tmp/systemd-test.compressed.XXXXXX", + pattern2[] = "/tmp/systemd-test.compressed.XXXXXX"; + int r; + _cleanup_free_ char *cmd = NULL, *cmd2 = NULL; + struct stat st = {}; + + r = find_executable(cat, NULL); + if (r < 0) { + log_error_errno(r, "Skipping %s, could not find %s binary: %m", __func__, cat); + return; + } + + log_debug("/* testing %s compression */", compression); + + log_debug("/* create source from %s */", srcfile); + + assert_se((src = open(srcfile, O_RDONLY|O_CLOEXEC)) >= 0); + + log_debug("/* test compression */"); + + assert_se((dst = mkostemp_safe(pattern)) >= 0); + + assert_se(compress(src, dst, -1) == 0); + + if (cat) { + assert_se(asprintf(&cmd, "%s %s | diff %s -", cat, pattern, srcfile) > 0); + assert_se(system(cmd) == 0); + } + + log_debug("/* test decompression */"); + + assert_se((dst2 = mkostemp_safe(pattern2)) >= 0); + + assert_se(stat(srcfile, &st) == 0); + + assert_se(lseek(dst, 0, SEEK_SET) == 0); + r = decompress(dst, dst2, st.st_size); + assert_se(r == 0); + + assert_se(asprintf(&cmd2, "diff %s %s", srcfile, pattern2) > 0); + assert_se(system(cmd2) == 0); + + log_debug("/* test faulty decompression */"); + + assert_se(lseek(dst, 1, SEEK_SET) == 1); + r = decompress(dst, dst2, st.st_size); + assert_se(IN_SET(r, 0, -EBADMSG)); + + assert_se(lseek(dst, 0, SEEK_SET) == 0); + assert_se(lseek(dst2, 0, SEEK_SET) == 0); + r = decompress(dst, dst2, st.st_size - 1); + assert_se(r == -EFBIG); +} +#endif + +#if HAVE_LZ4 +static void test_lz4_decompress_partial(void) { + char buf[20000], buf2[100]; + size_t buf_size = sizeof(buf), compressed; + int r; + _cleanup_free_ char *huge = NULL; + + log_debug("/* %s */", __func__); + + assert_se(huge = malloc(HUGE_SIZE)); + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + r = LZ4_compress_default(huge, buf, HUGE_SIZE, buf_size); + assert_se(r >= 0); + compressed = r; + log_info("Compressed %i → %zu", HUGE_SIZE, compressed); + + r = LZ4_decompress_safe(buf, huge, r, HUGE_SIZE); + assert_se(r >= 0); + log_info("Decompressed → %i", r); + + r = LZ4_decompress_safe_partial(buf, huge, + compressed, + 12, HUGE_SIZE); + assert_se(r >= 0); + log_info("Decompressed partial %i/%i → %i", 12, HUGE_SIZE, r); + + for (size_t size = 1; size < sizeof(buf2); size++) { + /* This failed in older lz4s but works in newer ones. */ + r = LZ4_decompress_safe_partial(buf, buf2, compressed, size, size); + log_info("Decompressed partial %zu/%zu → %i (%s)", size, size, r, + r < 0 ? "bad" : "good"); + if (r >= 0 && LZ4_versionNumber() >= 10803) + /* lz4 <= 1.8.2 should fail that test, let's only check for newer ones */ + assert_se(memcmp(buf2, huge, r) == 0); + } +} +#endif + +int main(int argc, char *argv[]) { +#if HAVE_COMPRESSION + _unused_ const char text[] = + "text\0foofoofoofoo AAAA aaaaaaaaa ghost busters barbarbar FFF" + "foofoofoofoo AAAA aaaaaaaaa ghost busters barbarbar FFF"; + + /* The file to test compression on can be specified as the first argument */ + const char *srcfile = argc > 1 ? argv[1] : argv[0]; + + char data[512] = "random\0"; + + _cleanup_free_ char *huge = NULL; + + assert_se(huge = malloc(HUGE_SIZE)); + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + test_setup_logging(LOG_DEBUG); + + random_bytes(data + 7, sizeof(data) - 7); + +#if HAVE_XZ + test_compress_decompress("XZ", compress_blob_xz, decompress_blob_xz, + text, sizeof(text), false); + test_compress_decompress("XZ", compress_blob_xz, decompress_blob_xz, + data, sizeof(data), true); + + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + text, sizeof(text), false); + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + data, sizeof(data), true); + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + huge, HUGE_SIZE, true); + + test_compress_stream("XZ", "xzcat", + compress_stream_xz, decompress_stream_xz, srcfile); + + test_decompress_startswith_short("XZ", compress_blob_xz, decompress_startswith_xz); + +#else + log_info("/* XZ test skipped */"); +#endif + +#if HAVE_LZ4 + test_compress_decompress("LZ4", compress_blob_lz4, decompress_blob_lz4, + text, sizeof(text), false); + test_compress_decompress("LZ4", compress_blob_lz4, decompress_blob_lz4, + data, sizeof(data), true); + + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + text, sizeof(text), false); + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + data, sizeof(data), true); + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + huge, HUGE_SIZE, true); + + test_compress_stream("LZ4", "lz4cat", + compress_stream_lz4, decompress_stream_lz4, srcfile); + + test_lz4_decompress_partial(); + + test_decompress_startswith_short("LZ4", compress_blob_lz4, decompress_startswith_lz4); + +#else + log_info("/* LZ4 test skipped */"); +#endif + +#if HAVE_ZSTD + test_compress_decompress("ZSTD", compress_blob_zstd, decompress_blob_zstd, + text, sizeof(text), false); + test_compress_decompress("ZSTD", compress_blob_zstd, decompress_blob_zstd, + data, sizeof(data), true); + + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + text, sizeof(text), false); + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + data, sizeof(data), true); + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + huge, HUGE_SIZE, true); + + test_compress_stream("ZSTD", "zstdcat", + compress_stream_zstd, decompress_stream_zstd, srcfile); + + test_decompress_startswith_short("ZSTD", compress_blob_zstd, decompress_startswith_zstd); +#else + log_info("/* ZSTD test skipped */"); +#endif + + return 0; +#else + log_info("/* XZ, LZ4 and ZSTD tests skipped */"); + return EXIT_TEST_SKIP; +#endif +} diff --git a/src/journal/test-journal-config.c b/src/journal/test-journal-config.c new file mode 100644 index 0000000..4f29e1b --- /dev/null +++ b/src/journal/test-journal-config.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdbool.h> + +#include "journald-server.h" + +#define _COMPRESS_PARSE_CHECK(str, enab, thresh, varname) \ + do { \ + JournalCompressOptions varname = {true, 111}; \ + config_parse_compress("", "", 0, "", 0, "", 0, str, \ + &varname, NULL); \ + assert_se((enab) == varname.enabled); \ + if (varname.enabled) \ + assert_se((thresh) == varname.threshold_bytes); \ + } while (0) + +#define COMPRESS_PARSE_CHECK(str, enabled, threshold) \ + _COMPRESS_PARSE_CHECK(str, enabled, threshold, conf##__COUNTER__) + +static void test_config_compress(void) { + COMPRESS_PARSE_CHECK("yes", true, 111); + COMPRESS_PARSE_CHECK("no", false, 111); + COMPRESS_PARSE_CHECK("y", true, 111); + COMPRESS_PARSE_CHECK("n", false, 111); + COMPRESS_PARSE_CHECK("true", true, 111); + COMPRESS_PARSE_CHECK("false", false, 111); + COMPRESS_PARSE_CHECK("t", true, 111); + COMPRESS_PARSE_CHECK("f", false, 111); + COMPRESS_PARSE_CHECK("on", true, 111); + COMPRESS_PARSE_CHECK("off", false, 111); + + /* Weird size/bool overlapping case. We preserve backward compatibility instead of assuming these are byte + * counts. */ + COMPRESS_PARSE_CHECK("1", true, 111); + COMPRESS_PARSE_CHECK("0", false, 111); + + /* IEC sizing */ + COMPRESS_PARSE_CHECK("1B", true, 1); + COMPRESS_PARSE_CHECK("1K", true, 1024); + COMPRESS_PARSE_CHECK("1M", true, 1024 * 1024); + COMPRESS_PARSE_CHECK("1G", true, 1024 * 1024 * 1024); + + /* Invalid Case */ + COMPRESS_PARSE_CHECK("-1", true, 111); + COMPRESS_PARSE_CHECK("blah blah", true, 111); + COMPRESS_PARSE_CHECK("", true, (uint64_t)-1); +} + +int main(int argc, char *argv[]) { + test_config_compress(); + + return 0; +} diff --git a/src/journal/test-journal-enum.c b/src/journal/test-journal-enum.c new file mode 100644 index 0000000..03fe8e2 --- /dev/null +++ b/src/journal/test-journal-enum.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdio.h> + +#include "sd-journal.h" + +#include "journal-internal.h" +#include "log.h" +#include "macro.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + unsigned n = 0; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY) >= 0); + + assert_se(sd_journal_add_match(j, "_TRANSPORT=syslog", 0) >= 0); + assert_se(sd_journal_add_match(j, "_UID=0", 0) >= 0); + + SD_JOURNAL_FOREACH_BACKWARDS(j) { + const void *d; + size_t l; + + assert_se(sd_journal_get_data(j, "MESSAGE", &d, &l) >= 0); + + printf("%.*s\n", (int) l, (char*) d); + + n++; + if (n >= 10) + break; + } + + return 0; +} diff --git a/src/journal/test-journal-flush.c b/src/journal/test-journal-flush.c new file mode 100644 index 0000000..dad277d --- /dev/null +++ b/src/journal/test-journal-flush.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "macro.h" +#include "path-util.h" +#include "string-util.h" + +int main(int argc, char *argv[]) { + _cleanup_free_ char *fn = NULL; + char dn[] = "/var/tmp/test-journal-flush.XXXXXX"; + JournalFile *new_journal = NULL; + sd_journal *j = NULL; + unsigned n = 0; + int r; + + assert_se(mkdtemp(dn)); + (void) chattr_path(dn, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + fn = path_join(dn, "test.journal"); + + r = journal_file_open(-1, fn, O_CREAT|O_RDWR, 0644, false, 0, false, NULL, NULL, NULL, NULL, &new_journal); + assert_se(r >= 0); + + r = sd_journal_open(&j, 0); + assert_se(r >= 0); + + sd_journal_set_data_threshold(j, 0); + + SD_JOURNAL_FOREACH(j) { + Object *o; + JournalFile *f; + + f = j->current_file; + assert_se(f && f->current_offset > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + log_error_errno(r, "journal_file_move_to_object failed: %m"); + assert_se(r >= 0); + + r = journal_file_copy_entry(f, new_journal, o, f->current_offset); + if (r < 0) + log_error_errno(r, "journal_file_copy_entry failed: %m"); + assert_se(r >= 0); + + if (++n >= 10000) + break; + } + + sd_journal_close(j); + + (void) journal_file_close(new_journal); + + unlink(fn); + assert_se(rmdir(dn) == 0); + + return 0; +} diff --git a/src/journal/test-journal-init.c b/src/journal/test-journal-init.c new file mode 100644 index 0000000..80aff75 --- /dev/null +++ b/src/journal/test-journal-init.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-journal.h" + +#include "chattr-util.h" +#include "log.h" +#include "parse-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "util.h" + +int main(int argc, char *argv[]) { + sd_journal *j; + int r, i, I = 100; + char t[] = "/var/tmp/journal-stream-XXXXXX"; + + test_setup_logging(LOG_DEBUG); + + if (argc >= 2) { + r = safe_atoi(argv[1], &I); + if (r < 0) + log_info("Could not parse loop count argument. Using default."); + } + + log_info("Running %d loops", I); + + assert_se(mkdtemp(t)); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + for (i = 0; i < I; i++) { + r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); + assert_se(r == 0); + + sd_journal_close(j); + + r = sd_journal_open_directory(&j, t, 0); + assert_se(r == 0); + + sd_journal_close(j); + + j = NULL; + r = sd_journal_open_directory(&j, t, SD_JOURNAL_LOCAL_ONLY); + assert_se(r == -EINVAL); + assert_se(j == NULL); + } + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + return 0; +} diff --git a/src/journal/test-journal-interleaving.c b/src/journal/test-journal-interleaving.c new file mode 100644 index 0000000..8c78c3b --- /dev/null +++ b/src/journal/test-journal-interleaving.c @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "io-util.h" +#include "journal-file.h" +#include "journal-vacuum.h" +#include "log.h" +#include "parse-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "util.h" + +/* This program tests skipping around in a multi-file journal. */ + +static bool arg_keep = false; + +_noreturn_ static void log_assert_errno(const char *text, int error, const char *file, int line, const char *func) { + log_internal(LOG_CRIT, error, file, line, func, + "'%s' failed at %s:%u (%s): %m", text, file, line, func); + abort(); +} + +#define assert_ret(expr) \ + do { \ + int _r_ = (expr); \ + if (_unlikely_(_r_ < 0)) \ + log_assert_errno(#expr, -_r_, PROJECT_FILE, __LINE__, __PRETTY_FUNCTION__); \ + } while (false) + +static JournalFile *test_open(const char *name) { + JournalFile *f; + assert_ret(journal_file_open(-1, name, O_RDWR|O_CREAT, 0644, true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &f)); + return f; +} + +static void test_close(JournalFile *f) { + (void) journal_file_close (f); +} + +static void append_number(JournalFile *f, int n, uint64_t *seqnum) { + char *p; + dual_timestamp ts; + static dual_timestamp previous_ts = {}; + struct iovec iovec[1]; + + dual_timestamp_get(&ts); + + if (ts.monotonic <= previous_ts.monotonic) + ts.monotonic = previous_ts.monotonic + 1; + + if (ts.realtime <= previous_ts.realtime) + ts.realtime = previous_ts.realtime + 1; + + previous_ts = ts; + + assert_se(asprintf(&p, "NUMBER=%d", n) >= 0); + iovec[0] = IOVEC_MAKE_STRING(p); + assert_ret(journal_file_append_entry(f, &ts, NULL, iovec, 1, seqnum, NULL, NULL)); + free(p); +} + +static void test_check_number (sd_journal *j, int n) { + const void *d; + _cleanup_free_ char *k; + size_t l; + int x; + + assert_ret(sd_journal_get_data(j, "NUMBER", &d, &l)); + assert_se(k = strndup(d, l)); + printf("%s\n", k); + + assert_se(safe_atoi(k + 7, &x) >= 0); + assert_se(n == x); +} + +static void test_check_numbers_down (sd_journal *j, int count) { + int i; + + for (i = 1; i <= count; i++) { + int r; + test_check_number(j, i); + assert_ret(r = sd_journal_next(j)); + if (i == count) + assert_se(r == 0); + else + assert_se(r == 1); + } + +} + +static void test_check_numbers_up (sd_journal *j, int count) { + for (int i = count; i >= 1; i--) { + int r; + test_check_number(j, i); + assert_ret(r = sd_journal_previous(j)); + if (i == 1) + assert_se(r == 0); + else + assert_se(r == 1); + } + +} + +static void setup_sequential(void) { + JournalFile *one, *two; + one = test_open("one.journal"); + two = test_open("two.journal"); + append_number(one, 1, NULL); + append_number(one, 2, NULL); + append_number(two, 3, NULL); + append_number(two, 4, NULL); + test_close(one); + test_close(two); +} + +static void setup_interleaved(void) { + JournalFile *one, *two; + one = test_open("one.journal"); + two = test_open("two.journal"); + append_number(one, 1, NULL); + append_number(two, 2, NULL); + append_number(one, 3, NULL); + append_number(two, 4, NULL); + test_close(one); + test_close(two); +} + +static void mkdtemp_chdir_chattr(char *path) { + assert_se(mkdtemp(path)); + assert_se(chdir(path) >= 0); + + /* Speed up things a bit on btrfs, ensuring that CoW is turned off for all files created in our + * directory during the test run */ + (void) chattr_path(path, FS_NOCOW_FL, FS_NOCOW_FL, NULL); +} + +static void test_skip(void (*setup)(void)) { + char t[] = "/var/tmp/journal-skip-XXXXXX"; + sd_journal *j; + int r; + + mkdtemp_chdir_chattr(t); + + setup(); + + /* Seek to head, iterate down. + */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_ret(sd_journal_next(j)); + test_check_numbers_down(j, 4); + sd_journal_close(j); + + /* Seek to tail, iterate up. + */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_ret(sd_journal_previous(j)); + test_check_numbers_up(j, 4); + sd_journal_close(j); + + /* Seek to tail, skip to head, iterate down. + */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_ret(r = sd_journal_previous_skip(j, 4)); + assert_se(r == 4); + test_check_numbers_down(j, 4); + sd_journal_close(j); + + /* Seek to head, skip to tail, iterate up. + */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_ret(r = sd_journal_next_skip(j, 4)); + assert_se(r == 4); + test_check_numbers_up(j, 4); + sd_journal_close(j); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + puts("------------------------------------------------------------"); +} + +static void test_sequence_numbers(void) { + + char t[] = "/var/tmp/journal-seq-XXXXXX"; + JournalFile *one, *two; + uint64_t seqnum = 0; + sd_id128_t seqnum_id; + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "one.journal", O_RDWR|O_CREAT, 0644, + true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &one) == 0); + + append_number(one, 1, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 1); + append_number(one, 2, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 2); + + assert_se(one->header->state == STATE_ONLINE); + assert_se(!sd_id128_equal(one->header->file_id, one->header->machine_id)); + assert_se(!sd_id128_equal(one->header->file_id, one->header->boot_id)); + assert_se(sd_id128_equal(one->header->file_id, one->header->seqnum_id)); + + memcpy(&seqnum_id, &one->header->seqnum_id, sizeof(sd_id128_t)); + + assert_se(journal_file_open(-1, "two.journal", O_RDWR|O_CREAT, 0644, + true, (uint64_t) -1, false, NULL, NULL, NULL, one, &two) == 0); + + assert_se(two->header->state == STATE_ONLINE); + assert_se(!sd_id128_equal(two->header->file_id, one->header->file_id)); + assert_se(sd_id128_equal(one->header->machine_id, one->header->machine_id)); + assert_se(sd_id128_equal(one->header->boot_id, one->header->boot_id)); + assert_se(sd_id128_equal(one->header->seqnum_id, one->header->seqnum_id)); + + append_number(two, 3, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 3); + append_number(two, 4, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 4); + + test_close(two); + + append_number(one, 5, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 5); + + append_number(one, 6, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 6); + + test_close(one); + + /* restart server */ + seqnum = 0; + + assert_se(journal_file_open(-1, "two.journal", O_RDWR, 0, + true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &two) == 0); + + assert_se(sd_id128_equal(two->header->seqnum_id, seqnum_id)); + + append_number(two, 7, &seqnum); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 5); + + /* So..., here we have the same seqnum in two files with the + * same seqnum_id. */ + + test_close(two); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + + /* journal_file_open requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + arg_keep = argc > 1; + + test_skip(setup_sequential); + test_skip(setup_interleaved); + + test_sequence_numbers(); + + return 0; +} diff --git a/src/journal/test-journal-match.c b/src/journal/test-journal-match.c new file mode 100644 index 0000000..ded6756 --- /dev/null +++ b/src/journal/test-journal-match.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdio.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "journal-internal.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" +#include "util.h" + +int main(int argc, char *argv[]) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_free_ char *t; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_journal_open(&j, 0) >= 0); + + assert_se(sd_journal_add_match(j, "foobar", 0) < 0); + assert_se(sd_journal_add_match(j, "foobar=waldo", 0) < 0); + assert_se(sd_journal_add_match(j, "", 0) < 0); + assert_se(sd_journal_add_match(j, "=", 0) < 0); + assert_se(sd_journal_add_match(j, "=xxxxx", 0) < 0); + assert_se(sd_journal_add_match(j, (uint8_t[4]){'A', '=', '\1', '\2'}, 4) >= 0); + assert_se(sd_journal_add_match(j, (uint8_t[5]){'B', '=', 'C', '\0', 'D'}, 5) >= 0); + assert_se(sd_journal_add_match(j, "HALLO=WALDO", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=mmmm", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=xxxxx", 0) >= 0); + assert_se(sd_journal_add_match(j, "HALLO=", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=xxxxx", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=yyyyy", 0) >= 0); + assert_se(sd_journal_add_match(j, "PIFF=paff", 0) >= 0); + + assert_se(sd_journal_add_disjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "ONE=one", 0) >= 0); + assert_se(sd_journal_add_match(j, "ONE=two", 0) >= 0); + assert_se(sd_journal_add_match(j, "TWO=two", 0) >= 0); + + assert_se(sd_journal_add_conjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "L4_1=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_1=ok", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_2=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_2=ok", 0) >= 0); + + assert_se(sd_journal_add_disjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "L3=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L3=ok", 0) >= 0); + + assert_se(t = journal_make_match_string(j)); + + printf("resulting match expression is: %s\n", t); + + assert_se(streq(t, "(((L3=ok OR L3=yes) OR ((L4_2=ok OR L4_2=yes) AND (L4_1=ok OR L4_1=yes))) AND ((TWO=two AND (ONE=two OR ONE=one)) OR (PIFF=paff AND (QUUX=yyyyy OR QUUX=xxxxx OR QUUX=mmmm) AND (HALLO= OR HALLO=WALDO) AND B=C\\000D AND A=\\001\\002)))")); + + return 0; +} diff --git a/src/journal/test-journal-send.c b/src/journal/test-journal-send.c new file mode 100644 index 0000000..75bd8e7 --- /dev/null +++ b/src/journal/test-journal-send.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> + +#include "sd-journal.h" +#include "fileio.h" +#include "macro.h" +#include "memory-util.h" + +static void test_journal_print(void) { + assert_se(sd_journal_print(LOG_INFO, "XXX") == 0); + assert_se(sd_journal_print(LOG_INFO, "%s", "YYY") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%4094sY", "ZZZ") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%*sY", LONG_LINE_MAX - 8 - 3, "ZZZ") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%*sY", LONG_LINE_MAX - 8 - 2, "ZZZ") == -ENOBUFS); +} + +static void test_journal_send(void) { + _cleanup_free_ char *huge = NULL; + +#define HUGE_SIZE (4096*1024) + assert_se(huge = malloc(HUGE_SIZE)); + + /* utf-8 and non-utf-8, message-less and message-ful iovecs */ + struct iovec graph1[] = { + {(char*) "GRAPH=graph", STRLEN("GRAPH=graph")} + }; + struct iovec graph2[] = { + {(char*) "GRAPH=graph\n", STRLEN("GRAPH=graph\n")} + }; + struct iovec message1[] = { + {(char*) "MESSAGE=graph", STRLEN("MESSAGE=graph")} + }; + struct iovec message2[] = { + {(char*) "MESSAGE=graph\n", STRLEN("MESSAGE=graph\n")} + }; + + assert_se(sd_journal_print(LOG_INFO, "piepapo") == 0); + + assert_se(sd_journal_send("MESSAGE=foobar", + "VALUE=%i", 7, + NULL) == 0); + + errno = ENOENT; + assert_se(sd_journal_perror("Foobar") == 0); + + assert_se(sd_journal_perror("") == 0); + + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + assert_se(sd_journal_send("MESSAGE=Huge field attached", + huge, + NULL) == 0); + + assert_se(sd_journal_send("MESSAGE=uiui", + "VALUE=A", + "VALUE=B", + "VALUE=C", + "SINGLETON=1", + "OTHERVALUE=X", + "OTHERVALUE=Y", + "WITH_BINARY=this is a binary value \a", + NULL) == 0); + + syslog(LOG_NOTICE, "Hello World!"); + + assert_se(sd_journal_print(LOG_NOTICE, "Hello World") == 0); + + assert_se(sd_journal_send("MESSAGE=Hello World!", + "MESSAGE_ID=52fb62f99e2c49d89cfbf9d6de5e3555", + "PRIORITY=5", + "HOME=%s", getenv("HOME"), + "TERM=%s", getenv("TERM"), + "PAGE_SIZE=%li", sysconf(_SC_PAGESIZE), + "N_CPUS=%li", sysconf(_SC_NPROCESSORS_ONLN), + NULL) == 0); + + assert_se(sd_journal_sendv(graph1, 1) == 0); + assert_se(sd_journal_sendv(graph2, 1) == 0); + assert_se(sd_journal_sendv(message1, 1) == 0); + assert_se(sd_journal_sendv(message2, 1) == 0); + + /* test without location fields */ +#undef sd_journal_sendv + assert_se(sd_journal_sendv(graph1, 1) == 0); + assert_se(sd_journal_sendv(graph2, 1) == 0); + assert_se(sd_journal_sendv(message1, 1) == 0); + assert_se(sd_journal_sendv(message2, 1) == 0); +} + +int main(int argc, char *argv[]) { + test_journal_print(); + test_journal_send(); + + /* Sleep a bit to make it easy for journald to collect metadata. */ + sleep(1); + + return 0; +} diff --git a/src/journal/test-journal-stream.c b/src/journal/test-journal-stream.c new file mode 100644 index 0000000..a121859 --- /dev/null +++ b/src/journal/test-journal-stream.c @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "io-util.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "util.h" + +#define N_ENTRIES 200 + +static void verify_contents(sd_journal *j, unsigned skip) { + unsigned i; + + assert_se(j); + + i = 0; + SD_JOURNAL_FOREACH(j) { + const void *d; + char *k, *c; + size_t l; + unsigned u = 0; + + assert_se(sd_journal_get_cursor(j, &k) >= 0); + printf("cursor: %s\n", k); + free(k); + + assert_se(sd_journal_get_data(j, "MAGIC", &d, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) d); + + assert_se(sd_journal_get_data(j, "NUMBER", &d, &l) >= 0); + assert_se(k = strndup(d, l)); + printf("\t%s\n", k); + + if (skip > 0) { + assert_se(safe_atou(k + 7, &u) >= 0); + assert_se(i == u); + i += skip; + } + + free(k); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + free(c); + } + + if (skip > 0) + assert_se(i == N_ENTRIES); +} + +static void run_test(void) { + JournalFile *one, *two, *three; + char t[] = "/var/tmp/journal-stream-XXXXXX"; + unsigned i; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + char *z; + const void *data; + size_t l; + dual_timestamp previous_ts = DUAL_TIMESTAMP_NULL; + + assert_se(mkdtemp(t)); + assert_se(chdir(t) >= 0); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + assert_se(journal_file_open(-1, "one.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &one) == 0); + assert_se(journal_file_open(-1, "two.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &two) == 0); + assert_se(journal_file_open(-1, "three.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &three) == 0); + + for (i = 0; i < N_ENTRIES; i++) { + char *p, *q; + dual_timestamp ts; + struct iovec iovec[2]; + + dual_timestamp_get(&ts); + + if (ts.monotonic <= previous_ts.monotonic) + ts.monotonic = previous_ts.monotonic + 1; + + if (ts.realtime <= previous_ts.realtime) + ts.realtime = previous_ts.realtime + 1; + + previous_ts = ts; + + assert_se(asprintf(&p, "NUMBER=%u", i) >= 0); + iovec[0] = IOVEC_MAKE(p, strlen(p)); + + assert_se(asprintf(&q, "MAGIC=%s", i % 5 == 0 ? "quux" : "waldo") >= 0); + + iovec[1] = IOVEC_MAKE(q, strlen(q)); + + if (i % 10 == 0) + assert_se(journal_file_append_entry(three, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + else { + if (i % 3 == 0) + assert_se(journal_file_append_entry(two, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + + assert_se(journal_file_append_entry(one, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + } + + free(p); + free(q); + } + + (void) journal_file_close(one); + (void) journal_file_close(two); + (void) journal_file_close(three); + + assert_se(sd_journal_open_directory(&j, t, 0) >= 0); + + assert_se(sd_journal_add_match(j, "MAGIC=quux", 0) >= 0); + SD_JOURNAL_FOREACH_BACKWARDS(j) { + _cleanup_free_ char *c; + + assert_se(sd_journal_get_data(j, "NUMBER", &data, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) data); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + } + + SD_JOURNAL_FOREACH(j) { + _cleanup_free_ char *c; + + assert_se(sd_journal_get_data(j, "NUMBER", &data, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) data); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + } + + sd_journal_flush_matches(j); + + verify_contents(j, 1); + + printf("NEXT TEST\n"); + assert_se(sd_journal_add_match(j, "MAGIC=quux", 0) >= 0); + + assert_se(z = journal_make_match_string(j)); + printf("resulting match expression is: %s\n", z); + free(z); + + verify_contents(j, 5); + + printf("NEXT TEST\n"); + sd_journal_flush_matches(j); + assert_se(sd_journal_add_match(j, "MAGIC=waldo", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=10", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=11", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=12", 0) >= 0); + + assert_se(z = journal_make_match_string(j)); + printf("resulting match expression is: %s\n", z); + free(z); + + verify_contents(j, 0); + + assert_se(sd_journal_query_unique(j, "NUMBER") >= 0); + SD_JOURNAL_FOREACH_UNIQUE(j, data, l) + printf("%.*s\n", (int) l, (const char*) data); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + +int main(int argc, char *argv[]) { + + /* journal_file_open requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + test_setup_logging(LOG_DEBUG); + + /* Run this test twice. Once with old hashing and once with new hashing */ + assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0); + run_test(); + + assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0); + run_test(); + + return 0; +} diff --git a/src/journal/test-journal-syslog.c b/src/journal/test-journal-syslog.c new file mode 100644 index 0000000..33f4129 --- /dev/null +++ b/src/journal/test-journal-syslog.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "journald-syslog.h" +#include "macro.h" +#include "string-util.h" +#include "syslog-util.h" + +static void test_syslog_parse_identifier(const char *str, + const char *ident, const char *pid, const char *rest, int ret) { + const char *buf = str; + _cleanup_free_ char *ident2 = NULL, *pid2 = NULL; + int ret2; + + ret2 = syslog_parse_identifier(&buf, &ident2, &pid2); + + assert_se(ret == ret2); + assert_se(ident == ident2 || streq_ptr(ident, ident2)); + assert_se(pid == pid2 || streq_ptr(pid, pid2)); + assert_se(streq(buf, rest)); +} + +static void test_syslog_parse_priority(const char *str, int priority, int ret) { + const char *buf = str; + int priority2 = 0, ret2; + + ret2 = syslog_parse_priority(&buf, &priority2, false); + + assert_se(ret == ret2); + if (ret2 == 1) + assert_se(priority == priority2); +} + +int main(void) { + test_syslog_parse_identifier("pidu[111]: xxx", "pidu", "111", "xxx", 11); + test_syslog_parse_identifier("pidu: xxx", "pidu", NULL, "xxx", 6); + test_syslog_parse_identifier("pidu: xxx", "pidu", NULL, " xxx", 6); + test_syslog_parse_identifier("pidu xxx", NULL, NULL, "pidu xxx", 0); + test_syslog_parse_identifier(" pidu xxx", NULL, NULL, " pidu xxx", 0); + test_syslog_parse_identifier("", NULL, NULL, "", 0); + test_syslog_parse_identifier(" ", NULL, NULL, " ", 0); + test_syslog_parse_identifier(":", "", NULL, "", 1); + test_syslog_parse_identifier(": ", "", NULL, " ", 2); + test_syslog_parse_identifier(" :", "", NULL, "", 2); + test_syslog_parse_identifier(" pidu:", "pidu", NULL, "", 8); + test_syslog_parse_identifier("pidu:", "pidu", NULL, "", 5); + test_syslog_parse_identifier("pidu: ", "pidu", NULL, "", 6); + test_syslog_parse_identifier("pidu : ", NULL, NULL, "pidu : ", 0); + + test_syslog_parse_priority("<>", 0, 0); + test_syslog_parse_priority("<>aaa", 0, 0); + test_syslog_parse_priority("<aaaa>", 0, 0); + test_syslog_parse_priority("<aaaa>aaa", 0, 0); + test_syslog_parse_priority(" <aaaa>", 0, 0); + test_syslog_parse_priority(" <aaaa>aaa", 0, 0); + /* TODO: add test cases of valid priorities */ + + return 0; +} diff --git a/src/journal/test-journal-verify.c b/src/journal/test-journal-verify.c new file mode 100644 index 0000000..d208e46 --- /dev/null +++ b/src/journal/test-journal-verify.c @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> + +#include "chattr-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "journal-file.h" +#include "journal-verify.h" +#include "log.h" +#include "rm-rf.h" +#include "terminal-util.h" +#include "tests.h" +#include "util.h" + +#define N_ENTRIES 6000 +#define RANDOM_RANGE 77 + +static void bit_toggle(const char *fn, uint64_t p) { + uint8_t b; + ssize_t r; + int fd; + + fd = open(fn, O_RDWR|O_CLOEXEC); + assert_se(fd >= 0); + + r = pread(fd, &b, 1, p/8); + assert_se(r == 1); + + b ^= 1 << (p % 8); + + r = pwrite(fd, &b, 1, p/8); + assert_se(r == 1); + + safe_close(fd); +} + +static int raw_verify(const char *fn, const char *verification_key) { + JournalFile *f; + int r; + + r = journal_file_open(-1, fn, O_RDONLY, 0666, true, (uint64_t) -1, !!verification_key, NULL, NULL, NULL, NULL, &f); + if (r < 0) + return r; + + r = journal_file_verify(f, verification_key, NULL, NULL, NULL, false); + (void) journal_file_close(f); + + return r; +} + +int main(int argc, char *argv[]) { + char t[] = "/var/tmp/journal-XXXXXX"; + unsigned n; + JournalFile *f; + const char *verification_key = argv[1]; + usec_t from = 0, to = 0, total = 0; + char a[FORMAT_TIMESTAMP_MAX]; + char b[FORMAT_TIMESTAMP_MAX]; + char c[FORMAT_TIMESPAN_MAX]; + struct stat st; + uint64_t p; + + /* journal_file_open requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + test_setup_logging(LOG_DEBUG); + + assert_se(mkdtemp(t)); + assert_se(chdir(t) >= 0); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + log_info("Generating..."); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, !!verification_key, NULL, NULL, NULL, NULL, &f) == 0); + + for (n = 0; n < N_ENTRIES; n++) { + struct iovec iovec; + struct dual_timestamp ts; + char *test; + + dual_timestamp_get(&ts); + + assert_se(asprintf(&test, "RANDOM=%lu", random() % RANDOM_RANGE)); + + iovec = IOVEC_MAKE_STRING(test); + + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + + free(test); + } + + (void) journal_file_close(f); + + log_info("Verifying..."); + + assert_se(journal_file_open(-1, "test.journal", O_RDONLY, 0666, true, (uint64_t) -1, !!verification_key, NULL, NULL, NULL, NULL, &f) == 0); + /* journal_file_print_header(f); */ + journal_file_dump(f); + + assert_se(journal_file_verify(f, verification_key, &from, &to, &total, true) >= 0); + + if (verification_key && JOURNAL_HEADER_SEALED(f->header)) + log_info("=> Validated from %s to %s, %s missing", + format_timestamp(a, sizeof(a), from), + format_timestamp(b, sizeof(b), to), + format_timespan(c, sizeof(c), total > to ? total - to : 0, 0)); + + (void) journal_file_close(f); + + if (verification_key) { + log_info("Toggling bits..."); + + assert_se(stat("test.journal", &st) >= 0); + + for (p = 38448*8+0; p < ((uint64_t) st.st_size * 8); p ++) { + bit_toggle("test.journal", p); + + log_info("[ %"PRIu64"+%"PRIu64"]", p / 8, p % 8); + + if (raw_verify("test.journal", verification_key) >= 0) + log_notice(ANSI_HIGHLIGHT_RED ">>>> %"PRIu64" (bit %"PRIu64") can be toggled without detection." ANSI_NORMAL, p / 8, p % 8); + + bit_toggle("test.journal", p); + } + } + + log_info("Exiting..."); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + return 0; +} diff --git a/src/journal/test-journal.c b/src/journal/test-journal.c new file mode 100644 index 0000000..f8f08b5 --- /dev/null +++ b/src/journal/test-journal.c @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "chattr-util.h" +#include "io-util.h" +#include "journal-authenticate.h" +#include "journal-file.h" +#include "journal-vacuum.h" +#include "log.h" +#include "rm-rf.h" +#include "tests.h" + +static bool arg_keep = false; + +static void mkdtemp_chdir_chattr(char *path) { + assert_se(mkdtemp(path)); + assert_se(chdir(path) >= 0); + + /* Speed up things a bit on btrfs, ensuring that CoW is turned off for all files created in our + * directory during the test run */ + (void) chattr_path(path, FS_NOCOW_FL, FS_NOCOW_FL, NULL); +} + +static void test_non_empty(void) { + dual_timestamp ts; + JournalFile *f; + struct iovec iovec; + static const char test[] = "TEST1=1", test2[] = "TEST2=2"; + Object *o; + uint64_t p; + sd_id128_t fake_boot_id; + char t[] = "/var/tmp/journal-XXXXXX"; + + test_setup_logging(LOG_DEBUG); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, true, NULL, NULL, NULL, NULL, &f) == 0); + + assert_se(dual_timestamp_get(&ts)); + assert_se(sd_id128_randomize(&fake_boot_id) == 0); + + iovec = IOVEC_MAKE_STRING(test); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + + iovec = IOVEC_MAKE_STRING(test2); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + + iovec = IOVEC_MAKE_STRING(test); + assert_se(journal_file_append_entry(f, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL) == 0); + +#if HAVE_GCRYPT + journal_file_append_tag(f); +#endif + journal_file_dump(f); + + assert_se(journal_file_next_entry(f, 0, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + assert_se(sd_id128_equal(o->entry.boot_id, fake_boot_id)); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 0); + + assert_se(journal_file_next_entry(f, 0, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_find_data_object(f, test, strlen(test), NULL, &p) == 1); + assert_se(journal_file_next_entry_for_data(f, NULL, 0, p, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_next_entry_for_data(f, NULL, 0, p, DIRECTION_UP, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + + assert_se(journal_file_find_data_object(f, test2, strlen(test2), NULL, &p) == 1); + assert_se(journal_file_next_entry_for_data(f, NULL, 0, p, DIRECTION_UP, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_next_entry_for_data(f, NULL, 0, p, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_find_data_object(f, "quux", 4, NULL, &p) == 0); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 1, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 3, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 2, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 10, DIRECTION_DOWN, &o, NULL) == 0); + + journal_file_rotate(&f, true, (uint64_t) -1, true, NULL); + journal_file_rotate(&f, true, (uint64_t) -1, true, NULL); + + (void) journal_file_close(f); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + puts("------------------------------------------------------------"); +} + +static void test_empty(void) { + JournalFile *f1, *f2, *f3, *f4; + char t[] = "/var/tmp/journal-XXXXXX"; + + test_setup_logging(LOG_DEBUG); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, 0666, false, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &f1) == 0); + + assert_se(journal_file_open(-1, "test-compress.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, false, NULL, NULL, NULL, NULL, &f2) == 0); + + assert_se(journal_file_open(-1, "test-seal.journal", O_RDWR|O_CREAT, 0666, false, (uint64_t) -1, true, NULL, NULL, NULL, NULL, &f3) == 0); + + assert_se(journal_file_open(-1, "test-seal-compress.journal", O_RDWR|O_CREAT, 0666, true, (uint64_t) -1, true, NULL, NULL, NULL, NULL, &f4) == 0); + + journal_file_print_header(f1); + puts(""); + journal_file_print_header(f2); + puts(""); + journal_file_print_header(f3); + puts(""); + journal_file_print_header(f4); + puts(""); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + (void) journal_file_close(f1); + (void) journal_file_close(f2); + (void) journal_file_close(f3); + (void) journal_file_close(f4); +} + +#if HAVE_COMPRESSION +static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) { + dual_timestamp ts; + JournalFile *f; + struct iovec iovec; + Object *o; + uint64_t p; + char t[] = "/var/tmp/journal-XXXXXX"; + char data[2048] = {0}; + bool is_compressed; + int r; + + assert_se(data_size <= sizeof(data)); + + test_setup_logging(LOG_DEBUG); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, 0666, true, compress_threshold, true, NULL, NULL, NULL, NULL, &f) == 0); + + dual_timestamp_get(&ts); + + iovec = IOVEC_MAKE(data, data_size); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + +#if HAVE_GCRYPT + journal_file_append_tag(f); +#endif + journal_file_dump(f); + + /* We have to partially reimplement some of the dump logic, because the normal next_entry does the + * decompression for us. */ + p = le64toh(f->header->header_size); + for (;;) { + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + assert_se(r == 0); + if (o->object.type == OBJECT_DATA) + break; + + assert_se(p < le64toh(f->header->tail_object_offset)); + p = p + ALIGN64(le64toh(o->object.size)); + } + + is_compressed = (o->object.flags & OBJECT_COMPRESSION_MASK) != 0; + + (void) journal_file_close(f); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + puts("------------------------------------------------------------"); + + return is_compressed; +} + +static void test_min_compress_size(void) { + /* Note that XZ will actually fail to compress anything under 80 bytes, so you have to choose the limits + * carefully */ + + /* DEFAULT_MIN_COMPRESS_SIZE is 512 */ + assert_se(!check_compressed((uint64_t) -1, 255)); + assert_se(check_compressed((uint64_t) -1, 513)); + + /* compress everything */ + assert_se(check_compressed(0, 96)); + assert_se(check_compressed(8, 96)); + + /* Ensure we don't try to compress less than 8 bytes */ + assert_se(!check_compressed(0, 7)); + + /* check boundary conditions */ + assert_se(check_compressed(256, 256)); + assert_se(!check_compressed(256, 255)); +} +#endif + +int main(int argc, char *argv[]) { + arg_keep = argc > 1; + + test_setup_logging(LOG_INFO); + + /* journal_file_open requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + test_non_empty(); + test_empty(); +#if HAVE_COMPRESSION + test_min_compress_size(); +#endif + + return 0; +} diff --git a/src/journal/test-mmap-cache.c b/src/journal/test-mmap-cache.c new file mode 100644 index 0000000..d1d2876 --- /dev/null +++ b/src/journal/test-mmap-cache.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "fd-util.h" +#include "macro.h" +#include "mmap-cache.h" +#include "tmpfile-util.h" +#include "util.h" + +int main(int argc, char *argv[]) { + MMapFileDescriptor *fx; + int x, y, z, r; + char px[] = "/tmp/testmmapXXXXXXX", py[] = "/tmp/testmmapYXXXXXX", pz[] = "/tmp/testmmapZXXXXXX"; + MMapCache *m; + void *p, *q; + + assert_se(m = mmap_cache_new()); + + x = mkostemp_safe(px); + assert_se(x >= 0); + unlink(px); + + assert_se(fx = mmap_cache_add_fd(m, x)); + + y = mkostemp_safe(py); + assert_se(y >= 0); + unlink(py); + + z = mkostemp_safe(pz); + assert_se(z >= 0); + unlink(pz); + + r = mmap_cache_get(m, fx, PROT_READ, 0, false, 1, 2, NULL, &p, NULL); + assert_se(r >= 0); + + r = mmap_cache_get(m, fx, PROT_READ, 0, false, 2, 2, NULL, &q, NULL); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 1 == (uint8_t*) q); + + r = mmap_cache_get(m, fx, PROT_READ, 1, false, 3, 2, NULL, &q, NULL); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 2 == (uint8_t*) q); + + r = mmap_cache_get(m, fx, PROT_READ, 0, false, 16ULL*1024ULL*1024ULL, 2, NULL, &p, NULL); + assert_se(r >= 0); + + r = mmap_cache_get(m, fx, PROT_READ, 1, false, 16ULL*1024ULL*1024ULL+1, 2, NULL, &q, NULL); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 1 == (uint8_t*) q); + + mmap_cache_free_fd(m, fx); + mmap_cache_unref(m); + + safe_close(x); + safe_close(y); + safe_close(z); + + return 0; +} |