diff options
Diffstat (limited to 'src/basic/extract-word.c')
-rw-r--r-- | src/basic/extract-word.c | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/src/basic/extract-word.c b/src/basic/extract-word.c new file mode 100644 index 0000000..160f771 --- /dev/null +++ b/src/basic/extract-word.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <syslog.h> + +#include "alloc-util.h" +#include "escape.h" +#include "extract-word.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +int extract_first_word(const char **p, char **ret, const char *separators, ExtractFlags flags) { + _cleanup_free_ char *s = NULL; + size_t sz = 0; + char quote = 0; /* 0 or ' or " */ + bool backslash = false; /* whether we've just seen a backslash */ + char c; + int r; + + assert(p); + assert(ret); + assert(!FLAGS_SET(flags, EXTRACT_KEEP_QUOTE | EXTRACT_UNQUOTE)); + + /* Bail early if called after last value or with no input */ + if (!*p) + goto finish; + c = **p; + + if (!separators) + separators = WHITESPACE; + + /* Parses the first word of a string, and returns it in + * *ret. Removes all quotes in the process. When parsing fails + * (because of an uneven number of quotes or similar), leaves + * the pointer *p at the first invalid character. */ + + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) + if (!GREEDY_REALLOC(s, sz+1)) + return -ENOMEM; + + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + else if (strchr(separators, c)) { + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) { + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + (*p)++; + goto finish_force_next; + } + } else { + /* We found a non-blank character, so we will always + * want to return a string (even if it is empty), + * allocate it here. */ + if (!GREEDY_REALLOC(s, sz+1)) + return -ENOMEM; + break; + } + } + + for (;; (*p)++, c = **p) { + if (backslash) { + if (!GREEDY_REALLOC(s, sz+7)) + return -ENOMEM; + + if (c == 0) { + if ((flags & EXTRACT_UNESCAPE_RELAX) && + (quote == 0 || flags & EXTRACT_RELAX)) { + /* If we find an unquoted trailing backslash and we're in + * EXTRACT_UNESCAPE_RELAX mode, keep it verbatim in the + * output. + * + * Unbalanced quotes will only be allowed in EXTRACT_RELAX + * mode, EXTRACT_UNESCAPE_RELAX mode does not allow them. + */ + s[sz++] = '\\'; + goto finish_force_terminate; + } + if (flags & EXTRACT_RELAX) + goto finish_force_terminate; + return -EINVAL; + } + + if (flags & (EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS)) { + bool eight_bit = false; + char32_t u; + + if ((flags & EXTRACT_CUNESCAPE) && + (r = cunescape_one(*p, SIZE_MAX, &u, &eight_bit, false)) >= 0) { + /* A valid escaped sequence */ + assert(r >= 1); + + (*p) += r - 1; + + if (eight_bit) + s[sz++] = u; + else + sz += utf8_encode_unichar(s + sz, u); + } else if ((flags & EXTRACT_UNESCAPE_SEPARATORS) && + (strchr(separators, **p) || **p == '\\')) + /* An escaped separator char or the escape char itself */ + s[sz++] = c; + else if (flags & EXTRACT_UNESCAPE_RELAX) { + s[sz++] = '\\'; + s[sz++] = c; + } else + return -EINVAL; + } else + s[sz++] = c; + + backslash = false; + + } else if (quote != 0) { /* inside either single or double quotes */ + for (;; (*p)++, c = **p) { + if (c == 0) { + if (flags & EXTRACT_RELAX) + goto finish_force_terminate; + return -EINVAL; + } else if (c == quote) { /* found the end quote */ + quote = 0; + if (flags & EXTRACT_UNQUOTE) + break; + } else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) { + backslash = true; + break; + } + + if (!GREEDY_REALLOC(s, sz+2)) + return -ENOMEM; + + s[sz++] = c; + + if (quote == 0) + break; + } + + } else { + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + else if (IN_SET(c, '\'', '"') && (flags & (EXTRACT_KEEP_QUOTE | EXTRACT_UNQUOTE))) { + quote = c; + if (flags & EXTRACT_UNQUOTE) + break; + } else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) { + backslash = true; + break; + } else if (strchr(separators, c)) { + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) { + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + (*p)++; + goto finish_force_next; + } + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + /* Skip additional coalesced separators. */ + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + if (!strchr(separators, c)) + break; + } + goto finish; + + } + + if (!GREEDY_REALLOC(s, sz+2)) + return -ENOMEM; + + s[sz++] = c; + + if (quote != 0) + break; + } + } + } + +finish_force_terminate: + *p = NULL; +finish: + if (!s) { + *p = NULL; + *ret = NULL; + return 0; + } + +finish_force_next: + s[sz] = 0; + *ret = TAKE_PTR(s); + + return 1; +} + +int extract_first_word_and_warn( + const char **p, + char **ret, + const char *separators, + ExtractFlags flags, + const char *unit, + const char *filename, + unsigned line, + const char *rvalue) { + + /* Try to unquote it, if it fails, warn about it and try again + * but this time using EXTRACT_UNESCAPE_RELAX to keep the + * backslashes verbatim in invalid escape sequences. */ + + const char *save; + int r; + + save = *p; + r = extract_first_word(p, ret, separators, flags); + if (r >= 0) + return r; + + if (r == -EINVAL && !(flags & EXTRACT_UNESCAPE_RELAX)) { + + /* Retry it with EXTRACT_UNESCAPE_RELAX. */ + *p = save; + r = extract_first_word(p, ret, separators, flags|EXTRACT_UNESCAPE_RELAX); + if (r >= 0) { + /* It worked this time, hence it must have been an invalid escape sequence. */ + log_syntax(unit, LOG_WARNING, filename, line, EINVAL, "Ignoring unknown escape sequences: \"%s\"", *ret); + return r; + } + + /* If it's still EINVAL; then it must be unbalanced quoting, report this. */ + if (r == -EINVAL) + return log_syntax(unit, LOG_ERR, filename, line, r, "Unbalanced quoting, ignoring: \"%s\"", rvalue); + } + + /* Can be any error, report it */ + return log_syntax(unit, LOG_ERR, filename, line, r, "Unable to decode word \"%s\", ignoring: %m", rvalue); +} + +/* We pass ExtractFlags as unsigned int (to avoid undefined behaviour when passing + * an object that undergoes default argument promotion as an argument to va_start). + * Let's make sure that ExtractFlags fits into an unsigned int. */ +assert_cc(sizeof(enum ExtractFlags) <= sizeof(unsigned)); + +int extract_many_words(const char **p, const char *separators, unsigned flags, ...) { + va_list ap; + char **l; + int n = 0, i, c, r; + + /* Parses a number of words from a string, stripping any + * quotes if necessary. */ + + assert(p); + + /* Count how many words are expected */ + va_start(ap, flags); + for (;;) { + if (!va_arg(ap, char **)) + break; + n++; + } + va_end(ap); + + if (n <= 0) + return 0; + + /* Read all words into a temporary array */ + l = newa0(char*, n); + for (c = 0; c < n; c++) { + + r = extract_first_word(p, &l[c], separators, flags); + if (r < 0) { + free_many_charp(l, c); + return r; + } + + if (r == 0) + break; + } + + /* If we managed to parse all words, return them in the passed + * in parameters */ + va_start(ap, flags); + for (i = 0; i < n; i++) { + char **v; + + v = va_arg(ap, char **); + assert(v); + + *v = l[i]; + } + va_end(ap); + + return c; +} |