diff options
Diffstat (limited to 'src/libutil/str_util.h')
-rw-r--r-- | src/libutil/str_util.h | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h new file mode 100644 index 0000000..07560cc --- /dev/null +++ b/src/libutil/str_util.h @@ -0,0 +1,565 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBUTIL_STR_UTIL_H_ +#define SRC_LIBUTIL_STR_UTIL_H_ + +#include "config.h" +#include "ucl.h" +#include "fstring.h" + +#include <stdalign.h> + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_newlines_type { + RSPAMD_TASK_NEWLINES_CR = 0, + RSPAMD_TASK_NEWLINES_LF, + RSPAMD_TASK_NEWLINES_CRLF, + RSPAMD_TASK_NEWLINES_MAX +}; + +/** + * Compare two memory regions of size `l` using case insensitive matching + */ +gint rspamd_lc_cmp(const gchar *s, const gchar *d, gsize l); + +/** + * Convert string to lowercase in-place using ASCII conversion + */ +guint rspamd_str_lc(gchar *str, guint size); + +/** + * Performs ascii copy & lowercase + * @param src + * @param size + * @return + */ +gsize rspamd_str_copy_lc(const gchar *src, gchar *dst, gsize size); + +/** + * Convert string to lowercase in-place using utf (limited) conversion + */ +guint rspamd_str_lc_utf8(gchar *str, guint size); + +/* + * Hash table utility functions for case insensitive hashing + */ +guint64 rspamd_icase_hash(const gchar *in, gsize len, guint64 seed); + +guint rspamd_strcase_hash(gconstpointer key); + +gboolean rspamd_strcase_equal(gconstpointer v, gconstpointer v2); + +/* + * Hash table utility functions for case sensitive hashing + */ +guint rspamd_str_hash(gconstpointer key); + +gboolean rspamd_str_equal(gconstpointer v, gconstpointer v2); + + +/* + * Hash table utility functions for hashing fixed strings + */ +guint rspamd_ftok_icase_hash(gconstpointer key); + +gboolean rspamd_ftok_icase_equal(gconstpointer v, gconstpointer v2); + +/* Use in khash for speed */ +#define rspamd_ftok_hash(key) _wyhash32((key)->begin, (key)->len, 0) +#define rspamd_ftok_equal(v1, v2) ((v1)->len == (v2)->len && memcmp((v1)->begin, (v2)->begin, (v1)->len) == 0) + +guint rspamd_gstring_icase_hash(gconstpointer key); + +gboolean rspamd_gstring_icase_equal(gconstpointer v, gconstpointer v2); + +/** + * Copy src to dest limited to len, in compare with standard strlcpy(3) rspamd strlcpy does not + * traverse the whole string and it is possible to use it for non NULL terminated strings. This is + * more like memccpy(dst, src, size, '\0') + * + * @param dst destination string + * @param src source string + * @param siz length of destination buffer + * @return bytes copied + */ +gsize rspamd_strlcpy_fast(gchar *dst, const gchar *src, gsize siz); + +gsize rspamd_strlcpy_safe(gchar *dst, const gchar *src, gsize siz); + +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define rspamd_strlcpy rspamd_strlcpy_safe +#else +#ifdef __SANITIZE_ADDRESS__ +#define rspamd_strlcpy rspamd_strlcpy_safe +#else +#define rspamd_strlcpy rspamd_strlcpy_fast +#endif +#endif +#else +#ifdef __SANITIZE_ADDRESS__ +#define rspamd_strlcpy rspamd_strlcpy_safe +#else +#define rspamd_strlcpy rspamd_strlcpy_fast +#endif +#endif + +/** + * Copies `srclen` characters from `src` to `dst` ignoring \0 + * @param src + * @param srclen + * @param dest + * @param destlen + * @return number of bytes copied + */ +gsize rspamd_null_safe_copy(const gchar *src, gsize srclen, + gchar *dest, gsize destlen); + +/* + * Try to convert string of length to long + */ +gboolean rspamd_strtol(const gchar *s, gsize len, glong *value); + +/* + * Try to convert a string of length to unsigned long + */ +gboolean rspamd_strtoul(const gchar *s, gsize len, gulong *value); +gboolean rspamd_strtou64(const gchar *s, gsize len, guint64 *value); + +/* + * Try to convert a hex string of length to unsigned long + */ +gboolean rspamd_xstrtoul(const gchar *s, gsize len, gulong *value); + +/** + * Utility function to provide mem_pool copy for rspamd_hash_table_copy function + * @param data string to copy + * @param ud memory pool to use + * @return + */ +gpointer rspamd_str_pool_copy(gconstpointer data, gpointer ud); + +/** + * Encode string using hex encoding + * @param in input + * @param inlen input length + * @return freshly allocated base32 encoding of a specified string + */ +gchar *rspamd_encode_hex(const guchar *in, gsize inlen); + +/** + * Decode string using hex encoding + * @param in input + * @param inlen input length + * @return freshly allocated base32 decoded value or NULL if input is invalid + */ +guchar *rspamd_decode_hex(const gchar *in, gsize inlen); + +enum rspamd_base32_type { + RSPAMD_BASE32_DEFAULT = 0, + RSPAMD_BASE32_ZBASE = 0, + RSPAMD_BASE32_BLEACH, + RSPAMD_BASE32_RFC, + RSPAMD_BASE32_INVALID = -1, +}; + +/** + * Returns base32 type from a string or RSPAMD_BASE32_INVALID + * @param str + * @return + */ +enum rspamd_base32_type rspamd_base32_decode_type_from_str(const gchar *str); + +/** + * Encode string using base32 encoding + * @param in input + * @param inlen input length + * @return freshly allocated base32 encoding of a specified string + */ +gchar *rspamd_encode_base32(const guchar *in, gsize inlen, + enum rspamd_base32_type type); + +/** + * Decode string using base32 encoding + * @param in input + * @param inlen input length + * @return freshly allocated base32 decoded value or NULL if input is invalid + */ +guchar *rspamd_decode_base32(const gchar *in, gsize inlen, gsize *outlen, enum rspamd_base32_type type); + +/** + * Encode string using base32 encoding + * @param in input + * @param inlen input length + * @param out output buf + * @param outlen output buf len + * @return encoded len if `outlen` is enough to encode `inlen` + */ +gint rspamd_encode_base32_buf(const guchar *in, gsize inlen, gchar *out, + gsize outlen, enum rspamd_base32_type type); + +/** + * Decode string using base32 encoding + * @param in input + * @param inlen input length + * @param out output buf (may overlap with `in`) + * @param outlen output buf len + * @return decoded len if in is valid base32 and `outlen` is enough to encode `inlen` + */ +gint rspamd_decode_base32_buf(const gchar *in, gsize inlen, guchar *out, + gsize outlen, enum rspamd_base32_type type); + +/** + * Encode string using hex encoding + * @param in input + * @param inlen input length + * @param out output buf + * @param outlen output buf len + * @return encoded len if `outlen` is enough to encode `inlen` + */ +gint rspamd_encode_hex_buf(const guchar *in, gsize inlen, gchar *out, + gsize outlen); + + +/** + * Decode string using hex encoding + * @param in input + * @param inlen input length + * @param out output buf (may overlap with `in`) + * @param outlen output buf len + * @return decoded len if in is valid hex and `outlen` is enough to encode `inlen` + */ +gssize rspamd_decode_hex_buf(const gchar *in, gsize inlen, + guchar *out, gsize outlen); + +/** + * Common version of base64 encoder + * @param in + * @param inlen + * @param str_len + * @param outlen + * @param fold + * @param how + * @return + */ +gchar * +rspamd_encode_base64_common(const guchar *in, + gsize inlen, + gint str_len, + gsize *outlen, + gboolean fold, + enum rspamd_newlines_type how); + +/** + * Encode string using base64 encoding + * @param in input + * @param inlen input length + * @param str_len maximum string length (if <= 0 then no lines are split) + * @return freshly allocated base64 encoded value or NULL if input is invalid + */ +gchar *rspamd_encode_base64(const guchar *in, gsize inlen, gint str_len, + gsize *outlen); + +/** + * Encode and fold string using base64 encoding + * @param in input + * @param inlen input length + * @param str_len maximum string length (if <= 0 then no lines are split) + * @return freshly allocated base64 encoded value or NULL if input is invalid + */ +gchar *rspamd_encode_base64_fold(const guchar *in, gsize inlen, gint str_len, + gsize *outlen, enum rspamd_newlines_type how); + +/** + * Encode and fold string using quoted printable encoding + * @param in input + * @param inlen input length + * @param str_len maximum string length (if <= 0 then no lines are split) + * @return freshly allocated base64 encoded value or NULL if input is invalid + */ +gchar *rspamd_encode_qp_fold(const guchar *in, gsize inlen, gint str_len, + gsize *outlen, enum rspamd_newlines_type how); + +/** + * Decode quoted-printable encoded buffer, input and output must not overlap + * @param in input + * @param inlen length of input + * @param out output + * @param outlen length of output + * @return real size of decoded output or (-1) if outlen is not enough + */ +gssize rspamd_decode_qp_buf(const gchar *in, gsize inlen, + gchar *out, gsize outlen); + +/** + * Decode uuencode encoded buffer, input and output must not overlap + * @param in input + * @param inlen length of input + * @param out output + * @param outlen length of output + * @return real size of decoded output or (-1) if outlen is not enough + */ +gssize rspamd_decode_uue_buf(const gchar *in, gsize inlen, + gchar *out, gsize outlen); + +/** + * Decode quoted-printable encoded buffer using rfc2047 format, input and output must not overlap + * @param in input + * @param inlen length of input + * @param out output + * @param outlen length of output + * @return real size of decoded output or (-1) if outlen is not enough + */ +gssize rspamd_decode_qp2047_buf(const gchar *in, gsize inlen, + gchar *out, gsize outlen); + +/** + * Encode quoted-printable buffer using rfc2047 format, input and output must not overlap + * @param in + * @param inlen + * @param out + * @param outlen + * @return + */ +gssize rspamd_encode_qp2047_buf(const gchar *in, gsize inlen, + gchar *out, gsize outlen); + +#ifndef g_tolower +#define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x)) +#endif + +/** + * Return levenstein distance between two strings + * @param s1 + * @param s1len + * @param s2 + * @param s2len + * @return + */ +gint rspamd_strings_levenshtein_distance(const gchar *s1, gsize s1len, + const gchar *s2, gsize s2len, guint replace_cost); + +/** + * Fold header using rfc822 rules, return new GString from the previous one + * @param name name of header (used just for folding) + * @param value value of header + * @param fold_max + * @param how + * @param fold_on_chars + * @return new GString with the folded value + */ +GString *rspamd_header_value_fold(const gchar *name, + gsize name_len, + const gchar *value, + gsize value_len, + guint fold_max, + enum rspamd_newlines_type how, + const gchar *fold_on_chars); + +/** + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 + * @param in input + * @param inlen input len + * @param srch search string + * @param srchlen length of the search string + * @return position of the first substring match or (-1) if not found + */ +goffset rspamd_substring_search(const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen); + +/** + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only) + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 + * @param in input + * @param inlen input len + * @param srch search string + * @param srchlen length of the search string + * @return position of the first substring match or (-1) if not found + */ +goffset rspamd_substring_search_caseless(const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen); + +/** + * Search for end-of-headers mark in the input string. Returns position just after + * the last header in message (but before the last newline character). + * Hence, to obtain the real EOH position, it is also required to skip + * space characters + */ +goffset rspamd_string_find_eoh(GString *input, goffset *body_start); + + +#define rspamd_ucl_emit_gstring(o, t, target) \ + rspamd_ucl_emit_gstring_comments((o), (t), (target), NULL) + +/** + * Emit UCL object to gstring + * @param obj object to emit + * @param emit_type emitter type + * @param comments optional comments object + * @param target target string + */ +void rspamd_ucl_emit_gstring_comments(const ucl_object_t *obj, + enum ucl_emitter emit_type, + GString *target, + const ucl_object_t *comments); + +#define rspamd_ucl_emit_fstring(o, t, target) \ + rspamd_ucl_emit_fstring_comments((o), (t), (target), NULL) + +/** + * Emit UCL object to fstring + * @param obj object to emit + * @param emit_type emitter type + * * @param comments optional comments object + * @param target target string + */ +void rspamd_ucl_emit_fstring_comments(const ucl_object_t *obj, + enum ucl_emitter emit_type, + rspamd_fstring_t **target, + const ucl_object_t *comments); + +extern const guchar lc_map[256]; + +/** + * Search for the last occurrence of character `c` in memory block of size `len` + * @param m + * @param c + * @param len + * @return pointer to the last occurrence or NULL + */ +#ifdef HAVE_MEMRCHR +#define rspamd_memrchr memrchr +#else +void *rspamd_memrchr(const void *m, gint c, gsize len); +#endif + +/** + * Return length of memory segment starting in `s` that contains no chars from `e` + * @param s any input + * @param e zero terminated string of exceptions + * @param len length of `s` + * @return segment size + */ +gsize rspamd_memcspn(const gchar *s, const gchar *e, gsize len); + +/** + * Return length of memory segment starting in `s` that contains only chars from `e` + * @param s any input + * @param e zero terminated string of inclusions + * @param len length of `s` + * @return segment size + */ +gsize rspamd_memspn(const gchar *s, const gchar *e, gsize len); + +/* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */ +#define rspamd_str_hasmore(x, n) ((((x) + ~0UL / 255 * (127 - (n))) | (x)) & ~0UL / 255 * 128) +/* + * Check if a pointer is aligned; n must be power of two + */ +#define rspamd_is_aligned(p, n) (((uintptr_t) (p) & ((uintptr_t) (n) -1)) == 0) +#define rspamd_is_aligned_as(p, v) rspamd_is_aligned(p, RSPAMD_ALIGNOF(__typeof((v)))) +gboolean rspamd_str_has_8bit(const guchar *beg, gsize len); + +struct UConverter; + +struct UConverter *rspamd_get_utf8_converter(void); + +struct UNormalizer2; + +const struct UNormalizer2 *rspamd_get_unicode_normalizer(void); + + +enum rspamd_regexp_escape_flags { + RSPAMD_REGEXP_ESCAPE_ASCII = 0, + RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0, + RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1, + RSPAMD_REGEXP_ESCAPE_RE = 1u << 2, +}; + +/** + * Escapes special characters when reading plain data to be processed in pcre + * @param pattern pattern to process + * @param slen source length + * @param dst_len destination length pointer (can be NULL) + * @param allow_glob allow glob expressions to be translated into pcre + * @return newly allocated zero terminated escaped pattern + */ +gchar * +rspamd_str_regexp_escape(const gchar *pattern, gsize slen, + gsize *dst_len, enum rspamd_regexp_escape_flags flags) G_GNUC_WARN_UNUSED_RESULT; + +/** + * Returns copy of src (zero terminated) where all unicode is made valid or replaced + * to FFFD characters. Caller must free string after usage + * @param src + * @param slen + * @param dstelen + * @return + */ +gchar *rspamd_str_make_utf_valid(const guchar *src, gsize slen, gsize *dstlen, + rspamd_mempool_t *pool) G_GNUC_WARN_UNUSED_RESULT; + +/** + * Strips characters in `strip_chars` from start and end of the GString + * @param s + * @param strip_chars + */ +gsize rspamd_gstring_strip(GString *s, const gchar *strip_chars); + +/** + * Strips characters in `strip_chars` from start and end of the sized string + * @param s + * @param strip_chars + */ +const gchar *rspamd_string_len_strip(const gchar *in, + gsize *len, const gchar *strip_chars) G_GNUC_WARN_UNUSED_RESULT; + +/** + * Returns a NULL terminated list of zero terminated strings based on splitting of + * the base string into parts. If pool is not NULL then memory is allocated from + * the pool. Otherwise, it is allocated from the heap using `g_malloc` (so + * g_strfreev could be used to free stuff) + * @param in + * @param len + * @param spill + * @param max_elts + * @return + */ +gchar **rspamd_string_len_split(const gchar *in, gsize len, + const gchar *spill, gint max_elts, rspamd_mempool_t *pool); + +#define IS_ZERO_WIDTH_SPACE(uc) ((uc) == 0x200B || \ + (uc) == 0x200C || \ + (uc) == 0x200D || \ + (uc) == 0xFEFF || \ + (uc) == 0x00AD) +#define IS_OBSCURED_CHAR(uc) (((uc) >= 0x200B && (uc) <= 0x200F) || \ + ((uc) >= 0x2028 && (uc) <= 0x202F) || \ + ((uc) >= 0x205F && (uc) <= 0x206F) || \ + (uc) == 0xFEFF) + +#define RSPAMD_LEN_CHECK_STARTS_WITH(s, len, lit) \ + ((len) >= sizeof(lit) - 1 && g_ascii_strncasecmp((s), (lit), sizeof(lit) - 1) == 0) + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBUTIL_STR_UTIL_H_ */ |