summaryrefslogtreecommitdiffstats
path: root/lib/util/charset/util_str.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/util/charset/util_str.c607
1 files changed, 607 insertions, 0 deletions
diff --git a/lib/util/charset/util_str.c b/lib/util/charset/util_str.c
new file mode 100644
index 0000000..2a4ccd9
--- /dev/null
+++ b/lib/util/charset/util_str.c
@@ -0,0 +1,607 @@
+/*
+ Unix SMB/CIFS implementation.
+ Samba utility functions
+ Copyright (C) Andrew Tridgell 1992-2001
+ Copyright (C) Simo Sorce 2001
+ Copyright (C) Andrew Bartlett 2011
+ Copyright (C) Jeremy Allison 1992-2007
+ Copyright (C) Martin Pool 2003
+ Copyright (C) James Peach 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/locale.h"
+#include "charset.h"
+#include "lib/util/fault.h"
+
+#ifdef strcasecmp
+#undef strcasecmp
+#endif
+#ifdef strncasecmp
+#undef strncasecmp
+#endif
+
+
+/**
+ Case insensitive string compararison, handle specified for testing
+**/
+_PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
+ const char *s1, const char *s2)
+{
+ codepoint_t c1=0, c2=0;
+ codepoint_t u1=0, u2=0;
+ codepoint_t l1=0, l2=0;
+ size_t size1, size2;
+
+ /* handle null ptr comparisons to simplify the use in qsort */
+ if (s1 == s2) return 0;
+ if (s1 == NULL) return -1;
+ if (s2 == NULL) return 1;
+
+ while (*s1 && *s2) {
+ c1 = next_codepoint_handle(iconv_handle, s1, &size1);
+ c2 = next_codepoint_handle(iconv_handle, s2, &size2);
+
+ if (c1 == INVALID_CODEPOINT ||
+ c2 == INVALID_CODEPOINT) {
+ return strcasecmp(s1, s2);
+ }
+
+ s1 += size1;
+ s2 += size2;
+
+ if (c1 == c2) {
+ continue;
+ }
+
+ u1 = toupper_m(c1);
+ u2 = toupper_m(c2);
+ if (u1 == u2) {
+ continue;
+ }
+
+ l1 = tolower_m(c1);
+ l2 = tolower_m(c2);
+ if (l1 == l2) {
+ continue;
+ }
+
+ return l1 - l2;
+ }
+
+ return *s1 - *s2;
+}
+
+/**
+ Case insensitive string compararison
+**/
+_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
+{
+ struct smb_iconv_handle *iconv_handle = get_iconv_handle();
+ return strcasecmp_m_handle(iconv_handle, s1, s2);
+}
+
+/**
+ Case insensitive string compararison, length limited, handle specified for testing
+**/
+_PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
+ const char *s1, const char *s2, size_t n)
+{
+ codepoint_t c1=0, c2=0;
+ codepoint_t u1=0, u2=0;
+ codepoint_t l1=0, l2=0;
+ size_t size1, size2;
+
+ /* handle null ptr comparisons to simplify the use in qsort */
+ if (s1 == s2) return 0;
+ if (s1 == NULL) return -1;
+ if (s2 == NULL) return 1;
+
+ while (*s1 && *s2 && n) {
+ n--;
+
+ c1 = next_codepoint_handle(iconv_handle, s1, &size1);
+ c2 = next_codepoint_handle(iconv_handle, s2, &size2);
+
+ if (c1 == INVALID_CODEPOINT ||
+ c2 == INVALID_CODEPOINT) {
+ /*
+ * n was specified in characters,
+ * now we must convert it to bytes.
+ * As bytes are the smallest
+ * character unit, the following
+ * increment and strncasecmp is always
+ * safe.
+ *
+ * The source string was already known
+ * to be n characters long, so we are
+ * guaranteed to be able to look at the
+ * (n remaining + size1) bytes from the
+ * s1 position).
+ */
+ n += size1;
+ return strncasecmp(s1, s2, n);
+ }
+
+ s1 += size1;
+ s2 += size2;
+
+ if (c1 == c2) {
+ continue;
+ }
+
+ u1 = toupper_m(c1);
+ u2 = toupper_m(c2);
+ if (u1 == u2) {
+ continue;
+ }
+
+ l1 = tolower_m(c1);
+ l2 = tolower_m(c2);
+ if (l1 == l2) {
+ continue;
+ }
+
+ return l1 - l2;
+ }
+
+ if (n == 0) {
+ return 0;
+ }
+
+ return *s1 - *s2;
+}
+
+/**
+ Case insensitive string compararison, length limited
+**/
+_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
+{
+ struct smb_iconv_handle *iconv_handle = get_iconv_handle();
+ return strncasecmp_m_handle(iconv_handle, s1, s2, n);
+}
+
+/**
+ * Compare 2 strings.
+ *
+ * @note The comparison is case-insensitive.
+ **/
+_PUBLIC_ bool strequal_m(const char *s1, const char *s2)
+{
+ return strcasecmp_m(s1,s2) == 0;
+}
+
+/**
+ Compare 2 strings (case sensitive).
+**/
+_PUBLIC_ bool strcsequal(const char *s1,const char *s2)
+{
+ if (s1 == s2)
+ return true;
+ if (!s1 || !s2)
+ return false;
+
+ return strcmp(s1,s2) == 0;
+}
+
+/**
+ * Calculate the number of units (8 or 16-bit, depending on the
+ * destination charset), that would be needed to convert the input
+ * string which is expected to be in in src_charset encoding to the
+ * destination charset (which should be a unicode charset).
+ */
+_PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
+ const char *s, charset_t src_charset, charset_t dst_charset)
+{
+ size_t count = 0;
+
+#ifdef DEVELOPER
+ switch (dst_charset) {
+ case CH_DOS:
+ case CH_UNIX:
+ smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
+ default:
+ break;
+ }
+
+ switch (src_charset) {
+ case CH_UTF16LE:
+ case CH_UTF16BE:
+ smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
+ default:
+ break;
+ }
+#endif
+ if (!s) {
+ return 0;
+ }
+
+ while (*s && !(((uint8_t)*s) & 0x80)) {
+ s++;
+ count++;
+ }
+
+ if (!*s) {
+ return count;
+ }
+
+ while (*s) {
+ size_t c_size;
+ codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
+ src_charset, &c_size);
+ s += c_size;
+
+ switch (dst_charset) {
+ case CH_UTF16LE:
+ case CH_UTF16BE:
+ case CH_UTF16MUNGED:
+ if (c < 0x10000) {
+ /* Unicode char fits into 16 bits. */
+ count += 1;
+ } else {
+ /* Double-width unicode char - 32 bits. */
+ count += 2;
+ }
+ break;
+ case CH_UTF8:
+ /*
+ * this only checks ranges, and does not
+ * check for invalid codepoints
+ */
+ if (c < 0x80) {
+ count += 1;
+ } else if (c < 0x800) {
+ count += 2;
+ } else if (c < 0x10000) {
+ count += 3;
+ } else {
+ count += 4;
+ }
+ break;
+ default:
+ /*
+ * non-unicode encoding:
+ * assume that each codepoint fits into
+ * one unit in the destination encoding.
+ */
+ count += 1;
+ }
+ }
+
+ return count;
+}
+
+/**
+ * Calculate the number of units (8 or 16-bit, depending on the
+ * destination charset), that would be needed to convert the input
+ * string which is expected to be in in src_charset encoding to the
+ * destination charset (which should be a unicode charset).
+ */
+_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
+{
+ struct smb_iconv_handle *ic = get_iconv_handle();
+ return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
+}
+
+_PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
+ const charset_t dst_charset)
+{
+ if (!s) {
+ return 0;
+ }
+ return strlen_m_ext(s, src_charset, dst_charset) + 1;
+}
+
+_PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
+ const charset_t src_charset,
+ const charset_t dst_charset)
+{
+ size_t len;
+ if (!s) {
+ return 0;
+ }
+ len = strlen_m_ext(s, src_charset, dst_charset);
+ if (len == 0) {
+ return 0;
+ }
+
+ return len+1;
+}
+
+/**
+ * Calculate the number of 16-bit units that would be needed to convert
+ * the input string which is expected to be in CH_UNIX encoding to UTF16.
+ *
+ * This will be the same as the number of bytes in a string for single
+ * byte strings, but will be different for multibyte.
+ */
+_PUBLIC_ size_t strlen_m(const char *s)
+{
+ return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
+}
+
+/**
+ Work out the number of multibyte chars in a string, including the NULL
+ terminator.
+**/
+_PUBLIC_ size_t strlen_m_term(const char *s)
+{
+ return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
+}
+
+/*
+ * Weird helper routine for the winreg pipe: If nothing is around, return 0,
+ * if a string is there, include the terminator.
+ */
+
+_PUBLIC_ size_t strlen_m_term_null(const char *s)
+{
+ return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
+}
+
+/**
+ Strchr and strrchr_m are a bit complex on general multi-byte strings.
+**/
+_PUBLIC_ char *strchr_m(const char *src, char c)
+{
+ const char *s;
+ struct smb_iconv_handle *ic = get_iconv_handle();
+ if (src == NULL) {
+ return NULL;
+ }
+ /* characters below 0x3F are guaranteed to not appear in
+ non-initial position in multi-byte charsets */
+ if ((c & 0xC0) == 0) {
+ return strchr(src, c);
+ }
+
+ /* this is quite a common operation, so we want it to be
+ fast. We optimise for the ascii case, knowing that all our
+ supported multi-byte character sets are ascii-compatible
+ (ie. they match for the first 128 chars) */
+
+ for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
+ if (*s == c)
+ return discard_const_p(char, s);
+ }
+
+ if (!*s)
+ return NULL;
+
+#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
+ /* With compose characters we must restart from the beginning. JRA. */
+ s = src;
+#endif
+
+ while (*s) {
+ size_t size;
+ codepoint_t c2 = next_codepoint_handle(ic, s, &size);
+ if (c2 == c) {
+ return discard_const_p(char, s);
+ }
+ s += size;
+ }
+
+ return NULL;
+}
+
+/**
+ * Multibyte-character version of strrchr
+ */
+_PUBLIC_ char *strrchr_m(const char *s, char c)
+{
+ struct smb_iconv_handle *ic;
+ char *ret = NULL;
+
+ if (s == NULL) {
+ return NULL;
+ }
+
+ /* characters below 0x3F are guaranteed to not appear in
+ non-initial position in multi-byte charsets */
+ if ((c & 0xC0) == 0) {
+ return strrchr(s, c);
+ }
+
+ /* this is quite a common operation, so we want it to be
+ fast. We optimise for the ascii case, knowing that all our
+ supported multi-byte character sets are ascii-compatible
+ (ie. they match for the first 128 chars). Also, in Samba
+ we only search for ascii characters in 'c' and that
+ in all mb character sets with a compound character
+ containing c, if 'c' is not a match at position
+ p, then p[-1] > 0x7f. JRA. */
+
+ {
+ size_t len = strlen(s);
+ const char *cp = s;
+ bool got_mb = false;
+
+ if (len == 0)
+ return NULL;
+ cp += (len - 1);
+ do {
+ if (c == *cp) {
+ /* Could be a match. Part of a multibyte ? */
+ if ((cp > s) &&
+ (((unsigned char)cp[-1]) & 0x80)) {
+ /* Yep - go slow :-( */
+ got_mb = true;
+ break;
+ }
+ /* No - we have a match ! */
+ return discard_const_p(char , cp);
+ }
+ } while (cp-- != s);
+ if (!got_mb)
+ return NULL;
+ }
+
+ ic = get_iconv_handle();
+
+ while (*s) {
+ size_t size;
+ codepoint_t c2 = next_codepoint_handle(ic, s, &size);
+ if (c2 == c) {
+ ret = discard_const_p(char, s);
+ }
+ s += size;
+ }
+
+ return ret;
+}
+
+/**
+ return True if any (multi-byte) character is lower case
+*/
+_PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
+ const char *string)
+{
+ while (*string) {
+ size_t c_size;
+ codepoint_t s;
+ codepoint_t t;
+
+ s = next_codepoint_handle(ic, string, &c_size);
+ string += c_size;
+
+ t = toupper_m(s);
+
+ if (s != t) {
+ return true; /* that means it has lower case chars */
+ }
+ }
+
+ return false;
+}
+
+_PUBLIC_ bool strhaslower(const char *string)
+{
+ struct smb_iconv_handle *ic = get_iconv_handle();
+ return strhaslower_handle(ic, string);
+}
+
+/**
+ return True if any (multi-byte) character is upper case
+*/
+_PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
+ const char *string)
+{
+ while (*string) {
+ size_t c_size;
+ codepoint_t s;
+ codepoint_t t;
+
+ s = next_codepoint_handle(ic, string, &c_size);
+ string += c_size;
+
+ t = tolower_m(s);
+
+ if (s != t) {
+ return true; /* that means it has upper case chars */
+ }
+ }
+
+ return false;
+}
+
+_PUBLIC_ bool strhasupper(const char *string)
+{
+ struct smb_iconv_handle *ic = get_iconv_handle();
+ return strhasupper_handle(ic, string);
+}
+
+/***********************************************************************
+ strstr_m - We convert via ucs2 for now.
+***********************************************************************/
+
+char *strstr_m(const char *src, const char *findstr)
+{
+ TALLOC_CTX *mem_ctx = NULL;
+ smb_ucs2_t *p;
+ smb_ucs2_t *src_w, *find_w;
+ const char *s;
+ char *s2;
+ char *retp = NULL;
+ size_t converted_size, findstr_len = 0;
+
+ /* for correctness */
+ if (!findstr[0]) {
+ return discard_const_p(char, src);
+ }
+
+ /* Samba does single character findstr calls a *lot*. */
+ if (findstr[1] == '\0')
+ return strchr_m(src, *findstr);
+
+ /* We optimise for the ascii case, knowing that all our
+ supported multi-byte character sets are ascii-compatible
+ (ie. they match for the first 128 chars) */
+
+ for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
+ if (*s == *findstr) {
+ if (!findstr_len)
+ findstr_len = strlen(findstr);
+
+ if (strncmp(s, findstr, findstr_len) == 0) {
+ return discard_const_p(char, s);
+ }
+ }
+ }
+
+ if (!*s)
+ return NULL;
+
+#if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
+ /* 'make check' fails unless we do this */
+
+ /* With compose characters we must restart from the beginning. JRA. */
+ s = src;
+#endif
+
+ /*
+ * Use get_iconv_handle() just as a non-NULL talloc ctx. In
+ * case we leak memory, this should then be more obvious in
+ * the talloc report.
+ */
+ mem_ctx = talloc_new(get_iconv_handle());
+ if (mem_ctx == NULL) {
+ return NULL;
+ }
+
+ if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
+ goto done;
+ }
+
+ if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
+ goto done;
+ }
+
+ p = strstr_w(src_w, find_w);
+
+ if (!p) {
+ goto done;
+ }
+
+ *p = 0;
+ if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
+ goto done;
+ }
+ retp = discard_const_p(char, (s+strlen(s2)));
+done:
+ TALLOC_FREE(mem_ctx);
+ return retp;
+}