diff options
Diffstat (limited to '')
-rw-r--r-- | lib/util/charset/util_str.c | 607 |
1 files changed, 607 insertions, 0 deletions
diff --git a/lib/util/charset/util_str.c b/lib/util/charset/util_str.c new file mode 100644 index 0000000..2a4ccd9 --- /dev/null +++ b/lib/util/charset/util_str.c @@ -0,0 +1,607 @@ +/* + Unix SMB/CIFS implementation. + Samba utility functions + Copyright (C) Andrew Tridgell 1992-2001 + Copyright (C) Simo Sorce 2001 + Copyright (C) Andrew Bartlett 2011 + Copyright (C) Jeremy Allison 1992-2007 + Copyright (C) Martin Pool 2003 + Copyright (C) James Peach 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/locale.h" +#include "charset.h" +#include "lib/util/fault.h" + +#ifdef strcasecmp +#undef strcasecmp +#endif +#ifdef strncasecmp +#undef strncasecmp +#endif + + +/** + Case insensitive string compararison, handle specified for testing +**/ +_PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle, + const char *s1, const char *s2) +{ + codepoint_t c1=0, c2=0; + codepoint_t u1=0, u2=0; + codepoint_t l1=0, l2=0; + size_t size1, size2; + + /* handle null ptr comparisons to simplify the use in qsort */ + if (s1 == s2) return 0; + if (s1 == NULL) return -1; + if (s2 == NULL) return 1; + + while (*s1 && *s2) { + c1 = next_codepoint_handle(iconv_handle, s1, &size1); + c2 = next_codepoint_handle(iconv_handle, s2, &size2); + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + return strcasecmp(s1, s2); + } + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + u1 = toupper_m(c1); + u2 = toupper_m(c2); + if (u1 == u2) { + continue; + } + + l1 = tolower_m(c1); + l2 = tolower_m(c2); + if (l1 == l2) { + continue; + } + + return l1 - l2; + } + + return *s1 - *s2; +} + +/** + Case insensitive string compararison +**/ +_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2) +{ + struct smb_iconv_handle *iconv_handle = get_iconv_handle(); + return strcasecmp_m_handle(iconv_handle, s1, s2); +} + +/** + Case insensitive string compararison, length limited, handle specified for testing +**/ +_PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle, + const char *s1, const char *s2, size_t n) +{ + codepoint_t c1=0, c2=0; + codepoint_t u1=0, u2=0; + codepoint_t l1=0, l2=0; + size_t size1, size2; + + /* handle null ptr comparisons to simplify the use in qsort */ + if (s1 == s2) return 0; + if (s1 == NULL) return -1; + if (s2 == NULL) return 1; + + while (*s1 && *s2 && n) { + n--; + + c1 = next_codepoint_handle(iconv_handle, s1, &size1); + c2 = next_codepoint_handle(iconv_handle, s2, &size2); + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + /* + * n was specified in characters, + * now we must convert it to bytes. + * As bytes are the smallest + * character unit, the following + * increment and strncasecmp is always + * safe. + * + * The source string was already known + * to be n characters long, so we are + * guaranteed to be able to look at the + * (n remaining + size1) bytes from the + * s1 position). + */ + n += size1; + return strncasecmp(s1, s2, n); + } + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + u1 = toupper_m(c1); + u2 = toupper_m(c2); + if (u1 == u2) { + continue; + } + + l1 = tolower_m(c1); + l2 = tolower_m(c2); + if (l1 == l2) { + continue; + } + + return l1 - l2; + } + + if (n == 0) { + return 0; + } + + return *s1 - *s2; +} + +/** + Case insensitive string compararison, length limited +**/ +_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n) +{ + struct smb_iconv_handle *iconv_handle = get_iconv_handle(); + return strncasecmp_m_handle(iconv_handle, s1, s2, n); +} + +/** + * Compare 2 strings. + * + * @note The comparison is case-insensitive. + **/ +_PUBLIC_ bool strequal_m(const char *s1, const char *s2) +{ + return strcasecmp_m(s1,s2) == 0; +} + +/** + Compare 2 strings (case sensitive). +**/ +_PUBLIC_ bool strcsequal(const char *s1,const char *s2) +{ + if (s1 == s2) + return true; + if (!s1 || !s2) + return false; + + return strcmp(s1,s2) == 0; +} + +/** + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). + */ +_PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic, + const char *s, charset_t src_charset, charset_t dst_charset) +{ + size_t count = 0; + +#ifdef DEVELOPER + switch (dst_charset) { + case CH_DOS: + case CH_UNIX: + smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)"); + default: + break; + } + + switch (src_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)"); + default: + break; + } +#endif + if (!s) { + return 0; + } + + while (*s && !(((uint8_t)*s) & 0x80)) { + s++; + count++; + } + + if (!*s) { + return count; + } + + while (*s) { + size_t c_size; + codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5), + src_charset, &c_size); + s += c_size; + + switch (dst_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + if (c < 0x10000) { + /* Unicode char fits into 16 bits. */ + count += 1; + } else { + /* Double-width unicode char - 32 bits. */ + count += 2; + } + break; + case CH_UTF8: + /* + * this only checks ranges, and does not + * check for invalid codepoints + */ + if (c < 0x80) { + count += 1; + } else if (c < 0x800) { + count += 2; + } else if (c < 0x10000) { + count += 3; + } else { + count += 4; + } + break; + default: + /* + * non-unicode encoding: + * assume that each codepoint fits into + * one unit in the destination encoding. + */ + count += 1; + } + } + + return count; +} + +/** + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). + */ +_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset) +{ + struct smb_iconv_handle *ic = get_iconv_handle(); + return strlen_m_ext_handle(ic, s, src_charset, dst_charset); +} + +_PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset, + const charset_t dst_charset) +{ + if (!s) { + return 0; + } + return strlen_m_ext(s, src_charset, dst_charset) + 1; +} + +_PUBLIC_ size_t strlen_m_ext_term_null(const char *s, + const charset_t src_charset, + const charset_t dst_charset) +{ + size_t len; + if (!s) { + return 0; + } + len = strlen_m_ext(s, src_charset, dst_charset); + if (len == 0) { + return 0; + } + + return len+1; +} + +/** + * Calculate the number of 16-bit units that would be needed to convert + * the input string which is expected to be in CH_UNIX encoding to UTF16. + * + * This will be the same as the number of bytes in a string for single + * byte strings, but will be different for multibyte. + */ +_PUBLIC_ size_t strlen_m(const char *s) +{ + return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); +} + +/** + Work out the number of multibyte chars in a string, including the NULL + terminator. +**/ +_PUBLIC_ size_t strlen_m_term(const char *s) +{ + return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE); +} + +/* + * Weird helper routine for the winreg pipe: If nothing is around, return 0, + * if a string is there, include the terminator. + */ + +_PUBLIC_ size_t strlen_m_term_null(const char *s) +{ + return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE); +} + +/** + Strchr and strrchr_m are a bit complex on general multi-byte strings. +**/ +_PUBLIC_ char *strchr_m(const char *src, char c) +{ + const char *s; + struct smb_iconv_handle *ic = get_iconv_handle(); + if (src == NULL) { + return NULL; + } + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strchr(src, c); + } + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars) */ + + for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) { + if (*s == c) + return discard_const_p(char, s); + } + + if (!*s) + return NULL; + +#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS + /* With compose characters we must restart from the beginning. JRA. */ + s = src; +#endif + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint_handle(ic, s, &size); + if (c2 == c) { + return discard_const_p(char, s); + } + s += size; + } + + return NULL; +} + +/** + * Multibyte-character version of strrchr + */ +_PUBLIC_ char *strrchr_m(const char *s, char c) +{ + struct smb_iconv_handle *ic; + char *ret = NULL; + + if (s == NULL) { + return NULL; + } + + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strrchr(s, c); + } + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars). Also, in Samba + we only search for ascii characters in 'c' and that + in all mb character sets with a compound character + containing c, if 'c' is not a match at position + p, then p[-1] > 0x7f. JRA. */ + + { + size_t len = strlen(s); + const char *cp = s; + bool got_mb = false; + + if (len == 0) + return NULL; + cp += (len - 1); + do { + if (c == *cp) { + /* Could be a match. Part of a multibyte ? */ + if ((cp > s) && + (((unsigned char)cp[-1]) & 0x80)) { + /* Yep - go slow :-( */ + got_mb = true; + break; + } + /* No - we have a match ! */ + return discard_const_p(char , cp); + } + } while (cp-- != s); + if (!got_mb) + return NULL; + } + + ic = get_iconv_handle(); + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint_handle(ic, s, &size); + if (c2 == c) { + ret = discard_const_p(char, s); + } + s += size; + } + + return ret; +} + +/** + return True if any (multi-byte) character is lower case +*/ +_PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic, + const char *string) +{ + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint_handle(ic, string, &c_size); + string += c_size; + + t = toupper_m(s); + + if (s != t) { + return true; /* that means it has lower case chars */ + } + } + + return false; +} + +_PUBLIC_ bool strhaslower(const char *string) +{ + struct smb_iconv_handle *ic = get_iconv_handle(); + return strhaslower_handle(ic, string); +} + +/** + return True if any (multi-byte) character is upper case +*/ +_PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic, + const char *string) +{ + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint_handle(ic, string, &c_size); + string += c_size; + + t = tolower_m(s); + + if (s != t) { + return true; /* that means it has upper case chars */ + } + } + + return false; +} + +_PUBLIC_ bool strhasupper(const char *string) +{ + struct smb_iconv_handle *ic = get_iconv_handle(); + return strhasupper_handle(ic, string); +} + +/*********************************************************************** + strstr_m - We convert via ucs2 for now. +***********************************************************************/ + +char *strstr_m(const char *src, const char *findstr) +{ + TALLOC_CTX *mem_ctx = NULL; + smb_ucs2_t *p; + smb_ucs2_t *src_w, *find_w; + const char *s; + char *s2; + char *retp = NULL; + size_t converted_size, findstr_len = 0; + + /* for correctness */ + if (!findstr[0]) { + return discard_const_p(char, src); + } + + /* Samba does single character findstr calls a *lot*. */ + if (findstr[1] == '\0') + return strchr_m(src, *findstr); + + /* We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars) */ + + for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) { + if (*s == *findstr) { + if (!findstr_len) + findstr_len = strlen(findstr); + + if (strncmp(s, findstr, findstr_len) == 0) { + return discard_const_p(char, s); + } + } + } + + if (!*s) + return NULL; + +#if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */ + /* 'make check' fails unless we do this */ + + /* With compose characters we must restart from the beginning. JRA. */ + s = src; +#endif + + /* + * Use get_iconv_handle() just as a non-NULL talloc ctx. In + * case we leak memory, this should then be more obvious in + * the talloc report. + */ + mem_ctx = talloc_new(get_iconv_handle()); + if (mem_ctx == NULL) { + return NULL; + } + + if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) { + goto done; + } + + if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) { + goto done; + } + + p = strstr_w(src_w, find_w); + + if (!p) { + goto done; + } + + *p = 0; + if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) { + goto done; + } + retp = discard_const_p(char, (s+strlen(s2))); +done: + TALLOC_FREE(mem_ctx); + return retp; +} |