diff options
Diffstat (limited to 'lib/strutil/strutilutf8.c')
-rw-r--r-- | lib/strutil/strutilutf8.c | 1519 |
1 files changed, 1519 insertions, 0 deletions
diff --git a/lib/strutil/strutilutf8.c b/lib/strutil/strutilutf8.c new file mode 100644 index 0000000..16725cb --- /dev/null +++ b/lib/strutil/strutilutf8.c @@ -0,0 +1,1519 @@ +/* + UTF-8 strings utilities + + Copyright (C) 2007-2022 + Free Software Foundation, Inc. + + Written by: + Rostislav Benes, 2007 + + This file is part of the Midnight Commander. + + The Midnight Commander is free software: you can redistribute it + and/or modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + The Midnight Commander is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#include <stdlib.h> +#include <langinfo.h> +#include <limits.h> /* MB_LEN_MAX */ +#include <string.h> + +#include "lib/global.h" +#include "lib/strutil.h" + +/* using function for utf-8 from glib */ + +/*** global variables ****************************************************************************/ + +/*** file scope macro definitions ****************************************************************/ + +/*** file scope type declarations ****************************************************************/ + +struct utf8_tool +{ + char *actual; + size_t remain; + const char *checked; + int ident; + gboolean compose; +}; + +struct term_form +{ + char text[BUF_MEDIUM * MB_LEN_MAX]; + size_t width; + gboolean compose; +}; + +/*** file scope variables ************************************************************************/ + +static const char replch[] = "\xEF\xBF\xBD"; + +/* --------------------------------------------------------------------------------------------- */ +/*** file scope functions ************************************************************************/ +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_unichar_iscombiningmark (gunichar uni) +{ + GUnicodeType type; + + type = g_unichar_type (uni); + return (type == G_UNICODE_SPACING_MARK) + || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK); +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_insert_replace_char (GString * buffer) +{ + g_string_append (buffer, replch); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_is_valid_string (const char *text) +{ + return g_utf8_validate (text, -1, NULL); +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_is_valid_char (const char *ch, size_t size) +{ + switch (g_utf8_get_char_validated (ch, size)) + { + case (gunichar) (-2): + return (-2); + case (gunichar) (-1): + return (-1); + default: + return 1; + } +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_cnext_char (const char **text) +{ + (*text) = g_utf8_next_char (*text); +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_cprev_char (const char **text) +{ + (*text) = g_utf8_prev_char (*text); +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_cnext_char_safe (const char **text) +{ + if (str_utf8_is_valid_char (*text, -1) == 1) + (*text) = g_utf8_next_char (*text); + else + (*text)++; +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_cprev_char_safe (const char **text) +{ + const char *result, *t; + + result = g_utf8_prev_char (*text); + t = result; + str_utf8_cnext_char_safe (&t); + if (t == *text) + (*text) = result; + else + (*text)--; +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_fix_string (char *text) +{ + while (text[0] != '\0') + { + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))) + text = g_utf8_next_char (text); + else + { + text[0] = '?'; + text++; + } + } +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_isspace (const char *text) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + return g_unichar_isspace (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_ispunct (const char *text) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + return g_unichar_ispunct (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_isalnum (const char *text) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + return g_unichar_isalnum (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_isdigit (const char *text) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + return g_unichar_isdigit (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_isprint (const char *ch) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (ch, -1); + return g_unichar_isprint (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_iscombiningmark (const char *ch) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (ch, -1); + return str_unichar_iscombiningmark (uni); +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_cnext_noncomb_char (const char **text) +{ + int count = 0; + + while ((*text)[0] != '\0') + { + str_utf8_cnext_char_safe (text); + count++; + if (!str_utf8_iscombiningmark (*text)) + break; + } + + return count; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_cprev_noncomb_char (const char **text, const char *begin) +{ + int count = 0; + + while ((*text) != begin) + { + str_utf8_cprev_char_safe (text); + count++; + if (!str_utf8_iscombiningmark (*text)) + break; + } + + return count; +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_toupper (const char *text, char **out, size_t * remain) +{ + gunichar uni; + size_t left; + + uni = g_utf8_get_char_validated (text, -1); + if (uni == (gunichar) (-1) || uni == (gunichar) (-2)) + return FALSE; + + uni = g_unichar_toupper (uni); + left = g_unichar_to_utf8 (uni, NULL); + if (left >= *remain) + return FALSE; + + left = g_unichar_to_utf8 (uni, *out); + (*out) += left; + (*remain) -= left; + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + +static gboolean +str_utf8_tolower (const char *text, char **out, size_t * remain) +{ + gunichar uni; + size_t left; + + uni = g_utf8_get_char_validated (text, -1); + if (uni == (gunichar) (-1) || uni == (gunichar) (-2)) + return FALSE; + + uni = g_unichar_tolower (uni); + left = g_unichar_to_utf8 (uni, NULL); + if (left >= *remain) + return FALSE; + + left = g_unichar_to_utf8 (uni, *out); + (*out) += left; + (*remain) -= left; + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_length (const char *text) +{ + int result = 0; + const char *start; + const char *end; + + start = text; + while (!g_utf8_validate (start, -1, &end) && start[0] != '\0') + { + if (start != end) + result += g_utf8_strlen (start, end - start); + + result++; + start = end + 1; + } + + if (start == text) + result = g_utf8_strlen (text, -1); + else if (start[0] != '\0' && start != end) + result += g_utf8_strlen (start, end - start); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_length2 (const char *text, int size) +{ + int result = 0; + const char *start; + const char *end; + + start = text; + while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0) + { + if (start != end) + { + result += g_utf8_strlen (start, MIN (end - start, size)); + size -= end - start; + } + result += (size > 0); + size--; + start = end + 1; + } + + if (start == text) + result = g_utf8_strlen (text, size); + else if (start[0] != '\0' && start != end && size > 0) + result += g_utf8_strlen (start, MIN (end - start, size)); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_length_noncomb (const char *text) +{ + int result = 0; + const char *t = text; + + while (t[0] != '\0') + { + str_utf8_cnext_noncomb_char (&t); + result++; + } + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +#if 0 +static void +str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer) +{ + char *next; + + next = g_utf8_next_char (*string); + (*left) -= next - (*string); + (*string) = next; + g_string_append_c (buffer, '?'); +} +#endif + +/* --------------------------------------------------------------------------------------------- */ + +static gchar * +str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg) +{ + if (mcerror != NULL) + return g_strdup (mcerror->message); + + return g_strdup (def_msg != NULL ? def_msg : ""); +} + +/* --------------------------------------------------------------------------------------------- */ + +static estr_t +str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer) +{ + estr_t result = ESTR_SUCCESS; + + if (coder == str_cnv_not_convert) + g_string_append_len (buffer, string, size); + else + result = str_nconvert (coder, string, size, buffer); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, that makes string valid in utf8 and all characters printable + * return width of string too */ + +static const struct term_form * +str_utf8_make_make_term_form (const char *text, size_t length) +{ + static struct term_form result; + gunichar uni; + size_t left; + char *actual; + + result.text[0] = '\0'; + result.width = 0; + result.compose = FALSE; + actual = result.text; + + /* check if text start with combining character, + * add space at begin in this case */ + if (length != 0 && text[0] != '\0') + { + uni = g_utf8_get_char_validated (text, -1); + if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)) + && str_unichar_iscombiningmark (uni)) + { + actual[0] = ' '; + actual++; + result.width++; + result.compose = TRUE; + } + } + + while (length != 0 && text[0] != '\0') + { + uni = g_utf8_get_char_validated (text, -1); + if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))) + { + if (g_unichar_isprint (uni)) + { + left = g_unichar_to_utf8 (uni, actual); + actual += left; + if (str_unichar_iscombiningmark (uni)) + result.compose = TRUE; + else + { + result.width++; + if (g_unichar_iswide (uni)) + result.width++; + } + } + else + { + actual[0] = '.'; + actual++; + result.width++; + } + text = g_utf8_next_char (text); + } + else + { + text++; + /*actual[0] = '?'; */ + memcpy (actual, replch, strlen (replch)); + actual += strlen (replch); + result.width++; + } + + if (length != (size_t) (-1)) + length--; + } + actual[0] = '\0'; + + return &result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_term_form (const char *text) +{ + static char result[BUF_MEDIUM * MB_LEN_MAX]; + const struct term_form *pre_form; + + pre_form = str_utf8_make_make_term_form (text, (size_t) (-1)); + if (pre_form->compose) + { + char *composed; + + composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE); + g_strlcpy (result, composed, sizeof (result)); + g_free (composed); + } + else + g_strlcpy (result, pre_form->text, sizeof (result)); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, that copies all characters from checked to actual */ + +static gboolean +utf8_tool_copy_chars_to_end (struct utf8_tool *tool) +{ + tool->compose = FALSE; + + while (tool->checked[0] != '\0') + { + gunichar uni; + size_t left; + + uni = g_utf8_get_char (tool->checked); + tool->compose = tool->compose || str_unichar_iscombiningmark (uni); + left = g_unichar_to_utf8 (uni, NULL); + if (tool->remain <= left) + return FALSE; + left = g_unichar_to_utf8 (uni, tool->actual); + tool->actual += left; + tool->remain -= left; + tool->checked = g_utf8_next_char (tool->checked); + } + + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, that copies characters from checked to actual until ident is + * smaller than to_ident */ + +static gboolean +utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident) +{ + tool->compose = FALSE; + + while (tool->checked[0] != '\0') + { + gunichar uni; + size_t left; + int w = 0; + + uni = g_utf8_get_char (tool->checked); + if (str_unichar_iscombiningmark (uni)) + tool->compose = TRUE; + else + { + w = 1; + if (g_unichar_iswide (uni)) + w++; + if (tool->ident + w > to_ident) + return TRUE; + } + + left = g_unichar_to_utf8 (uni, NULL); + if (tool->remain <= left) + return FALSE; + left = g_unichar_to_utf8 (uni, tool->actual); + tool->actual += left; + tool->remain -= left; + tool->checked = g_utf8_next_char (tool->checked); + tool->ident += w; + } + + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, adds count spaces to actual */ + +static int +utf8_tool_insert_space (struct utf8_tool *tool, int count) +{ + if (count <= 0) + return 1; + if (tool->remain <= (gsize) count) + return 0; + + memset (tool->actual, ' ', count); + tool->actual += count; + tool->remain -= count; + return 1; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, adds one characters to actual */ + +static int +utf8_tool_insert_char (struct utf8_tool *tool, char ch) +{ + if (tool->remain <= 1) + return 0; + + tool->actual[0] = ch; + tool->actual++; + tool->remain--; + return 1; +} + +/* --------------------------------------------------------------------------------------------- */ +/* utility function, thah skips characters from checked until ident is greater or + * equal to to_ident */ + +static gboolean +utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident) +{ + gunichar uni; + + while (to_ident > tool->ident && tool->checked[0] != '\0') + { + uni = g_utf8_get_char (tool->checked); + if (!str_unichar_iscombiningmark (uni)) + { + tool->ident++; + if (g_unichar_iswide (uni)) + tool->ident++; + } + tool->checked = g_utf8_next_char (tool->checked); + } + + uni = g_utf8_get_char (tool->checked); + while (str_unichar_iscombiningmark (uni)) + { + tool->checked = g_utf8_next_char (tool->checked); + uni = g_utf8_get_char (tool->checked); + } + + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +utf8_tool_compose (char *buffer, size_t size) +{ + char *composed; + + composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE); + g_strlcpy (buffer, composed, size); + g_free (composed); +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode) +{ + static char result[BUF_MEDIUM * MB_LEN_MAX]; + const struct term_form *pre_form; + struct utf8_tool tool; + + pre_form = str_utf8_make_make_term_form (text, (size_t) (-1)); + tool.checked = pre_form->text; + tool.actual = result; + tool.remain = sizeof (result); + tool.compose = FALSE; + + if (pre_form->width <= (gsize) width) + { + switch (HIDE_FIT (just_mode)) + { + case J_CENTER_LEFT: + case J_CENTER: + tool.ident = (width - pre_form->width) / 2; + break; + case J_RIGHT: + tool.ident = width - pre_form->width; + break; + default: + tool.ident = 0; + break; + } + + utf8_tool_insert_space (&tool, tool.ident); + utf8_tool_copy_chars_to_end (&tool); + utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident); + } + else if (IS_FIT (just_mode)) + { + tool.ident = 0; + utf8_tool_copy_chars_to (&tool, width / 2); + utf8_tool_insert_char (&tool, '~'); + + tool.ident = 0; + utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1); + utf8_tool_copy_chars_to_end (&tool); + utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1)); + } + else + { + switch (HIDE_FIT (just_mode)) + { + case J_CENTER: + tool.ident = (width - pre_form->width) / 2; + break; + case J_RIGHT: + tool.ident = width - pre_form->width; + break; + default: + tool.ident = 0; + break; + } + + utf8_tool_skip_chars_to (&tool, 0); + utf8_tool_insert_space (&tool, tool.ident); + utf8_tool_copy_chars_to (&tool, width); + utf8_tool_insert_space (&tool, width - tool.ident); + } + + tool.actual[0] = '\0'; + if (tool.compose) + utf8_tool_compose (result, sizeof (result)); + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_term_trim (const char *text, int width) +{ + static char result[BUF_MEDIUM * MB_LEN_MAX]; + const struct term_form *pre_form; + struct utf8_tool tool; + + if (width < 1) + { + result[0] = '\0'; + return result; + } + + pre_form = str_utf8_make_make_term_form (text, (size_t) (-1)); + + tool.checked = pre_form->text; + tool.actual = result; + tool.remain = sizeof (result); + tool.compose = FALSE; + + if ((gsize) width >= pre_form->width) + utf8_tool_copy_chars_to_end (&tool); + else if (width <= 3) + { + memset (tool.actual, '.', width); + tool.actual += width; + tool.remain -= width; + } + else + { + memset (tool.actual, '.', 3); + tool.actual += 3; + tool.remain -= 3; + + tool.ident = 0; + utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3); + utf8_tool_copy_chars_to_end (&tool); + } + + tool.actual[0] = '\0'; + if (tool.compose) + utf8_tool_compose (result, sizeof (result)); + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_term_width2 (const char *text, size_t length) +{ + const struct term_form *result; + + result = str_utf8_make_make_term_form (text, length); + return result->width; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_term_width1 (const char *text) +{ + return str_utf8_term_width2 (text, (size_t) (-1)); +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_term_char_width (const char *text) +{ + gunichar uni; + + uni = g_utf8_get_char_validated (text, -1); + return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1); +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_term_substring (const char *text, int start, int width) +{ + static char result[BUF_MEDIUM * MB_LEN_MAX]; + const struct term_form *pre_form; + struct utf8_tool tool; + + pre_form = str_utf8_make_make_term_form (text, (size_t) (-1)); + + tool.checked = pre_form->text; + tool.actual = result; + tool.remain = sizeof (result); + tool.compose = FALSE; + + tool.ident = -start; + utf8_tool_skip_chars_to (&tool, 0); + if (tool.ident < 0) + tool.ident = 0; + utf8_tool_insert_space (&tool, tool.ident); + + utf8_tool_copy_chars_to (&tool, width); + utf8_tool_insert_space (&tool, width - tool.ident); + + tool.actual[0] = '\0'; + if (tool.compose) + utf8_tool_compose (result, sizeof (result)); + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_trunc (const char *text, int width) +{ + static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2]; + const struct term_form *pre_form; + struct utf8_tool tool; + + pre_form = str_utf8_make_make_term_form (text, (size_t) (-1)); + + tool.checked = pre_form->text; + tool.actual = result; + tool.remain = sizeof (result); + tool.compose = FALSE; + + if (pre_form->width <= (gsize) width) + utf8_tool_copy_chars_to_end (&tool); + else + { + tool.ident = 0; + utf8_tool_copy_chars_to (&tool, width / 2); + utf8_tool_insert_char (&tool, '~'); + + tool.ident = 0; + utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1); + utf8_tool_copy_chars_to_end (&tool); + } + + tool.actual[0] = '\0'; + if (tool.compose) + utf8_tool_compose (result, sizeof (result)); + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_offset_to_pos (const char *text, size_t length) +{ + if (str_utf8_is_valid_string (text)) + return g_utf8_offset_to_pointer (text, length) - text; + else + { + int result; + GString *buffer; + + buffer = g_string_new (text); + str_utf8_fix_string (buffer->str); + result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str; + g_string_free (buffer, TRUE); + return result; + } +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_column_to_pos (const char *text, size_t pos) +{ + int result = 0; + int width = 0; + + while (text[0] != '\0') + { + gunichar uni; + + uni = g_utf8_get_char_validated (text, MB_LEN_MAX); + if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))) + { + if (g_unichar_isprint (uni)) + { + if (!str_unichar_iscombiningmark (uni)) + { + width++; + if (g_unichar_iswide (uni)) + width++; + } + } + else + { + width++; + } + text = g_utf8_next_char (text); + } + else + { + text++; + width++; + } + + if ((gsize) width > pos) + return result; + + result++; + } + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static char * +str_utf8_create_search_needle (const char *needle, gboolean case_sen) +{ + char *fold, *result; + + if (needle == NULL) + return NULL; + + if (case_sen) + return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL); + + fold = g_utf8_casefold (needle, -1); + result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL); + g_free (fold); + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_release_search_needle (char *needle, gboolean case_sen) +{ + (void) case_sen; + g_free (needle); +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_search_first (const char *text, const char *search, gboolean case_sen) +{ + char *fold_text; + char *deco_text; + const char *match; + const char *result = NULL; + const char *m; + + fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1); + deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL); + + match = deco_text; + do + { + match = g_strstr_len (match, -1, search); + if (match != NULL) + { + if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) && + !str_utf8_iscombiningmark (match + strlen (search))) + { + result = text; + m = deco_text; + while (m < match) + { + str_utf8_cnext_noncomb_char (&m); + str_utf8_cnext_noncomb_char (&result); + } + } + else + str_utf8_cnext_char (&match); + } + } + while (match != NULL && result == NULL); + + g_free (deco_text); + if (!case_sen) + g_free (fold_text); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static const char * +str_utf8_search_last (const char *text, const char *search, gboolean case_sen) +{ + char *fold_text; + char *deco_text; + char *match; + const char *result = NULL; + const char *m; + + fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1); + deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL); + + do + { + match = g_strrstr_len (deco_text, -1, search); + if (match != NULL) + { + if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) && + !str_utf8_iscombiningmark (match + strlen (search))) + { + result = text; + m = deco_text; + while (m < match) + { + str_utf8_cnext_noncomb_char (&m); + str_utf8_cnext_noncomb_char (&result); + } + } + else + match[0] = '\0'; + } + } + while (match != NULL && result == NULL); + + g_free (deco_text); + if (!case_sen) + g_free (fold_text); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static char * +str_utf8_normalize (const char *text) +{ + GString *fixed; + char *tmp; + char *result; + const char *start; + const char *end; + + /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4, + * does the normalization and then converts UCS-4 back into UTF-8. + * Since file names are composed of ASCII characters in most cases, we can speed up + * utf8 normalization by checking if the heavyweight Unicode normalization is actually + * needed. Normalization of ASCII string is no-op. + */ + + /* find out whether text is ASCII only */ + for (end = text; *end != '\0'; end++) + if ((*end & 0x80) != 0) + { + /* found 2nd byte of utf8-encoded symbol */ + break; + } + + /* if text is ASCII-only, return copy, normalize otherwise */ + if (*end == '\0') + return g_strndup (text, end - text); + + fixed = g_string_sized_new (4); + + start = text; + while (!g_utf8_validate (start, -1, &end) && start[0] != '\0') + { + if (start != end) + { + tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL); + g_string_append (fixed, tmp); + g_free (tmp); + } + g_string_append_c (fixed, end[0]); + start = end + 1; + } + + if (start == text) + { + result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL); + g_string_free (fixed, TRUE); + } + else + { + if (start[0] != '\0' && start != end) + { + tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL); + g_string_append (fixed, tmp); + g_free (tmp); + } + result = g_string_free (fixed, FALSE); + } + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static char * +str_utf8_casefold_normalize (const char *text) +{ + GString *fixed; + char *tmp, *fold; + char *result; + const char *start; + const char *end; + + fixed = g_string_sized_new (4); + + start = text; + while (!g_utf8_validate (start, -1, &end) && start[0] != '\0') + { + if (start != end) + { + fold = g_utf8_casefold (start, end - start); + tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL); + g_string_append (fixed, tmp); + g_free (tmp); + g_free (fold); + } + g_string_append_c (fixed, end[0]); + start = end + 1; + } + + if (start == text) + { + fold = g_utf8_casefold (text, -1); + result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL); + g_free (fold); + g_string_free (fixed, TRUE); + } + else + { + if (start[0] != '\0' && start != end) + { + fold = g_utf8_casefold (start, end - start); + tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL); + g_string_append (fixed, tmp); + g_free (tmp); + g_free (fold); + } + result = g_string_free (fixed, FALSE); + } + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_compare (const char *t1, const char *t2) +{ + char *n1, *n2; + int result; + + n1 = str_utf8_normalize (t1); + n2 = str_utf8_normalize (t2); + + result = strcmp (n1, n2); + + g_free (n1); + g_free (n2); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_ncompare (const char *t1, const char *t2) +{ + char *n1, *n2; + size_t l1, l2; + int result; + + n1 = str_utf8_normalize (t1); + n2 = str_utf8_normalize (t2); + + l1 = strlen (n1); + l2 = strlen (n2); + result = strncmp (n1, n2, MIN (l1, l2)); + + g_free (n1); + g_free (n2); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_casecmp (const char *t1, const char *t2) +{ + char *n1, *n2; + int result; + + n1 = str_utf8_casefold_normalize (t1); + n2 = str_utf8_casefold_normalize (t2); + + result = strcmp (n1, n2); + + g_free (n1); + g_free (n2); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_ncasecmp (const char *t1, const char *t2) +{ + char *n1, *n2; + size_t l1, l2; + int result; + + n1 = str_utf8_casefold_normalize (t1); + n2 = str_utf8_casefold_normalize (t2); + + l1 = strlen (n1); + l2 = strlen (n2); + result = strncmp (n1, n2, MIN (l1, l2)); + + g_free (n1); + g_free (n2); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_prefix (const char *text, const char *prefix) +{ + char *t, *p; + const char *nt, *np; + const char *nnt, *nnp; + int result; + + t = str_utf8_normalize (text); + p = str_utf8_normalize (prefix); + nt = t; + np = p; + nnt = t; + nnp = p; + + while (nt[0] != '\0' && np[0] != '\0') + { + str_utf8_cnext_char_safe (&nnt); + str_utf8_cnext_char_safe (&nnp); + if (nnt - nt != nnp - np) + break; + if (strncmp (nt, np, nnt - nt) != 0) + break; + nt = nnt; + np = nnp; + } + + result = np - p; + + g_free (t); + g_free (p); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_caseprefix (const char *text, const char *prefix) +{ + char *t, *p; + const char *nt, *np; + const char *nnt, *nnp; + int result; + + t = str_utf8_casefold_normalize (text); + p = str_utf8_casefold_normalize (prefix); + nt = t; + np = p; + nnt = t; + nnp = p; + + while (nt[0] != '\0' && np[0] != '\0') + { + str_utf8_cnext_char_safe (&nnt); + str_utf8_cnext_char_safe (&nnp); + if (nnt - nt != nnp - np) + break; + if (strncmp (nt, np, nnt - nt) != 0) + break; + nt = nnt; + np = nnp; + } + + result = np - p; + + g_free (t); + g_free (p); + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static char * +str_utf8_create_key_gen (const char *text, gboolean case_sen, + gchar * (*keygen) (const gchar * text, gssize size)) +{ + char *result; + + if (case_sen) + result = str_utf8_normalize (text); + else + { + gboolean dot; + GString *fixed; + const char *start, *end; + char *fold, *key; + + dot = text[0] == '.'; + fixed = g_string_sized_new (16); + + if (!dot) + start = text; + else + { + start = text + 1; + g_string_append_c (fixed, '.'); + } + + while (!g_utf8_validate (start, -1, &end) && start[0] != '\0') + { + if (start != end) + { + fold = g_utf8_casefold (start, end - start); + key = keygen (fold, -1); + g_string_append (fixed, key); + g_free (key); + g_free (fold); + } + g_string_append_c (fixed, end[0]); + start = end + 1; + } + + if (start == text) + { + fold = g_utf8_casefold (start, -1); + result = keygen (fold, -1); + g_free (fold); + g_string_free (fixed, TRUE); + } + else if (dot && (start == text + 1)) + { + fold = g_utf8_casefold (start, -1); + key = keygen (fold, -1); + g_string_append (fixed, key); + g_free (key); + g_free (fold); + result = g_string_free (fixed, FALSE); + } + else + { + if (start[0] != '\0' && start != end) + { + fold = g_utf8_casefold (start, end - start); + key = keygen (fold, -1); + g_string_append (fixed, key); + g_free (key); + g_free (fold); + } + result = g_string_free (fixed, FALSE); + } + } + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +static char * +str_utf8_create_key (const char *text, gboolean case_sen) +{ + return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key); +} + +/* --------------------------------------------------------------------------------------------- */ + +#ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME +static char * +str_utf8_create_key_for_filename (const char *text, gboolean case_sen) +{ + return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename); +} +#endif + +/* --------------------------------------------------------------------------------------------- */ + +static int +str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen) +{ + (void) case_sen; + return strcmp (t1, t2); +} + +/* --------------------------------------------------------------------------------------------- */ + +static void +str_utf8_release_key (char *key, gboolean case_sen) +{ + (void) case_sen; + g_free (key); +} + +/* --------------------------------------------------------------------------------------------- */ +/*** public functions ****************************************************************************/ +/* --------------------------------------------------------------------------------------------- */ + +struct str_class +str_utf8_init (void) +{ + struct str_class result; + + result.conv_gerror_message = str_utf8_conv_gerror_message; + result.vfs_convert_to = str_utf8_vfs_convert_to; + result.insert_replace_char = str_utf8_insert_replace_char; + result.is_valid_string = str_utf8_is_valid_string; + result.is_valid_char = str_utf8_is_valid_char; + result.cnext_char = str_utf8_cnext_char; + result.cprev_char = str_utf8_cprev_char; + result.cnext_char_safe = str_utf8_cnext_char_safe; + result.cprev_char_safe = str_utf8_cprev_char_safe; + result.cnext_noncomb_char = str_utf8_cnext_noncomb_char; + result.cprev_noncomb_char = str_utf8_cprev_noncomb_char; + result.char_isspace = str_utf8_isspace; + result.char_ispunct = str_utf8_ispunct; + result.char_isalnum = str_utf8_isalnum; + result.char_isdigit = str_utf8_isdigit; + result.char_isprint = str_utf8_isprint; + result.char_iscombiningmark = str_utf8_iscombiningmark; + result.char_toupper = str_utf8_toupper; + result.char_tolower = str_utf8_tolower; + result.length = str_utf8_length; + result.length2 = str_utf8_length2; + result.length_noncomb = str_utf8_length_noncomb; + result.fix_string = str_utf8_fix_string; + result.term_form = str_utf8_term_form; + result.fit_to_term = str_utf8_fit_to_term; + result.term_trim = str_utf8_term_trim; + result.term_width2 = str_utf8_term_width2; + result.term_width1 = str_utf8_term_width1; + result.term_char_width = str_utf8_term_char_width; + result.term_substring = str_utf8_term_substring; + result.trunc = str_utf8_trunc; + result.offset_to_pos = str_utf8_offset_to_pos; + result.column_to_pos = str_utf8_column_to_pos; + result.create_search_needle = str_utf8_create_search_needle; + result.release_search_needle = str_utf8_release_search_needle; + result.search_first = str_utf8_search_first; + result.search_last = str_utf8_search_last; + result.compare = str_utf8_compare; + result.ncompare = str_utf8_ncompare; + result.casecmp = str_utf8_casecmp; + result.ncasecmp = str_utf8_ncasecmp; + result.prefix = str_utf8_prefix; + result.caseprefix = str_utf8_caseprefix; + result.create_key = str_utf8_create_key; +#ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME + /* case insensitive sort files in "a1 a2 a10" order */ + result.create_key_for_filename = str_utf8_create_key_for_filename; +#else + /* case insensitive sort files in "a1 a10 a2" order */ + result.create_key_for_filename = str_utf8_create_key; +#endif + result.key_collate = str_utf8_key_collate; + result.release_key = str_utf8_release_key; + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ |