diff options
Diffstat (limited to 'src/grep/lib/propername.c')
-rw-r--r-- | src/grep/lib/propername.c | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/src/grep/lib/propername.c b/src/grep/lib/propername.c new file mode 100644 index 0000000..561566b --- /dev/null +++ b/src/grep/lib/propername.c @@ -0,0 +1,318 @@ +/* Localization of proper names. + Copyright (C) 2006-2021 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2006. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that + the proper_name function might be candidate for attribute 'const' */ +#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__ +# pragma GCC diagnostic ignored "-Wsuggest-attribute=const" +#endif + +#include <config.h> + +/* Specification. */ +#include "propername.h" + +#include <ctype.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#if HAVE_ICONV +# include <iconv.h> +#endif + +#include "trim.h" +#include "mbchar.h" +#include "mbuiter.h" +#include "localcharset.h" +#include "c-strcase.h" +#include "xstriconv.h" +#include "xalloc.h" +#include "gettext.h" + + +/* Tests whether STRING contains trim (SUB), starting and ending at word + boundaries. + Here, instead of implementing Unicode Standard Annex #29 for determining + word boundaries, we assume that trim (SUB) starts and ends with words and + only test whether the part before it ends with a non-word and the part + after it starts with a non-word. */ +static bool +mbsstr_trimmed_wordbounded (const char *string, const char *sub) +{ + char *tsub = trim (sub); + bool found = false; + + for (; *string != '\0';) + { + const char *tsub_in_string = mbsstr (string, tsub); + if (tsub_in_string == NULL) + break; + else + { + if (MB_CUR_MAX > 1) + { + mbui_iterator_t string_iter; + bool word_boundary_before; + bool word_boundary_after; + + mbui_init (string_iter, string); + word_boundary_before = true; + if (mbui_cur_ptr (string_iter) < tsub_in_string) + { + mbchar_t last_char_before_tsub; + do + { + if (!mbui_avail (string_iter)) + abort (); + last_char_before_tsub = mbui_cur (string_iter); + mbui_advance (string_iter); + } + while (mbui_cur_ptr (string_iter) < tsub_in_string); + if (mb_isalnum (last_char_before_tsub)) + word_boundary_before = false; + } + + mbui_init (string_iter, tsub_in_string); + { + mbui_iterator_t tsub_iter; + + for (mbui_init (tsub_iter, tsub); + mbui_avail (tsub_iter); + mbui_advance (tsub_iter)) + { + if (!mbui_avail (string_iter)) + abort (); + mbui_advance (string_iter); + } + } + word_boundary_after = true; + if (mbui_avail (string_iter)) + { + mbchar_t first_char_after_tsub = mbui_cur (string_iter); + if (mb_isalnum (first_char_after_tsub)) + word_boundary_after = false; + } + + if (word_boundary_before && word_boundary_after) + { + found = true; + break; + } + + mbui_init (string_iter, tsub_in_string); + if (!mbui_avail (string_iter)) + break; + string = tsub_in_string + mb_len (mbui_cur (string_iter)); + } + else + { + bool word_boundary_before; + const char *p; + bool word_boundary_after; + + word_boundary_before = true; + if (string < tsub_in_string) + if (isalnum ((unsigned char) tsub_in_string[-1])) + word_boundary_before = false; + + p = tsub_in_string + strlen (tsub); + word_boundary_after = true; + if (*p != '\0') + if (isalnum ((unsigned char) *p)) + word_boundary_after = false; + + if (word_boundary_before && word_boundary_after) + { + found = true; + break; + } + + if (*tsub_in_string == '\0') + break; + string = tsub_in_string + 1; + } + } + } + free (tsub); + return found; +} + +/* Return the localization of NAME. NAME is written in ASCII. */ + +const char * +proper_name (const char *name) +{ + /* See whether there is a translation. */ + const char *translation = gettext (name); + + if (translation != name) + { + /* See whether the translation contains the original name. */ + if (mbsstr_trimmed_wordbounded (translation, name)) + return translation; + else + { + /* Return "TRANSLATION (NAME)". */ + char *result = + XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); + + sprintf (result, "%s (%s)", translation, name); + return result; + } + } + else + return name; +} + +/* Return the localization of a name whose original writing is not ASCII. + NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal + escape sequences. NAME_ASCII is a fallback written only with ASCII + characters. */ + +const char * +proper_name_utf8 (const char *name_ascii, const char *name_utf8) +{ + /* See whether there is a translation. */ + const char *translation = gettext (name_ascii); + + /* Try to convert NAME_UTF8 to the locale encoding. */ + const char *locale_code = locale_charset (); + char *alloc_name_converted = NULL; + char *alloc_name_converted_translit = NULL; + const char *name_converted = NULL; + const char *name_converted_translit = NULL; + const char *name; + + if (c_strcasecmp (locale_code, "UTF-8") != 0) + { +#if HAVE_ICONV + name_converted = alloc_name_converted = + xstr_iconv (name_utf8, "UTF-8", locale_code); + +# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ + && !defined __UCLIBC__) \ + || _LIBICONV_VERSION >= 0x0105 + { + char *converted_translit; + + size_t len = strlen (locale_code); + char *locale_code_translit = XNMALLOC (len + 10 + 1, char); + memcpy (locale_code_translit, locale_code, len); + memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1); + + converted_translit = + xstr_iconv (name_utf8, "UTF-8", locale_code_translit); + + free (locale_code_translit); + + if (converted_translit != NULL) + { +# if !_LIBICONV_VERSION + /* Don't use the transliteration if it added question marks. + glibc's transliteration falls back to question marks; libiconv's + transliteration does not. + mbschr is equivalent to strchr in this case. */ + if (strchr (converted_translit, '?') != NULL) + free (converted_translit); + else +# endif + name_converted_translit = alloc_name_converted_translit = + converted_translit; + } + } +# endif +#endif + } + else + { + name_converted = name_utf8; + name_converted_translit = name_utf8; + } + + /* The name in locale encoding. */ + name = (name_converted != NULL ? name_converted : + name_converted_translit != NULL ? name_converted_translit : + name_ascii); + + /* See whether we have a translation. Some translators have not understood + that they should use the UTF-8 form of the name, if possible. So if the + translator provided a no-op translation, we ignore it. */ + if (strcmp (translation, name_ascii) != 0) + { + /* See whether the translation contains the original name. */ + if (mbsstr_trimmed_wordbounded (translation, name_ascii) + || (name_converted != NULL + && mbsstr_trimmed_wordbounded (translation, name_converted)) + || (name_converted_translit != NULL + && mbsstr_trimmed_wordbounded (translation, name_converted_translit))) + { + if (alloc_name_converted != NULL) + free (alloc_name_converted); + if (alloc_name_converted_translit != NULL) + free (alloc_name_converted_translit); + return translation; + } + else + { + /* Return "TRANSLATION (NAME)". */ + char *result = + XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); + + sprintf (result, "%s (%s)", translation, name); + + if (alloc_name_converted != NULL) + free (alloc_name_converted); + if (alloc_name_converted_translit != NULL) + free (alloc_name_converted_translit); + return result; + } + } + else + { + if (alloc_name_converted != NULL && alloc_name_converted != name) + free (alloc_name_converted); + if (alloc_name_converted_translit != NULL + && alloc_name_converted_translit != name) + free (alloc_name_converted_translit); + return name; + } +} + +#ifdef TEST1 +# include <locale.h> +int +main (int argc, char *argv[]) +{ + setlocale (LC_ALL, ""); + if (mbsstr_trimmed_wordbounded (argv[1], argv[2])) + printf("found\n"); + return 0; +} +#endif + +#ifdef TEST2 +# include <locale.h> +# include <stdio.h> +int +main (int argc, char *argv[]) +{ + setlocale (LC_ALL, ""); + printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard")); + return 0; +} +#endif |