diff options
Diffstat (limited to 'src/util/utf8.c')
-rw-r--r-- | src/util/utf8.c | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/src/util/utf8.c b/src/util/utf8.c new file mode 100644 index 0000000..c566fdf --- /dev/null +++ b/src/util/utf8.c @@ -0,0 +1,150 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#include "utf8.h" + +#include "git2_util.h" + +/* + * git_utf8_iterate is taken from the utf8proc project, + * http://www.public-software-group.org/utf8proc + * + * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the ""Software""), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static const uint8_t utf8proc_utf8class[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static int utf8_charlen(const uint8_t *str, size_t str_len) +{ + uint8_t length; + size_t i; + + length = utf8proc_utf8class[str[0]]; + if (!length) + return -1; + + if (str_len > 0 && length > str_len) + return -1; + + for (i = 1; i < length; i++) { + if ((str[i] & 0xC0) != 0x80) + return -1; + } + + return (int)length; +} + +int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len) +{ + const uint8_t *str = (const uint8_t *)_str; + uint32_t uc = 0; + int length; + + *out = 0; + + if ((length = utf8_charlen(str, str_len)) < 0) + return -1; + + switch (length) { + case 1: + uc = str[0]; + break; + case 2: + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); + if (uc < 0x80) uc = -1; + break; + case 3: + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + + (str[2] & 0x3F); + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + break; + case 4: + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); + if (uc < 0x10000 || uc >= 0x110000) uc = -1; + break; + default: + return -1; + } + + if ((uc & 0xFFFF) >= 0xFFFE) + return -1; + + *out = uc; + return length; +} + +size_t git_utf8_char_length(const char *_str, size_t str_len) +{ + const uint8_t *str = (const uint8_t *)_str; + size_t offset = 0, count = 0; + + while (offset < str_len) { + int length = utf8_charlen(str + offset, str_len - offset); + + if (length < 0) + length = 1; + + offset += length; + count++; + } + + return count; +} + +size_t git_utf8_valid_buf_length(const char *_str, size_t str_len) +{ + const uint8_t *str = (const uint8_t *)_str; + size_t offset = 0; + + while (offset < str_len) { + int length = utf8_charlen(str + offset, str_len - offset); + + if (length < 0) + break; + + offset += length; + } + + return offset; +} |