summaryrefslogtreecommitdiffstats
path: root/src/common/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/common/utf8.c183
1 files changed, 183 insertions, 0 deletions
diff --git a/src/common/utf8.c b/src/common/utf8.c
new file mode 100644
index 00000000..9b7aaf5f
--- /dev/null
+++ b/src/common/utf8.c
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <string.h>
+
+static int high_bits_set(int c)
+{
+ int ret = 0;
+ while (1) {
+ if ((c & 0x80) != 0x080)
+ break;
+ c <<= 1;
+ ++ret;
+ }
+ return ret;
+}
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+int encode_utf8(unsigned long u, unsigned char *buf)
+{
+ static const unsigned long max_val[MAX_UTF8_SZ] = {
+ 0x0000007ful, 0x000007fful, 0x0000fffful,
+ 0x001ffffful, 0x03fffffful, 0x7ffffffful
+ };
+ static const int MAX_VAL_SZ = sizeof(max_val)/sizeof(max_val[0]);
+
+ int i;
+ for (i = 0; i < MAX_VAL_SZ; ++i) {
+ if (u <= max_val[i])
+ break;
+ }
+ if (i == MAX_VAL_SZ) {
+ // This code point is too big to encode.
+ return -1;
+ }
+
+ if (i == 0) {
+ buf[0] = u;
+ }
+ else {
+ signed int j;
+ for (j = i; j > 0; --j) {
+ buf[j] = 0x80 | (u & 0x3f);
+ u >>= 6;
+ }
+
+ unsigned char mask = ~(0xFF >> (i + 1));
+ buf[0] = mask | u;
+ }
+
+ return i + 1;
+}
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+unsigned long decode_utf8(unsigned char *buf, int nbytes)
+{
+ unsigned long code;
+ int i, j;
+
+ if (nbytes <= 0)
+ return INVALID_UTF8_CHAR;
+
+ if (nbytes == 1) {
+ if (buf[0] >= 0x80)
+ return INVALID_UTF8_CHAR;
+ return buf[0];
+ }
+
+ i = high_bits_set(buf[0]);
+ if (i != nbytes)
+ return INVALID_UTF8_CHAR;
+ code = buf[0] & (0xff >> i);
+ for (j = 1; j < nbytes; ++j) {
+ if ((buf[j] & 0xc0) != 0x80)
+ return INVALID_UTF8_CHAR;
+ code = (code << 6) | (buf[j] & 0x3f);
+ }
+
+ // Check for invalid code points
+ if (code == 0xFFFE)
+ return INVALID_UTF8_CHAR;
+ if (code == 0xFFFF)
+ return INVALID_UTF8_CHAR;
+ if (code >= 0xD800 && code <= 0xDFFF)
+ return INVALID_UTF8_CHAR;
+
+ return code;
+}
+
+int check_utf8(const char *buf, int len)
+{
+ unsigned char u[MAX_UTF8_SZ];
+ int enc_len = 0;
+ int i = 0;
+ while (1) {
+ unsigned int c = buf[i];
+ if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
+ // the start of a new character. Process what we have
+ // in the buffer.
+ if (enc_len > 0) {
+ int re_encoded_len;
+ unsigned char re_encoded[MAX_UTF8_SZ];
+ unsigned long code = decode_utf8(u, enc_len);
+ if (code == INVALID_UTF8_CHAR) {
+ //printf("decoded to invalid utf8");
+ return i + 1;
+ }
+ re_encoded_len = encode_utf8(code, re_encoded);
+ if (enc_len != re_encoded_len) {
+ //printf("originally encoded as %d bytes, "
+ // "but was re-encoded to %d!\n",
+ // enc_len, re_encoded_len);
+ return i + 1;
+ }
+ if (memcmp(u, re_encoded, enc_len) != 0) {
+ //printf("re-encoded to a different "
+ // "byte stream!");
+ return i + 1;
+ }
+ //printf("code_point %lu\n", code);
+ }
+ enc_len = 0;
+ if (i >= len)
+ break;
+ // start collecting again?
+ if (c >= 0x80)
+ u[enc_len++] = c;
+ } else {
+ if (enc_len == MAX_UTF8_SZ) {
+ //printf("too many enc_len in utf character!\n");
+ return i + 1;
+ }
+ //printf("continuation byte...\n");
+ u[enc_len++] = c;
+ }
+ ++i;
+ }
+ return 0;
+}
+
+int check_utf8_cstr(const char *buf)
+{
+ return check_utf8(buf, strlen(buf));
+}
+
+int is_control_character(int c)
+{
+ return (((c != 0) && (c < 0x20)) || (c == 0x7f));
+}
+
+int check_for_control_characters(const char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len; ++i) {
+ if (is_control_character((int)(unsigned char)buf[i])) {
+ return i + 1;
+ }
+ }
+ return 0;
+}
+
+int check_for_control_characters_cstr(const char *buf)
+{
+ return check_for_control_characters(buf, strlen(buf));
+}