/*
* utf8.c: UTF-8 validation
*
* Based on glib's gutf8.c, which is:
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see
* .
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include
#include
#include
#include "attribute.h"
#include "manconfig.h"
#include "utf8.h"
#define VALIDATE_BYTE(mask, expect) \
do { \
if (UNLIKELY ((*(unsigned char *) p & (mask)) != (expect))) \
goto error; \
} while (0)
/* see IETF RFC 3629 Section 4 */
static const char * ATTRIBUTE_PURE fast_validate_len (const char *str,
size_t max_len)
{
const char *p;
const char *end = str + max_len;
for (p = str; p < end && *p; p++) {
const char *last;
if (*(unsigned char *) p < 128)
continue;
last = p;
if (*(unsigned char *) p < 0xe0) {
/* 110xxxxx */
if (UNLIKELY (end - p < 2))
goto error;
if (UNLIKELY (*(unsigned char *) p < 0xc2))
goto error;
} else if (*(unsigned char *)p < 0xf0) {
/* 1110xxxx */
if (UNLIKELY (end - p < 3))
goto error;
switch (*(unsigned char *) p++ & 0x0f) {
case 0:
/* 0xa0 ... 0xbf */
VALIDATE_BYTE (0xe0, 0xa0);
break;
case 0x0d:
/* 0x80 ... 0x9f */
VALIDATE_BYTE (0xe0, 0x80);
break;
default:
/* 10xxxxxx */
VALIDATE_BYTE (0xc0, 0x80);
}
} else if (*(unsigned char *) p < 0xf5) {
/* 11110xxx excluding out-of-range */
if (UNLIKELY (end - p < 4))
goto error;
switch (*(unsigned char *) p++ & 0x07) {
case 0:
/* 0x90 ... 0xbf */
VALIDATE_BYTE (0xc0, 0x80);
if (UNLIKELY ((*(unsigned char *) p &
0x30) == 0))
goto error;
break;
case 4:
/* 0x80 ... 0x8f */
VALIDATE_BYTE(0xf0, 0x80);
break;
default:
/* 10xxxxxx */
VALIDATE_BYTE(0xc0, 0x80);
}
p++;
/* 10xxxxxx */
VALIDATE_BYTE(0xc0, 0x80);
} else
goto error;
p++;
/* 10xxxxxx */
VALIDATE_BYTE(0xc0, 0x80);
continue;
error:
return last;
}
return p;
}
/* Validates UTF-8 encoded text. str is the text to validate; max_len is
* the number of bytes to validate.
*
* Note that utf8_validate() returns false if any of the max_len bytes are
* NUL.
*
* Returns true if the text was valid UTF-8.
*/
bool ATTRIBUTE_PURE utf8_validate_len (const char *str, size_t max_len)
{
const char *p;
p = fast_validate_len (str, max_len);
return p == str + max_len;
}