summaryrefslogtreecommitdiffstats
path: root/src/raptor_nfc_test.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/raptor_nfc_test.c')
-rw-r--r--src/raptor_nfc_test.c292
1 files changed, 292 insertions, 0 deletions
diff --git a/src/raptor_nfc_test.c b/src/raptor_nfc_test.c
new file mode 100644
index 0000000..625cf93
--- /dev/null
+++ b/src/raptor_nfc_test.c
@@ -0,0 +1,292 @@
+/* -*- Mode: c; c-basic-offset: 2 -*-
+ *
+ * raptor_nfc_test.c - Raptor Unicode NFC validation check
+ *
+ * Copyright (C) 2004-2008, David Beckett http://www.dajobe.org/
+ * Copyright (C) 2004-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ *
+ * See LICENSE.html or LICENSE.txt at the top of this package for the
+ * complete terms and further detail along with the license texts for
+ * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
+ *
+ * It operates over the Unicode NormalizationTest.txt
+ * which tests normalization the process, NOT normalization checking.
+ * It says:
+ * " CONFORMANCE:
+ * 1. The following invariants must be true for all conformant implementations
+ * NFC
+ * c2 == NFC(c1) == NFC(c2) == NFC(c3)
+ * c4 == NFC(c4) == NFC(c5)
+ * "
+ *
+ * It does NOT require that c1, c3 and c5 are NFC.
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <raptor_config.h>
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h> /* for isprint() */
+#include <stdarg.h>
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+/* Raptor includes */
+#include "raptor2.h"
+#include "raptor_internal.h"
+
+
+#undef RAPTOR_NFC_DECODE_DEBUG
+
+
+/*
+ * decode_to_utf8:
+ * @utf8_string: destination utf8 buffer (FIXME big enough!)
+ * @unicode_string: first char of string
+ * @end: last char of unicode_string
+ */
+static size_t
+decode_to_utf8(unsigned char *utf8_string, size_t utf8_string_length,
+ const char *unicode_string, const char *end)
+{
+ unsigned char *u = utf8_string;
+ const char *p = unicode_string;
+
+#ifdef RAPTOR_NFC_DECODE_DEBUG
+ fputs("decode_to_utf8: string '", stderr);
+ (void)fwrite(unicode_string, sizeof(char), (end-unicode_string) + 1, stderr);
+ fputs("' converts to:\n ", stderr);
+#endif
+
+ while(p < end) {
+ unsigned long c = 0;
+ char *endptr;
+ int unicode_width;
+
+ if(*p == ' ') {
+ p++;
+ continue;
+ }
+
+ c = (unsigned long)strtol(p, &endptr, 16);
+
+#ifdef RAPTOR_NFC_DECODE_DEBUG
+ fprintf(stderr, "U+%04lX ", c);
+#endif
+
+ p = (const char*)endptr;
+
+ unichar_width = raptor_unicode_utf8_string_put_char(c, u, (end-p));
+ if(unichar_width < 0) {
+ fprintf(stderr,
+ "decode_to_utf8 Illegal Unicode character with code point #x%lX.",
+ unichar);
+ break;
+ }
+
+ u += (size_t)unichar_width;
+
+ if((u-utf8_string) > RAPTOR_GOOD_CAST(int, utf8_string_length)) {
+ fprintf(stderr,
+ "decode_to_utf8 overwrote utf8_string buffer at byte %ld\n",
+ (u-utf8_string));
+ abort();
+ }
+ }
+
+#ifdef RAPTOR_NFC_DECODE_DEBUG
+ fputs("\n", stderr);
+#endif
+
+ return u-utf8_string;
+}
+
+
+
+static void
+utf8_print(const unsigned char *input, size_t length, FILE *stream)
+{
+ size_t i = 0;
+
+ while(i < length && *input) {
+ unsigned long c;
+ int size = raptor_unicode_utf8_string_get_char(input, length - i, &c);
+ if(size <= 0)
+ return;
+
+ if(i)
+ fputc(' ', stream);
+ fprintf(stream, "U+%04X", RAPTOR_GOOD_CAST(int, c));
+ input += size;
+ i += size;
+ }
+}
+
+
+int
+main (int argc, char *argv[])
+{
+ const char *program = raptor_basename(argv[0]);
+ const char *filename;
+ FILE *fh;
+ int rc = 0;
+ unsigned int line = 1;
+ size_t max_c2_len = 0;
+ size_t max_c4_len = 0;
+ int passes = 0;
+ int fails = 0;
+
+ if(argc != 2) {
+ fprintf(stderr,
+ "USAGE %s [path to NormalizationTest.txt]\n"
+ "Get it at http://unicode.org/Public/UNIDATA/NormalizationTest.txt\n",
+ program);
+ return 1;
+ }
+
+ filename = argv[1];
+ fh = fopen(filename, "r");
+ if(!fh) {
+ fprintf(stderr, "%s: file '%s' open failed - %s\n",
+ program, filename, strerror(errno));
+ return 1;
+ }
+
+#define LINE_BUFFER_SIZE 1024
+
+/* FIXME big enough for Unicode 4 (c2 max 16; c4 max 33) */
+#define UNISTR_SIZE 40
+
+ for(;!feof(fh); line++) {
+ char buffer[LINE_BUFFER_SIZE];
+ char *p, *start;
+ unsigned char column2[UNISTR_SIZE];
+ unsigned char column4[UNISTR_SIZE];
+ size_t column2_len, column4_len;
+ int nfc_rc;
+ int error;
+
+ p = fgets(buffer, LINE_BUFFER_SIZE, fh);
+ if(!p) {
+ if(ferror(fh)) {
+ fprintf(stderr, "%s: file '%s' read failed - %s\n",
+ program, filename, strerror(errno));
+ rc = 1;
+ break;
+ }
+ /* assume feof */
+ break;
+ };
+
+#if 0
+ fprintf(stderr, "%s:%d: line '%s'\n", program, line, buffer);
+#endif
+
+ /* skip lines */
+ if(*p == '@' || *p == '#')
+ continue;
+
+
+ /* skip column 1 */
+ while(*p++ != ';')
+ ;
+
+ /* read column 2 into column2, column2_len */
+ start = p;
+ /* find end column 2 */
+ while(*p++ != ';')
+ ;
+
+ column2_len = decode_to_utf8(column2, UNISTR_SIZE, start, p-2);
+ if(column2_len > max_c2_len)
+ max_c2_len = column2_len;
+
+#if 0
+ fprintf(stderr, "UTF8 column 2 (%ld bytes) is: '", column2_len);
+ utf8_print(column2, column2_len, stderr);
+ fputs("'\n", stderr);
+#endif
+
+ /* skip column 3 */
+ while(*p++ != ';')
+ ;
+
+ /* read column 4 into column4, column4_len */
+ start = p;
+ /* find end column 4 */
+ while(*p++ != ';')
+ ;
+
+ column4_len = decode_to_utf8(column4, UNISTR_SIZE, start, p-2);
+ if(column4_len > max_c4_len)
+ max_c4_len = column4_len;
+
+#if 0
+ fprintf(stderr, "UTF8 column 4 (%ld bytes) is: '", column4_len);
+ utf8_print(column4, column4_len, stderr);
+ fputs("'\n", stderr);
+#endif
+
+ if(!raptor_unicode_check_utf8_string(column2, column2_len)) {
+ fprintf(stderr, "%s:%d: UTF8 column 2 failed on: '", filename, line);
+ utf8_print(column2, column2_len, stderr);
+ fputs("'\n", stderr);
+ fails++;
+ } else
+ passes++;
+
+ /* Column 2 must be NFC */
+ nfc_rc = raptor_nfc_check(column2, column2_len, &error);
+ if(!nfc_rc) {
+ fprintf(stderr, "%s:%d: NFC column 2 failed on: '", filename, line);
+ utf8_print(column2, column2_len, stderr);
+ fprintf(stderr, "' at byte %d of %d\n", error, (int)column2_len);
+ fails++;
+ } else
+ passes++;
+
+ if(column2_len == column4_len && !memcmp(column2, column4, column2_len))
+ continue;
+
+ if(!raptor_unicode_check_utf8_string(column4, column4_len)) {
+ fprintf(stderr, "%s:%d: UTF8 column 4 failed on: '", filename, line);
+ utf8_print(column4, column4_len, stderr);
+ fputs("'\n", stderr);
+ fails++;
+ } else
+ passes++;
+
+ /* Column 4 must be in NFC */
+ nfc_rc = raptor_nfc_check(column4, column4_len, &error);
+ if(!nfc_rc) {
+ fprintf(stderr, "%s:%d: NFC column 4 failed on: '", filename, line);
+ utf8_print(column4, column4_len, stderr);
+ fprintf(stderr, "' at byte %d of %d\n", error, (int)column4_len);
+ fails++;
+ } else
+ passes++;
+ }
+
+ fclose(fh);
+
+ fprintf(stderr, "%s: max column 2 len: %d, max column 4 len: %d\n", program,
+ (int)max_c2_len, (int)max_c4_len);
+ fprintf(stderr, "%s: passes: %d fails: %d\n", program,
+ passes, fails);
+
+ return rc;
+}