summaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 15:49:25 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 15:49:25 +0000
commit464df1d5e5ab1322e2dd0a7796939fff1aeefa9a (patch)
tree6a403684e0978f0287d7f0ec0e5aab1fd31a59e1 /util
parentInitial commit. (diff)
downloade2fsprogs-464df1d5e5ab1322e2dd0a7796939fff1aeefa9a.tar.xz
e2fsprogs-464df1d5e5ab1322e2dd0a7796939fff1aeefa9a.zip
Adding upstream version 1.47.0.upstream/1.47.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--util/Makefile.in74
-rw-r--r--util/all.exclude15
-rw-r--r--util/android-README.version.in3
-rw-r--r--util/android_config.h69
-rw-r--r--util/android_types.h45
-rw-r--r--util/copy_sparse.c228
-rwxr-xr-xutil/gen-android-files118
-rwxr-xr-xutil/gen-git-tarball17
-rwxr-xr-xutil/gen-sample-fs40
-rw-r--r--util/gen-tarball.in50
-rwxr-xr-xutil/get-ver4
-rw-r--r--util/install-symlink.in89
-rw-r--r--util/libecho.c78
-rw-r--r--util/mkutf8data.c3392
-rw-r--r--util/subst.c468
-rw-r--r--util/subst.conf.in26
-rw-r--r--util/symlinks.c391
-rw-r--r--util/ucd/README37
18 files changed, 5144 insertions, 0 deletions
diff --git a/util/Makefile.in b/util/Makefile.in
new file mode 100644
index 0000000..7ad18c0
--- /dev/null
+++ b/util/Makefile.in
@@ -0,0 +1,74 @@
+#
+# Standard e2fsprogs prologue....
+#
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+top_builddir = ..
+my_dir = util
+INSTALL = @INSTALL@
+MKDIR_P = @MKDIR_P@
+
+SRCS = $(srcdir)/subst.c $(srcdir)/mkutf8data.c
+
+@MCONFIG@
+
+.c.o:
+ $(E) " CC $<"
+ $(Q) $(BUILD_CC) -c $(BUILD_CFLAGS) $< -o $@
+ $(Q) $(CHECK_CMD) $(ALL_CFLAGS) $<
+ $(Q) $(CPPCHECK_CMD) $(CPPFLAGS) $<
+
+PROGS= subst symlinks mkutf8data
+
+all:: $(PROGS) gen-tarball
+
+dirpaths.h:
+ $(E) " CREATE dirpaths.h"
+ $(Q) echo "/* fake dirpaths.h for config.h */" > dirpaths.h
+
+subst.o: dirpaths.h
+
+subst: subst.o
+ $(E) " LD $@"
+ $(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o subst subst.o
+
+mkutf8data: mkutf8data.o
+ $(E) " LD $@"
+ $(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o mkutf8data mkutf8data.o
+
+copy_sparse: copy_sparse.o
+ $(E) " LD $@"
+ $(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o copy_sparse copy_sparse.o
+
+symlinks: symlinks.o
+ $(E) " LD $@"
+ $(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o symlinks symlinks.o
+
+gen-tarball: $(srcdir)/gen-tarball.in $(top_builddir)/config.status
+ $(E) " CONFIG.STATUS $@"
+ $(Q) cd $(top_builddir); CONFIG_FILES=util/gen-tarball ./config.status
+ $(Q) chmod +x gen-tarball
+
+tarballs: gen-tarball
+ sh gen-tarball debian
+ sh gen-tarball all
+ sh gen-tarball subset
+
+clean::
+ $(RM) -f $(PROGS) \#* *.s *.o *.a *~ core *.tar.gz gen-tarball \
+ copy-sparse dirpaths.h install-symlink mkutf8data
+
+mostlyclean: clean
+
+distclean: clean
+ $(RM) -f .depend Makefile $(srcdir)/TAGS $(srcdir)/Makefile.in.old
+
+# +++ Dependency line eater +++
+#
+# Makefile dependencies follow. This must be the last section in
+# the Makefile.in file
+#
+subst.o: $(srcdir)/subst.c $(top_builddir)/lib/config.h dirpaths.h
+mkutf8data.o: $(srcdir)/mkutf8data.c
diff --git a/util/all.exclude b/util/all.exclude
new file mode 100644
index 0000000..d7d03b2
--- /dev/null
+++ b/util/all.exclude
@@ -0,0 +1,15 @@
+.git
+.hg
+.hgignore
+.pc
+patches
+README.subset
+build
+build[^/]*
+rpm.log
+TODO
+powerquest
+.exclude-file
+po/stamp-cat-id
+po/cat-id-tbl.c
+Meta
diff --git a/util/android-README.version.in b/util/android-README.version.in
new file mode 100644
index 0000000..c8a3217
--- /dev/null
+++ b/util/android-README.version.in
@@ -0,0 +1,3 @@
+URL: https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/v@VER@/e2fsprogs-@FN@.tar.gz
+Version: @FN@
+BugComponent: 95221
diff --git a/util/android_config.h b/util/android_config.h
new file mode 100644
index 0000000..90b8f8a
--- /dev/null
+++ b/util/android_config.h
@@ -0,0 +1,69 @@
+#ifndef __APPLE__
+#define HAVE_MALLOC_H 1
+#endif
+
+#define ROOT_SYSCONFDIR "/etc"
+
+#define ENABLE_LIBSPARSE 1
+
+#define DISABLE_BACKTRACE 1
+#define HAVE_DIRENT_H 1
+#define HAVE_ERRNO_H 1
+#define HAVE_GETOPT_H 1
+#define HAVE_GETPWUID_R 1
+#define HAVE_INTPTR_T 1
+#define HAVE_INTTYPES_H 1
+#define HAVE_MMAP 1
+#define HAVE_SETJMP_H 1
+#define HAVE_SNPRINTF 1
+#define HAVE_STDLIB_H 1
+#define HAVE_STRCASECMP 1
+#define HAVE_STRDUP 1
+#define HAVE_STRINGS_H 1
+#define HAVE_STRNLEN 1
+#define HAVE_STRPTIME 1
+#define HAVE_SYSCONF 1
+#define HAVE_TYPE_SSIZE_T 1
+#define HAVE_UNISTD_H 1
+#define HAVE_UTIME_H 1
+
+#define HAVE_SYS_STAT_H 1
+#if !defined(__APPLE__)
+# define HAVE_SYS_SYSMACROS_H 1
+#endif
+#define HAVE_SYS_TIME_H 1
+#define HAVE_SYS_TYPES_H 1
+
+#if defined(_WIN32)
+# define HAVE_LINUX_TYPES_H 1
+#endif
+#if defined(__APPLE__) || defined(__linux__)
+# define HAVE_FCNTL 1
+# define HAVE_FSYNC 1
+# define HAVE_GETPAGESIZE 1
+# define HAVE_NET_IF_H 1
+# define HAVE_NETINET_IN_H 1
+# define HAVE_PREAD 1
+# define HAVE_PWRITE 1
+# define HAVE_POSIX_MEMALIGN 1
+# define HAVE_SYS_IOCTL_H 1
+# define HAVE_SYS_MMAN_H 1
+# define HAVE_SYS_MOUNT_H 1
+# define HAVE_SYS_PARAM_H 1
+# define HAVE_SYS_RESOURCE_H 1
+# define HAVE_SYS_SELECT_H 1
+# define HAVE_SYS_WAIT_H 1
+#endif
+#if defined(__linux__)
+# define HAVE_EXT2_IOCTLS 1
+# define HAVE_FALLOCATE 1
+# define HAVE_LINUX_FD_H 1
+# define HAVE_LINUX_TYPES_H 1
+# define HAVE_LSEEK64 1
+# define HAVE_LSEEK64_PROTOTYPE 1
+# define HAVE_MNTENT_H 1
+# define HAVE_PREAD64 1
+# define HAVE_PWRITE64 1
+# define HAVE_SETMNTENT 1
+# define HAVE_SYS_PRCTL_H 1
+#endif
diff --git a/util/android_types.h b/util/android_types.h
new file mode 100644
index 0000000..5f05903
--- /dev/null
+++ b/util/android_types.h
@@ -0,0 +1,45 @@
+/*
+ * If linux/types.h is already been included, assume it has defined
+ * everything we need. (cross fingers) Other header files may have
+ * also defined the types that we need.
+ */
+#if (!defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \
+ !defined(_EXT2_TYPES_H) && !defined(_UUID_TYPES_H))
+#define _LINUX_TYPES_H
+
+typedef unsigned char __u8;
+typedef __signed__ char __s8;
+typedef unsigned short __u16;
+typedef __signed__ short __s16;
+typedef unsigned int __u32;
+typedef __signed__ int __s32;
+typedef unsigned long long __u64;
+typedef __signed__ long long __s64;
+#endif
+
+#include <stdint.h> //uintptr_t
+
+/* endian checking stuff */
+#ifndef EXT2_ENDIAN_H_
+#define EXT2_ENDIAN_H_
+
+#ifdef __CHECKER__
+#ifndef __bitwise
+#define __bitwise __attribute__((bitwise))
+#endif
+#define __force __attribute__((force))
+#else
+#ifndef __bitwise
+#define __bitwise
+#endif
+#define __force
+#endif
+
+typedef __u16 __bitwise __le16;
+typedef __u32 __bitwise __le32;
+typedef __u64 __bitwise __le64;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __be64;
+
+#endif /* EXT2_ENDIAN_H_ */
diff --git a/util/copy_sparse.c b/util/copy_sparse.c
new file mode 100644
index 0000000..cbab273
--- /dev/null
+++ b/util/copy_sparse.c
@@ -0,0 +1,228 @@
+/*
+ * copy_sparse.c -- copy a very large sparse files efficiently
+ * (requires root privileges)
+ *
+ * Copyright 2003, 2004 by Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+
+#ifndef __linux__
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(void) {
+ fputs("This program is only supported on Linux!\n", stderr);
+ exit(EXIT_FAILURE);
+}
+#else
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#else
+extern char *optarg;
+extern int optind;
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/ioctl.h>
+#include <linux/fd.h>
+
+int verbose = 0;
+
+#define FIBMAP _IO(0x00,1) /* bmap access */
+#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+
+static unsigned long get_bmap(int fd, unsigned long block)
+{
+ int ret;
+ unsigned long b;
+
+ b = block;
+ ret = ioctl(fd, FIBMAP, &b);
+ if (ret < 0) {
+ if (errno == EPERM) {
+ fprintf(stderr, "No permission to use FIBMAP ioctl; must have root privileges\n");
+ exit(1);
+ }
+ perror("FIBMAP");
+ }
+ return b;
+}
+
+static int full_read(int fd, char *buf, size_t count)
+{
+ int got, total = 0;
+ int pass = 0;
+
+ while (count > 0) {
+ got = read(fd, buf, count);
+ if (got == -1) {
+ if ((errno == EINTR) || (errno == EAGAIN))
+ continue;
+ return total ? total : -1;
+ }
+ if (got == 0) {
+ if (pass++ >= 3)
+ return total;
+ continue;
+ }
+ pass = 0;
+ buf += got;
+ total += got;
+ count -= got;
+ }
+ return total;
+}
+
+static void copy_sparse_file(const char *src, const char *dest)
+{
+ struct stat64 fileinfo;
+ long lb, i, fd, ofd, bs, block, numblocks;
+ ssize_t got, got2;
+ off64_t offset = 0, should_be;
+ char *buf;
+
+ if (verbose)
+ printf("Copying sparse file from %s to %s\n", src, dest);
+
+ if (strcmp(src, "-")) {
+ if (stat64(src, &fileinfo) < 0) {
+ perror("stat");
+ exit(1);
+ }
+ if (!S_ISREG(fileinfo.st_mode)) {
+ printf("%s: Not a regular file\n", src);
+ exit(1);
+ }
+ fd = open(src, O_RDONLY | O_LARGEFILE);
+ if (fd < 0) {
+ perror("open");
+ exit(1);
+ }
+ if (ioctl(fd, FIGETBSZ, &bs) < 0) {
+ perror("FIGETBSZ");
+ close(fd);
+ exit(1);
+ }
+ if (bs < 0) {
+ printf("%s: Invalid block size: %ld\n", src, bs);
+ exit(1);
+ }
+ if (verbose)
+ printf("Blocksize of file %s is %ld\n", src, bs);
+ numblocks = (fileinfo.st_size + (bs-1)) / bs;
+ if (verbose)
+ printf("File size of %s is %lld (%ld blocks)\n", src,
+ (long long) fileinfo.st_size, numblocks);
+ } else {
+ fd = 0;
+ bs = 1024;
+ }
+
+ ofd = open(dest, O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0777);
+ if (ofd < 0) {
+ perror(dest);
+ exit(1);
+ }
+
+ buf = malloc(bs);
+ if (!buf) {
+ fprintf(stderr, "Couldn't allocate buffer");
+ exit(1);
+ }
+
+ for (lb = 0; !fd || lb < numblocks; lb++) {
+ if (fd) {
+ block = get_bmap(fd, lb);
+ if (!block)
+ continue;
+ should_be = ((off64_t) lb) * bs;
+ if (offset != should_be) {
+ if (verbose)
+ printf("Seeking to %lld\n", should_be);
+ if (lseek64(fd, should_be, SEEK_SET) == (off_t) -1) {
+ perror("lseek src");
+ exit(1);
+ }
+ if (lseek64(ofd, should_be, SEEK_SET) == (off_t) -1) {
+ perror("lseek dest");
+ exit(1);
+ }
+ offset = should_be;
+ }
+ }
+ got = full_read(fd, buf, bs);
+
+ if (fd == 0 && got == 0)
+ break;
+
+ if (got == bs) {
+ for (i=0; i < bs; i++)
+ if (buf[i])
+ break;
+ if (i == bs) {
+ lseek(ofd, bs, SEEK_CUR);
+ offset += bs;
+ continue;
+ }
+ }
+ got2 = write(ofd, buf, got);
+ if (got != got2) {
+ printf("short write\n");
+ exit(1);
+ }
+ offset += got;
+ }
+ offset = fileinfo.st_size;
+ if (fstat64(ofd, &fileinfo) < 0) {
+ perror("fstat");
+ exit(1);
+ }
+ if (fileinfo.st_size != offset) {
+ lseek64(ofd, offset-1, SEEK_CUR);
+ buf[0] = 0;
+ write(ofd, buf, 1);
+ }
+ close(fd);
+ close(ofd);
+}
+
+static void usage(const char *progname)
+{
+ fprintf(stderr, "Usage: %s [-v] source_file destination_file\n", progname);
+ exit(1);
+}
+
+int main(int argc, char**argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "v")) != EOF)
+ switch (c) {
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage(argv[0]);
+ break;
+ }
+ if (optind+2 != argc)
+ usage(argv[0]);
+ copy_sparse_file(argv[optind], argv[optind+1]);
+
+ return 0;
+}
+#endif
diff --git a/util/gen-android-files b/util/gen-android-files
new file mode 100755
index 0000000..cab4e8d
--- /dev/null
+++ b/util/gen-android-files
@@ -0,0 +1,118 @@
+#!/bin/sh
+
+ANDROID_GENERATED_FILES="lib/ext2fs/ext2_err.c lib/ext2fs/ext2_err.h \
+ lib/ss/ss_err.c lib/ss/ss_err.h lib/support/prof_err.c \
+ lib/support/prof_err.h \
+ lib/blkid/blkid_types.h lib/uuid/uuid_types.h \
+ lib/ext2fs/ext2_types.h lib/config.h lib/blkid/blkid.h \
+ lib/uuid/uuid.h lib/ext2fs/crc32c_table.h misc/default_profile.c \
+ lib/ss/std_rqs.c debugfs/debug_cmds.c debugfs/ro_debug_cmds.c \
+ debugfs/extent_cmds.c debugfs/e2freefrag.c \
+ debugfs/recovery.c debugfs/revoke.c \
+ MODULE_LICENSE_GPL README.version"
+
+SS_DIR=$(pwd)/lib/ss
+MK_CMDS=/tmp/mk_cmds$$.sh
+
+sed -e "s/@AWK@/awk/" < $SS_DIR/mk_cmds.sh.in \
+ | sed -e "s/@SED@/sed/" > $MK_CMDS
+
+sed -e "s/@E2FSPROGS_VERSION@/$(git describe)/" < lib/ext2fs/ext2_err.et.in > lib/ext2fs/ext2_err.et
+
+for i in lib/ss/ss_err lib/support/prof_err lib/ext2fs/ext2_err
+do
+ rm -f $i.c $i.h
+ awk -f lib/et/et_c.awk outfile=$i.c outfn=$(basename $i.c) $i.et
+ awk -f lib/et/et_h.awk outfile=$i.h outfn=$(basename $i.h) $i.et
+done
+
+for i in lib/ss/std_rqs debugfs/debug_cmds debugfs/ro_debug_cmds \
+ debugfs/extent_cmds
+do
+ _SS_DIR_OVERRIDE=lib/ss /bin/sh $MK_CMDS $i.ct
+ mv -f $(basename $i).c $i.c
+done
+
+rm -f $MK_CMDS
+
+cp lib/blkid/blkid.h.in lib/blkid/blkid.h
+cp lib/uuid/uuid.h.in lib/uuid/uuid.h
+
+cp util/android_types.h lib/ext2fs/ext2_types.h
+cp util/android_types.h lib/blkid/blkid_types.h
+cp util/android_types.h lib/uuid/uuid_types.h
+# Copied header files having exactly same content results in debug output
+# differences on RBE. Hence modify the #define's appropriately.
+sed -i 's/#define _LINUX_TYPES_H/#define _BLKID_TYPES_H/g' lib/blkid/blkid_types.h
+sed -i 's/#define _LINUX_TYPES_H/#define _EXT2_TYPES_H/g' lib/ext2fs/ext2_types.h
+sed -i 's/#define _LINUX_TYPES_H/#define _UUID_TYPES_H/g' lib/uuid/uuid_types.h
+
+cp util/android_config.h lib/config.h
+cp misc/e2freefrag.c debugfs/
+cp e2fsck/recovery.c e2fsck/revoke.c debugfs/
+
+gcc -o gen_crc32ctable lib/ext2fs/gen_crc32ctable.c
+./gen_crc32ctable > lib/ext2fs/crc32c_table.h
+
+awk -f misc/profile-to-c.awk < misc/mke2fs.conf.in > misc/default_profile.c
+
+rm -f ./gen_crc32table ./gen_crc32ctable lib/ext2fs/ext2_err.et
+
+touch MODULE_LICENSE_GPL
+
+E2FSPROGS_VERSION=`grep E2FSPROGS_VERSION version.h \
+ | awk '{print $3}' | tr \" " " | awk '{print $1}'`
+DATE=`grep E2FSPROGS_DATE version.h | awk '{print $3}' \
+ | tr \" " "`
+E2FSPROGS_DAY=$(echo $DATE | awk -F- '{print $1}' | sed -e '/^[1-9]$/s/^/0/')
+MONTH=`echo $DATE | awk -F- '{print $2}'`
+YEAR=`echo $DATE | awk -F- '{print $3}'`
+
+if expr $YEAR ">" 1900 > /dev/null ; then
+ E2FSPROGS_YEAR=$YEAR
+elif expr $YEAR ">" 90 >/dev/null ; then
+ E2FSPROGS_YEAR=19$YEAR
+else
+ E2FSPROGS_YEAR=20$YEAR
+fi
+
+case $MONTH in
+Jan) MONTH_NUM=01; E2FSPROGS_MONTH="January" ;;
+Feb) MONTH_NUM=02; E2FSPROGS_MONTH="February" ;;
+Mar) MONTH_NUM=03; E2FSPROGS_MONTH="March" ;;
+Apr) MONTH_NUM=04; E2FSPROGS_MONTH="April" ;;
+May) MONTH_NUM=05; E2FSPROGS_MONTH="May" ;;
+Jun) MONTH_NUM=06; E2FSPROGS_MONTH="June" ;;
+Jul) MONTH_NUM=07; E2FSPROGS_MONTH="July" ;;
+Aug) MONTH_NUM=08; E2FSPROGS_MONTH="August" ;;
+Sep) MONTH_NUM=09; E2FSPROGS_MONTH="September" ;;
+Oct) MONTH_NUM=10; E2FSPROGS_MONTH="October" ;;
+Nov) MONTH_NUM=11; E2FSPROGS_MONTH="November" ;;
+Dec) MONTH_NUM=12; E2FSPROGS_MONTH="December" ;;
+*) MONTH_NUM=13; E2FSPROGS_MONTH="UNKNOWN" ;;
+esac
+
+base_ver=`echo $E2FSPROGS_VERSION | \
+ sed -e 's/-WIP//' -e 's/pre-//' -e 's/-PLUS//'`
+
+date_spec=${E2FSPROGS_YEAR}${MONTH_NUM}${E2FSPROGS_DAY}
+
+case $E2FSPROGS_VERSION in
+*-WIP|pre-*)
+ VER="$base_ver-WIP-$date_spec"
+ FN="$base_ver~WIP.$E2FSPROGS_YEAR.$MONTH_NUM.$E2FSPROGS_DAY"
+ ;;
+*)
+ VER="$base_ver"
+ FN="$base_ver"
+ ;;
+esac
+
+sed -e "s/@VER@/$VER/g" -e "s/@FN@/$FN/" < util/android-README.version.in > README.version
+
+git add -f $ANDROID_GENERATED_FILES
+if test -f COPYING
+then
+ git mv COPYING NOTICE
+fi
+git commit -m "Update generated files for Android"
diff --git a/util/gen-git-tarball b/util/gen-git-tarball
new file mode 100755
index 0000000..a959c4a
--- /dev/null
+++ b/util/gen-git-tarball
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Generate the e2fsprogs release tar ball
+#
+
+commit=HEAD
+
+if test -n "$1" ; then
+ commit="$1"
+fi
+
+ver=`git show ${commit}:version.h | grep E2FSPROGS_VERSION \
+ | awk '{print $3}' | tr \" " " | awk '{print $1}'`
+fn=e2fsprogs-${ver}.tar.gz
+
+git archive --prefix=e2fsprogs-${ver}/ ${commit} | gzip -9n > $fn
+echo "Generated $fn"
diff --git a/util/gen-sample-fs b/util/gen-sample-fs
new file mode 100755
index 0000000..8e13916
--- /dev/null
+++ b/util/gen-sample-fs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+MNT=/mnt
+FS=/tmp/foo.img
+
+cp /dev/null $FS
+mke2fs -q -t ext4 -O inline_data,^has_journal -I 256 -b 4096 -N 64 $FS 256
+mount -t ext4 $FS $MNT
+ln -s symlink_data $MNT/symlink
+for i in 30 70 500 1023 1024; do
+ ln -s /$(perl -e "print 'x' x $i;") $MNT/l_$i
+done
+touch $MNT/acl
+setfacl -m u:daemon:r $MNT/acl
+setfacl -m u:bin:rx $MNT/acl
+setfacl -m g:mail:rw $MNT/acl
+setfacl -m g:daemon:r $MNT/acl
+touch $MNT/simple_acl
+setfacl -m u:daemon:r $MNT/simple_acl
+touch $MNT/xattr
+attr -q -s foo -V bar $MNT/xattr
+echo -e "one\n\ttwo" | attr -q -s quux $MNT/xattr
+echo -e "abc\001\002\003" | attr -q -s def $MNT/xattr
+echo file_data > $MNT/small_inline
+a="I am a very model of a modern major general;"
+a="$a I've information vegetable, animal and mineral"
+echo $a > $MNT/big_inline
+mkdir $MNT/sdir
+touch $MNT/sdir/1
+touch $MNT/sdir/2
+touch $MNT/sdir/3
+touch $MNT/sdir/4
+mkdir $MNT/mdir
+touch $MNT/mdir/1
+touch $MNT/mdir/2
+touch $MNT/mdir/3
+touch $MNT/mdir/4
+touch $MNT/mdir/5
+umount $MNT
+e2fsck -fp $FS
diff --git a/util/gen-tarball.in b/util/gen-tarball.in
new file mode 100644
index 0000000..997bd93
--- /dev/null
+++ b/util/gen-tarball.in
@@ -0,0 +1,50 @@
+#!/bin/sh
+#
+# This script is used to generate the distribution tarball
+#
+srcdir=@srcdir@
+top_srcdir=@top_srcdir@
+top_dir=`cd $top_srcdir; pwd`
+base_ver=`echo @E2FSPROGS_VERSION@ | sed -e 's/-WIP//' -e 's/pre-//' -e 's/-PLUS//'`
+base_e2fsprogs=`basename $top_dir`
+exclude=/tmp/exclude$$
+GZIP=gzip
+
+#
+# This hack is needed because texi2dvi blows up horribly if there are
+# any '~' characters in the directory pathname. So we kludge around it by
+# using a non-standard directory name for WIP releases. dpkg-source
+# complains, but life goes on.
+#
+deb_pkgver=`echo @E2FSPROGS_PKGVER@ | sed -e 's/~/-/g'`
+
+case $1 in
+ debian|ubuntu)
+ SRCROOT="e2fsprogs-$deb_pkgver"
+ tarout="e2fsprogs_@E2FSPROGS_PKGVER@.orig.tar.gz"
+ ;;
+ all|*)
+ SRCROOT="e2fsprogs-$base_ver"
+ tarout="$SRCROOT.tar.gz"
+ ;;
+esac
+
+if test -z "$SOURCE_DATE_EPOCH" ; then
+ export SOURCE_DATE_EPOCH=$(cd $top_srcdir; git log -1 --pretty=%ct)
+fi
+
+(cd $top_srcdir/.. ; find $base_e2fsprogs \( -name \*~ -o -name \*.orig \
+ -o -name CVS -o -name \*.rej -o -name Makefile.pq \
+ -o -name TAGS -o -name \*.old -o -name SCCS \
+ -o -name changed-files -o -name .#\* -o -name \*.tar.gz \
+ -o -name autom4te.cache \) \
+ -print) > $exclude
+sed -e "s;^;$base_e2fsprogs/;" < $srcdir/all.exclude >> $exclude
+
+(cd $top_srcdir/.. ; \
+ tar -c -f - -X $exclude --sort=name --owner=0 --group=0 \
+ --transform "flags=r;s|^$base_e2fsprogs|$SRCROOT|" \
+ --numeric-owner --mtime="@${SOURCE_DATE_EPOCH}" $base_e2fsprogs) \
+ | $GZIP -9n -c > $tarout
+$GZIP -ln $tarout
+rm -f "$exclude"
diff --git a/util/get-ver b/util/get-ver
new file mode 100755
index 0000000..ade7d22
--- /dev/null
+++ b/util/get-ver
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+ver=$(git describe --always --dirty); echo "e2fsprogs $ver ($(git log -1 --pretty=%cD))"
+
diff --git a/util/install-symlink.in b/util/install-symlink.in
new file mode 100644
index 0000000..24341b8
--- /dev/null
+++ b/util/install-symlink.in
@@ -0,0 +1,89 @@
+#!/bin/sh
+#
+# install-symlink source destination destdir
+#
+
+SYMLINKS=symlinks
+LN_S="@LN_S@"
+RM="@RM@"
+FORCE_RELATIVE=NO
+FORCE_ABSOLUTE=NO
+
+while echo $1 | grep -q -- ^- ;
+do
+ case $1 in
+ --relative)
+ FORCE_RELATIVE=YES
+ ;;
+ --absolute)
+ FORCE_ABSOLUTE=YES
+ ;;
+ --debian)
+ FORCE_ABSOLUTE=NO
+ FORCE_RELATIVE=NO
+ ;;
+ --symlinks=*)
+ SYMLINKS=$(echo $1 | sed -e 's/--symlinks=//')
+ ;;
+ *)
+ echo "Unknown option $1"
+ exit 1
+ ;;
+ esac
+ shift;
+done
+
+
+FIX_SYMLINK="$SYMLINKS -c"
+
+SRC="$1"
+DEST="$2"
+DESTDIR="$3"
+
+if ! echo $SRC | grep -q ^/ ; then
+ echo $SRC: Source pathname must be absolute
+ exit 1
+fi
+
+if ! echo $DEST | grep -q ^/ ; then
+ echo $DEST: Destination pathname must be absolute
+ exit 1
+fi
+
+if ! test -e "$DESTDIR$SRC" ; then
+ echo $DESTDIR$SRC: file or directory does not exist
+ exit 1
+fi
+
+$RM -f "$DESTDIR$DEST"
+
+if test "$LN_S" != "ln -s" ; then
+ $LN_S "$DESTDIR$SRC" "$DESTDIR$DEST"
+ exit 0
+fi
+
+if test $(dirname "$SRC") = $(dirname "$DEST") ; then
+ $LN_S "$(basename "$SRC")" "$DESTDIR$DEST"
+ exit 0
+fi
+
+TOP_SRC=$(echo $SRC | awk -F/ '{print $2}')
+TOP_DEST=$(echo $DEST | awk -F/ '{print $2}')
+
+if test $FORCE_RELATIVE = YES ; then
+ TOP_SRC=FORCE
+ TOP_DEST=FORCE
+fi
+
+if test $FORCE_ABSOLUTE = YES ; then
+ TOP_SRC=FORCE
+ TOP_DEST=FORCE_ABSOLUTE
+fi
+
+if test $TOP_SRC != $TOP_DEST ; then
+ $LN_S "$SRC" "$DESTDIR$DEST"
+else
+ $LN_S "$DESTDIR$SRC" "$DESTDIR$DEST"
+ $FIX_SYMLINK "$DESTDIR$DEST"
+fi
+
diff --git a/util/libecho.c b/util/libecho.c
new file mode 100644
index 0000000..352ce1e
--- /dev/null
+++ b/util/libecho.c
@@ -0,0 +1,78 @@
+/*
+ * libecho.c
+ *
+ * For each argument on the command line, echo it. Should expand
+ * DOS wildcards correctly.
+ *
+ * Syntax: libecho [-p prefix] list...
+ */
+#include <stdio.h>
+#include <io.h>
+#include <string.h>
+
+void echo_files(char *, char *);
+
+int
+main(int argc, char *argv[])
+{
+ int i;
+ char *prefix;
+
+ prefix = "";
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: libecho [-p prefix] list...\n");
+ return 1;
+ }
+
+ for (i = 1 ; i < argc ; i++)
+ if (!stricmp(argv[i], "-p"))
+ prefix = argv[++i];
+ else
+ echo_files(prefix, argv[i]);
+
+ return 0;
+}
+
+void
+echo_files(char *prefix, char *f)
+{
+ long ff;
+ struct _finddata_t fdt;
+ char *slash;
+ char filepath[256];
+
+ /*
+ * We're unix based quite a bit here. Look for normal slashes and
+ * make them reverse slashes.
+ */
+ while((slash = strrchr(f, '/')) != NULL)
+ *slash = '\\';
+
+ strcpy(filepath, f);
+
+ slash = strrchr(filepath, '\\');
+
+ if (slash) {
+ slash++;
+ *slash = 0;
+ } else {
+ filepath[0] = '\0';
+ }
+
+ ff = _findfirst(f, &fdt);
+
+ if (ff < 0) {
+ printf("%s%s\n", prefix, f);
+ return;
+ }
+
+ printf("%s%s%s\n", prefix, filepath, fdt.name);
+
+ for (;;) {
+ if (_findnext(ff, &fdt) < 0)
+ break;
+ printf("%s%s%s\n", prefix, filepath, fdt.name);
+ }
+ _findclose(ff);
+}
diff --git a/util/mkutf8data.c b/util/mkutf8data.c
new file mode 100644
index 0000000..2af25ac
--- /dev/null
+++ b/util/mkutf8data.c
@@ -0,0 +1,3392 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME "DerivedAge.txt"
+#define CCC_NAME "DerivedCombiningClass.txt"
+#define PROP_NAME "DerivedCoreProperties.txt"
+#define DATA_NAME "UnicodeData.txt"
+#define FOLD_NAME "CaseFolding.txt"
+#define NORM_NAME "NormalizationCorrections.txt"
+#define TEST_NAME "NormalizationTest.txt"
+#define UTF8_NAME "utf8data.h"
+
+const char *age_name = AGE_NAME;
+const char *ccc_name = CCC_NAME;
+const char *prop_name = PROP_NAME;
+const char *data_name = DATA_NAME;
+const char *fold_name = FOLD_NAME;
+const char *norm_name = NORM_NAME;
+const char *test_name = TEST_NAME;
+const char *utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE 1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision. These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT (16)
+#define UNICODE_MIN_SHIFT (8)
+
+#define UNICODE_MAJ_MAX ((unsigned short)-1)
+#define UNICODE_MIN_MAX ((unsigned char)-1)
+#define UNICODE_REV_MAX ((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV) \
+ (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
+ ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
+ ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int age_valid(unsigned int major, unsigned int minor,
+ unsigned int revision)
+{
+ if (major > UNICODE_MAJ_MAX)
+ return 0;
+ if (minor > UNICODE_MIN_MAX)
+ return 0;
+ if (revision > UNICODE_REV_MAX)
+ return 0;
+ return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree. The first byte contains the
+ * following information:
+ * NEXTBYTE - flag - advance to next byte if set
+ * BITNUM - 3 bit field - the bit number to tested
+ * OFFLEN - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ * RIGHTPATH - 1 bit field - set if the following node is for the
+ * right-hand path (tested bit is set)
+ * TRIENODE - 1 bit field - set if the following node is an internal
+ * node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ * LEFTNODE - 1 bit field - set if the left-hand node is internal
+ * RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM 0x07
+#define NEXTBYTE 0x08
+#define OFFLEN 0x30
+#define OFFLEN_SHIFT 4
+#define RIGHTPATH 0x40
+#define TRIENODE 0x80
+#define RIGHTNODE 0x40
+#define LEFTNODE 0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ * an index into utf8agetab[]. With this we can filter code
+ * points based on the unicode version in which they were
+ * defined. The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ * to do a stable sort into ascending order of all characters
+ * with a non-zero CCC that occur between two characters with
+ * a CCC of 0, or at the begin or end of a string.
+ * The unicode standard guarantees that all CCC values are
+ * between 0 and 254 inclusive, which leaves 255 available as
+ * a special value.
+ * Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ * start of a NUL-terminated string that is the decomposition
+ * of the character.
+ * The CCC of a decomposable character is the same as the CCC
+ * of the first character of its decomposition.
+ * Some characters decompose as the empty string: these are
+ * characters with the Default_Ignorable_Code_Point property.
+ * These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF) ((LEAF)[0])
+#define LEAF_CCC(LEAF) ((LEAF)[1])
+#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
+
+#define MAXGEN (255)
+
+#define MINCCC (0)
+#define MAXCCC (254)
+#define STOPPER (0)
+#define DECOMPOSE (255)
+#define HANGUL ((char)(255))
+
+#define UTF8HANGULLEAF (12)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
+ const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfkdi;
+utf8trie_t *nfkdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used. A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values. This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ * 0 - 0x7f: 0 0x7f
+ * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
+ * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
+ * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character. This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ * Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ * http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS 0xC0
+#define UTF8_3_BITS 0xE0
+#define UTF8_4_BITS 0xF0
+#define UTF8_N_BITS 0x80
+#define UTF8_2_MASK 0xE0
+#define UTF8_3_MASK 0xF0
+#define UTF8_4_MASK 0xF8
+#define UTF8_N_MASK 0xC0
+#define UTF8_V_MASK 0x3F
+#define UTF8_V_SHIFT 6
+
+static int utf8encode(char *str, unsigned int val)
+{
+ int len;
+
+ if (val < 0x80) {
+ str[0] = val;
+ len = 1;
+ } else if (val < 0x800) {
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_2_BITS;
+ len = 2;
+ } else if (val < 0x10000) {
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_3_BITS;
+ len = 3;
+ } else if (val < 0x110000) {
+ str[3] = val & UTF8_V_MASK;
+ str[3] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_4_BITS;
+ len = 4;
+ } else {
+ printf("%#x: illegal val\n", val);
+ len = 0;
+ }
+ return len;
+}
+
+static unsigned int utf8decode(const char *str)
+{
+ const unsigned char *s = (const unsigned char*)str;
+ unsigned int unichar = 0;
+
+ if (*s < 0x80) {
+ unichar = *s;
+ } else if (*s < UTF8_3_BITS) {
+ unichar = *s++ & 0x1F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else if (*s < UTF8_4_BITS) {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ }
+ return unichar;
+}
+
+static int utf32valid(unsigned int unichar)
+{
+ return unichar < 0x110000;
+}
+
+#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+ void *root;
+ int childnode;
+ const char *type;
+ unsigned int maxage;
+ struct tree *next;
+ int (*leaf_equal)(void *, void *);
+ void (*leaf_print)(void *, int);
+ int (*leaf_mark)(void *);
+ int (*leaf_size)(void *);
+ int *(*leaf_index)(struct tree *, void *);
+ unsigned char *(*leaf_emit)(void *, unsigned char *);
+ int leafindex[0x110000];
+ int index;
+};
+
+struct node {
+ int index;
+ int offset;
+ int mark;
+ int size;
+ struct node *parent;
+ void *left;
+ void *right;
+ unsigned char bitnum;
+ unsigned char nextbyte;
+ unsigned char leftnode;
+ unsigned char rightnode;
+ unsigned int keybits;
+ unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *lookup(struct tree *tree, const char *key)
+{
+ struct node *node;
+ void *leaf = NULL;
+
+ node = tree->root;
+ while (!leaf && node) {
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7))) {
+ /* Right leg */
+ if (node->rightnode == NODE) {
+ node = node->right;
+ } else if (node->rightnode == LEAF) {
+ leaf = node->right;
+ } else {
+ node = NULL;
+ }
+ } else {
+ /* Left leg */
+ if (node->leftnode == NODE) {
+ node = node->left;
+ } else if (node->leftnode == LEAF) {
+ leaf = node->left;
+ } else {
+ node = NULL;
+ }
+ }
+ }
+
+ return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void tree_walk(struct tree *tree)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int indent = 1;
+ int nodes, singletons, leaves;
+
+ nodes = singletons = leaves = 0;
+
+ printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+ if (tree->childnode == LEAF) {
+ assert(tree->root);
+ tree->leaf_print(tree->root, indent);
+ leaves = 1;
+ } else {
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ printf("%*snode @ %p bitnum %d nextbyte %d"
+ " left %p right %p mask %x bits %x\n",
+ indent, "", node,
+ node->bitnum, node->nextbyte,
+ node->left, node->right,
+ node->keymask, node->keybits);
+ nodes += 1;
+ if (!(node->left && node->right))
+ singletons += 1;
+
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ tree->leaf_print(node->left,
+ indent+1);
+ leaves += 1;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ tree->leaf_print(node->right,
+ indent+1);
+ leaves += 1;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+ }
+ printf("nodes %d leaves %d singletons %d\n",
+ nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *alloc_node(struct node *parent)
+{
+ struct node *node;
+ int bitnum;
+
+ node = malloc(sizeof(*node));
+ node->left = node->right = NULL;
+ node->parent = parent;
+ node->leftnode = NODE;
+ node->rightnode = NODE;
+ node->keybits = 0;
+ node->keymask = 0;
+ node->mark = 0;
+ node->index = 0;
+ node->offset = -1;
+ node->size = 4;
+
+ if (node->parent) {
+ bitnum = parent->bitnum;
+ if ((bitnum & 7) == 0) {
+ node->bitnum = bitnum + 7 + 8;
+ node->nextbyte = 1;
+ } else {
+ node->bitnum = bitnum - 1;
+ node->nextbyte = 0;
+ }
+ } else {
+ node->bitnum = 7;
+ node->nextbyte = 0;
+ }
+
+ return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+ struct node *node;
+ struct node *parent;
+ void **cursor;
+ int keybits;
+
+ assert(keylen >= 1 && keylen <= 4);
+
+ node = NULL;
+ cursor = &tree->root;
+ keybits = 8 * keylen;
+
+ /* Insert, creating path along the way. */
+ while (keybits) {
+ if (!*cursor)
+ *cursor = alloc_node(node);
+ node = *cursor;
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7)))
+ cursor = &node->right;
+ else
+ cursor = &node->left;
+ keybits--;
+ }
+ *cursor = leaf;
+
+ /* Merge subtrees if possible. */
+ while (node) {
+ if (*key & (1 << (node->bitnum & 7)))
+ node->rightnode = LEAF;
+ else
+ node->leftnode = LEAF;
+ if (node->nextbyte)
+ break;
+ if (node->leftnode == NODE || node->rightnode == NODE)
+ break;
+ assert(node->left);
+ assert(node->right);
+ /* Compare */
+ if (! tree->leaf_equal(node->left, node->right))
+ break;
+ /* Keep left, drop right leaf. */
+ leaf = node->left;
+ /* Check in parent */
+ parent = node->parent;
+ if (!parent) {
+ /* root of tree! */
+ tree->root = leaf;
+ tree->childnode = LEAF;
+ } else if (parent->left == node) {
+ parent->left = leaf;
+ parent->leftnode = LEAF;
+ if (parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ }
+ } else if (parent->right == node) {
+ parent->right = leaf;
+ parent->rightnode = LEAF;
+ if (parent->left) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ parent->keybits |= (1 << node->bitnum);
+ }
+ } else {
+ /* internal tree error */
+ assert(0);
+ }
+ free(node);
+ node = parent;
+ }
+
+ /* Propagate keymasks up along singleton chains. */
+ while (node) {
+ parent = node->parent;
+ if (!parent)
+ break;
+ /* Nix the mask for parents with two children. */
+ if (node->keymask == 0) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else if (parent->left && parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ assert((parent->keymask & node->keymask) == 0);
+ parent->keymask |= node->keymask;
+ parent->keymask |= (1 << parent->bitnum);
+ parent->keybits |= node->keybits;
+ if (parent->right)
+ parent->keybits |= (1 << parent->bitnum);
+ }
+ node = parent;
+ }
+
+ return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed. There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves. The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains. When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity. Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void prune(struct tree *tree)
+{
+ struct node *node;
+ struct node *left;
+ struct node *right;
+ struct node *parent;
+ void *leftleaf;
+ void *rightleaf;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+
+ if (verbose > 0)
+ printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+ count = 0;
+ if (tree->childnode == LEAF)
+ return;
+ if (!tree->root)
+ return;
+
+ leftmask = rightmask = 0;
+ node = tree->root;
+ while (node) {
+ if (node->nextbyte)
+ goto advance;
+ if (node->leftnode == LEAF)
+ goto advance;
+ if (node->rightnode == LEAF)
+ goto advance;
+ if (!node->left)
+ goto advance;
+ if (!node->right)
+ goto advance;
+ left = node->left;
+ right = node->right;
+ if (left->keymask == 0)
+ goto advance;
+ if (right->keymask == 0)
+ goto advance;
+ if (left->keymask != right->keymask)
+ goto advance;
+ if (left->keybits != right->keybits)
+ goto advance;
+ leftleaf = NULL;
+ while (!leftleaf) {
+ assert(left->left || left->right);
+ if (left->leftnode == LEAF)
+ leftleaf = left->left;
+ else if (left->rightnode == LEAF)
+ leftleaf = left->right;
+ else if (left->left)
+ left = left->left;
+ else if (left->right)
+ left = left->right;
+ else
+ assert(0);
+ }
+ rightleaf = NULL;
+ while (!rightleaf) {
+ assert(right->left || right->right);
+ if (right->leftnode == LEAF)
+ rightleaf = right->left;
+ else if (right->rightnode == LEAF)
+ rightleaf = right->right;
+ else if (right->left)
+ right = right->left;
+ else if (right->right)
+ right = right->right;
+ else
+ assert(0);
+ }
+ if (! tree->leaf_equal(leftleaf, rightleaf))
+ goto advance;
+ /*
+ * This node has identical singleton-only subtrees.
+ * Remove it.
+ */
+ parent = node->parent;
+ left = node->left;
+ right = node->right;
+ if (parent->left == node)
+ parent->left = left;
+ else if (parent->right == node)
+ parent->right = left;
+ else
+ assert(0);
+ left->parent = parent;
+ left->keymask |= (1 << node->bitnum);
+ node->left = NULL;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ if (node->leftnode == NODE && node->left) {
+ left = node->left;
+ free(node);
+ count++;
+ node = left;
+ } else if (node->rightnode == NODE && node->right) {
+ right = node->right;
+ free(node);
+ count++;
+ node = right;
+ } else {
+ node = NULL;
+ }
+ }
+ /* Propagate keymasks up along singleton chains. */
+ node = parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ for (;;) {
+ if (node->left && node->right)
+ break;
+ if (node->left) {
+ left = node->left;
+ node->keymask |= left->keymask;
+ node->keybits |= left->keybits;
+ }
+ if (node->right) {
+ right = node->right;
+ node->keymask |= right->keymask;
+ node->keybits |= right->keybits;
+ }
+ node->keymask |= (1 << node->bitnum);
+ node = node->parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ }
+ advance:
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0 &&
+ node->leftnode == NODE &&
+ node->left) {
+ leftmask |= bitmask;
+ node = node->left;
+ } else if ((rightmask & bitmask) == 0 &&
+ node->rightnode == NODE &&
+ node->right) {
+ rightmask |= bitmask;
+ node = node->right;
+ } else {
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+ }
+ if (verbose > 0)
+ printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void mark_nodes(struct tree *tree)
+{
+ struct node *node;
+ struct node *n;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int marked;
+
+ marked = 0;
+ if (verbose > 0)
+ printf("Marking %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF)
+ goto done;
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+
+ /* second pass: left siblings and singletons */
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ if (!node->mark && node->parent->mark) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ if (!node->mark && node->parent->mark &&
+ !node->parent->left) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+done:
+ if (verbose > 0)
+ printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie. These values must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int index_nodes(struct tree *tree, int index)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+ int indent;
+
+ /* Align to a cache line (or half a cache line?). */
+ while (index % 64)
+ index++;
+ tree->index = index;
+ indent = 1;
+ count = 0;
+
+ if (verbose > 0)
+ printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
+ if (tree->childnode == LEAF) {
+ index += tree->leaf_size(tree->root);
+ goto done;
+ }
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ count++;
+ if (node->index != index)
+ node->index = index;
+ index += node->size;
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ *tree->leaf_index(tree, node->left) =
+ index;
+ index += tree->leaf_size(node->left);
+ count++;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ *tree->leaf_index(tree, node->right) = index;
+ index += tree->leaf_size(node->right);
+ count++;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ /* Round up to a multiple of 16 */
+ while (index % 16)
+ index++;
+ if (verbose > 0)
+ printf("Final index %d\n", index);
+ return index;
+}
+
+/*
+ * Mark the nodes in a subtree, helper for size_nodes().
+ */
+static int mark_subtree(struct node *node)
+{
+ int changed;
+
+ if (!node || node->mark)
+ return 0;
+ node->mark = 1;
+ node->index = node->parent->index;
+ changed = 1;
+ if (node->leftnode == NODE)
+ changed += mark_subtree(node->left);
+ if (node->rightnode == NODE)
+ changed += mark_subtree(node->right);
+ return changed;
+}
+
+/*
+ * Compute the size of nodes and leaves. We start by assuming that
+ * each node needs to store a three-byte offset. The indexes of the
+ * nodes are calculated based on that, and then this function is
+ * called to see if the sizes of some nodes can be reduced. This is
+ * repeated until no more changes are seen.
+ */
+static int size_nodes(struct tree *tree)
+{
+ struct tree *next;
+ struct node *node;
+ struct node *right;
+ struct node *n;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ unsigned int pathbits;
+ unsigned int pathmask;
+ unsigned int nbit;
+ int changed;
+ int offset;
+ int size;
+ int indent;
+
+ indent = 1;
+ changed = 0;
+ size = 0;
+
+ if (verbose > 0)
+ printf("Sizing %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF)
+ goto done;
+
+ assert(tree->childnode == NODE);
+ pathbits = 0;
+ pathmask = 0;
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ offset = 0;
+ if (!node->left || !node->right) {
+ size = 1;
+ } else {
+ if (node->rightnode == NODE) {
+ /*
+ * If the right node is not marked,
+ * look for a corresponding node in
+ * the next tree. Such a node need
+ * not exist.
+ */
+ right = node->right;
+ next = tree->next;
+ while (!right->mark) {
+ assert(next);
+ n = next->root;
+ while (n->bitnum != node->bitnum) {
+ nbit = 1 << n->bitnum;
+ if (!(pathmask & nbit))
+ break;
+ if (pathbits & nbit) {
+ if (n->rightnode == LEAF)
+ break;
+ n = n->right;
+ } else {
+ if (n->leftnode == LEAF)
+ break;
+ n = n->left;
+ }
+ }
+ if (n->bitnum != node->bitnum)
+ break;
+ n = n->right;
+ right = n;
+ next = next->next;
+ }
+ /* Make sure the right node is marked. */
+ if (!right->mark)
+ changed += mark_subtree(right);
+ offset = right->index - node->index;
+ } else {
+ offset = *tree->leaf_index(tree, node->right);
+ offset -= node->index;
+ }
+ assert(offset >= 0);
+ assert(offset <= 0xffffff);
+ if (offset <= 0xff) {
+ size = 2;
+ } else if (offset <= 0xffff) {
+ size = 3;
+ } else { /* offset <= 0xffffff */
+ size = 4;
+ }
+ }
+ if (node->size != size || node->offset != offset) {
+ node->size = size;
+ node->offset = offset;
+ changed++;
+ }
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ pathmask |= bitmask;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ pathbits |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ pathmask &= ~bitmask;
+ pathbits &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ if (verbose > 0)
+ printf("Found %d changes\n", changed);
+ return changed;
+}
+
+/*
+ * Emit a trie for the given tree into the data array.
+ */
+static void emit(struct tree *tree, unsigned char *data)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int offlen;
+ int offset;
+ int index;
+ int indent;
+ int size;
+ int bytes;
+ int leaves;
+ int nodes[4];
+ unsigned char byte;
+
+ nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
+ leaves = 0;
+ bytes = 0;
+ index = tree->index;
+ data += index;
+ indent = 1;
+ if (verbose > 0)
+ printf("Emitting %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF) {
+ assert(tree->root);
+ tree->leaf_emit(tree->root, data);
+ size = tree->leaf_size(tree->root);
+ index += size;
+ leaves++;
+ goto done;
+ }
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ assert(node->offset != -1);
+ assert(node->index == index);
+
+ byte = 0;
+ if (node->nextbyte)
+ byte |= NEXTBYTE;
+ byte |= (node->bitnum & BITNUM);
+ if (node->left && node->right) {
+ if (node->leftnode == NODE)
+ byte |= LEFTNODE;
+ if (node->rightnode == NODE)
+ byte |= RIGHTNODE;
+ if (node->offset <= 0xff)
+ offlen = 1;
+ else if (node->offset <= 0xffff)
+ offlen = 2;
+ else
+ offlen = 3;
+ nodes[offlen]++;
+ offset = node->offset;
+ byte |= offlen << OFFLEN_SHIFT;
+ *data++ = byte;
+ index++;
+ while (offlen--) {
+ *data++ = offset & 0xff;
+ index++;
+ offset >>= 8;
+ }
+ } else if (node->left) {
+ if (node->leftnode == NODE)
+ byte |= TRIENODE;
+ nodes[0]++;
+ *data++ = byte;
+ index++;
+ } else if (node->right) {
+ byte |= RIGHTNODE;
+ if (node->rightnode == NODE)
+ byte |= TRIENODE;
+ nodes[0]++;
+ *data++ = byte;
+ index++;
+ } else {
+ assert(0);
+ }
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ data = tree->leaf_emit(node->left,
+ data);
+ size = tree->leaf_size(node->left);
+ index += size;
+ bytes += size;
+ leaves++;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ data = tree->leaf_emit(node->right,
+ data);
+ size = tree->leaf_size(node->right);
+ index += size;
+ bytes += size;
+ leaves++;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ if (verbose > 0) {
+ printf("Emitted %d (%d) leaves",
+ leaves, bytes);
+ printf(" %d (%d+%d+%d+%d) nodes",
+ nodes[0] + nodes[1] + nodes[2] + nodes[3],
+ nodes[0], nodes[1], nodes[2], nodes[3]);
+ printf(" %d total\n", index - tree->index);
+ }
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode data.
+ *
+ * We need to keep track of the Canonical Combining Class, the Age,
+ * and decompositions for a code point.
+ *
+ * For the Age, we store the index into the ages table. Effectively
+ * this is a generation number that the table maps to a unicode
+ * version.
+ *
+ * The correction field is used to indicate that this entry is in the
+ * corrections array, which contains decompositions that were
+ * corrected in later revisions. The value of the correction field is
+ * the Unicode version in which the mapping was corrected.
+ */
+struct unicode_data {
+ unsigned int code;
+ int ccc;
+ int gen;
+ int correction;
+ unsigned int *utf32nfkdi;
+ unsigned int *utf32nfkdicf;
+ char *utf8nfkdi;
+ char *utf8nfkdicf;
+};
+
+struct unicode_data unicode_data[0x110000];
+struct unicode_data *corrections;
+int corrections_count;
+
+struct tree *nfkdi_tree;
+struct tree *nfkdicf_tree;
+
+struct tree *trees;
+int trees_count;
+
+/*
+ * Check the corrections array to see if this entry was corrected at
+ * some point.
+ */
+static struct unicode_data *corrections_lookup(struct unicode_data *u)
+{
+ int i;
+
+ for (i = 0; i != corrections_count; i++)
+ if (u->code == corrections[i].code)
+ return &corrections[i];
+ return u;
+}
+
+static int nfkdi_equal(void *l, void *r)
+{
+ struct unicode_data *left = l;
+ struct unicode_data *right = r;
+
+ if (left->gen != right->gen)
+ return 0;
+ if (left->ccc != right->ccc)
+ return 0;
+ if (left->utf8nfkdi && right->utf8nfkdi &&
+ strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+ return 1;
+ if (left->utf8nfkdi || right->utf8nfkdi)
+ return 0;
+ return 1;
+}
+
+static int nfkdicf_equal(void *l, void *r)
+{
+ struct unicode_data *left = l;
+ struct unicode_data *right = r;
+
+ if (left->gen != right->gen)
+ return 0;
+ if (left->ccc != right->ccc)
+ return 0;
+ if (left->utf8nfkdicf && right->utf8nfkdicf &&
+ strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
+ return 1;
+ if (left->utf8nfkdicf && right->utf8nfkdicf)
+ return 0;
+ if (left->utf8nfkdicf || right->utf8nfkdicf)
+ return 0;
+ if (left->utf8nfkdi && right->utf8nfkdi &&
+ strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+ return 1;
+ if (left->utf8nfkdi || right->utf8nfkdi)
+ return 0;
+ return 1;
+}
+
+static void nfkdi_print(void *l, int indent)
+{
+ struct unicode_data *leaf = l;
+
+ printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+ leaf->code, leaf->ccc, leaf->gen);
+ if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
+ printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
+ else if (leaf->utf8nfkdi)
+ printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+ printf("\n");
+}
+
+static void nfkdicf_print(void *l, int indent)
+{
+ struct unicode_data *leaf = l;
+
+ printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+ leaf->code, leaf->ccc, leaf->gen);
+ if (leaf->utf8nfkdicf)
+ printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
+ else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
+ printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
+ else if (leaf->utf8nfkdi)
+ printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+ printf("\n");
+}
+
+static int nfkdi_mark(void *l)
+{
+ return 1;
+}
+
+static int nfkdicf_mark(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ if (leaf->utf8nfkdicf)
+ return 1;
+ return 0;
+}
+
+static int correction_mark(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return leaf->correction;
+}
+
+static int nfkdi_size(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ int size = 2;
+ if (HANGUL_SYLLABLE(leaf->code))
+ size += 1;
+ else if (leaf->utf8nfkdi)
+ size += strlen(leaf->utf8nfkdi) + 1;
+ return size;
+}
+
+static int nfkdicf_size(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ int size = 2;
+ if (HANGUL_SYLLABLE(leaf->code))
+ size += 1;
+ else if (leaf->utf8nfkdicf)
+ size += strlen(leaf->utf8nfkdicf) + 1;
+ else if (leaf->utf8nfkdi)
+ size += strlen(leaf->utf8nfkdi) + 1;
+ return size;
+}
+
+static int *nfkdi_index(struct tree *tree, void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return &tree->leafindex[leaf->code];
+}
+
+static int *nfkdicf_index(struct tree *tree, void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return &tree->leafindex[leaf->code];
+}
+
+static unsigned char *nfkdi_emit(void *l, unsigned char *data)
+{
+ struct unicode_data *leaf = l;
+ unsigned char *s;
+
+ *data++ = leaf->gen;
+ if (HANGUL_SYLLABLE(leaf->code)) {
+ *data++ = DECOMPOSE;
+ *data++ = HANGUL;
+ } else if (leaf->utf8nfkdi) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdi;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else {
+ *data++ = leaf->ccc;
+ }
+ return data;
+}
+
+static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
+{
+ struct unicode_data *leaf = l;
+ unsigned char *s;
+
+ *data++ = leaf->gen;
+ if (HANGUL_SYLLABLE(leaf->code)) {
+ *data++ = DECOMPOSE;
+ *data++ = HANGUL;
+ } else if (leaf->utf8nfkdicf) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdicf;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else if (leaf->utf8nfkdi) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdi;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else {
+ *data++ = leaf->ccc;
+ }
+ return data;
+}
+
+static void utf8_create(struct unicode_data *data)
+{
+ char utf[18*4+1];
+ char *u;
+ unsigned int *um;
+ int i;
+
+ if (data->utf8nfkdi) {
+ assert(data->utf8nfkdi[0] == HANGUL);
+ return;
+ }
+
+ u = utf;
+ um = data->utf32nfkdi;
+ if (um) {
+ for (i = 0; um[i]; i++)
+ u += utf8encode(u, um[i]);
+ *u = '\0';
+ data->utf8nfkdi = strdup(utf);
+ }
+ u = utf;
+ um = data->utf32nfkdicf;
+ if (um) {
+ for (i = 0; um[i]; i++)
+ u += utf8encode(u, um[i]);
+ *u = '\0';
+ if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
+ data->utf8nfkdicf = strdup(utf);
+ }
+}
+
+static void utf8_init(void)
+{
+ unsigned int unichar;
+ int i;
+
+ for (unichar = 0; unichar != 0x110000; unichar++)
+ utf8_create(&unicode_data[unichar]);
+
+ for (i = 0; i != corrections_count; i++)
+ utf8_create(&corrections[i]);
+}
+
+static void trees_init(void)
+{
+ struct unicode_data *data;
+ unsigned int maxage;
+ unsigned int nextage;
+ int count;
+ int i;
+ int j;
+
+ /* Count the number of different ages. */
+ count = 0;
+ nextage = (unsigned int)-1;
+ do {
+ maxage = nextage;
+ nextage = 0;
+ for (i = 0; i <= corrections_count; i++) {
+ data = &corrections[i];
+ if (nextage < data->correction &&
+ data->correction < maxage)
+ nextage = data->correction;
+ }
+ count++;
+ } while (nextage);
+
+ /* Two trees per age: nfkdi and nfkdicf */
+ trees_count = count * 2;
+ trees = calloc(trees_count, sizeof(struct tree));
+
+ /* Assign ages to the trees. */
+ count = trees_count;
+ nextage = (unsigned int)-1;
+ do {
+ maxage = nextage;
+ trees[--count].maxage = maxage;
+ trees[--count].maxage = maxage;
+ nextage = 0;
+ for (i = 0; i <= corrections_count; i++) {
+ data = &corrections[i];
+ if (nextage < data->correction &&
+ data->correction < maxage)
+ nextage = data->correction;
+ }
+ } while (nextage);
+
+ /* The ages assigned above are off by one. */
+ for (i = 0; i != trees_count; i++) {
+ j = 0;
+ while (ages[j] < trees[i].maxage)
+ j++;
+ trees[i].maxage = ages[j-1];
+ }
+
+ /* Set up the forwarding between trees. */
+ trees[trees_count-2].next = &trees[trees_count-1];
+ trees[trees_count-1].leaf_mark = nfkdi_mark;
+ trees[trees_count-2].leaf_mark = nfkdicf_mark;
+ for (i = 0; i != trees_count-2; i += 2) {
+ trees[i].next = &trees[trees_count-2];
+ trees[i].leaf_mark = correction_mark;
+ trees[i+1].next = &trees[trees_count-1];
+ trees[i+1].leaf_mark = correction_mark;
+ }
+
+ /* Assign the callouts. */
+ for (i = 0; i != trees_count; i += 2) {
+ trees[i].type = "nfkdicf";
+ trees[i].leaf_equal = nfkdicf_equal;
+ trees[i].leaf_print = nfkdicf_print;
+ trees[i].leaf_size = nfkdicf_size;
+ trees[i].leaf_index = nfkdicf_index;
+ trees[i].leaf_emit = nfkdicf_emit;
+
+ trees[i+1].type = "nfkdi";
+ trees[i+1].leaf_equal = nfkdi_equal;
+ trees[i+1].leaf_print = nfkdi_print;
+ trees[i+1].leaf_size = nfkdi_size;
+ trees[i+1].leaf_index = nfkdi_index;
+ trees[i+1].leaf_emit = nfkdi_emit;
+ }
+
+ /* Finish init. */
+ for (i = 0; i != trees_count; i++)
+ trees[i].childnode = NODE;
+}
+
+static void trees_populate(void)
+{
+ struct unicode_data *data;
+ unsigned int unichar;
+ char keyval[4];
+ int keylen;
+ int i;
+
+ for (i = 0; i != trees_count; i++) {
+ if (verbose > 0) {
+ printf("Populating %s_%x\n",
+ trees[i].type, trees[i].maxage);
+ }
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (unicode_data[unichar].gen < 0)
+ continue;
+ keylen = utf8encode(keyval, unichar);
+ data = corrections_lookup(&unicode_data[unichar]);
+ if (data->correction <= trees[i].maxage)
+ data = &unicode_data[unichar];
+ insert(&trees[i], keyval, keylen, data);
+ }
+ }
+}
+
+static void trees_reduce(void)
+{
+ int i;
+ int size;
+ int changed;
+
+ for (i = 0; i != trees_count; i++)
+ prune(&trees[i]);
+ for (i = 0; i != trees_count; i++)
+ mark_nodes(&trees[i]);
+ do {
+ size = 0;
+ for (i = 0; i != trees_count; i++)
+ size = index_nodes(&trees[i], size);
+ changed = 0;
+ for (i = 0; i != trees_count; i++)
+ changed += size_nodes(&trees[i]);
+ } while (changed);
+
+ utf8data = calloc(size, 1);
+ utf8data_size = size;
+ for (i = 0; i != trees_count; i++)
+ emit(&trees[i], utf8data);
+
+ if (verbose > 0) {
+ for (i = 0; i != trees_count; i++) {
+ printf("%s_%x idx %d\n",
+ trees[i].type, trees[i].maxage, trees[i].index);
+ }
+ }
+
+ nfkdi = utf8data + trees[trees_count-1].index;
+ nfkdicf = utf8data + trees[trees_count-2].index;
+
+ nfkdi_tree = &trees[trees_count-1];
+ nfkdicf_tree = &trees[trees_count-2];
+}
+
+static void verify(struct tree *tree)
+{
+ struct unicode_data *data;
+ utf8leaf_t *leaf;
+ unsigned int unichar;
+ char key[4];
+ unsigned char hangul[UTF8HANGULLEAF];
+ int report;
+ int nocf;
+
+ if (verbose > 0)
+ printf("Verifying %s_%x\n", tree->type, tree->maxage);
+ nocf = strcmp(tree->type, "nfkdicf");
+
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ report = 0;
+ data = corrections_lookup(&unicode_data[unichar]);
+ if (data->correction <= tree->maxage)
+ data = &unicode_data[unichar];
+ utf8encode(key,unichar);
+ leaf = utf8lookup(tree, hangul, key);
+
+ if (!leaf) {
+ if (data->gen != -1)
+ report++;
+ if (unichar < 0xd800 || unichar > 0xdfff)
+ report++;
+ } else {
+ if (unichar >= 0xd800 && unichar <= 0xdfff)
+ report++;
+ if (data->gen == -1)
+ report++;
+ if (data->gen != LEAF_GEN(leaf))
+ report++;
+ if (LEAF_CCC(leaf) == DECOMPOSE) {
+ if (HANGUL_SYLLABLE(data->code)) {
+ if (data->utf8nfkdi[0] != HANGUL)
+ report++;
+ } else if (nocf) {
+ if (!data->utf8nfkdi) {
+ report++;
+ } else if (strcmp(data->utf8nfkdi,
+ LEAF_STR(leaf))) {
+ report++;
+ }
+ } else {
+ if (!data->utf8nfkdicf &&
+ !data->utf8nfkdi) {
+ report++;
+ } else if (data->utf8nfkdicf) {
+ if (strcmp(data->utf8nfkdicf,
+ LEAF_STR(leaf)))
+ report++;
+ } else if (strcmp(data->utf8nfkdi,
+ LEAF_STR(leaf))) {
+ report++;
+ }
+ }
+ } else if (data->ccc != LEAF_CCC(leaf)) {
+ report++;
+ }
+ }
+ if (report) {
+ printf("%X code %X gen %d ccc %d"
+ " nfkdi -> \"%s\"",
+ unichar, data->code, data->gen,
+ data->ccc,
+ data->utf8nfkdi);
+ if (leaf) {
+ printf(" gen %d ccc %d"
+ " nfkdi -> \"%s\"",
+ LEAF_GEN(leaf),
+ LEAF_CCC(leaf),
+ LEAF_CCC(leaf) == DECOMPOSE ?
+ LEAF_STR(leaf) : "");
+ }
+ printf("\n");
+ }
+ }
+}
+
+static void trees_verify(void)
+{
+ int i;
+
+ for (i = 0; i != trees_count; i++)
+ verify(&trees[i]);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void help(void)
+{
+ printf("Usage: %s [options]\n", argv0);
+ printf("\n");
+ printf("This program creates an a data trie used for parsing and\n");
+ printf("normalization of UTF-8 strings. The trie is derived from\n");
+ printf("a set of input files from the Unicode character database\n");
+ printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
+ printf("\n");
+ printf("The generated tree supports two normalization forms:\n");
+ printf("\n");
+ printf("\tnfkdi:\n");
+ printf("\t- Apply unicode normalization form NFKD.\n");
+ printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+ printf("\n");
+ printf("\tnfkdicf:\n");
+ printf("\t- Apply unicode normalization form NFKD.\n");
+ printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+ printf("\t- Apply a full casefold (C + F).\n");
+ printf("\n");
+ printf("These forms were chosen as being most useful when dealing\n");
+ printf("with file names: NFKD catches most cases where characters\n");
+ printf("should be considered equivalent. The ignorables are mostly\n");
+ printf("invisible, making names hard to type.\n");
+ printf("\n");
+ printf("The options to specify the files to be used are listed\n");
+ printf("below with their default values, which are the names used\n");
+ printf("by version 11.0.0 of the Unicode Character Database.\n");
+ printf("\n");
+ printf("The input files:\n");
+ printf("\t-a %s\n", AGE_NAME);
+ printf("\t-c %s\n", CCC_NAME);
+ printf("\t-p %s\n", PROP_NAME);
+ printf("\t-d %s\n", DATA_NAME);
+ printf("\t-f %s\n", FOLD_NAME);
+ printf("\t-n %s\n", NORM_NAME);
+ printf("\n");
+ printf("Additionally, the generated tables are tested using:\n");
+ printf("\t-t %s\n", TEST_NAME);
+ printf("\n");
+ printf("Finally, the output file:\n");
+ printf("\t-o %s\n", UTF8_NAME);
+ printf("\n");
+}
+
+static void usage(void)
+{
+ help();
+ exit(1);
+}
+
+static void open_fail(const char *name, int error)
+{
+ printf("Error %d opening %s: %s\n", error, name, strerror(error));
+ exit(1);
+}
+
+static void file_fail(const char *filename)
+{
+ printf("Error parsing %s\n", filename);
+ exit(1);
+}
+
+static void line_fail(const char *filename, const char *line)
+{
+ printf("Error parsing %s:%s\n", filename, line);
+ exit(1);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void print_utf32(unsigned int *utf32str)
+{
+ int i;
+
+ for (i = 0; utf32str[i]; i++)
+ printf(" %X", utf32str[i]);
+}
+
+static void print_utf32nfkdi(unsigned int unichar)
+{
+ printf(" %X ->", unichar);
+ print_utf32(unicode_data[unichar].utf32nfkdi);
+ printf("\n");
+}
+
+static void print_utf32nfkdicf(unsigned int unichar)
+{
+ printf(" %X ->", unichar);
+ print_utf32(unicode_data[unichar].utf32nfkdicf);
+ printf("\n");
+}
+
+/* ------------------------------------------------------------------ */
+
+static void age_init(void)
+{
+ FILE *file;
+ unsigned int first;
+ unsigned int last;
+ unsigned int unichar;
+ unsigned int major;
+ unsigned int minor;
+ unsigned int revision;
+ int gen;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", age_name);
+
+ file = fopen(age_name, "r");
+ if (!file)
+ open_fail(age_name, errno);
+ count = 0;
+
+ gen = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "# Age=V%d_%d_%d",
+ &major, &minor, &revision);
+ if (ret == 3) {
+ ages_count++;
+ if (verbose > 1)
+ printf(" Age V%d_%d_%d\n",
+ major, minor, revision);
+ if (!age_valid(major, minor, revision))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+ if (ret == 2) {
+ ages_count++;
+ if (verbose > 1)
+ printf(" Age V%d_%d\n", major, minor);
+ if (!age_valid(major, minor, 0))
+ line_fail(age_name, line);
+ continue;
+ }
+ }
+
+ /* We must have found something above. */
+ if (verbose > 1)
+ printf("%d age entries\n", ages_count);
+ if (ages_count == 0 || ages_count > MAXGEN)
+ file_fail(age_name);
+
+ /* There is a 0 entry. */
+ ages_count++;
+ ages = calloc(ages_count + 1, sizeof(*ages));
+ /* And a guard entry. */
+ ages[ages_count] = (unsigned int)-1;
+
+ rewind(file);
+ count = 0;
+ gen = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "# Age=V%d_%d_%d",
+ &major, &minor, &revision);
+ if (ret == 3) {
+ ages[++gen] =
+ UNICODE_AGE(major, minor, revision);
+ if (verbose > 1)
+ printf(" Age V%d_%d_%d = gen %d\n",
+ major, minor, revision, gen);
+ if (!age_valid(major, minor, revision))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+ if (ret == 2) {
+ ages[++gen] = UNICODE_AGE(major, minor, 0);
+ if (verbose > 1)
+ printf(" Age V%d_%d = %d\n",
+ major, minor, gen);
+ if (!age_valid(major, minor, 0))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X..%X ; %d.%d #",
+ &first, &last, &major, &minor);
+ if (ret == 4) {
+ for (unichar = first; unichar <= last; unichar++)
+ unicode_data[unichar].gen = gen;
+ count += 1 + last - first;
+ if (verbose > 1)
+ printf(" %X..%X gen %d\n", first, last, gen);
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
+ if (ret == 3) {
+ unicode_data[unichar].gen = gen;
+ count++;
+ if (verbose > 1)
+ printf(" %X gen %d\n", unichar, gen);
+ if (!utf32valid(unichar))
+ line_fail(age_name, line);
+ continue;
+ }
+ }
+ unicode_maxage = ages[gen];
+ fclose(file);
+
+ /* Nix surrogate block */
+ if (verbose > 1)
+ printf(" Removing surrogate block D800..DFFF\n");
+ for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
+ unicode_data[unichar].gen = -1;
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(age_name);
+}
+
+static void ccc_init(void)
+{
+ FILE *file;
+ unsigned int first;
+ unsigned int last;
+ unsigned int unichar;
+ unsigned int value;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", ccc_name);
+
+ file = fopen(ccc_name, "r");
+ if (!file)
+ open_fail(ccc_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
+ if (ret == 3) {
+ for (unichar = first; unichar <= last; unichar++) {
+ unicode_data[unichar].ccc = value;
+ count++;
+ }
+ if (verbose > 1)
+ printf(" %X..%X ccc %d\n", first, last, value);
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(ccc_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %d #", &unichar, &value);
+ if (ret == 2) {
+ unicode_data[unichar].ccc = value;
+ count++;
+ if (verbose > 1)
+ printf(" %X ccc %d\n", unichar, value);
+ if (!utf32valid(unichar))
+ line_fail(ccc_name, line);
+ continue;
+ }
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(ccc_name);
+}
+
+static void nfkdi_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char *s;
+ unsigned int *um;
+ int count;
+ int i;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", data_name);
+ file = fopen(data_name, "r");
+ if (!file)
+ open_fail(data_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+ &unichar, buf0);
+ if (ret != 2)
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(data_name, line);
+
+ s = buf0;
+ /* skip over <tag> */
+ if (*s == '<')
+ while (*s++ != ' ')
+ ;
+ /* decode the decomposition into UTF-32 */
+ i = 0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(data_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+ count++;
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(data_name);
+}
+
+static void nfkdicf_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char status;
+ char *s;
+ unsigned int *um;
+ int i;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", fold_name);
+ file = fopen(fold_name, "r");
+ if (!file)
+ open_fail(fold_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
+ if (ret != 3)
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(fold_name, line);
+ /* Use the C+F casefold. */
+ if (status != 'C' && status != 'F')
+ continue;
+ s = buf0;
+ if (*s == '<')
+ while (*s++ != ' ')
+ ;
+ i = 0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(fold_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+
+ if (verbose > 1)
+ print_utf32nfkdicf(unichar);
+ count++;
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(fold_name);
+}
+
+static void ignore_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int first;
+ unsigned int last;
+ unsigned int *um;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", prop_name);
+ file = fopen(prop_name, "r");
+ if (!file)
+ open_fail(prop_name, errno);
+ assert(file);
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
+ if (ret == 3) {
+ if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+ continue;
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(prop_name, line);
+ for (unichar = first; unichar <= last; unichar++) {
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdi = um;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdicf = um;
+ count++;
+ }
+ if (verbose > 1)
+ printf(" %X..%X Default_Ignorable_Code_Point\n",
+ first, last);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
+ if (ret == 2) {
+ if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(prop_name, line);
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdi = um;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdicf = um;
+ if (verbose > 1)
+ printf(" %X Default_Ignorable_Code_Point\n",
+ unichar);
+ count++;
+ continue;
+ }
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(prop_name);
+}
+
+static void corrections_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int major;
+ unsigned int minor;
+ unsigned int revision;
+ unsigned int age;
+ unsigned int *um;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char *s;
+ int i;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", norm_name);
+ file = fopen(norm_name, "r");
+ if (!file)
+ open_fail(norm_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+ &unichar, buf0, buf1,
+ &major, &minor, &revision);
+ if (ret != 6)
+ continue;
+ if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+ line_fail(norm_name, line);
+ count++;
+ }
+ corrections = calloc(count, sizeof(struct unicode_data));
+ corrections_count = count;
+ rewind(file);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+ &unichar, buf0, buf1,
+ &major, &minor, &revision);
+ if (ret != 6)
+ continue;
+ if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+ line_fail(norm_name, line);
+ corrections[count] = unicode_data[unichar];
+ assert(corrections[count].code == unichar);
+ age = UNICODE_AGE(major, minor, revision);
+ corrections[count].correction = age;
+
+ i = 0;
+ s = buf0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(norm_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ corrections[count].utf32nfkdi = um;
+
+ if (verbose > 1)
+ printf(" %X -> %s -> %s V%d_%d_%d\n",
+ unichar, buf0, buf1, major, minor, revision);
+ count++;
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(norm_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ * SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ * LVIndex = (SIndex / TCount) * TCount
+ * TIndex = (Sindex % TCount)
+ * LVPart = SBase + LVIndex
+ * TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * TIndex = (Sindex % TCount)
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ * if (TIndex == 0) {
+ * d = <LPart, VPart>
+ * } else {
+ * TPart = TBase + TIndex
+ * d = <LPart, VPart, TPart>
+ * }
+ *
+ */
+
+static void hangul_decompose(void)
+{
+ unsigned int sb = 0xAC00;
+ unsigned int lb = 0x1100;
+ unsigned int vb = 0x1161;
+ unsigned int tb = 0x11a7;
+ /* unsigned int lc = 19; */
+ unsigned int vc = 21;
+ unsigned int tc = 28;
+ unsigned int nc = (vc * tc);
+ /* unsigned int sc = (lc * nc); */
+ unsigned int unichar;
+ unsigned int mapping[4];
+ unsigned int *um;
+ int count;
+ int i;
+
+ if (verbose > 0)
+ printf("Decomposing hangul\n");
+ /* Hangul */
+ count = 0;
+ for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
+ unsigned int si = unichar - sb;
+ unsigned int li = si / nc;
+ unsigned int vi = (si % nc) / tc;
+ unsigned int ti = si % tc;
+
+ i = 0;
+ mapping[i++] = lb + li;
+ mapping[i++] = vb + vi;
+ if (ti)
+ mapping[i++] = tb + ti;
+ mapping[i++] = 0;
+
+ assert(!unicode_data[unichar].utf32nfkdi);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+
+ assert(!unicode_data[unichar].utf32nfkdicf);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+
+ /*
+ * Add a cookie as a reminder that the hangul syllable
+ * decompositions must not be stored in the generated
+ * trie.
+ */
+ unicode_data[unichar].utf8nfkdi = malloc(2);
+ unicode_data[unichar].utf8nfkdi[0] = HANGUL;
+ unicode_data[unichar].utf8nfkdi[1] = '\0';
+
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+
+ count++;
+ }
+ if (verbose > 0)
+ printf("Created %d entries\n", count);
+}
+
+static void nfkdi_decompose(void)
+{
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ unsigned int *um;
+ unsigned int *dc;
+ int count;
+ int i;
+ int j;
+ int ret;
+
+ if (verbose > 0)
+ printf("Decomposing nfkdi\n");
+
+ count = 0;
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (!unicode_data[unichar].utf32nfkdi)
+ continue;
+ for (;;) {
+ ret = 1;
+ i = 0;
+ um = unicode_data[unichar].utf32nfkdi;
+ while (*um) {
+ dc = unicode_data[*um].utf32nfkdi;
+ if (dc) {
+ for (j = 0; dc[j]; j++)
+ mapping[i++] = dc[j];
+ ret = 0;
+ } else {
+ mapping[i++] = *um;
+ }
+ um++;
+ }
+ mapping[i++] = 0;
+ if (ret)
+ break;
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+ }
+ /* Add this decomposition to nfkdicf if there is no entry. */
+ if (!unicode_data[unichar].utf32nfkdicf) {
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+ }
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+ count++;
+ }
+ if (verbose > 0)
+ printf("Processed %d entries\n", count);
+}
+
+static void nfkdicf_decompose(void)
+{
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ unsigned int *um;
+ unsigned int *dc;
+ int count;
+ int i;
+ int j;
+ int ret;
+
+ if (verbose > 0)
+ printf("Decomposing nfkdicf\n");
+ count = 0;
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (!unicode_data[unichar].utf32nfkdicf)
+ continue;
+ for (;;) {
+ ret = 1;
+ i = 0;
+ um = unicode_data[unichar].utf32nfkdicf;
+ while (*um) {
+ dc = unicode_data[*um].utf32nfkdicf;
+ if (dc) {
+ for (j = 0; dc[j]; j++)
+ mapping[i++] = dc[j];
+ ret = 0;
+ } else {
+ mapping[i++] = *um;
+ }
+ um++;
+ }
+ mapping[i++] = 0;
+ if (ret)
+ break;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+ }
+ if (verbose > 1)
+ print_utf32nfkdicf(unichar);
+ count++;
+ }
+ if (verbose > 0)
+ printf("Processed %d entries\n", count);
+}
+
+/* ------------------------------------------------------------------ */
+
+int utf8agemax(struct tree *, const char *);
+int utf8nagemax(struct tree *, const char *, size_t);
+int utf8agemin(struct tree *, const char *);
+int utf8nagemin(struct tree *, const char *, size_t);
+ssize_t utf8len(struct tree *, const char *);
+ssize_t utf8nlen(struct tree *, const char *, size_t);
+struct utf8cursor;
+int utf8cursor(struct utf8cursor *, struct tree *, const char *);
+int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
+int utf8byte(struct utf8cursor *);
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ * SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ * LVIndex = (SIndex / TCount) * TCount
+ * TIndex = (Sindex % TCount)
+ * LVPart = SBase + LVIndex
+ * TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * TIndex = (Sindex % TCount)
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ * if (TIndex == 0) {
+ * d = <LPart, VPart>
+ * } else {
+ * TPart = TBase + TIndex
+ * d = <LPart, VPart, TPart>
+ * }
+ */
+
+/* Constants */
+#define SB (0xAC00)
+#define LB (0x1100)
+#define VB (0x1161)
+#define TB (0x11A7)
+#define LC (19)
+#define VC (21)
+#define TC (28)
+#define NC (VC * TC)
+#define SC (LC * NC)
+
+/* Algorithmic decomposition of hangul syllable. */
+static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
+{
+ unsigned int si;
+ unsigned int li;
+ unsigned int vi;
+ unsigned int ti;
+ unsigned char *h;
+
+ /* Calculate the SI, LI, VI, and TI values. */
+ si = utf8decode(str) - SB;
+ li = si / NC;
+ vi = (si % NC) / TC;
+ ti = si % TC;
+
+ /* Fill in base of leaf. */
+ h = hangul;
+ LEAF_GEN(h) = 2;
+ LEAF_CCC(h) = DECOMPOSE;
+ h += 2;
+
+ /* Add LPart, a 3-byte UTF-8 sequence. */
+ h += utf8encode((char *)h, li + LB);
+
+ /* Add VPart, a 3-byte UTF-8 sequence. */
+ h += utf8encode((char *)h, vi + VB);
+
+ /* Add TPart if required, also a 3-byte UTF-8 sequence. */
+ if (ti)
+ h += utf8encode((char *)h, ti + TB);
+
+ /* Terminate string. */
+ h[0] = '\0';
+
+ return hangul;
+}
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point. The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
+ const char *s, size_t len)
+{
+ utf8trie_t *trie = utf8data + tree->index;
+ int offlen;
+ int offset;
+ int mask;
+ int node;
+
+ if (!tree)
+ return NULL;
+ if (len == 0)
+ return NULL;
+ node = 1;
+ while (node) {
+ offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+ if (*trie & NEXTBYTE) {
+ if (--len == 0)
+ return NULL;
+ s++;
+ }
+ mask = 1 << (*trie & BITNUM);
+ if (*s & mask) {
+ /* Right leg */
+ if (offlen) {
+ /* Right node at offset of trie */
+ node = (*trie & RIGHTNODE);
+ offset = trie[offlen];
+ while (--offlen) {
+ offset <<= 8;
+ offset |= trie[offlen];
+ }
+ trie += offset;
+ } else if (*trie & RIGHTPATH) {
+ /* Right node after this node */
+ node = (*trie & TRIENODE);
+ trie++;
+ } else {
+ /* No right node. */
+ return NULL;
+ }
+ } else {
+ /* Left leg */
+ if (offlen) {
+ /* Left node after this node. */
+ node = (*trie & LEFTNODE);
+ trie += offlen + 1;
+ } else if (*trie & RIGHTPATH) {
+ /* No left node. */
+ return NULL;
+ } else {
+ /* Left node after this node */
+ node = (*trie & TRIENODE);
+ trie++;
+ }
+ }
+ }
+ /*
+ * Hangul decomposition is done algorithmically. These are the
+ * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
+ * always 3 bytes long, so s has been advanced twice, and the
+ * start of the sequence is at s-2.
+ */
+ if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
+ trie = utf8hangul(s - 2, hangul);
+ return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to trie_nlookup().
+ */
+static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
+ const char *s)
+{
+ return utf8nlookup(tree, hangul, s, (size_t)-1);
+}
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int utf8clen(const char *s)
+{
+ unsigned char c = *s;
+ return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int utf8agemax(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ int age = 0;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age > age)
+ age = leaf_age;
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int utf8agemin(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ int age;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ age = tree->maxage;
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age < age)
+ age = leaf_age;
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemax(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ int age = 0;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age > age)
+ age = leaf_age;
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemin(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ int leaf_age;
+ int age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ age = tree->maxage;
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age < age)
+ age = leaf_age;
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t utf8len(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ size_t ret = 0;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ if (ages[LEAF_GEN(leaf)] > tree->maxage)
+ ret += utf8clen(s);
+ else if (LEAF_CCC(leaf) == DECOMPOSE)
+ ret += strlen(LEAF_STR(leaf));
+ else
+ ret += utf8clen(s);
+ s += utf8clen(s);
+ }
+ return ret;
+}
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ size_t ret = 0;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ if (ages[LEAF_GEN(leaf)] > tree->maxage)
+ ret += utf8clen(s);
+ else if (LEAF_CCC(leaf) == DECOMPOSE)
+ ret += strlen(LEAF_STR(leaf));
+ else
+ ret += utf8clen(s);
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return ret;
+}
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+ struct tree *tree;
+ const char *s;
+ const char *p;
+ const char *ss;
+ const char *sp;
+ unsigned int len;
+ unsigned int slen;
+ short int ccc;
+ short int nccc;
+ unsigned int unichar;
+ unsigned char hangul[UTF8HANGULLEAF];
+};
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ * s : string.
+ * len : length of s.
+ * u8c : pointer to cursor.
+ * trie : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
+ size_t len)
+{
+ if (!tree)
+ return -1;
+ if (!s)
+ return -1;
+ u8c->tree = tree;
+ u8c->s = s;
+ u8c->p = NULL;
+ u8c->ss = NULL;
+ u8c->sp = NULL;
+ u8c->len = len;
+ u8c->slen = 0;
+ u8c->ccc = STOPPER;
+ u8c->nccc = STOPPER;
+ u8c->unichar = 0;
+ /* Check we didn't clobber the maximum length. */
+ if (u8c->len != len)
+ return -1;
+ /* The first byte of s may not be an utf8 continuation. */
+ if (len > 0 && (*s & 0xC0) == 0x80)
+ return -1;
+ return 0;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ * s : NUL-terminated string.
+ * u8c : pointer to cursor.
+ * trie : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
+{
+ return utf8ncursor(u8c, tree, s, (unsigned int)-1);
+}
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on success, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string. The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan. The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ * u8c->p != NULL -> a decomposition is being scanned.
+ * u8c->ss != NULL -> this is a repeating scan.
+ * u8c->ccc == -1 -> this is the first scan of a repeating scan.
+ */
+int utf8byte(struct utf8cursor *u8c)
+{
+ utf8leaf_t *leaf;
+ int ccc;
+
+ for (;;) {
+ /* Check for the end of a decomposed character. */
+ if (u8c->p && *u8c->s == '\0') {
+ u8c->s = u8c->p;
+ u8c->p = NULL;
+ }
+
+ /* Check for end-of-string. */
+ if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+ /* There is no next byte. */
+ if (u8c->ccc == STOPPER)
+ return 0;
+ /* End-of-string during a scan counts as a stopper. */
+ ccc = STOPPER;
+ goto ccc_mismatch;
+ } else if ((*u8c->s & 0xC0) == 0x80) {
+ /* This is a continuation of the current character. */
+ if (!u8c->p)
+ u8c->len--;
+ return (unsigned char)*u8c->s++;
+ }
+
+ /* Look up the data for the current character. */
+ if (u8c->p) {
+ leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+ } else {
+ leaf = utf8nlookup(u8c->tree, u8c->hangul,
+ u8c->s, u8c->len);
+ }
+
+ /* No leaf found implies that the input is a binary blob. */
+ if (!leaf)
+ return -1;
+
+ /* Characters that are too new have CCC 0. */
+ if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
+ ccc = STOPPER;
+ } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
+ u8c->len -= utf8clen(u8c->s);
+ u8c->p = u8c->s + utf8clen(u8c->s);
+ u8c->s = LEAF_STR(leaf);
+ /* Empty decomposition implies CCC 0. */
+ if (*u8c->s == '\0') {
+ if (u8c->ccc == STOPPER)
+ continue;
+ ccc = STOPPER;
+ goto ccc_mismatch;
+ }
+ leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+ ccc = LEAF_CCC(leaf);
+ }
+ u8c->unichar = utf8decode(u8c->s);
+
+ /*
+ * If this is not a stopper, then see if it updates
+ * the next canonical class to be emitted.
+ */
+ if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+ u8c->nccc = ccc;
+
+ /*
+ * Return the current byte if this is the current
+ * combining class.
+ */
+ if (ccc == u8c->ccc) {
+ if (!u8c->p)
+ u8c->len--;
+ return (unsigned char)*u8c->s++;
+ }
+
+ /* Current combining class mismatch. */
+ ccc_mismatch:
+ if (u8c->nccc == STOPPER) {
+ /*
+ * Scan forward for the first canonical class
+ * to be emitted. Save the position from
+ * which to restart.
+ */
+ assert(u8c->ccc == STOPPER);
+ u8c->ccc = MINCCC - 1;
+ u8c->nccc = ccc;
+ u8c->sp = u8c->p;
+ u8c->ss = u8c->s;
+ u8c->slen = u8c->len;
+ if (!u8c->p)
+ u8c->len -= utf8clen(u8c->s);
+ u8c->s += utf8clen(u8c->s);
+ } else if (ccc != STOPPER) {
+ /* Not a stopper, and not the ccc we're emitting. */
+ if (!u8c->p)
+ u8c->len -= utf8clen(u8c->s);
+ u8c->s += utf8clen(u8c->s);
+ } else if (u8c->nccc != MAXCCC + 1) {
+ /* At a stopper, restart for next ccc. */
+ u8c->ccc = u8c->nccc;
+ u8c->nccc = MAXCCC + 1;
+ u8c->s = u8c->ss;
+ u8c->p = u8c->sp;
+ u8c->len = u8c->slen;
+ } else {
+ /* All done, proceed from here. */
+ u8c->ccc = STOPPER;
+ u8c->nccc = STOPPER;
+ u8c->sp = NULL;
+ u8c->ss = NULL;
+ u8c->slen = 0;
+ }
+ }
+}
+
+/* ------------------------------------------------------------------ */
+
+static int normalize_line(struct tree *tree)
+{
+ char *s;
+ char *t;
+ int c;
+ struct utf8cursor u8c;
+
+ /* First test: null-terminated string. */
+ s = buf2;
+ t = buf3;
+ if (utf8cursor(&u8c, tree, s))
+ return -1;
+ while ((c = utf8byte(&u8c)) > 0)
+ if (c != (unsigned char)*t++)
+ return -1;
+ if (c < 0)
+ return -1;
+ if (*t != 0)
+ return -1;
+
+ /* Second test: length-limited string. */
+ s = buf2;
+ /* Replace NUL with a value that will cause an error if seen. */
+ s[strlen(s) + 1] = -1;
+ t = buf3;
+ if (utf8cursor(&u8c, tree, s))
+ return -1;
+ while ((c = utf8byte(&u8c)) > 0)
+ if (c != (unsigned char)*t++)
+ return -1;
+ if (c < 0)
+ return -1;
+ if (*t != 0)
+ return -1;
+
+ return 0;
+}
+
+static void normalization_test(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ struct unicode_data *data;
+ char *s;
+ char *t;
+ int ret;
+ int ignorables;
+ int tests = 0;
+ int failures = 0;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", test_name);
+ /* Step one, read data from file. */
+ file = fopen(test_name, "r");
+ if (!file)
+ open_fail(test_name, errno);
+
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+ buf0, buf1);
+ if (ret != 2 || *line == '#')
+ continue;
+ s = buf0;
+ t = buf2;
+ while (*s) {
+ unichar = strtoul(s, &s, 16);
+ t += utf8encode(t, unichar);
+ }
+ *t = '\0';
+
+ ignorables = 0;
+ s = buf1;
+ t = buf3;
+ while (*s) {
+ unichar = strtoul(s, &s, 16);
+ data = &unicode_data[unichar];
+ if (data->utf8nfkdi && !*data->utf8nfkdi)
+ ignorables = 1;
+ else
+ t += utf8encode(t, unichar);
+ }
+ *t = '\0';
+
+ tests++;
+ if (normalize_line(nfkdi_tree) < 0) {
+ printf("Line %s -> %s", buf0, buf1);
+ if (ignorables)
+ printf(" (ignorables removed)");
+ printf(" failure\n");
+ failures++;
+ }
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Ran %d tests with %d failures\n", tests, failures);
+ if (failures)
+ file_fail(test_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void write_file(void)
+{
+ FILE *file;
+ int i;
+ int j;
+ int t;
+ int gen;
+
+ if (verbose > 0)
+ printf("Writing %s\n", utf8_name);
+ file = fopen(utf8_name, "w");
+ if (!file)
+ open_fail(utf8_name, errno);
+
+ fprintf(file, "/* This file is generated code, do not edit. */\n");
+ fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
+ fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
+ fprintf(file, "#endif\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned int utf8vers = %#x;\n",
+ unicode_maxage);
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned int utf8agetab[] = {\n");
+ for (i = 0; i != ages_count; i++)
+ fprintf(file, "\t%#x%s\n", ages[i],
+ ages[i] == unicode_maxage ? "" : ",");
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
+ t = 0;
+ for (gen = 0; gen < ages_count; gen++) {
+ fprintf(file, "\t{ %#x, %d }%s\n",
+ ages[gen], trees[t].index,
+ ages[gen] == unicode_maxage ? "" : ",");
+ if (trees[t].maxage == ages[gen])
+ t += 2;
+ }
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
+ t = 1;
+ for (gen = 0; gen < ages_count; gen++) {
+ fprintf(file, "\t{ %#x, %d }%s\n",
+ ages[gen], trees[t].index,
+ ages[gen] == unicode_maxage ? "" : ",");
+ if (trees[t].maxage == ages[gen])
+ t += 2;
+ }
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
+ utf8data_size);
+ t = 0;
+ for (i = 0; i != utf8data_size; i += 16) {
+ if (i == trees[t].index) {
+ fprintf(file, "\t/* %s_%x */\n",
+ trees[t].type, trees[t].maxage);
+ if (t < trees_count-1)
+ t++;
+ }
+ fprintf(file, "\t");
+ for (j = i; j != i + 16; j++)
+ fprintf(file, "0x%.2x%s", utf8data[j],
+ (j < utf8data_size -1 ? "," : ""));
+ fprintf(file, "\n");
+ }
+ fprintf(file, "};\n");
+ fclose(file);
+}
+
+/* ------------------------------------------------------------------ */
+
+int main(int argc, char *argv[])
+{
+ unsigned int unichar;
+ int opt;
+
+ argv0 = argv[0];
+
+ while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
+ switch (opt) {
+ case 'a':
+ age_name = optarg;
+ break;
+ case 'c':
+ ccc_name = optarg;
+ break;
+ case 'd':
+ data_name = optarg;
+ break;
+ case 'f':
+ fold_name = optarg;
+ break;
+ case 'n':
+ norm_name = optarg;
+ break;
+ case 'o':
+ utf8_name = optarg;
+ break;
+ case 'p':
+ prop_name = optarg;
+ break;
+ case 't':
+ test_name = optarg;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'h':
+ help();
+ exit(0);
+ default:
+ usage();
+ }
+ }
+
+ if (verbose > 1)
+ help();
+ for (unichar = 0; unichar != 0x110000; unichar++)
+ unicode_data[unichar].code = unichar;
+ age_init();
+ ccc_init();
+ nfkdi_init();
+ nfkdicf_init();
+ ignore_init();
+ corrections_init();
+ hangul_decompose();
+ nfkdi_decompose();
+ nfkdicf_decompose();
+ utf8_init();
+ trees_init();
+ trees_populate();
+ trees_reduce();
+ trees_verify();
+ /* Prevent "unused function" warning. */
+ (void)lookup(nfkdi_tree, " ");
+ if (verbose > 2)
+ tree_walk(nfkdi_tree);
+ if (verbose > 2)
+ tree_walk(nfkdicf_tree);
+ normalization_test();
+ write_file();
+
+ return 0;
+}
diff --git a/util/subst.c b/util/subst.c
new file mode 100644
index 0000000..be2a0dd
--- /dev/null
+++ b/util/subst.c
@@ -0,0 +1,468 @@
+/*
+ * subst.c --- substitution program
+ *
+ * Subst is used as a quickie program to do @ substitutions
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#else
+#define HAVE_SYS_STAT_H
+#define HAVE_SYS_TIME_H
+#endif
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <time.h>
+#include <utime.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#else
+extern char *optarg;
+extern int optind;
+#endif
+
+
+struct subst_entry {
+ char *name;
+ char *value;
+ struct subst_entry *next;
+};
+
+static struct subst_entry *subst_table = 0;
+
+static int add_subst(char *name, char *value)
+{
+ struct subst_entry *ent = 0;
+
+ ent = (struct subst_entry *) malloc(sizeof(struct subst_entry));
+ if (!ent)
+ goto fail;
+ ent->name = (char *) malloc(strlen(name)+1);
+ if (!ent->name)
+ goto fail;
+ ent->value = (char *) malloc(strlen(value)+1);
+ if (!ent->value)
+ goto fail;
+ strcpy(ent->name, name);
+ strcpy(ent->value, value);
+ ent->next = subst_table;
+ subst_table = ent;
+ return 0;
+fail:
+ if (ent) {
+ free(ent->name);
+ free(ent);
+ }
+ return ENOMEM;
+}
+
+static struct subst_entry *fetch_subst_entry(char *name)
+{
+ struct subst_entry *ent;
+
+ for (ent = subst_table; ent; ent = ent->next) {
+ if (strcmp(name, ent->name) == 0)
+ break;
+ }
+ return ent;
+}
+
+/*
+ * Given the starting and ending position of the replacement name,
+ * check to see if it is valid, and pull it out if it is.
+ */
+static char *get_subst_symbol(const char *begin, size_t len, char prefix)
+{
+ static char replace_name[128];
+ char *cp, *start;
+
+ start = replace_name;
+ if (prefix)
+ *start++ = prefix;
+
+ if (len > sizeof(replace_name)-2)
+ return NULL;
+ memcpy(start, begin, len);
+ start[len] = 0;
+
+ /*
+ * The substitution variable must all be in the of [0-9A-Za-z_].
+ * If it isn't, this must be an invalid symbol name.
+ */
+ for (cp = start; *cp; cp++) {
+ if (!(*cp >= 'a' && *cp <= 'z') &&
+ !(*cp >= 'A' && *cp <= 'Z') &&
+ !(*cp >= '0' && *cp <= '9') &&
+ !(*cp == '_'))
+ return NULL;
+ }
+ return (replace_name);
+}
+
+static void replace_string(char *begin, char *end, char *newstr)
+{
+ int replace_len, len;
+
+ replace_len = strlen(newstr);
+ len = end - begin;
+ if (replace_len == 0)
+ memmove(begin, end+1, strlen(end)+1);
+ else if (replace_len != len+1)
+ memmove(end+(replace_len-len-1), end,
+ strlen(end)+1);
+ memcpy(begin, newstr, replace_len);
+}
+
+static void substitute_line(char *line)
+{
+ char *ptr, *name_ptr, *end_ptr;
+ struct subst_entry *ent;
+ char *replace_name;
+ size_t len;
+
+ /*
+ * Expand all @FOO@ substitutions
+ */
+ ptr = line;
+ while (ptr) {
+ name_ptr = strchr(ptr, '@');
+ if (!name_ptr)
+ break; /* No more */
+ if (*(++name_ptr) == '@') {
+ /*
+ * Handle tytso@@mit.edu --> tytso@mit.edu
+ */
+ memmove(name_ptr-1, name_ptr, strlen(name_ptr)+1);
+ ptr = name_ptr+1;
+ continue;
+ }
+ end_ptr = strchr(name_ptr, '@');
+ if (!end_ptr)
+ break;
+ len = end_ptr - name_ptr;
+ replace_name = get_subst_symbol(name_ptr, len, 0);
+ if (!replace_name) {
+ ptr = name_ptr;
+ continue;
+ }
+ ent = fetch_subst_entry(replace_name);
+ if (!ent) {
+ fprintf(stderr, "Unfound expansion: '%s'\n",
+ replace_name);
+ ptr = end_ptr + 1;
+ continue;
+ }
+#if 0
+ fprintf(stderr, "Replace name = '%s' with '%s'\n",
+ replace_name, ent->value);
+#endif
+ ptr = name_ptr-1;
+ replace_string(ptr, end_ptr, ent->value);
+ if ((ent->value[0] == '@') &&
+ (strlen(replace_name) == strlen(ent->value)-2) &&
+ !strncmp(replace_name, ent->value+1,
+ strlen(ent->value)-2))
+ /* avoid an infinite loop */
+ ptr += strlen(ent->value);
+ }
+ /*
+ * Now do a second pass to expand ${FOO}
+ */
+ ptr = line;
+ while (ptr) {
+ name_ptr = strchr(ptr, '$');
+ if (!name_ptr)
+ break; /* No more */
+ if (*(++name_ptr) != '{') {
+ ptr = name_ptr;
+ continue;
+ }
+ name_ptr++;
+ end_ptr = strchr(name_ptr, '}');
+ if (!end_ptr)
+ break;
+ len = end_ptr - name_ptr;
+ replace_name = get_subst_symbol(name_ptr, len, '$');
+ if (!replace_name) {
+ ptr = name_ptr;
+ continue;
+ }
+ ent = fetch_subst_entry(replace_name);
+ if (!ent) {
+ ptr = end_ptr + 1;
+ continue;
+ }
+#if 0
+ fprintf(stderr, "Replace name = '%s' with '%s'\n",
+ replace_name, ent->value);
+#endif
+ ptr = name_ptr-2;
+ replace_string(ptr, end_ptr, ent->value);
+ }
+}
+
+static void parse_config_file(FILE *f)
+{
+ char line[2048];
+ char *cp, *ptr;
+
+ while (!feof(f)) {
+ memset(line, 0, sizeof(line));
+ if (fgets(line, sizeof(line), f) == NULL)
+ break;
+ /*
+ * Strip newlines and comments.
+ */
+ cp = strchr(line, '\n');
+ if (cp)
+ *cp = 0;
+ cp = strchr(line, '#');
+ if (cp)
+ *cp = 0;
+ /*
+ * Skip trailing and leading whitespace
+ */
+ for (cp = line + strlen(line) - 1; cp >= line; cp--) {
+ if (*cp == ' ' || *cp == '\t')
+ *cp = 0;
+ else
+ break;
+ }
+ cp = line;
+ while (*cp && isspace(*cp))
+ cp++;
+ ptr = cp;
+ /*
+ * Skip empty lines
+ */
+ if (*ptr == 0)
+ continue;
+ /*
+ * Ignore future extensions
+ */
+ if (*ptr == '@')
+ continue;
+ /*
+ * Parse substitutions
+ */
+ for (cp = ptr; *cp; cp++)
+ if (isspace(*cp))
+ break;
+ *cp = 0;
+ for (cp++; *cp; cp++)
+ if (!isspace(*cp))
+ break;
+#if 0
+ printf("Substitute: '%s' for '%s'\n", ptr, cp ? cp : "<NULL>");
+#endif
+ add_subst(ptr, cp);
+ }
+}
+
+/*
+ * Return 0 if the files are different, 1 if the files are the same.
+ */
+static int compare_file(FILE *old_f, FILE *new_f)
+{
+ char oldbuf[2048], newbuf[2048], *oldcp, *newcp;
+ int retval;
+
+ while (1) {
+ oldcp = fgets(oldbuf, sizeof(oldbuf), old_f);
+ newcp = fgets(newbuf, sizeof(newbuf), new_f);
+ if (!oldcp && !newcp) {
+ retval = 1;
+ break;
+ }
+ if (!oldcp || !newcp || strcmp(oldbuf, newbuf)) {
+ retval = 0;
+ break;
+ }
+ }
+ return retval;
+}
+
+void set_utimes(const char *filename, int fd, const struct timeval times[2])
+{
+#ifdef HAVE_FUTIMES
+ if (futimes(fd, times) < 0)
+ perror("futimes");
+#elif HAVE_UTIMES
+ if (utimes(filename, times) < 0)
+ perror("utimes");
+#else
+ struct utimbuf ut;
+
+ ut.actime = times[0].tv_sec;
+ ut.modtime = times[1].tv_sec;
+ if (utime(filename, &ut) < 0)
+ perror("utime");
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+ char line[2048];
+ int c;
+ int fd, ofd = -1;
+ FILE *in, *out, *old = NULL;
+ char *outfn = NULL, *newfn = NULL;
+ int verbose = 0;
+ int adjust_timestamp = 0;
+ int got_atime = 0;
+ struct stat stbuf;
+ struct timeval tv[2];
+
+ while ((c = getopt (argc, argv, "f:tv")) != EOF) {
+ switch (c) {
+ case 'f':
+ in = fopen(optarg, "r");
+ if (!in) {
+ perror(optarg);
+ exit(1);
+ }
+ parse_config_file(in);
+ fclose(in);
+ break;
+ case 't':
+ adjust_timestamp++;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ fprintf(stderr, "%s: [-f config-file] [file]\n",
+ argv[0]);
+ break;
+ }
+ }
+ if (optind < argc) {
+ in = fopen(argv[optind], "r");
+ if (!in) {
+ perror(argv[optind]);
+ exit(1);
+ }
+ optind++;
+ } else
+ in = stdin;
+
+ if (optind < argc) {
+ outfn = argv[optind];
+ newfn = (char *) malloc(strlen(outfn)+20);
+ if (!newfn) {
+ fprintf(stderr, "Memory error! Exiting.\n");
+ exit(1);
+ }
+ strcpy(newfn, outfn);
+ strcat(newfn, ".new");
+ ofd = open(newfn, O_CREAT|O_TRUNC|O_RDWR, 0644);
+ if (ofd < 0) {
+ perror(newfn);
+ exit(1);
+ }
+ out = fdopen(ofd, "w+");
+ if (!out) {
+ perror("fdopen");
+ exit(1);
+ }
+
+ fd = open(outfn, O_RDONLY);
+ if (fd > 0) {
+ /* save the original atime, if possible */
+ if (fstat(fd, &stbuf) == 0) {
+#if HAVE_STRUCT_STAT_ST_ATIM
+ tv[0].tv_sec = stbuf.st_atim.tv_sec;
+ tv[0].tv_usec = stbuf.st_atim.tv_nsec / 1000;
+#else
+ tv[0].tv_sec = stbuf.st_atime;
+ tv[0].tv_usec = 0;
+#endif
+ got_atime = 1;
+ }
+ old = fdopen(fd, "r");
+ if (!old)
+ close(fd);
+ }
+ } else {
+ out = stdout;
+ outfn = 0;
+ }
+
+ while (!feof(in)) {
+ if (fgets(line, sizeof(line), in) == NULL)
+ break;
+ substitute_line(line);
+ fputs(line, out);
+ }
+ fclose(in);
+ if (outfn) {
+ fflush(out);
+ rewind(out);
+ if (old && compare_file(old, out)) {
+ if (verbose)
+ printf("No change, keeping %s.\n", outfn);
+ if (adjust_timestamp) {
+ if (verbose)
+ printf("Updating modtime for %s\n", outfn);
+ if (gettimeofday(&tv[1], NULL) < 0) {
+ perror("gettimeofday");
+ exit(1);
+ }
+ if (got_atime == 0)
+ tv[0] = tv[1];
+ else if (verbose)
+ printf("Using original atime\n");
+ set_utimes(outfn, fileno(old), tv);
+ }
+#ifndef _WIN32
+ if (ofd >= 0)
+ (void) fchmod(ofd, 0444);
+#endif
+ fclose(out);
+ if (unlink(newfn) < 0)
+ perror("unlink");
+ } else {
+ if (verbose)
+ printf("Creating or replacing %s.\n", outfn);
+#ifndef _WIN32
+ if (ofd >= 0)
+ (void) fchmod(ofd, 0444);
+#endif
+ fclose(out);
+ if (old)
+ fclose(old);
+ old = NULL;
+ if (rename(newfn, outfn) < 0) {
+ perror("rename");
+ exit(1);
+ }
+ }
+ }
+ if (old)
+ fclose(old);
+ if (newfn)
+ free(newfn);
+ return (0);
+}
+
+
diff --git a/util/subst.conf.in b/util/subst.conf.in
new file mode 100644
index 0000000..0da4554
--- /dev/null
+++ b/util/subst.conf.in
@@ -0,0 +1,26 @@
+AWK @AWK@
+SED @SED@
+E2FSPROGS_MONTH @E2FSPROGS_MONTH@
+E2FSPROGS_YEAR @E2FSPROGS_YEAR@
+E2FSPROGS_DATE @E2FSPROGS_DATE@
+E2FSPROGS_VERSION @E2FSPROGS_VERSION@
+SIZEOF_LONG_LONG @SIZEOF_LONG_LONG@
+SIZEOF_LONG @SIZEOF_LONG@
+SIZEOF_INT @SIZEOF_INT@
+SIZEOF_SHORT @SIZEOF_SHORT@
+datarootdir @datarootdir@
+datadir @datadir@
+root_sysconfdir @root_sysconfdir@
+$datarootdir @datarootdir@
+$root_prefix @root_prefix@
+$prefix @prefix@
+# Enable the documentation for the journal device mke2fs, tune2fs, and
+# e2fsck's man page
+JDEV
+# Enable the documentation for the tdb profile in e2fsck.conf's man page
+TDB_MAN_COMMENT @TDB_MAN_COMMENT@
+root_sbindir @root_sbindir@
+root_bindir @root_bindir@
+libdir @libdir@
+$exec_prefix @exec_prefix@
+pkglibdir @libdir@/e2fsprogs
diff --git a/util/symlinks.c b/util/symlinks.c
new file mode 100644
index 0000000..e9d2b01
--- /dev/null
+++ b/util/symlinks.c
@@ -0,0 +1,391 @@
+#define _FILE_OFFSET_BITS 64
+#ifndef _LARGEFILE_SOURCE
+#define _LARGEFILE_SOURCE
+#endif
+#ifndef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_SOURCE
+#endif
+
+#include <unistd.h>
+#ifndef _POSIX_SOURCE
+#define _POSIX_SOURCE
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#include <string.h>
+#include <fcntl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <time.h>
+#include <stddef.h>
+#include <errno.h>
+
+#ifndef S_ISLNK
+#define S_ISLNK(mode) (((mode) & (_S_IFMT)) == (_S_IFLNK))
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 1024
+#endif
+
+#define progver "%s: scan/change symbolic links - v1.3 - by Mark Lord\n\n"
+static char *progname;
+static int verbose = 0, fix_links = 0, recurse = 0, delete = 0, shorten = 0,
+ testing = 0, single_fs = 1;
+
+/*
+ * tidypath removes excess slashes and "." references from a path string
+ */
+
+static int substr (char *s, char *old, char *new)
+{
+ char *tmp = NULL;
+ int oldlen = strlen(old), newlen = 0;
+
+ if (NULL == strstr(s, old))
+ return 0;
+
+ if (new)
+ newlen = strlen(new);
+
+ if (newlen > oldlen) {
+ if ((tmp = malloc(strlen(s))) == NULL) {
+ fprintf(stderr, "no memory\n");
+ exit (1);
+ }
+ }
+
+ while (NULL != (s = strstr(s, old))) {
+ char *p, *old_s = s;
+
+ if (new) {
+ if (newlen > oldlen)
+ old_s = strcpy(tmp, s);
+ p = new;
+ while (*p)
+ *s++ = *p++;
+ }
+ p = old_s + oldlen;
+ while ((*s++ = *p++));
+ }
+ if (tmp)
+ free(tmp);
+ return 1;
+}
+
+
+static int tidy_path (char *path)
+{
+ int tidied = 0;
+ char *s, *p;
+
+ s = path + strlen(path) - 1;
+ if (s[0] != '/') { /* tmp trailing slash simplifies things */
+ s[1] = '/';
+ s[2] = '\0';
+ }
+ while (substr(path, "/./", "/"))
+ tidied = 1;
+ while (substr(path, "//", "/"))
+ tidied = 1;
+
+ while ((p = strstr(path,"/../")) != NULL) {
+ s = p+3;
+ for (p--; p != path; p--) if (*p == '/') break;
+ if (*p != '/')
+ break;
+ while ((*p++ = *s++));
+ tidied = 1;
+ }
+ if (*path == '\0')
+ strcpy(path,"/");
+ p = path + strlen(path) - 1;
+ if (p != path && *p == '/')
+ *p-- = '\0'; /* remove tmp trailing slash */
+ while (p != path && *p == '/') { /* remove any others */
+ *p-- = '\0';
+ tidied = 1;
+ }
+ while (!strncmp(path,"./",2)) {
+ for (p = path, s = path+2; (*p++ = *s++););
+ tidied = 1;
+ }
+ return tidied;
+}
+
+static int shorten_path (char *path, char *abspath)
+{
+ static char dir[PATH_MAX];
+ int shortened = 0;
+ char *p;
+
+ /* get rid of unnecessary "../dir" sequences */
+ while (abspath && strlen(abspath) > 1 && (p = strstr(path,"../"))) {
+ /* find innermost occurrence of "../dir", and save "dir" */
+ int slashes = 2;
+ char *a, *s, *d = dir;
+ while ((s = strstr(p+3, "../"))) {
+ ++slashes;
+ p = s;
+ }
+ s = p+3;
+ *d++ = '/';
+ while (*s && *s != '/')
+ *d++ = *s++;
+ *d++ = '/';
+ *d = '\0';
+ if (!strcmp(dir,"//"))
+ break;
+ /* note: p still points at ../dir */
+ if (*s != '/' || !*++s)
+ break;
+ a = abspath + strlen(abspath) - 1;
+ while (slashes-- > 0) {
+ if (a <= abspath)
+ goto ughh;
+ while (*--a != '/') {
+ if (a <= abspath)
+ goto ughh;
+ }
+ }
+ if (strncmp(dir, a, strlen(dir)))
+ break;
+ while ((*p++ = *s++)); /* delete the ../dir */
+ shortened = 1;
+ }
+ughh:
+ return shortened;
+}
+
+
+static void fix_symlink (char *path, dev_t my_dev)
+{
+ static char lpath[PATH_MAX], new[PATH_MAX], abspath[PATH_MAX];
+ char *p, *np, *lp, *tail, *msg;
+ struct stat stbuf, lstbuf;
+ int c, fix_abs = 0, fix_messy = 0, fix_long = 0;
+
+ if ((c = readlink(path, lpath, sizeof(lpath) - 1)) == -1) {
+ perror(path);
+ return;
+ }
+ lpath[c] = '\0'; /* readlink does not null terminate it */
+
+ /* construct the absolute address of the link */
+ abspath[0] = '\0';
+ if (lpath[0] != '/') {
+ strcat(abspath,path);
+ c = strlen(abspath);
+ if ((c > 0) && (abspath[c-1] == '/'))
+ abspath[c-1] = '\0'; /* cut trailing / */
+ if ((p = strrchr(abspath,'/')) != NULL)
+ *p = '\0'; /* cut last component */
+ strcat(abspath,"/");
+ }
+ strcat(abspath,lpath);
+ (void) tidy_path(abspath);
+
+ /* check for various things */
+ if (stat(abspath, &stbuf) == -1) {
+ printf("dangling: %s -> %s\n", path, lpath);
+ if (delete) {
+ if (unlink (path)) {
+ perror(path);
+ } else
+ printf("deleted: %s -> %s\n", path, lpath);
+ }
+ return;
+ }
+
+ if (single_fs)
+ lstat(abspath, &lstbuf); /* if the above didn't fail, then this shouldn't */
+
+ if (single_fs && lstbuf.st_dev != my_dev) {
+ msg = "other_fs:";
+ } else if (lpath[0] == '/') {
+ msg = "absolute:";
+ fix_abs = 1;
+ } else if (verbose) {
+ msg = "relative:";
+ } else
+ msg = NULL;
+ fix_messy = tidy_path(strcpy(new,lpath));
+ if (shorten)
+ fix_long = shorten_path(new, path);
+ if (!fix_abs) {
+ if (fix_messy)
+ msg = "messy: ";
+ else if (fix_long)
+ msg = "lengthy: ";
+ }
+ if (msg != NULL)
+ printf("%s %s -> %s\n", msg, path, lpath);
+ if (!(fix_links || testing) || !(fix_messy || fix_abs || fix_long))
+ return;
+
+ if (fix_abs) {
+ /* convert an absolute link to relative: */
+ /* point tail at first part of lpath that differs from path */
+ /* point p at first part of path that differs from lpath */
+ (void) tidy_path(lpath);
+ tail = lp = lpath;
+ p = path;
+ while (*p && (*p == *lp)) {
+ if (*lp++ == '/') {
+ tail = lp;
+ while (*++p == '/');
+ }
+ }
+
+ /* now create new, with "../"s followed by tail */
+ np = new;
+ while (*p) {
+ if (*p++ == '/') {
+ *np++ = '.';
+ *np++ = '.';
+ *np++ = '/';
+ while (*p == '/') ++p;
+ }
+ }
+ strcpy (np, tail);
+ (void) tidy_path(new);
+ if (shorten) (void) shorten_path(new, path);
+ }
+ shorten_path(new,path);
+ if (!testing) {
+ if (unlink (path)) {
+ perror(path);
+ return;
+ }
+ if (symlink(new, path)) {
+ perror(path);
+ return;
+ }
+ }
+ printf("changed: %s -> %s\n", path, new);
+}
+
+static void dirwalk (char *path, int pathlen, dev_t dev)
+{
+ char *name;
+ DIR *dfd;
+ static struct stat st;
+ static struct dirent *dp;
+
+ if ((dfd = opendir(path)) == NULL) {
+ perror(path);
+ return;
+ }
+
+ name = path + pathlen;
+ if (*(name-1) != '/')
+ *name++ = '/';
+
+ while ((dp = readdir(dfd)) != NULL ) {
+ strcpy(name, dp->d_name);
+ if (strcmp(name, ".") && strcmp(name,"..")) {
+ if (lstat(path, &st) == -1) {
+ perror(path);
+ } else if (st.st_dev == dev) {
+ if (S_ISLNK(st.st_mode)) {
+ fix_symlink (path, dev);
+ } else if (recurse && S_ISDIR(st.st_mode)) {
+ dirwalk(path, strlen(path), dev);
+ }
+ }
+ }
+ }
+ closedir(dfd);
+ path[pathlen] = '\0';
+}
+
+static void usage_error (void)
+{
+ fprintf(stderr, progver, progname);
+ fprintf(stderr, "Usage:\t%s [-cdorstv] LINK|DIR ...\n\n", progname);
+ fprintf(stderr, "Flags:"
+ "\t-c == change absolute/messy links to relative\n"
+ "\t-d == delete dangling links\n"
+ "\t-o == warn about links across file systems\n"
+ "\t-r == recurse into subdirs\n"
+ "\t-s == shorten lengthy links (displayed in output only when -c not specified)\n"
+ "\t-t == show what would be done by -c\n"
+ "\t-v == verbose (show all symlinks)\n\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+#if defined (_GNU_SOURCE) && defined (__GLIBC__)
+ static char path[PATH_MAX+2];
+ char* cwd = get_current_dir_name();
+#else
+ static char path[PATH_MAX+2], cwd[PATH_MAX+2];
+#endif
+ int dircount = 0;
+ char c, *p;
+
+ if ((progname = (char *) strrchr(*argv, '/')) == NULL)
+ progname = *argv;
+ else
+ progname++;
+
+#if defined (_GNU_SOURCE) && defined (__GLIBC__)
+ if (NULL == cwd) {
+ fprintf(stderr,"get_current_dir_name() failed\n");
+#else
+ if (NULL == getcwd(cwd,PATH_MAX)) {
+ fprintf(stderr,"getcwd() failed\n");
+#endif
+ exit (1);
+ }
+#if defined (_GNU_SOURCE) && defined (__GLIBC__)
+ cwd = realloc(cwd, strlen(cwd)+2);
+ if (cwd == NULL) {
+ fprintf(stderr, "realloc() failed\n");
+ exit (1);
+ }
+#endif
+ if (!*cwd || cwd[strlen(cwd)-1] != '/')
+ strcat(cwd,"/");
+
+ while (--argc) {
+ p = *++argv;
+ if (*p == '-') {
+ if (*++p == '\0')
+ usage_error();
+ while ((c = *p++)) {
+ if (c == 'c') fix_links = 1;
+ else if (c == 'd') delete = 1;
+ else if (c == 'o') single_fs = 0;
+ else if (c == 'r') recurse = 1;
+ else if (c == 's') shorten = 1;
+ else if (c == 't') testing = 1;
+ else if (c == 'v') verbose = 1;
+ else usage_error();
+ }
+ } else {
+ struct stat st;
+ if (*p == '/')
+ *path = '\0';
+ else
+ strcpy(path,cwd);
+ tidy_path(strcat(path, p));
+ if (lstat(path, &st) == -1)
+ perror(path);
+ else if (S_ISLNK(st.st_mode))
+ fix_symlink(path, st.st_dev);
+ else
+ dirwalk(path, strlen(path), st.st_dev);
+ ++dircount;
+ }
+ }
+ if (dircount == 0)
+ usage_error();
+ exit (0);
+}
diff --git a/util/ucd/README b/util/ucd/README
new file mode 100644
index 0000000..9fed514
--- /dev/null
+++ b/util/ucd/README
@@ -0,0 +1,37 @@
+The files in this directory are part of the Unicode Character Database
+for version 11.0.0 of the Unicode standard.
+
+The full set of UCD files are not distributed with e2fsprogs, since they
+are very large and only needed to regenerate the lib/ext2fs/utf8data.h
+during an Unicode version update. They can be found in the link below
+and also in the Linux kernel source tree:
+
+ http://www.unicode.org/Public/11.0.0/ucd/
+
+The latest released version of the UCD can be found here:
+
+ http://www.unicode.org/Public/UCD/latest/
+
+The files in this directory are identical, except that they have been
+renamed with a suffix indicating the unicode version.
+
+Individual source links:
+
+ http://www.unicode.org/Public/11.0.0/ucd/CaseFolding.txt
+ http://www.unicode.org/Public/11.0.0/ucd/DerivedAge.txt
+ http://www.unicode.org/Public/11.0.0/ucd/extracted/DerivedCombiningClass.txt
+ http://www.unicode.org/Public/11.0.0/ucd/DerivedCoreProperties.txt
+ http://www.unicode.org/Public/11.0.0/ucd/NormalizationCorrections.txt
+ http://www.unicode.org/Public/11.0.0/ucd/NormalizationTest.txt
+ http://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt
+
+md5sums
+
+ 414436796cf097df55f798e1585448ee CaseFolding-11.0.0.txt
+ 6032a595fbb782694456491d86eecfac DerivedAge-11.0.0.txt
+ 3240997d671297ac754ab0d27577acf7 DerivedCombiningClass-11.0.0.txt
+ d41d8cd98f00b204e9800998ecf8427e DerivedCombiningClass.txt
+ 2a4fe257d9d8184518e036194d2248ec DerivedCoreProperties-11.0.0.txt
+ 4e7d383fa0dd3cd9d49d64e5b7b7c9e0 NormalizationCorrections-11.0.0.txt
+ c9500c5b8b88e584469f056023ecc3f2 NormalizationTest-11.0.0.txt
+ acc291106c3758d2025f8d7bd5518bee UnicodeData-11.0.0.txt