summaryrefslogtreecommitdiffstats
path: root/lib/util/charset/charset_macosxfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/util/charset/charset_macosxfs.c')
-rw-r--r--lib/util/charset/charset_macosxfs.c605
1 files changed, 605 insertions, 0 deletions
diff --git a/lib/util/charset/charset_macosxfs.c b/lib/util/charset/charset_macosxfs.c
new file mode 100644
index 0000000..75dbb4b
--- /dev/null
+++ b/lib/util/charset/charset_macosxfs.c
@@ -0,0 +1,605 @@
+/*
+ Unix SMB/CIFS implementation.
+ Samba charset module for Mac OS X/Darwin
+ Copyright (C) Benjamin Riefenstahl 2003
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * modules/charset_macosxfs.c
+ *
+ * A Samba charset module to use on Mac OS X/Darwin as the filesystem
+ * and display encoding.
+ *
+ * Actually two implementations are provided here. The default
+ * implementation is based on the official CFString API. The other is
+ * based on internal CFString APIs as defined in the OpenDarwin
+ * source.
+ */
+
+#include "replace.h"
+#include "charset.h"
+#include "charset_proto.h"
+#include "lib/util/debug.h"
+#undef realloc
+
+#ifdef DARWINOS
+
+/*
+ * Include OS frameworks. These are only needed in this module.
+ */
+#include <CoreFoundation/CFString.h>
+
+/*
+ * See if autoconf has found us the internal headers in some form.
+ */
+#if defined(HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H)
+# include <CoreFoundation/CFStringEncodingConverter.h>
+# include <CoreFoundation/CFUnicodePrecomposition.h>
+# define USE_INTERNAL_API 1
+#elif defined(HAVE_CFSTRINGENCODINGCONVERTER_H)
+# include <CFStringEncodingConverter.h>
+# include <CFUnicodePrecomposition.h>
+# define USE_INTERNAL_API 1
+#endif
+
+/*
+ * Compile time configuration: Do we want debug output?
+ */
+/* #define DEBUG_STRINGS 1 */
+
+/*
+ * A simple, but efficient memory provider for our buffers.
+ */
+static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
+{
+ if (newsize > *size) {
+ *size = newsize + 128;
+ buffer = realloc(buffer, *size);
+ }
+ return buffer;
+}
+
+/*
+ * While there is a version of OpenDarwin for intel, the usual case is
+ * big-endian PPC. So we need byte swapping to handle the
+ * little-endian byte order of the network protocol. We also need an
+ * additional dynamic buffer to do this work for incoming data blocks,
+ * because we have to consider the original data as constant.
+ *
+ * We abstract the differences away by providing a simple facade with
+ * these functions/macros:
+ *
+ * le_to_native(dst,src,len)
+ * native_to_le(cp,len)
+ * set_ucbuffer_with_le(buffer,bufsize,data,size)
+ * set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
+ */
+#ifdef WORDS_BIGENDIAN
+
+static inline void swap_bytes (char * dst, const char * src, size_t len)
+{
+ const char *srcend = src + len;
+ while (src < srcend) {
+ dst[0] = src[1];
+ dst[1] = src[0];
+ dst += 2;
+ src += 2;
+ }
+}
+static inline void swap_bytes_inplace (char * cp, size_t len)
+{
+ char temp;
+ char *end = cp + len;
+ while (cp < end) {
+ temp = cp[1];
+ cp[1] = cp[0];
+ cp[0] = temp;
+ cp += 2;
+ }
+}
+
+#define le_to_native(dst,src,len) swap_bytes(dst,src,len)
+#define native_to_le(cp,len) swap_bytes_inplace(cp,len)
+#define set_ucbuffer_with_le(buffer,bufsize,data,size) \
+ set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
+
+#else /* ! WORDS_BIGENDIAN */
+
+#define le_to_native(dst,src,len) memcpy(dst,src,len)
+#define native_to_le(cp,len) /* nothing */
+#define set_ucbuffer_with_le(buffer,bufsize,data,size) \
+ (((void)(bufsize)),(UniChar*)(data))
+
+#endif
+
+static inline UniChar *set_ucbuffer_with_le_copy (
+ UniChar *buffer, size_t *bufsize,
+ const void *data, size_t size, size_t reserve)
+{
+ buffer = resize_buffer(buffer, bufsize, size+reserve);
+ le_to_native((char*)buffer,data,size);
+ return buffer;
+}
+
+
+/*
+ * A simple hexdump function for debugging error conditions.
+ */
+#define debug_out(s) DEBUG(0,(s))
+
+#ifdef DEBUG_STRINGS
+
+static void hexdump( const char * label, const char * s, size_t len )
+{
+ size_t restlen = len;
+ debug_out("<<<<<<<\n");
+ debug_out(label);
+ debug_out("\n");
+ while (restlen > 0) {
+ char line[100];
+ size_t i, j;
+ char * d = line;
+#undef sprintf
+ d += sprintf(d, "%04X ", (unsigned)(len-restlen));
+ *d++ = ' ';
+ for( i = 0; i<restlen && i<8; ++i ) {
+ d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
+ }
+ for( j = i; j<8; ++j ) {
+ d += sprintf(d, " ");
+ }
+ *d++ = ' ';
+ for( i = 8; i<restlen && i<16; ++i ) {
+ d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
+ }
+ for( j = i; j<16; ++j ) {
+ d += sprintf(d, " ");
+ }
+ *d++ = ' ';
+ for( i = 0; i<restlen && i<16; ++i ) {
+ if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
+ *d++ = '.';
+ else
+ *d++ = s[i];
+ }
+ *d++ = '\n';
+ *d = 0;
+ restlen -= i;
+ s += i;
+ debug_out(line);
+ }
+ debug_out(">>>>>>>\n");
+}
+
+#else /* !DEBUG_STRINGS */
+
+#define hexdump(label,s,len) /* nothing */
+
+#endif
+
+
+#if !USE_INTERNAL_API
+
+/*
+ * An implementation based on documented Mac OS X APIs.
+ *
+ * This does a certain amount of memory management, creating and
+ * manipulating CFString objects. We try to minimize the impact by
+ * keeping those objects around and re-using them. We also use
+ * external backing store for the CFStrings where this is possible and
+ * benficial.
+ *
+ * The Unicode normalizations forms available at this level are
+ * generic, not specifically for the file system. So they may not be
+ * perfect fits.
+ */
+size_t macosxfs_encoding_pull(
+ void *cd, /* Encoder handle */
+ const char **inbuf, size_t *inbytesleft, /* Script string */
+ char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */
+{
+ static const int script_code = kCFStringEncodingUTF8;
+ static CFMutableStringRef cfstring = NULL;
+ size_t outsize;
+ CFRange range;
+
+ (void) cd; /* UNUSED */
+
+ if (0 == *inbytesleft) {
+ return 0;
+ }
+
+ if (NULL == cfstring) {
+ /*
+ * A version with an external backing store as in the
+ * push function should have been more efficient, but
+ * testing shows, that it is actually slower (!).
+ * Maybe kCFAllocatorDefault gets shortcut evaluation
+ * internally, while kCFAllocatorNull doesn't.
+ */
+ cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
+ }
+
+ /*
+ * Three methods of appending to a CFString, choose the most
+ * efficient.
+ */
+ if (0 == (*inbuf)[*inbytesleft-1]) {
+ CFStringAppendCString(cfstring, *inbuf, script_code);
+ } else if (*inbytesleft <= 255) {
+ Str255 buffer;
+ buffer[0] = *inbytesleft;
+ memcpy(buffer+1, *inbuf, buffer[0]);
+ CFStringAppendPascalString(cfstring, buffer, script_code);
+ } else {
+ /*
+ * We would like to use a fixed buffer and a loop
+ * here, but than we can't garantee that the input is
+ * well-formed UTF-8, as we are supposed to do.
+ */
+ static char *buffer = NULL;
+ static size_t buflen = 0;
+ buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
+ memcpy(buffer, *inbuf, *inbytesleft);
+ buffer[*inbytesleft] = 0;
+ CFStringAppendCString(cfstring, *inbuf, script_code);
+ }
+
+ /*
+ * Compose characters, using the non-canonical composition
+ * form.
+ */
+ CFStringNormalize(cfstring, kCFStringNormalizationFormC);
+
+ outsize = CFStringGetLength(cfstring);
+ range = CFRangeMake(0,outsize);
+
+ if (outsize == 0) {
+ /*
+ * HACK: smbd/mangle_hash2.c:is_legal_name() expects
+ * errors here. That function will always pass 2
+ * characters. smbd/open.c:check_for_pipe() cuts a
+ * patchname to 10 characters blindly. Suppress the
+ * debug output in those cases.
+ */
+ if(2 != *inbytesleft && 10 != *inbytesleft) {
+ debug_out("String conversion: "
+ "An unknown error occurred\n");
+ hexdump("UTF8->UTF16LE (old) input",
+ *inbuf, *inbytesleft);
+ }
+ errno = EILSEQ; /* Not sure, but this is what we have
+ * actually seen. */
+ return -1;
+ }
+ if (outsize*2 > *outbytesleft) {
+ CFStringDelete(cfstring, range);
+ debug_out("String conversion: "
+ "Output buffer too small\n");
+ hexdump("UTF8->UTF16LE (old) input",
+ *inbuf, *inbytesleft);
+ errno = E2BIG;
+ return -1;
+ }
+
+ CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
+ CFStringDelete(cfstring, range);
+
+ native_to_le(*outbuf, outsize*2);
+
+ /*
+ * Add a converted null byte, if the CFString conversions
+ * prevented that until now.
+ */
+ if (0 == (*inbuf)[*inbytesleft-1] &&
+ (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
+
+ if ((outsize*2+2) > *outbytesleft) {
+ debug_out("String conversion: "
+ "Output buffer too small\n");
+ hexdump("UTF8->UTF16LE (old) input",
+ *inbuf, *inbytesleft);
+ errno = E2BIG;
+ return -1;
+ }
+
+ (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
+ outsize += 2;
+ }
+
+ *inbuf += *inbytesleft;
+ *inbytesleft = 0;
+ *outbuf += outsize*2;
+ *outbytesleft -= outsize*2;
+
+ return 0;
+}
+
+size_t macosxfs_encoding_push(
+ void *cd, /* Encoder handle */
+ const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
+ char **outbuf, size_t *outbytesleft) /* Script string */
+{
+ static const int script_code = kCFStringEncodingUTF8;
+ static CFMutableStringRef cfstring = NULL;
+ static UniChar *buffer = NULL;
+ static size_t buflen = 0;
+ CFIndex outsize, cfsize, charsconverted;
+
+ (void) cd; /* UNUSED */
+
+ if (0 == *inbytesleft) {
+ return 0;
+ }
+
+ /*
+ * We need a buffer that can hold 4 times the original data,
+ * because that is the theoretical maximum that decomposition
+ * can create currently (in Unicode 4.0).
+ */
+ buffer = set_ucbuffer_with_le_copy(
+ buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
+
+ if (NULL == cfstring) {
+ cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
+ kCFAllocatorDefault,
+ buffer, *inbytesleft/2, buflen/2,
+ kCFAllocatorNull);
+ } else {
+ CFStringSetExternalCharactersNoCopy(
+ cfstring,
+ buffer, *inbytesleft/2, buflen/2);
+ }
+
+ /*
+ * Decompose characters, using the non-canonical decomposition
+ * form.
+ *
+ * NB: This isn't exactly what HFS+ wants (see note on
+ * kCFStringEncodingUseHFSPlusCanonical in
+ * CFStringEncodingConverter.h), but AFAIK it's the best that
+ * the official API can do.
+ */
+ CFStringNormalize(cfstring, kCFStringNormalizationFormD);
+
+ cfsize = CFStringGetLength(cfstring);
+ charsconverted = CFStringGetBytes(
+ cfstring, CFRangeMake(0,cfsize),
+ script_code, 0, false,
+ *(UInt8 **)outbuf, *outbytesleft, &outsize);
+
+ if (0 == charsconverted) {
+ debug_out("String conversion: "
+ "Buffer too small or not convertable\n");
+ hexdump("UTF16LE->UTF8 (old) input",
+ *inbuf, *inbytesleft);
+ errno = EILSEQ; /* Probably more likely. */
+ return -1;
+ }
+
+ /*
+ * Add a converted null byte, if the CFString conversions
+ * prevented that until now.
+ */
+ if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
+ (0 != (*outbuf)[outsize-1])) {
+
+ if (((size_t)outsize+1) > *outbytesleft) {
+ debug_out("String conversion: "
+ "Output buffer too small\n");
+ hexdump("UTF16LE->UTF8 (old) input",
+ *inbuf, *inbytesleft);
+ errno = E2BIG;
+ return -1;
+ }
+
+ (*outbuf)[outsize] = 0;
+ ++outsize;
+ }
+
+ *inbuf += *inbytesleft;
+ *inbytesleft = 0;
+ *outbuf += outsize;
+ *outbytesleft -= outsize;
+
+ return 0;
+}
+
+#else /* USE_INTERNAL_API */
+
+/*
+ * An implementation based on internal code as known from the
+ * OpenDarwin CVS.
+ *
+ * This code doesn't need much memory management because it uses
+ * functions that operate on the raw memory directly.
+ *
+ * The push routine here is faster and more compatible with HFS+ than
+ * the other implementation above. The pull routine is only faster
+ * for some strings, slightly slower for others. The pull routine
+ * looses because it has to iterate over the data twice, once to
+ * decode UTF-8 and than to do the character composition required by
+ * Windows.
+ */
+static size_t macosxfs_encoding_pull(
+ void *cd, /* Encoder handle */
+ const char **inbuf, size_t *inbytesleft, /* Script string */
+ char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */
+{
+ static const int script_code = kCFStringEncodingUTF8;
+ UInt32 srcCharsUsed = 0;
+ UInt32 dstCharsUsed = 0;
+ UInt32 result;
+ uint32_t dstDecomposedUsed = 0;
+ uint32_t dstPrecomposedUsed = 0;
+
+ (void) cd; /* UNUSED */
+
+ if (0 == *inbytesleft) {
+ return 0;
+ }
+
+ result = CFStringEncodingBytesToUnicode(
+ script_code, kCFStringEncodingComposeCombinings,
+ *inbuf, *inbytesleft, &srcCharsUsed,
+ (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
+
+ switch(result) {
+ case kCFStringEncodingConversionSuccess:
+ if (*inbytesleft == srcCharsUsed) {
+ break;
+ }
+
+ FALL_THROUGH;
+ case kCFStringEncodingInsufficientOutputBufferLength:
+ debug_out("String conversion: "
+ "Output buffer too small\n");
+ hexdump("UTF8->UTF16LE (new) input",
+ *inbuf, *inbytesleft);
+ errno = E2BIG;
+ return -1;
+ case kCFStringEncodingInvalidInputStream:
+ /*
+ * HACK: smbd/mangle_hash2.c:is_legal_name() expects
+ * errors here. That function will always pass 2
+ * characters. smbd/open.c:check_for_pipe() cuts a
+ * patchname to 10 characters blindly. Suppress the
+ * debug output in those cases.
+ */
+ if(2 != *inbytesleft && 10 != *inbytesleft) {
+ debug_out("String conversion: "
+ "Invalid input sequence\n");
+ hexdump("UTF8->UTF16LE (new) input",
+ *inbuf, *inbytesleft);
+ }
+ errno = EILSEQ;
+ return -1;
+ case kCFStringEncodingConverterUnavailable:
+ debug_out("String conversion: "
+ "Unknown encoding\n");
+ hexdump("UTF8->UTF16LE (new) input",
+ *inbuf, *inbytesleft);
+ errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * It doesn't look like CFStringEncodingBytesToUnicode() can
+ * produce precomposed characters (flags=ComposeCombinings
+ * doesn't do it), so we need another pass over the data here.
+ * We can do this in-place, as the string can only get
+ * shorter.
+ *
+ * (Actually in theory there should be an internal
+ * decomposition and reordering before the actual composition
+ * step. But we should be able to rely on that we always get
+ * fully decomposed strings for input, so this can't create
+ * problems in reality.)
+ */
+ CFUniCharPrecompose(
+ (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
+ (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
+
+ native_to_le(*outbuf, dstPrecomposedUsed*2);
+
+ *inbuf += srcCharsUsed;
+ *inbytesleft -= srcCharsUsed;
+ *outbuf += dstPrecomposedUsed*2;
+ *outbytesleft -= dstPrecomposedUsed*2;
+
+ return 0;
+}
+
+static size_t macosxfs_encoding_push(
+ void *cd, /* Encoder handle */
+ const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
+ char **outbuf, size_t *outbytesleft) /* Script string */
+{
+ static const int script_code = kCFStringEncodingUTF8;
+ static UniChar *buffer = NULL;
+ static size_t buflen = 0;
+ UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
+
+ (void) cd; /* UNUSED */
+
+ if (0 == *inbytesleft) {
+ return 0;
+ }
+
+ buffer = set_ucbuffer_with_le(
+ buffer, &buflen, *inbuf, *inbytesleft);
+
+ result = CFStringEncodingUnicodeToBytes(
+ script_code, kCFStringEncodingUseHFSPlusCanonical,
+ buffer, *inbytesleft/2, &srcCharsUsed,
+ *outbuf, *outbytesleft, &dstCharsUsed);
+
+ switch(result) {
+ case kCFStringEncodingConversionSuccess:
+ if (*inbytesleft/2 == srcCharsUsed) {
+ break;
+ }
+
+ FALL_THROUGH;
+ case kCFStringEncodingInsufficientOutputBufferLength:
+ debug_out("String conversion: "
+ "Output buffer too small\n");
+ hexdump("UTF16LE->UTF8 (new) input",
+ *inbuf, *inbytesleft);
+ errno = E2BIG;
+ return -1;
+ case kCFStringEncodingInvalidInputStream:
+ /*
+ * HACK: smbd/open.c:check_for_pipe():is_legal_name()
+ * cuts a pathname to 10 characters blindly. Suppress
+ * the debug output in those cases.
+ */
+ if(10 != *inbytesleft) {
+ debug_out("String conversion: "
+ "Invalid input sequence\n");
+ hexdump("UTF16LE->UTF8 (new) input",
+ *inbuf, *inbytesleft);
+ }
+ errno = EILSEQ;
+ return -1;
+ case kCFStringEncodingConverterUnavailable:
+ debug_out("String conversion: "
+ "Unknown encoding\n");
+ hexdump("UTF16LE->UTF8 (new) input",
+ *inbuf, *inbytesleft);
+ errno = EINVAL;
+ return -1;
+ }
+
+ *inbuf += srcCharsUsed*2;
+ *inbytesleft -= srcCharsUsed*2;
+ *outbuf += dstCharsUsed;
+ *outbytesleft -= dstCharsUsed;
+
+ return 0;
+}
+
+#endif /* USE_INTERNAL_API */
+
+#else /* DARWIN */
+
+void charset_macosfs_dummy(void);
+void charset_macosfs_dummy(void)
+{
+ return;
+}
+
+#endif /* DARWIN */