diff options
Diffstat (limited to 'lib/util/charset/charset_macosxfs.c')
-rw-r--r-- | lib/util/charset/charset_macosxfs.c | 605 |
1 files changed, 605 insertions, 0 deletions
diff --git a/lib/util/charset/charset_macosxfs.c b/lib/util/charset/charset_macosxfs.c new file mode 100644 index 0000000..75dbb4b --- /dev/null +++ b/lib/util/charset/charset_macosxfs.c @@ -0,0 +1,605 @@ +/* + Unix SMB/CIFS implementation. + Samba charset module for Mac OS X/Darwin + Copyright (C) Benjamin Riefenstahl 2003 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* + * modules/charset_macosxfs.c + * + * A Samba charset module to use on Mac OS X/Darwin as the filesystem + * and display encoding. + * + * Actually two implementations are provided here. The default + * implementation is based on the official CFString API. The other is + * based on internal CFString APIs as defined in the OpenDarwin + * source. + */ + +#include "replace.h" +#include "charset.h" +#include "charset_proto.h" +#include "lib/util/debug.h" +#undef realloc + +#ifdef DARWINOS + +/* + * Include OS frameworks. These are only needed in this module. + */ +#include <CoreFoundation/CFString.h> + +/* + * See if autoconf has found us the internal headers in some form. + */ +#if defined(HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H) +# include <CoreFoundation/CFStringEncodingConverter.h> +# include <CoreFoundation/CFUnicodePrecomposition.h> +# define USE_INTERNAL_API 1 +#elif defined(HAVE_CFSTRINGENCODINGCONVERTER_H) +# include <CFStringEncodingConverter.h> +# include <CFUnicodePrecomposition.h> +# define USE_INTERNAL_API 1 +#endif + +/* + * Compile time configuration: Do we want debug output? + */ +/* #define DEBUG_STRINGS 1 */ + +/* + * A simple, but efficient memory provider for our buffers. + */ +static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize) +{ + if (newsize > *size) { + *size = newsize + 128; + buffer = realloc(buffer, *size); + } + return buffer; +} + +/* + * While there is a version of OpenDarwin for intel, the usual case is + * big-endian PPC. So we need byte swapping to handle the + * little-endian byte order of the network protocol. We also need an + * additional dynamic buffer to do this work for incoming data blocks, + * because we have to consider the original data as constant. + * + * We abstract the differences away by providing a simple facade with + * these functions/macros: + * + * le_to_native(dst,src,len) + * native_to_le(cp,len) + * set_ucbuffer_with_le(buffer,bufsize,data,size) + * set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve) + */ +#ifdef WORDS_BIGENDIAN + +static inline void swap_bytes (char * dst, const char * src, size_t len) +{ + const char *srcend = src + len; + while (src < srcend) { + dst[0] = src[1]; + dst[1] = src[0]; + dst += 2; + src += 2; + } +} +static inline void swap_bytes_inplace (char * cp, size_t len) +{ + char temp; + char *end = cp + len; + while (cp < end) { + temp = cp[1]; + cp[1] = cp[0]; + cp[0] = temp; + cp += 2; + } +} + +#define le_to_native(dst,src,len) swap_bytes(dst,src,len) +#define native_to_le(cp,len) swap_bytes_inplace(cp,len) +#define set_ucbuffer_with_le(buffer,bufsize,data,size) \ + set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0) + +#else /* ! WORDS_BIGENDIAN */ + +#define le_to_native(dst,src,len) memcpy(dst,src,len) +#define native_to_le(cp,len) /* nothing */ +#define set_ucbuffer_with_le(buffer,bufsize,data,size) \ + (((void)(bufsize)),(UniChar*)(data)) + +#endif + +static inline UniChar *set_ucbuffer_with_le_copy ( + UniChar *buffer, size_t *bufsize, + const void *data, size_t size, size_t reserve) +{ + buffer = resize_buffer(buffer, bufsize, size+reserve); + le_to_native((char*)buffer,data,size); + return buffer; +} + + +/* + * A simple hexdump function for debugging error conditions. + */ +#define debug_out(s) DEBUG(0,(s)) + +#ifdef DEBUG_STRINGS + +static void hexdump( const char * label, const char * s, size_t len ) +{ + size_t restlen = len; + debug_out("<<<<<<<\n"); + debug_out(label); + debug_out("\n"); + while (restlen > 0) { + char line[100]; + size_t i, j; + char * d = line; +#undef sprintf + d += sprintf(d, "%04X ", (unsigned)(len-restlen)); + *d++ = ' '; + for( i = 0; i<restlen && i<8; ++i ) { + d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF); + } + for( j = i; j<8; ++j ) { + d += sprintf(d, " "); + } + *d++ = ' '; + for( i = 8; i<restlen && i<16; ++i ) { + d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF); + } + for( j = i; j<16; ++j ) { + d += sprintf(d, " "); + } + *d++ = ' '; + for( i = 0; i<restlen && i<16; ++i ) { + if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i])) + *d++ = '.'; + else + *d++ = s[i]; + } + *d++ = '\n'; + *d = 0; + restlen -= i; + s += i; + debug_out(line); + } + debug_out(">>>>>>>\n"); +} + +#else /* !DEBUG_STRINGS */ + +#define hexdump(label,s,len) /* nothing */ + +#endif + + +#if !USE_INTERNAL_API + +/* + * An implementation based on documented Mac OS X APIs. + * + * This does a certain amount of memory management, creating and + * manipulating CFString objects. We try to minimize the impact by + * keeping those objects around and re-using them. We also use + * external backing store for the CFStrings where this is possible and + * benficial. + * + * The Unicode normalizations forms available at this level are + * generic, not specifically for the file system. So they may not be + * perfect fits. + */ +size_t macosxfs_encoding_pull( + void *cd, /* Encoder handle */ + const char **inbuf, size_t *inbytesleft, /* Script string */ + char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */ +{ + static const int script_code = kCFStringEncodingUTF8; + static CFMutableStringRef cfstring = NULL; + size_t outsize; + CFRange range; + + (void) cd; /* UNUSED */ + + if (0 == *inbytesleft) { + return 0; + } + + if (NULL == cfstring) { + /* + * A version with an external backing store as in the + * push function should have been more efficient, but + * testing shows, that it is actually slower (!). + * Maybe kCFAllocatorDefault gets shortcut evaluation + * internally, while kCFAllocatorNull doesn't. + */ + cfstring = CFStringCreateMutable(kCFAllocatorDefault,0); + } + + /* + * Three methods of appending to a CFString, choose the most + * efficient. + */ + if (0 == (*inbuf)[*inbytesleft-1]) { + CFStringAppendCString(cfstring, *inbuf, script_code); + } else if (*inbytesleft <= 255) { + Str255 buffer; + buffer[0] = *inbytesleft; + memcpy(buffer+1, *inbuf, buffer[0]); + CFStringAppendPascalString(cfstring, buffer, script_code); + } else { + /* + * We would like to use a fixed buffer and a loop + * here, but than we can't garantee that the input is + * well-formed UTF-8, as we are supposed to do. + */ + static char *buffer = NULL; + static size_t buflen = 0; + buffer = resize_buffer(buffer, &buflen, *inbytesleft+1); + memcpy(buffer, *inbuf, *inbytesleft); + buffer[*inbytesleft] = 0; + CFStringAppendCString(cfstring, *inbuf, script_code); + } + + /* + * Compose characters, using the non-canonical composition + * form. + */ + CFStringNormalize(cfstring, kCFStringNormalizationFormC); + + outsize = CFStringGetLength(cfstring); + range = CFRangeMake(0,outsize); + + if (outsize == 0) { + /* + * HACK: smbd/mangle_hash2.c:is_legal_name() expects + * errors here. That function will always pass 2 + * characters. smbd/open.c:check_for_pipe() cuts a + * patchname to 10 characters blindly. Suppress the + * debug output in those cases. + */ + if(2 != *inbytesleft && 10 != *inbytesleft) { + debug_out("String conversion: " + "An unknown error occurred\n"); + hexdump("UTF8->UTF16LE (old) input", + *inbuf, *inbytesleft); + } + errno = EILSEQ; /* Not sure, but this is what we have + * actually seen. */ + return -1; + } + if (outsize*2 > *outbytesleft) { + CFStringDelete(cfstring, range); + debug_out("String conversion: " + "Output buffer too small\n"); + hexdump("UTF8->UTF16LE (old) input", + *inbuf, *inbytesleft); + errno = E2BIG; + return -1; + } + + CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf); + CFStringDelete(cfstring, range); + + native_to_le(*outbuf, outsize*2); + + /* + * Add a converted null byte, if the CFString conversions + * prevented that until now. + */ + if (0 == (*inbuf)[*inbytesleft-1] && + (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) { + + if ((outsize*2+2) > *outbytesleft) { + debug_out("String conversion: " + "Output buffer too small\n"); + hexdump("UTF8->UTF16LE (old) input", + *inbuf, *inbytesleft); + errno = E2BIG; + return -1; + } + + (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0; + outsize += 2; + } + + *inbuf += *inbytesleft; + *inbytesleft = 0; + *outbuf += outsize*2; + *outbytesleft -= outsize*2; + + return 0; +} + +size_t macosxfs_encoding_push( + void *cd, /* Encoder handle */ + const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */ + char **outbuf, size_t *outbytesleft) /* Script string */ +{ + static const int script_code = kCFStringEncodingUTF8; + static CFMutableStringRef cfstring = NULL; + static UniChar *buffer = NULL; + static size_t buflen = 0; + CFIndex outsize, cfsize, charsconverted; + + (void) cd; /* UNUSED */ + + if (0 == *inbytesleft) { + return 0; + } + + /* + * We need a buffer that can hold 4 times the original data, + * because that is the theoretical maximum that decomposition + * can create currently (in Unicode 4.0). + */ + buffer = set_ucbuffer_with_le_copy( + buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft); + + if (NULL == cfstring) { + cfstring = CFStringCreateMutableWithExternalCharactersNoCopy( + kCFAllocatorDefault, + buffer, *inbytesleft/2, buflen/2, + kCFAllocatorNull); + } else { + CFStringSetExternalCharactersNoCopy( + cfstring, + buffer, *inbytesleft/2, buflen/2); + } + + /* + * Decompose characters, using the non-canonical decomposition + * form. + * + * NB: This isn't exactly what HFS+ wants (see note on + * kCFStringEncodingUseHFSPlusCanonical in + * CFStringEncodingConverter.h), but AFAIK it's the best that + * the official API can do. + */ + CFStringNormalize(cfstring, kCFStringNormalizationFormD); + + cfsize = CFStringGetLength(cfstring); + charsconverted = CFStringGetBytes( + cfstring, CFRangeMake(0,cfsize), + script_code, 0, false, + *(UInt8 **)outbuf, *outbytesleft, &outsize); + + if (0 == charsconverted) { + debug_out("String conversion: " + "Buffer too small or not convertable\n"); + hexdump("UTF16LE->UTF8 (old) input", + *inbuf, *inbytesleft); + errno = EILSEQ; /* Probably more likely. */ + return -1; + } + + /* + * Add a converted null byte, if the CFString conversions + * prevented that until now. + */ + if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] && + (0 != (*outbuf)[outsize-1])) { + + if (((size_t)outsize+1) > *outbytesleft) { + debug_out("String conversion: " + "Output buffer too small\n"); + hexdump("UTF16LE->UTF8 (old) input", + *inbuf, *inbytesleft); + errno = E2BIG; + return -1; + } + + (*outbuf)[outsize] = 0; + ++outsize; + } + + *inbuf += *inbytesleft; + *inbytesleft = 0; + *outbuf += outsize; + *outbytesleft -= outsize; + + return 0; +} + +#else /* USE_INTERNAL_API */ + +/* + * An implementation based on internal code as known from the + * OpenDarwin CVS. + * + * This code doesn't need much memory management because it uses + * functions that operate on the raw memory directly. + * + * The push routine here is faster and more compatible with HFS+ than + * the other implementation above. The pull routine is only faster + * for some strings, slightly slower for others. The pull routine + * looses because it has to iterate over the data twice, once to + * decode UTF-8 and than to do the character composition required by + * Windows. + */ +static size_t macosxfs_encoding_pull( + void *cd, /* Encoder handle */ + const char **inbuf, size_t *inbytesleft, /* Script string */ + char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */ +{ + static const int script_code = kCFStringEncodingUTF8; + UInt32 srcCharsUsed = 0; + UInt32 dstCharsUsed = 0; + UInt32 result; + uint32_t dstDecomposedUsed = 0; + uint32_t dstPrecomposedUsed = 0; + + (void) cd; /* UNUSED */ + + if (0 == *inbytesleft) { + return 0; + } + + result = CFStringEncodingBytesToUnicode( + script_code, kCFStringEncodingComposeCombinings, + *inbuf, *inbytesleft, &srcCharsUsed, + (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed); + + switch(result) { + case kCFStringEncodingConversionSuccess: + if (*inbytesleft == srcCharsUsed) { + break; + } + + FALL_THROUGH; + case kCFStringEncodingInsufficientOutputBufferLength: + debug_out("String conversion: " + "Output buffer too small\n"); + hexdump("UTF8->UTF16LE (new) input", + *inbuf, *inbytesleft); + errno = E2BIG; + return -1; + case kCFStringEncodingInvalidInputStream: + /* + * HACK: smbd/mangle_hash2.c:is_legal_name() expects + * errors here. That function will always pass 2 + * characters. smbd/open.c:check_for_pipe() cuts a + * patchname to 10 characters blindly. Suppress the + * debug output in those cases. + */ + if(2 != *inbytesleft && 10 != *inbytesleft) { + debug_out("String conversion: " + "Invalid input sequence\n"); + hexdump("UTF8->UTF16LE (new) input", + *inbuf, *inbytesleft); + } + errno = EILSEQ; + return -1; + case kCFStringEncodingConverterUnavailable: + debug_out("String conversion: " + "Unknown encoding\n"); + hexdump("UTF8->UTF16LE (new) input", + *inbuf, *inbytesleft); + errno = EINVAL; + return -1; + } + + /* + * It doesn't look like CFStringEncodingBytesToUnicode() can + * produce precomposed characters (flags=ComposeCombinings + * doesn't do it), so we need another pass over the data here. + * We can do this in-place, as the string can only get + * shorter. + * + * (Actually in theory there should be an internal + * decomposition and reordering before the actual composition + * step. But we should be able to rely on that we always get + * fully decomposed strings for input, so this can't create + * problems in reality.) + */ + CFUniCharPrecompose( + (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed, + (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed); + + native_to_le(*outbuf, dstPrecomposedUsed*2); + + *inbuf += srcCharsUsed; + *inbytesleft -= srcCharsUsed; + *outbuf += dstPrecomposedUsed*2; + *outbytesleft -= dstPrecomposedUsed*2; + + return 0; +} + +static size_t macosxfs_encoding_push( + void *cd, /* Encoder handle */ + const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */ + char **outbuf, size_t *outbytesleft) /* Script string */ +{ + static const int script_code = kCFStringEncodingUTF8; + static UniChar *buffer = NULL; + static size_t buflen = 0; + UInt32 srcCharsUsed=0, dstCharsUsed=0, result; + + (void) cd; /* UNUSED */ + + if (0 == *inbytesleft) { + return 0; + } + + buffer = set_ucbuffer_with_le( + buffer, &buflen, *inbuf, *inbytesleft); + + result = CFStringEncodingUnicodeToBytes( + script_code, kCFStringEncodingUseHFSPlusCanonical, + buffer, *inbytesleft/2, &srcCharsUsed, + *outbuf, *outbytesleft, &dstCharsUsed); + + switch(result) { + case kCFStringEncodingConversionSuccess: + if (*inbytesleft/2 == srcCharsUsed) { + break; + } + + FALL_THROUGH; + case kCFStringEncodingInsufficientOutputBufferLength: + debug_out("String conversion: " + "Output buffer too small\n"); + hexdump("UTF16LE->UTF8 (new) input", + *inbuf, *inbytesleft); + errno = E2BIG; + return -1; + case kCFStringEncodingInvalidInputStream: + /* + * HACK: smbd/open.c:check_for_pipe():is_legal_name() + * cuts a pathname to 10 characters blindly. Suppress + * the debug output in those cases. + */ + if(10 != *inbytesleft) { + debug_out("String conversion: " + "Invalid input sequence\n"); + hexdump("UTF16LE->UTF8 (new) input", + *inbuf, *inbytesleft); + } + errno = EILSEQ; + return -1; + case kCFStringEncodingConverterUnavailable: + debug_out("String conversion: " + "Unknown encoding\n"); + hexdump("UTF16LE->UTF8 (new) input", + *inbuf, *inbytesleft); + errno = EINVAL; + return -1; + } + + *inbuf += srcCharsUsed*2; + *inbytesleft -= srcCharsUsed*2; + *outbuf += dstCharsUsed; + *outbytesleft -= dstCharsUsed; + + return 0; +} + +#endif /* USE_INTERNAL_API */ + +#else /* DARWIN */ + +void charset_macosfs_dummy(void); +void charset_macosfs_dummy(void) +{ + return; +} + +#endif /* DARWIN */ |