diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 03:01:46 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 03:01:46 +0000 |
commit | f8fe689a81f906d1b91bb3220acde2a4ecb14c5b (patch) | |
tree | 26484e9d7e2c67806c2d1760196ff01aaa858e8c /include/iprt/bldprog-strtab-template.cpp.h | |
parent | Initial commit. (diff) | |
download | virtualbox-upstream.tar.xz virtualbox-upstream.zip |
Adding upstream version 6.0.4-dfsg.upstream/6.0.4-dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | include/iprt/bldprog-strtab-template.cpp.h | 1046 |
1 files changed, 1046 insertions, 0 deletions
diff --git a/include/iprt/bldprog-strtab-template.cpp.h b/include/iprt/bldprog-strtab-template.cpp.h new file mode 100644 index 00000000..927a3243 --- /dev/null +++ b/include/iprt/bldprog-strtab-template.cpp.h @@ -0,0 +1,1046 @@ +/* $Id: bldprog-strtab-template.cpp.h $ */ +/** @file + * IPRT - Build Program - String Table Generator. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/* + * (Avoid include sanity checks as this is ring-3 only, C++ code.) + */ +#if defined(__cplusplus) && defined(IN_RING3) + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** @def BLDPROG_STRTAB_MAX_STRLEN + * The max length of strings in the table. */ +#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING) +# define BLDPROG_STRTAB_MAX_STRLEN 256 +#endif + +/** @def BLDPROG_STRTAB_WITH_COMPRESSION + * Enables very simple string compression. + */ +#if defined(DOXYGEN_RUNNING) +# define BLDPROG_STRTAB_WITH_COMPRESSION +#endif + +/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS + * Modifies the string compression to look for camel case words. + */ +#if defined(DOXYGEN_RUNNING) +# define BLDPROG_STRTAB_WITH_CAMEL_WORDS +#endif + +/** @def BLDPROG_STRTAB_PURE_ASCII + * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this + * is defined. Otherwise, the compression code will require IPRT to link. + */ +#if defined(DOXYGEN_RUNNING) +# define BLDPROG_STRTAB_PURE_ASCII +#endif + + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/stdarg.h> +#include <iprt/ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION +# include <algorithm> +# include <map> +# include <string> +# include <vector> + +typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP; +typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR; + +#endif + +#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */ + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Build table string. + */ +typedef struct BLDPROGSTRING +{ + /** The string. + * @note This may be modified or replaced (allocated from heap) when + * compressing the string table. */ + char *pszString; + /** The string hash value. */ + uint32_t uHash; + /** The strint table offset. */ + uint32_t offStrTab; + /** The string length. */ + size_t cchString; + /** Pointer to the next string reference (same string table entry). */ + struct BLDPROGSTRING *pNextRef; + /** Pointer to the next string with the same hash value (collision). */ + struct BLDPROGSTRING *pNextCollision; + +} BLDPROGSTRING; +/** Pointer to a string table string. */ +typedef BLDPROGSTRING *PBLDPROGSTRING; + + +/** String table data. */ +typedef struct BLDPROGSTRTAB +{ + /** The size of g_papStrHash. */ + size_t cStrHash; + /** String hash table. */ + PBLDPROGSTRING *papStrHash; + /** Duplicate strings found by AddString. */ + size_t cDuplicateStrings; + /** Total length of the unique strings (no terminators). */ + size_t cchUniqueStrings; + /** Number of unique strings after AddString. */ + size_t cUniqueStrings; + /** Number of collisions. */ + size_t cCollisions; + + /** Number of entries in apSortedStrings. */ + size_t cSortedStrings; + /** The sorted string table. */ + PBLDPROGSTRING *papSortedStrings; + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + /** The 127 words we've picked to be indexed by reference. */ + BLDPROGSTRING aCompDict[127]; + /** Incoming strings pending compression. */ + PBLDPROGSTRING *papPendingStrings; + /** Current number of entries in papStrPending. */ + size_t cPendingStrings; + /** The allocated size of papPendingStrings. */ + size_t cMaxPendingStrings; + /** Work frequency map. + * @todo rewrite in plain C. */ + BLDPROGWORDFREQMAP Frequencies; +#endif + + /** The string table. */ + char *pachStrTab; + /** The actual string table size. */ + size_t cchStrTab; +} BLDPROGSTRTAB; +typedef BLDPROGSTRTAB *PBLDPROGSTRTAB; + +#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + + +/** + * Initializes the strint table compiler. + * + * @returns success indicator (out of memory if false). + * @param pThis The strint table compiler instance. + * @param cMaxStrings The max number of strings we'll be adding. + */ +static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings) +{ + pThis->cStrHash = 0; + pThis->papStrHash = NULL; + pThis->cDuplicateStrings = 0; + pThis->cchUniqueStrings = 0; + pThis->cUniqueStrings = 0; + pThis->cCollisions = 0; + pThis->cSortedStrings = 0; + pThis->papSortedStrings = NULL; + pThis->pachStrTab = NULL; + pThis->cchStrTab = 0; +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict)); + pThis->papPendingStrings = NULL; + pThis->cPendingStrings = 0; + pThis->cMaxPendingStrings = cMaxStrings; +#endif + + /* + * Allocate a hash table double the size of all strings (to avoid too + * many collisions). Add all strings to it, finding duplicates in the + * process. + */ +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + cMaxStrings += RT_ELEMENTS(pThis->aCompDict); +#endif + cMaxStrings *= 2; + pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings); + if (pThis->papStrHash) + { + pThis->cStrHash = cMaxStrings; +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings); + if (pThis->papPendingStrings) +#endif + return true; + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + free(pThis->papStrHash); + pThis->papStrHash = NULL; +#endif + } + return false; +} + + +#if 0 /* unused */ +static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis) +{ + free(pThis->papStrHash); + free(pThis->papSortedStrings); + free(pThis->pachStrTab); +# ifdef BLDPROG_STRTAB_WITH_COMPRESSION + free(pThis->papPendingStrings); +# endif + memset(pThis, 0, sizeof(*pThis)); +} +#endif + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + +DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc) +{ + /* + * Skip leading word separators. + */ +# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS + while ( ch == ' ' + || ch == '-' + || ch == '+' + || ch == '_') +# else + while (ch == ' ') +# endif + ch = *++pszSrc; + if (ch) + { + /* + * Find end of word. + */ + size_t cchWord = 1; +# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS + char chPrev = ch; + while ( (ch = pszSrc[cchWord]) != ' ' + && ch != '\0' + && ch != '-' + && ch != '+' + && ch != '_' + && ( ch == chPrev + || !RT_C_IS_UPPER(ch) + || RT_C_IS_UPPER(chPrev)) ) + { + chPrev = ch; + cchWord++; + } +# else + while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0') + cchWord++; +# endif + *ppszSrc = pszSrc; + return cchWord; + } + + *ppszSrc = pszSrc; + return 0; +} + + +/** + * Analyzes a string. + * + * @param pThis The strint table compiler instance. + * @param pStr The string to analyze. + */ +static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ + const char *psz = pStr->pszString; + + /* + * For now we just consider words. + */ + char ch; + while ((ch = *psz) != '\0') + { + size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz); + if (cchWord > 1) + { + std::string strWord(psz, cchWord); + BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord); + if (it != pThis->Frequencies.end()) + it->second += cchWord - 1; + else + pThis->Frequencies[strWord] = 0; + + /** @todo could gain hits by including the space after the word, but that + * has the same accounting problems as the two words scenario below. */ + +# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */ + /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */ + if (ch == ' ') + { + ch = psz[++cchWord]; + if (ch != ' ' && ch != '\0') + { + size_t const cchSaved = cchWord; + + do + cchWord++; + while ((ch = psz[cchWord]) != ' ' && ch != '\0'); + + strWord = std::string(psz, cchWord); + BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord); + if (it != pThis->Frequencies.end()) + it->second += cchWord - 1; + else + pThis->Frequencies[strWord] = 0; + + cchWord = cchSaved; + } + } +# endif + } + else if (!cchWord) + break; + + /* Advance. */ + psz += cchWord; + } + pStr->cchString = psz - pStr->pszString; + if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN) + { + fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString); + abort(); + } +} + +#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */ + +/** + * Adds a string to the hash table. + * @param pThis The strint table compiler instance. + * @param pStr The string. + */ +static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ + pStr->pNextRef = NULL; + pStr->pNextCollision = NULL; + pStr->offStrTab = 0; + pStr->uHash = sdbm(pStr->pszString, &pStr->cchString); + if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN) + { + fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString); + exit(RTEXITCODE_FAILURE); + } + + size_t idxHash = pStr->uHash % pThis->cStrHash; + PBLDPROGSTRING pCur = pThis->papStrHash[idxHash]; + if (!pCur) + pThis->papStrHash[idxHash] = pStr; + else + { + /* Look for matching string. */ + do + { + if ( pCur->uHash == pStr->uHash + && pCur->cchString == pStr->cchString + && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0) + { + pStr->pNextRef = pCur->pNextRef; + pCur->pNextRef = pStr; + pThis->cDuplicateStrings++; + return; + } + pCur = pCur->pNextCollision; + } while (pCur != NULL); + + /* No matching string, insert. */ + pThis->cCollisions++; + pStr->pNextCollision = pThis->papStrHash[idxHash]; + pThis->papStrHash[idxHash] = pStr; + } + + pThis->cUniqueStrings++; + pThis->cchUniqueStrings += pStr->cchString; +} + + +/** + * Adds a string to the string table. + * + * @param pThis The strint table compiler instance. + * @param pStr The string. + */ +static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + bldProgStrTab_compressorAnalyzeString(pThis, pStr); + if (pThis->cPendingStrings < pThis->cMaxPendingStrings) + pThis->papPendingStrings[pThis->cPendingStrings++] = pStr; + else + abort(); +#else + bldProgStrTab_AddStringToHashTab(pThis, pStr); +#endif +} + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + +/** + * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special + * sequences. + * + * @returns New @a pszDst position, NULL if invalid source encoding. + * @param pszDst The destination buffer. + * @param pszSrc The source buffer. + * @param cchSrc How much to copy. + */ +static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc) +{ + while (cchSrc-- > 0) + { + char ch = *pszSrc; + if (!((unsigned char)ch & 0x80)) + { + *pszDst++ = ch; + pszSrc++; + } + else + { +# ifdef BLDPROG_STRTAB_PURE_ASCII + fprintf(stderr, "error: unexpected char value %#x\n", ch); + return NULL; +# else + RTUNICP uc; + int rc = RTStrGetCpEx(&pszSrc, &uc); + if (RT_SUCCESS(rc)) + { + *pszDst++ = (unsigned char)0xff; /* escape single code point. */ + pszDst = RTStrPutCp(pszDst, uc); + } + else + { + fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc); + return NULL; + } +# endif + } + } + return pszDst; +} + + +/** + * Replaces the dictionary words and escapes non-ascii chars in a string. + * + * @param pThis The strint table compiler instance. + * @param pString The string to fixup. + */ +static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ + char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2]; + char *pszDst = szNew; + const char *pszSrc = pStr->pszString; + const char *pszSrcEnd = pszSrc + pStr->cchString; + + char ch; + while ((ch = *pszSrc) != '\0') + { + const char * const pszSrcUncompressed = pszSrc; + size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc); + size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed; + if (cchSrcUncompressed > 0) + { + pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed); + if (!pszDst) + return false; + } + if (!cchWord) + break; + + /* Check for g_aWord matches. */ + size_t cchMax = pszSrcEnd - pszSrc; + for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) + { + size_t cchLen = pThis->aCompDict[i].cchString; + if ( cchLen >= cchWord + && cchLen <= cchMax + && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0) + { + *pszDst++ = (unsigned char)(0x80 | i); + pszSrc += cchLen; + cchWord = 0; + break; + } + } + + if (cchWord) + { + /* Copy the current word. */ + pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord); + if (!pszDst) + return false; + pszSrc += cchWord; + } + } + + /* Just terminate it now. */ + *pszDst = '\0'; + + /* + * Update the string. + */ + size_t cchNew = pszDst - &szNew[0]; + if (cchNew > pStr->cchString) + { + pStr->pszString = (char *)malloc(cchNew + 1); + if (!pStr->pszString) + { + fprintf(stderr, "Out of memory!\n"); + return false; + } + } + memcpy(pStr->pszString, szNew, cchNew + 1); + pStr->cchString = cchNew; + + return true; +} + + +/** + * For sorting the frequency fidning in descending order. + * + * Comparison operators are put outside to make older gcc versions (like 4.1.1 + * on lnx64-rel) happy. + */ +class WordFreqSortEntry +{ +public: + BLDPROGWORDFREQPAIR const *m_pPair; + +public: + WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {} +}; + +bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight) +{ + return rLeft.m_pPair->second == rRight.m_pPair->second; +} + +bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight) +{ + return rLeft.m_pPair->second > rRight.m_pPair->second; +} + + +/** + * Compresses the vendor and product strings. + * + * This is very very simple (a lot less work than the string table for + * instance). + */ +static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose) +{ + /* + * Sort the frequency analyzis result and pick the top 127 ones. + */ + std::vector<WordFreqSortEntry> SortMap; + for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it) + { + BLDPROGWORDFREQPAIR const &rPair = *it; + SortMap.push_back(WordFreqSortEntry(&rPair)); + } + + sort(SortMap.begin(), SortMap.end()); + + size_t cb = 0; + size_t i = 0; + for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin(); + it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict); + ++it, i++) + { + pThis->aCompDict[i].cchString = it->m_pPair->first.length(); + pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1); + if (pThis->aCompDict[i].pszString) + memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1); + else + return false; + cb += it->m_pPair->second; + } + + if (fVerbose) + printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb); + + /* + * Rework the strings. + */ + size_t cchOld = 0; + size_t cchOldMax = 0; + size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN; + size_t cchNew = 0; + size_t cchNewMax = 0; + size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN; + i = pThis->cPendingStrings; + while (i-- > 0) + { + PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i]; + cchOld += pCurStr->cchString; + if (pCurStr->cchString > cchOldMax) + cchOldMax = pCurStr->cchString; + if (pCurStr->cchString < cchOldMin) + cchOldMin = pCurStr->cchString; + + if (!bldProgStrTab_compressorFixupString(pThis, pCurStr)) + return false; + + cchNew += pCurStr->cchString; + if (pCurStr->cchString > cchNewMax) + cchNewMax = pCurStr->cchString; + if (pCurStr->cchString < cchNewMin) + cchNewMin = pCurStr->cchString; + + bldProgStrTab_AddStringToHashTab(pThis, pCurStr); + } + + /* + * Do debug stats. + */ + if (fVerbose) + { + for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) + cchNew += pThis->aCompDict[i].cchString + 1; + + printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew); + if (cchNew < cchOld) + printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld)); + else + printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew)); + printf("debug: Original string lengths: average %u; min %u; max %u\n", + (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax); + printf("debug: Compressed string lengths: average %u; min %u; max %u\n", + (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax); + } + + return true; +} + +#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */ + +/** + * Inserts a string into g_apUniqueStrings. + * @param pThis The strint table compiler instance. + * @param pStr The string. + */ +static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ + size_t iIdx; + size_t iEnd = pThis->cSortedStrings; + if (iEnd) + { + size_t iStart = 0; + for (;;) + { + iIdx = iStart + (iEnd - iStart) / 2; + if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString) + { + if (iIdx <= iStart) + break; + iEnd = iIdx; + } + else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString) + { + if (++iIdx >= iEnd) + break; + iStart = iIdx; + } + else + break; + } + + if (iIdx != pThis->cSortedStrings) + memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx], + (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx])); + } + else + iIdx = 0; + + pThis->papSortedStrings[iIdx] = pStr; + pThis->cSortedStrings++; +} + + +/** + * Compiles the string table after the string has been added. + * + * This will save space by dropping string terminators, eliminating duplicates + * and try find strings that are sub-strings of others. + * + * Will initialize the StrRef of all StrTabString instances. + * + * @returns success indicator (error flagged). + * @param pThis The strint table compiler instance. + */ +static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose) +{ +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + /* + * Do the compression and add all the compressed strings to the table. + */ + if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose)) + return false; + + /* + * Add the dictionary strings. + */ + for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) + bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]); +#endif + if (fVerbose) + printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n", + (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings, + (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions); + + /* + * Create papSortedStrings from the hash table. The table is sorted by + * string length, with the longer strings first. + */ + pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings); + if (!pThis->papSortedStrings) + return false; + pThis->cSortedStrings = 0; + size_t idxHash = pThis->cStrHash; + while (idxHash-- > 0) + { + PBLDPROGSTRING pCur = pThis->papStrHash[idxHash]; + if (pCur) + { + do + { + bldProgStrTab_InsertUniqueString(pThis, pCur); + pCur = pCur->pNextCollision; + } while (pCur); + } + } + + /* + * Create the actual string table. + */ + pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1); + if (!pThis->pachStrTab) + return false; + pThis->cchStrTab = 0; + for (size_t i = 0; i < pThis->cSortedStrings; i++) + { + PBLDPROGSTRING pCur = pThis->papSortedStrings[i]; + const char * const pszCur = pCur->pszString; + size_t const cchCur = pCur->cchString; + size_t offStrTab = pThis->cchStrTab; + + /* + * See if the string is a substring already found in the string table. + * Excluding the zero terminator increases the chances for this. + */ + size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0; + const char *pchLeft = pThis->pachStrTab; + char const chFirst = *pszCur; + while (cchLeft > 0) + { + const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft); + if (!pchCandidate) + break; + if (memcmp(pchCandidate, pszCur, cchCur) == 0) + { + offStrTab = pchCandidate - pThis->pachStrTab; + break; + } + + cchLeft -= pchCandidate + 1 - pchLeft; + pchLeft = pchCandidate + 1; + } + + if (offStrTab == pThis->cchStrTab) + { + /* + * See if the start of the string overlaps the end of the string table. + */ + if (pThis->cchStrTab && cchCur > 1) + { + cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1); + pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft]; + while (cchLeft > 0) + { + const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft); + if (!pchCandidate) + break; + cchLeft -= pchCandidate - pchLeft; + pchLeft = pchCandidate; + if (memcmp(pchLeft, pszCur, cchLeft) == 0) + { + size_t cchToCopy = cchCur - cchLeft; + memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy); + pThis->cchStrTab += cchToCopy; + offStrTab = pchCandidate - pThis->pachStrTab; + break; + } + cchLeft--; + pchLeft++; + } + } + + /* + * If we didn't have any luck above, just append the string. + */ + if (offStrTab == pThis->cchStrTab) + { + memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur); + pThis->cchStrTab += cchCur; + } + } + + /* + * Set the string table offset for all the references to this string. + */ + do + { + pCur->offStrTab = (uint32_t)offStrTab; + pCur = pCur->pNextRef; + } while (pCur != NULL); + } + + if (fVerbose) + printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab); + return true; +} + + +#ifdef RT_STRICT +/** + * Sanity checks a string table string. + * @param pThis The strint table compiler instance. + * @param pStr The string to check. + */ +static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) +{ + if (pStr->cchString != strlen(pStr->pszString)) + abort(); + if (pStr->offStrTab >= pThis->cchStrTab) + abort(); + if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab) + abort(); + if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0) + abort(); +} +#endif + + +/** + * Output the string table string in C string litteral fashion. + * + * @param pThis The strint table instance. + * @param pString The string to print. + * @param pOut The output stream. + */ +static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut) +{ + const unsigned char *psz = (const unsigned char *)pString->pszString; + unsigned char uch; + while ((uch = *psz++) != '\0') + { + if (!(uch & 0x80)) + { + if (uch != '\'' && uch != '\\') + fputc((char)uch, pOut); + else + { + fputc('\\', pOut); + fputc((char)uch, pOut); + } + } +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + else if (uch != 0xff) + fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut); + else + { +# ifdef BLDPROG_STRTAB_PURE_ASCII + abort(); +# else + RTUNICP uc = RTStrGetCp((const char *)psz); + psz += RTStrCpSize(uc); + fprintf(pOut, "\\u%04x", uc); +# endif + } +#else + else + fprintf(pOut, "\\x%02x", (unsigned)uch); + NOREF(pThis); +#endif + } +} + + +/** + * Writes the string table code to the output stream. + * + * @param pThis The strint table compiler instance. + * @param pOut The output stream. + * @param pszScope The scoping ("static " or empty string), + * including trailing space. + * @param pszPrefix The variable prefix. Typically "g_" for C programs, + * whereas C++ classes normally use "class::s_g". + * @param pszBaseName The base name for the variables. The user + * accessible variable will end up as just the + * prefix and basename concatenated. + */ +static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut, + const char *pszScope, const char *pszPrefix, const char *pszBaseName) +{ +#ifdef RT_STRICT + /* + * Do some quick sanity checks while we're here. + */ +# ifdef BLDPROG_STRTAB_WITH_COMPRESSION + for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) + BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]); +# endif +#endif + + /* + * Create a table for speeding up the character categorization. + */ + uint8_t abCharCat[256]; + memset(abCharCat, 0, sizeof(abCharCat)); + abCharCat[(unsigned char)'\\'] = 1; + abCharCat[(unsigned char)'\''] = 1; + for (unsigned i = 0; i < 0x20; i++) + abCharCat[i] = 2; + for (unsigned i = 0x7f; i < 0x100; i++) + abCharCat[i] = 2; + + /* + * We follow the sorted string table, one string per line. + */ + fprintf(pOut, + "#include <iprt/bldprog-strtab.h>\n" + "\n" + "static const char g_achStrTab%s[] =\n" + "{\n", + pszBaseName); + + uint32_t off = 0; + for (uint32_t i = 0; i < pThis->cSortedStrings; i++) + { + PBLDPROGSTRING pCur = pThis->papSortedStrings[i]; + uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString; + if (offEnd > off) + { + /* Comment with a uncompressed and more readable version of the string. */ + fprintf(pOut, off == pCur->offStrTab ? "/* 0x%05x = \"" : "/* 0X%05x = \"", off); + BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut); + fputs("\" */\n", pOut); + + /* Must use char by char here or we may trigger the max string + length limit in the compiler, */ + fputs(" ", pOut); + for (; off < offEnd; off++) + { + unsigned char uch = pThis->pachStrTab[off]; + fputc('\'', pOut); + if (abCharCat[uch] == 0) + fputc(uch, pOut); + else if (abCharCat[uch] != 1) + fprintf(pOut, "\\x%02x", (unsigned)uch); + else + { + fputc('\\', pOut); + fputc(uch, pOut); + } + fputc('\'', pOut); + fputc(',', pOut); + } + fputc('\n', pOut); + } + } + + fprintf(pOut, + "};\n" + "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n", + pszBaseName, (unsigned)pThis->cchStrTab); + +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + /* + * Write the compression dictionary. + */ + fprintf(pOut, + "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n" + "{\n", + pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict)); + for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) + fprintf(pOut, " { %#08x, %#04x }, // %s\n", + pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString); + fprintf(pOut, "};\n\n"); +#endif + + + /* + * Write the string table data structure. + */ + fprintf(pOut, + "%sconst RTBLDPROGSTRTAB %s%s = \n" + "{\n" + " /*.pchStrTab = */ &g_achStrTab%s[0],\n" + " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n" + , + pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName); +#ifdef BLDPROG_STRTAB_WITH_COMPRESSION + fprintf(pOut, + " /*.cCompDict = */ %u,\n" + " /*.paCompDict = */ &g_aCompDict%s[0]\n" + "};\n" + , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName); +#else + fprintf(pOut, + " /*.cCompDict = */ 0,\n" + " /*.paCompDict = */ NULL\n" + "};\n"); +#endif +} + +#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6) +# pragma GCC diagnostic pop +#endif + +#endif /* __cplusplus && IN_RING3 */ + |