/* $Id: bldprog-strtab-template.cpp.h $ */ /** @file * IPRT - Build Program - String Table Generator. */ /* * Copyright (C) 2006-2022 Oracle and/or its affiliates. * * This file is part of VirtualBox base platform packages, as * available from https://www.virtualbox.org. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, in version 3 of the * License. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . * * The contents of this file may alternatively be used under the terms * of the Common Development and Distribution License Version 1.0 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included * in the VirtualBox distribution, in which case the provisions of the * CDDL are applicable instead of those of the GPL. * * You may elect to license modified versions of this file under the * terms and conditions of either the GPL or the CDDL or both. * * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0 */ /* * (Avoid include sanity checks as this is ring-3 only, C++ code.) */ #if defined(__cplusplus) && defined(IN_RING3) /********************************************************************************************************************************* * Defined Constants And Macros * *********************************************************************************************************************************/ /** @def BLDPROG_STRTAB_MAX_STRLEN * The max length of strings in the table. */ #if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING) # define BLDPROG_STRTAB_MAX_STRLEN 256 #endif /** @def BLDPROG_STRTAB_WITH_COMPRESSION * Enables very simple string compression. */ #if defined(DOXYGEN_RUNNING) # define BLDPROG_STRTAB_WITH_COMPRESSION #endif /** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS * Modifies the string compression to look for camel case words. */ #if defined(DOXYGEN_RUNNING) # define BLDPROG_STRTAB_WITH_CAMEL_WORDS #endif /** @def BLDPROG_STRTAB_PURE_ASCII * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this * is defined. Otherwise, the compression code will require IPRT to link. */ #if defined(DOXYGEN_RUNNING) # define BLDPROG_STRTAB_PURE_ASCII #endif /********************************************************************************************************************************* * Header Files * *********************************************************************************************************************************/ #include #include #include #include #include #ifdef BLDPROG_STRTAB_WITH_COMPRESSION # include # include # define BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE typedef struct BLDPROGWORDFREQSTATS { uint32_t cWithoutSep; /**< Number of occurances without a separator. */ # ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE uint32_t cWithSep; /**< Number of occurance with a separator. */ char chSep; /**< The separator. First come basis. */ # endif } BLDPROGWORDFREQSTATS; typedef std::map BLDPROGWORDFREQMAP; #endif #include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */ /********************************************************************************************************************************* * Structures and Typedefs * *********************************************************************************************************************************/ /** * Build table string. */ typedef struct BLDPROGSTRING { /** The string. * @note This may be modified or replaced (allocated from heap) when * compressing the string table. */ char *pszString; /** The string hash value. */ uint32_t uHash; /** The strint table offset. */ uint32_t offStrTab; /** The string length. */ size_t cchString; /** Pointer to the next string reference (same string table entry). */ struct BLDPROGSTRING *pNextRef; /** Pointer to the next string with the same hash value (collision). */ struct BLDPROGSTRING *pNextCollision; } BLDPROGSTRING; /** Pointer to a string table string. */ typedef BLDPROGSTRING *PBLDPROGSTRING; /** String table data. */ typedef struct BLDPROGSTRTAB { /** The size of g_papStrHash. */ size_t cStrHash; /** String hash table. */ PBLDPROGSTRING *papStrHash; /** Duplicate strings found by AddString. */ size_t cDuplicateStrings; /** Total length of the unique strings (no terminators). */ size_t cchUniqueStrings; /** Number of unique strings after AddString. */ size_t cUniqueStrings; /** Number of collisions. */ size_t cCollisions; /** Number of entries in apSortedStrings. */ size_t cSortedStrings; /** The sorted string table. */ PBLDPROGSTRING *papSortedStrings; #ifdef BLDPROG_STRTAB_WITH_COMPRESSION /** The 256 words we've picked to be indexed by reference. */ BLDPROGSTRING aCompDict[256]; /** The frequency of the 256 dictionary entries. */ size_t auCompDictFreq[256]; /** Incoming strings pending compression. */ PBLDPROGSTRING *papPendingStrings; /** Current number of entries in papStrPending. */ size_t cPendingStrings; /** The allocated size of papPendingStrings. */ size_t cMaxPendingStrings; /** Work frequency map. * @todo rewrite in plain C. */ BLDPROGWORDFREQMAP Frequencies; /** Map of characters used by input strings. */ uint64_t bmUsedChars[256/64]; #endif /** The string table. */ char *pachStrTab; /** The actual string table size. */ size_t cchStrTab; } BLDPROGSTRTAB; typedef BLDPROGSTRTAB *PBLDPROGSTRTAB; #if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-function" #endif #ifdef BLDPROG_STRTAB_WITH_COMPRESSION /** * Same as ASMBitTest. * * We cannot safely use ASMBitTest here because it must be inline, as this code * is used to build RuntimeBldProg. */ DECLINLINE(bool) BldProgBitIsSet(uint64_t const *pbmBitmap, size_t iBit) { return RT_BOOL(pbmBitmap[iBit / 64] & RT_BIT_64(iBit % 64)); } /** * Same as ASMBitSet. * * We cannot safely use ASMBitSet here because it must be inline, as this code * is used to build RuntimeBldProg. */ DECLINLINE(void) BldProgBitSet(uint64_t *pbmBitmap, size_t iBit) { pbmBitmap[iBit / 64] |= RT_BIT_64(iBit % 64); } #endif /** * Initializes the strint table compiler. * * @returns success indicator (out of memory if false). * @param pThis The strint table compiler instance. * @param cMaxStrings The max number of strings we'll be adding. */ static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings) { pThis->cStrHash = 0; pThis->papStrHash = NULL; pThis->cDuplicateStrings = 0; pThis->cchUniqueStrings = 0; pThis->cUniqueStrings = 0; pThis->cCollisions = 0; pThis->cSortedStrings = 0; pThis->papSortedStrings = NULL; pThis->pachStrTab = NULL; pThis->cchStrTab = 0; #ifdef BLDPROG_STRTAB_WITH_COMPRESSION memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict)); pThis->papPendingStrings = NULL; pThis->cPendingStrings = 0; pThis->cMaxPendingStrings = cMaxStrings; memset(pThis->bmUsedChars, 0, sizeof(pThis->bmUsedChars)); BldProgBitSet(pThis->bmUsedChars, 0); /* Some parts of the code still thinks zero is a terminator, so don't use it for now. */ # ifndef BLDPROG_STRTAB_PURE_ASCII BldProgBitSet(pThis->bmUsedChars, 0xff); /* Reserve escape byte for codepoints above 127. */ # endif #endif /* * Allocate a hash table double the size of all strings (to avoid too * many collisions). Add all strings to it, finding duplicates in the * process. */ #ifdef BLDPROG_STRTAB_WITH_COMPRESSION cMaxStrings += RT_ELEMENTS(pThis->aCompDict); #endif cMaxStrings *= 2; pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings); if (pThis->papStrHash) { pThis->cStrHash = cMaxStrings; #ifdef BLDPROG_STRTAB_WITH_COMPRESSION pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings); if (pThis->papPendingStrings) #endif return true; #ifdef BLDPROG_STRTAB_WITH_COMPRESSION free(pThis->papStrHash); pThis->papStrHash = NULL; #endif } return false; } #if 0 /* unused */ static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis) { free(pThis->papStrHash); free(pThis->papSortedStrings); free(pThis->pachStrTab); # ifdef BLDPROG_STRTAB_WITH_COMPRESSION free(pThis->papPendingStrings); # endif memset(pThis, 0, sizeof(*pThis)); } #endif #ifdef BLDPROG_STRTAB_WITH_COMPRESSION DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc) { /* * Skip leading word separators. */ # ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS while ( ch == ' ' || ch == '-' || ch == '+' || ch == '_') # else while (ch == ' ') # endif ch = *++pszSrc; if (ch) { /* * Find end of word. */ size_t cchWord = 1; # ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS char chPrev = ch; while ( (ch = pszSrc[cchWord]) != ' ' && ch != '\0' && ch != '-' && ch != '+' && ch != '_' && ( ch == chPrev || !RT_C_IS_UPPER(ch) || RT_C_IS_UPPER(chPrev)) ) { chPrev = ch; cchWord++; } # else while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0') cchWord++; # endif *ppszSrc = pszSrc; return cchWord; } *ppszSrc = pszSrc; return 0; } /** * Analyzes a string. * * @param pThis The strint table compiler instance. * @param pStr The string to analyze. */ static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { /* * Mark all the string characters as used. */ const char *psz = pStr->pszString; char ch; while ((ch = *psz++) != '\0') BldProgBitSet(pThis->bmUsedChars, (uint8_t)ch); /* * For now we just consider words. */ psz = pStr->pszString; while ((ch = *psz) != '\0') { size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz); if (cchWord > 1) { std::string strWord(psz, cchWord); BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord); if (it != pThis->Frequencies.end()) { # ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE char const chSep = psz[cchWord]; if (chSep != '\0' && (it->second.chSep == chSep || it->second.chSep == '\0')) { it->second.chSep = chSep; it->second.cWithSep++; } else # endif it->second.cWithoutSep++; } else { # ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE char const chSep = psz[cchWord]; if (chSep != '\0') { BLDPROGWORDFREQSTATS const NewWord = { 0, 0, chSep }; pThis->Frequencies[strWord] = NewWord; } else { static BLDPROGWORDFREQSTATS const s_NewWord = { 0, 0, 0 }; pThis->Frequencies[strWord] = s_NewWord; } # else pThis->Frequencies[strWord].cWithoutSep = 0; # endif } # if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */ /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */ if (ch == ' ') { ch = psz[++cchWord]; if (ch != ' ' && ch != '\0') { size_t const cchSaved = cchWord; do cchWord++; while ((ch = psz[cchWord]) != ' ' && ch != '\0'); strWord = std::string(psz, cchWord); BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord); if (it != pThis->Frequencies.end()) it->second += cchWord - 1; else pThis->Frequencies[strWord] = 0; cchWord = cchSaved; } } # endif } else if (!cchWord) break; /* Advance. */ psz += cchWord; } pStr->cchString = psz - pStr->pszString; if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN) { fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString); abort(); } } #endif /* BLDPROG_STRTAB_WITH_COMPRESSION */ /** * Adds a string to the hash table. * @param pThis The strint table compiler instance. * @param pStr The string. */ static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { pStr->pNextRef = NULL; pStr->pNextCollision = NULL; pStr->offStrTab = 0; pStr->uHash = sdbm(pStr->pszString, &pStr->cchString); if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN) { fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString); exit(RTEXITCODE_FAILURE); } size_t idxHash = pStr->uHash % pThis->cStrHash; PBLDPROGSTRING pCur = pThis->papStrHash[idxHash]; if (!pCur) pThis->papStrHash[idxHash] = pStr; else { /* Look for matching string. */ do { if ( pCur->uHash == pStr->uHash && pCur->cchString == pStr->cchString && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0) { pStr->pNextRef = pCur->pNextRef; pCur->pNextRef = pStr; pThis->cDuplicateStrings++; return; } pCur = pCur->pNextCollision; } while (pCur != NULL); /* No matching string, insert. */ pThis->cCollisions++; pStr->pNextCollision = pThis->papStrHash[idxHash]; pThis->papStrHash[idxHash] = pStr; } pThis->cUniqueStrings++; pThis->cchUniqueStrings += pStr->cchString; } /** * Adds a string to the string table. * * @param pThis The strint table compiler instance. * @param pStr The string. */ static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { #ifdef BLDPROG_STRTAB_WITH_COMPRESSION bldProgStrTab_compressorAnalyzeString(pThis, pStr); if (pThis->cPendingStrings < pThis->cMaxPendingStrings) pThis->papPendingStrings[pThis->cPendingStrings++] = pStr; else abort(); #else bldProgStrTab_AddStringToHashTab(pThis, pStr); #endif } /** * Adds a string to the string table. * * @param pThis The strint table compiler instance. * @param pStr The string entry (uninitialized). * @param psz The string, will be duplicated if compression is enabled. */ DECLINLINE(void) BldProgStrTab_AddStringDup(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr, const char *psz) { #ifdef BLDPROG_STRTAB_WITH_COMPRESSION pStr->pszString = strdup(psz); if (!pStr->pszString) abort(); #else pStr->pszString = (char *)psz; #endif BldProgStrTab_AddString(pThis, pStr); } #ifdef BLDPROG_STRTAB_WITH_COMPRESSION /** * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special * sequences. * * @returns New @a pszDst position, NULL if invalid source encoding. * @param pszDst The destination buffer. * @param pszSrc The source buffer. * @param cchSrc How much to copy. */ static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc) { while (cchSrc-- > 0) { char ch = *pszSrc; if (!((unsigned char)ch & 0x80)) { *pszDst++ = ch; pszSrc++; } else { # ifdef BLDPROG_STRTAB_PURE_ASCII fprintf(stderr, "error: unexpected char value %#x\n", ch); return NULL; # else RTUNICP uc; int rc = RTStrGetCpEx(&pszSrc, &uc); if (RT_SUCCESS(rc)) { *pszDst++ = (unsigned char)0xff; /* escape single code point. */ pszDst = RTStrPutCp(pszDst, uc); } else { fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc); return NULL; } # endif } } return pszDst; } /** * Replaces the dictionary words and escapes non-ascii chars in a string. * * @param pThis The strint table compiler instance. * @param pString The string to fixup. */ static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2]; char *pszDst = szNew; const char *pszSrc = pStr->pszString; const char *pszSrcEnd = pszSrc + pStr->cchString; char ch; while ((ch = *pszSrc) != '\0') { const char * const pszSrcUncompressed = pszSrc; size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc); size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed; if (cchSrcUncompressed > 0) { pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed); if (!pszDst) return false; } if (!cchWord) break; /* Check for g_aWord matches. */ if (cchWord > 1) { size_t cchMax = pszSrcEnd - pszSrc; for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) { size_t cchLen = pThis->aCompDict[i].cchString; if ( cchLen >= cchWord && cchLen <= cchMax && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0) { *pszDst++ = (unsigned char)i; pszSrc += cchLen; cchWord = 0; break; } } } if (cchWord > 0) { /* Copy the current word. */ pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord); if (!pszDst) return false; pszSrc += cchWord; } } /* Just terminate it now. */ *pszDst = '\0'; /* * Update the string. */ size_t cchNew = pszDst - &szNew[0]; if (cchNew > pStr->cchString) { pStr->pszString = (char *)malloc(cchNew + 1); if (!pStr->pszString) { fprintf(stderr, "Out of memory!\n"); return false; } } memcpy(pStr->pszString, szNew, cchNew + 1); pStr->cchString = cchNew; return true; } /** * Entry in SortedDictionary. * * Uses variable length string member, so not class and allocated via malloc. */ struct SortedDictionaryEntry { size_t m_cchGain; size_t m_cchString; RT_FLEXIBLE_ARRAY_EXTENSION char m_szString[RT_FLEXIBLE_ARRAY]; /** Allocates and initializes a new entry. */ static SortedDictionaryEntry *allocate(const char *a_pch, size_t a_cch, size_t a_cchGain, char a_chSep) { size_t cbString = a_cch + !!a_chSep + 1; SortedDictionaryEntry *pNew = (SortedDictionaryEntry *)malloc(RT_UOFFSETOF(SortedDictionaryEntry, m_szString) + cbString); if (pNew) { pNew->m_cchGain = a_cchGain; memcpy(pNew->m_szString, a_pch, a_cch); if (a_chSep) pNew->m_szString[a_cch++] = a_chSep; pNew->m_szString[a_cch] = '\0'; pNew->m_cchString = a_cch; } return pNew; } /** Compares this dictionary entry with an incoming one. * @retval -1 if this entry is of less worth than the new one. * @retval 0 if this entry is of equal worth to the new one. * @retval +1 if this entry is of more worth than the new one. */ int compare(size_t a_cchGain, size_t a_cchString) { /* Higher gain is preferred of course: */ if (m_cchGain < a_cchGain) return -1; if (m_cchGain > a_cchGain) return 1; /* Gain is the same. Prefer the shorter string, as it will result in a shorter string table: */ if (m_cchString > a_cchString) return -1; if (m_cchString < a_cchString) return 1; return 0; } }; /** * Insertion sort dictionary that keeps the 256 best words. * * Used by bldProgStrTab_compressorDoStringCompression to pick the dictionary * words. */ class SortedDictionary { public: size_t m_cEntries; SortedDictionaryEntry *m_apEntries[256]; SortedDictionary() : m_cEntries(0) { for (size_t i = 0; i < RT_ELEMENTS(m_apEntries); i++) m_apEntries[i] = NULL; } ~SortedDictionary() { while (m_cEntries > 0) { free(m_apEntries[--m_cEntries]); m_apEntries[m_cEntries] = NULL; } } /** * Inserts a new entry, if it's worth it. * @returns true on succes, false if out of memory. */ bool insert(const char *a_pchString, size_t a_cchStringBase, size_t a_cchGain, char a_chSep = 0) { size_t const cchString = a_cchStringBase + (a_chSep + 1); /* * Drop the insert if the symbol table is full and the insert is less worth the last entry: */ if ( m_cEntries >= RT_ELEMENTS(m_apEntries) && m_apEntries[RT_ELEMENTS(m_apEntries) - 1]->compare(a_cchGain, cchString) >= 0) return true; /* * Create a new entry to insert. */ SortedDictionaryEntry *pNewEntry = SortedDictionaryEntry::allocate(a_pchString, a_cchStringBase, a_cchGain, a_chSep); if (!pNewEntry) return false; /* * Find the insert point. */ if (m_cEntries == 0) { m_apEntries[0] = pNewEntry; m_cEntries = 1; } else { /* If table is full, drop the last entry before we start (already made sure the incoming entry is preferable to the one were dropping): */ if (m_cEntries >= RT_ELEMENTS(m_apEntries)) { free(m_apEntries[RT_ELEMENTS(m_apEntries) - 1]); m_apEntries[RT_ELEMENTS(m_apEntries) - 1] = NULL; m_cEntries = RT_ELEMENTS(m_apEntries) - 1; } /* Find where to insert the new entry: */ /** @todo use binary search. */ size_t i = m_cEntries; while (i > 0 && m_apEntries[i - 1]->compare(a_cchGain, cchString) < 0) i--; /* Shift entries to make room and insert the new entry. */ if (i < m_cEntries) memmove(&m_apEntries[i + 1], &m_apEntries[i], (m_cEntries - i) * sizeof(m_apEntries[0])); m_apEntries[i] = pNewEntry; m_cEntries++; } return true; } }; /** * Compresses the vendor and product strings. * * This is very very simple (a lot less work than the string table for * instance). */ static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose) { /* * Sort the frequency analyzis result and pick the top entries for any * available dictionary slots. */ SortedDictionary SortedDict; for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it) { bool fInsert; size_t const cchString = it->first.length(); # ifndef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE size_t const cchGainWithout = it->second.cWithoutSep * cchString; # else size_t const cchGainWithout = (it->second.cWithoutSep + it->second.cWithSep) * cchString; size_t const cchGainWith = it->second.cWithSep * (cchString + 1); if (cchGainWith > cchGainWithout) fInsert = SortedDict.insert(it->first.c_str(), cchString, cchGainWith, it->second.chSep); else # endif fInsert = SortedDict.insert(it->first.c_str(), cchString, cchGainWithout); if (!fInsert) return false; } size_t cb = 0; size_t cWords = 0; size_t iDict = 0; for (size_t i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) { char szTmp[2] = { (char)i, '\0' }; const char *psz = szTmp; if ( BldProgBitIsSet(pThis->bmUsedChars, i) || iDict >= SortedDict.m_cEntries) { /* character entry */ pThis->auCompDictFreq[i] = 0; pThis->aCompDict[i].cchString = 1; } else { /* word entry */ cb += SortedDict.m_apEntries[iDict]->m_cchGain; pThis->auCompDictFreq[i] = SortedDict.m_apEntries[iDict]->m_cchGain; pThis->aCompDict[i].cchString = SortedDict.m_apEntries[iDict]->m_cchString; psz = SortedDict.m_apEntries[iDict]->m_szString; cWords++; iDict++; } pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1); if (pThis->aCompDict[i].pszString) memcpy(pThis->aCompDict[i].pszString, psz, pThis->aCompDict[i].cchString + 1); else return false; } if (fVerbose) printf("debug: Estimated string compression saving: %u bytes\n" "debug: %u words, %u characters\n" , (unsigned)cb, (unsigned)cWords, (unsigned)(RT_ELEMENTS(pThis->aCompDict) - cWords)); /* * Rework the strings. */ size_t cchOld = 0; size_t cchOldMax = 0; size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN; size_t cchNew = 0; size_t cchNewMax = 0; size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN; size_t i = pThis->cPendingStrings; while (i-- > 0) { PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i]; cchOld += pCurStr->cchString; if (pCurStr->cchString > cchOldMax) cchOldMax = pCurStr->cchString; if (pCurStr->cchString < cchOldMin) cchOldMin = pCurStr->cchString; if (!bldProgStrTab_compressorFixupString(pThis, pCurStr)) return false; cchNew += pCurStr->cchString; if (pCurStr->cchString > cchNewMax) cchNewMax = pCurStr->cchString; if (pCurStr->cchString < cchNewMin) cchNewMin = pCurStr->cchString; bldProgStrTab_AddStringToHashTab(pThis, pCurStr); } /* * Do debug stats. */ if (fVerbose) { for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) cchNew += pThis->aCompDict[i].cchString + 1; printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew); if (cchNew < cchOld) printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld)); else printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew)); printf("debug: Original string lengths: average %u; min %u; max %u\n", (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax); printf("debug: Compressed string lengths: average %u; min %u; max %u\n", (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax); } return true; } #endif /* BLDPROG_STRTAB_WITH_COMPRESSION */ /** * Inserts a string into g_apUniqueStrings. * @param pThis The strint table compiler instance. * @param pStr The string. */ static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { size_t iIdx; size_t iEnd = pThis->cSortedStrings; if (iEnd) { size_t iStart = 0; for (;;) { iIdx = iStart + (iEnd - iStart) / 2; if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString) { if (iIdx <= iStart) break; iEnd = iIdx; } else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString) { if (++iIdx >= iEnd) break; iStart = iIdx; } else break; } if (iIdx != pThis->cSortedStrings) memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx], (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx])); } else iIdx = 0; pThis->papSortedStrings[iIdx] = pStr; pThis->cSortedStrings++; } /** * Compiles the string table after the string has been added. * * This will save space by dropping string terminators, eliminating duplicates * and try find strings that are sub-strings of others. * * Will initialize the StrRef of all StrTabString instances. * * @returns success indicator (error flagged). * @param pThis The strint table compiler instance. */ static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose) { #ifdef BLDPROG_STRTAB_WITH_COMPRESSION /* * Do the compression and add all the compressed strings to the table. */ if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose)) return false; /* * Add the dictionary strings. */ for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) if (pThis->aCompDict[i].cchString > 1) bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]); # ifdef RT_STRICT else if (pThis->aCompDict[i].cchString != 1) abort(); # endif #endif if (fVerbose) printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n", (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings, (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions); /* * Create papSortedStrings from the hash table. The table is sorted by * string length, with the longer strings first, so that we increase our * chances of locating duplicate substrings. */ pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings); if (!pThis->papSortedStrings) return false; pThis->cSortedStrings = 0; size_t idxHash = pThis->cStrHash; while (idxHash-- > 0) { PBLDPROGSTRING pCur = pThis->papStrHash[idxHash]; if (pCur) { do { bldProgStrTab_InsertUniqueString(pThis, pCur); pCur = pCur->pNextCollision; } while (pCur); } } /* * Create the actual string table. */ pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1); if (!pThis->pachStrTab) return false; pThis->cchStrTab = 0; for (size_t i = 0; i < pThis->cSortedStrings; i++) { PBLDPROGSTRING pCur = pThis->papSortedStrings[i]; const char * const pszCur = pCur->pszString; size_t const cchCur = pCur->cchString; size_t offStrTab = pThis->cchStrTab; /* * See if the string is a substring already found in the string table. * Excluding the zero terminator increases the chances for this. */ size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0; const char *pchLeft = pThis->pachStrTab; char const chFirst = *pszCur; while (cchLeft > 0) { const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft); if (!pchCandidate) break; if (memcmp(pchCandidate, pszCur, cchCur) == 0) { offStrTab = pchCandidate - pThis->pachStrTab; break; } cchLeft -= pchCandidate + 1 - pchLeft; pchLeft = pchCandidate + 1; } if (offStrTab == pThis->cchStrTab) { /* * See if the start of the string overlaps the end of the string table. */ if (pThis->cchStrTab && cchCur > 1) { cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1); pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft]; while (cchLeft > 0) { const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft); if (!pchCandidate) break; cchLeft -= pchCandidate - pchLeft; pchLeft = pchCandidate; if (memcmp(pchLeft, pszCur, cchLeft) == 0) { size_t cchToCopy = cchCur - cchLeft; memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy); pThis->cchStrTab += cchToCopy; offStrTab = pchCandidate - pThis->pachStrTab; break; } cchLeft--; pchLeft++; } } /* * If we didn't have any luck above, just append the string. */ if (offStrTab == pThis->cchStrTab) { memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur); pThis->cchStrTab += cchCur; } } /* * Set the string table offset for all the references to this string. */ do { pCur->offStrTab = (uint32_t)offStrTab; pCur = pCur->pNextRef; } while (pCur != NULL); } if (fVerbose) printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab); return true; } #ifdef RT_STRICT /** * Sanity checks a string table string. * @param pThis The strint table compiler instance. * @param pStr The string to check. */ static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr) { if (pStr->cchString != strlen(pStr->pszString)) abort(); if (pStr->offStrTab >= pThis->cchStrTab) abort(); if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab) abort(); if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0) abort(); } #endif /** * Output the string table string in C string litteral fashion. * * @param pThis The strint table instance. * @param pString The string to print. * @param pOut The output stream. */ static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut) { const unsigned char *psz = (const unsigned char *)pString->pszString; unsigned char uch; while ((uch = *psz++) != '\0') { #ifdef BLDPROG_STRTAB_WITH_COMPRESSION if (pThis->aCompDict[uch].cchString == 1) #else if (!(uch & 0x80)) #endif { if (uch != '\'' && uch != '\\') fputc((char)uch, pOut); else { fputc('\\', pOut); fputc((char)uch, pOut); } } #ifdef BLDPROG_STRTAB_WITH_COMPRESSION # ifndef BLDPROG_STRTAB_PURE_ASCII else if (uch == 0xff) { RTUNICP uc = RTStrGetCp((const char *)psz); psz += RTStrCpSize(uc); fprintf(pOut, "\\u%04x", uc); } # else else fputs(pThis->aCompDict[uch].pszString, pOut); # endif #else else fprintf(pOut, "\\x%02x", (unsigned)uch); NOREF(pThis); #endif } } /** * Writes the string table code to the output stream. * * @param pThis The strint table compiler instance. * @param pOut The output stream. * @param pszScope The scoping ("static " or empty string), * including trailing space. * @param pszPrefix The variable prefix. Typically "g_" for C programs, * whereas C++ classes normally use "class::s_g". * @param pszBaseName The base name for the variables. The user * accessible variable will end up as just the * prefix and basename concatenated. */ static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut, const char *pszScope, const char *pszPrefix, const char *pszBaseName) { #ifdef RT_STRICT /* * Do some quick sanity checks while we're here. */ # ifdef BLDPROG_STRTAB_WITH_COMPRESSION for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) { if (BldProgBitIsSet(pThis->bmUsedChars, i) ? pThis->aCompDict[i].cchString != 1 : pThis->aCompDict[i].cchString < 1) abort(); if (pThis->aCompDict[i].cchString > 1) BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]); } # endif #endif /* * Create a table for speeding up the character categorization. */ uint8_t abCharCat[256]; memset(abCharCat, 0, sizeof(abCharCat)); abCharCat[(unsigned char)'\\'] = 1; abCharCat[(unsigned char)'\''] = 1; for (unsigned i = 0; i < 0x20; i++) abCharCat[i] = 2; for (unsigned i = 0x7f; i < 0x100; i++) abCharCat[i] = 2; #ifdef BLDPROG_STRTAB_WITH_COMPRESSION for (unsigned i = 0; i < 0x100; i++) if (!BldProgBitIsSet(pThis->bmUsedChars, i)) /* Encode table references using '\xYY'. */ abCharCat[i] = 2; #endif /* * We follow the sorted string table, one string per line. */ fprintf(pOut, "#include \n" "\n" "static const char g_achStrTab%s[] =\n" "{\n", pszBaseName); uint32_t off = 0; for (uint32_t i = 0; i < pThis->cSortedStrings; i++) { PBLDPROGSTRING pCur = pThis->papSortedStrings[i]; uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString; if (offEnd > off) { /* Comment with an uncompressed and more readable version of the string. */ if (off == pCur->offStrTab) fprintf(pOut, "/* 0x%05x = \"", off); else fprintf(pOut, "/* 0X%05x = \"", off); BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut); fputs("\" */\n", pOut); /* Must use char by char here or we may trigger the max string length limit in the compiler, */ fputs(" ", pOut); for (; off < offEnd; off++) { unsigned char uch = pThis->pachStrTab[off]; fputc('\'', pOut); if (abCharCat[uch] == 0) fputc(uch, pOut); else if (abCharCat[uch] != 1) fprintf(pOut, "\\x%02x", (unsigned)uch); else { fputc('\\', pOut); fputc(uch, pOut); } fputc('\'', pOut); fputc(',', pOut); } fputc('\n', pOut); } } fprintf(pOut, "};\n" "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n", pszBaseName, (unsigned)pThis->cchStrTab); #ifdef BLDPROG_STRTAB_WITH_COMPRESSION /* * Write the compression dictionary. */ fprintf(pOut, "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n" "{\n", pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict)); for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++) if (pThis->aCompDict[i].cchString > 1) fprintf(pOut, " /*[%3u]=*/ { %#08x, %#04x }, // %6lu - %s\n", i, pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, (unsigned long)pThis->auCompDictFreq[i], pThis->aCompDict[i].pszString); # ifndef BLDPROG_STRTAB_PURE_ASCII else if (i == 0xff) fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // UTF-8 escape\n", i); # endif else if (i == 0) fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // unused, because zero terminator\n", i); else if (i < 0x20) fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // %02x\n", i, i); else fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // '%c'\n", i, (char)i); fprintf(pOut, "};\n\n"); #endif /* * Write the string table data structure. */ fprintf(pOut, "%sconst RTBLDPROGSTRTAB %s%s = \n" "{\n" " /*.pchStrTab = */ &g_achStrTab%s[0],\n" " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n" , pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName); #ifdef BLDPROG_STRTAB_WITH_COMPRESSION fprintf(pOut, " /*.cCompDict = */ %u,\n" " /*.paCompDict = */ &g_aCompDict%s[0]\n" "};\n" # ifndef BLDPROG_STRTAB_PURE_ASCII /* 255 or 256 entries is how the decoder knows */ , (unsigned)RT_ELEMENTS(pThis->aCompDict) - 1, # else , (unsigned)RT_ELEMENTS(pThis->aCompDict), # endif pszBaseName); #else fprintf(pOut, " /*.cCompDict = */ 0,\n" " /*.paCompDict = */ NULL\n" "};\n"); #endif } #if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6) # pragma GCC diagnostic pop #endif #endif /* __cplusplus && IN_RING3 */