1
0
Fork 0
virtualbox/include/iprt/bldprog-strtab-template.cpp.h
Daniel Baumann df1bda4fe9
Adding upstream version 7.0.20-dfsg.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-22 09:56:04 +02:00

1335 lines
42 KiB
C++

/* $Id: bldprog-strtab-template.cpp.h $ */
/** @file
* IPRT - Build Program - String Table Generator.
*/
/*
* Copyright (C) 2006-2023 Oracle and/or its affiliates.
*
* This file is part of VirtualBox base platform packages, as
* available from https://www.virtualbox.org.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, in version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses>.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
* in the VirtualBox distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*
* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
*/
/*
* (Avoid include sanity checks as this is ring-3 only, C++ code.)
*/
#if defined(__cplusplus) && defined(IN_RING3)
/*********************************************************************************************************************************
* Defined Constants And Macros *
*********************************************************************************************************************************/
/** @def BLDPROG_STRTAB_MAX_STRLEN
* The max length of strings in the table. */
#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
# define BLDPROG_STRTAB_MAX_STRLEN 256
#endif
/** @def BLDPROG_STRTAB_WITH_COMPRESSION
* Enables very simple string compression.
*/
#if defined(DOXYGEN_RUNNING)
# define BLDPROG_STRTAB_WITH_COMPRESSION
#endif
/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
* Modifies the string compression to look for camel case words.
*/
#if defined(DOXYGEN_RUNNING)
# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
#endif
/** @def BLDPROG_STRTAB_PURE_ASCII
* String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
* is defined. Otherwise, the compression code will require IPRT to link.
*/
#if defined(DOXYGEN_RUNNING)
# define BLDPROG_STRTAB_PURE_ASCII
#endif
/*********************************************************************************************************************************
* Header Files *
*********************************************************************************************************************************/
#include <iprt/stdarg.h>
#include <iprt/ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
# include <map>
# include <iprt/sanitized/string>
# define BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE
typedef struct BLDPROGWORDFREQSTATS
{
uint32_t cWithoutSep; /**< Number of occurances without a separator. */
# ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE
uint32_t cWithSep; /**< Number of occurance with a separator. */
char chSep; /**< The separator. First come basis. */
# endif
} BLDPROGWORDFREQSTATS;
typedef std::map<std::string, BLDPROGWORDFREQSTATS> BLDPROGWORDFREQMAP;
#endif
#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
/*********************************************************************************************************************************
* Structures and Typedefs *
*********************************************************************************************************************************/
/**
* Build table string.
*/
typedef struct BLDPROGSTRING
{
/** The string.
* @note This may be modified or replaced (allocated from heap) when
* compressing the string table. */
char *pszString;
/** The string hash value. */
uint32_t uHash;
/** The strint table offset. */
uint32_t offStrTab;
/** The string length. */
size_t cchString;
/** Pointer to the next string reference (same string table entry). */
struct BLDPROGSTRING *pNextRef;
/** Pointer to the next string with the same hash value (collision). */
struct BLDPROGSTRING *pNextCollision;
} BLDPROGSTRING;
/** Pointer to a string table string. */
typedef BLDPROGSTRING *PBLDPROGSTRING;
/** String table data. */
typedef struct BLDPROGSTRTAB
{
/** The size of g_papStrHash. */
size_t cStrHash;
/** String hash table. */
PBLDPROGSTRING *papStrHash;
/** Duplicate strings found by AddString. */
size_t cDuplicateStrings;
/** Total length of the unique strings (no terminators). */
size_t cchUniqueStrings;
/** Number of unique strings after AddString. */
size_t cUniqueStrings;
/** Number of collisions. */
size_t cCollisions;
/** Number of entries in apSortedStrings. */
size_t cSortedStrings;
/** The sorted string table. */
PBLDPROGSTRING *papSortedStrings;
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
/** The 256 words we've picked to be indexed by reference. */
BLDPROGSTRING aCompDict[256];
/** The frequency of the 256 dictionary entries. */
size_t auCompDictFreq[256];
/** Incoming strings pending compression. */
PBLDPROGSTRING *papPendingStrings;
/** Current number of entries in papStrPending. */
size_t cPendingStrings;
/** The allocated size of papPendingStrings. */
size_t cMaxPendingStrings;
/** Work frequency map.
* @todo rewrite in plain C. */
BLDPROGWORDFREQMAP Frequencies;
/** Map of characters used by input strings. */
uint64_t bmUsedChars[256/64];
#endif
/** The string table. */
char *pachStrTab;
/** The actual string table size. */
size_t cchStrTab;
} BLDPROGSTRTAB;
typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-function"
#endif
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
/**
* Same as ASMBitTest.
*
* We cannot safely use ASMBitTest here because it must be inline, as this code
* is used to build RuntimeBldProg. */
DECLINLINE(bool) BldProgBitIsSet(uint64_t const *pbmBitmap, size_t iBit)
{
return RT_BOOL(pbmBitmap[iBit / 64] & RT_BIT_64(iBit % 64));
}
/**
* Same as ASMBitSet.
*
* We cannot safely use ASMBitSet here because it must be inline, as this code
* is used to build RuntimeBldProg.
*/
DECLINLINE(void) BldProgBitSet(uint64_t *pbmBitmap, size_t iBit)
{
pbmBitmap[iBit / 64] |= RT_BIT_64(iBit % 64);
}
#endif
/**
* Initializes the strint table compiler.
*
* @returns success indicator (out of memory if false).
* @param pThis The strint table compiler instance.
* @param cMaxStrings The max number of strings we'll be adding.
*/
static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
{
pThis->cStrHash = 0;
pThis->papStrHash = NULL;
pThis->cDuplicateStrings = 0;
pThis->cchUniqueStrings = 0;
pThis->cUniqueStrings = 0;
pThis->cCollisions = 0;
pThis->cSortedStrings = 0;
pThis->papSortedStrings = NULL;
pThis->pachStrTab = NULL;
pThis->cchStrTab = 0;
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
pThis->papPendingStrings = NULL;
pThis->cPendingStrings = 0;
pThis->cMaxPendingStrings = cMaxStrings;
memset(pThis->bmUsedChars, 0, sizeof(pThis->bmUsedChars));
BldProgBitSet(pThis->bmUsedChars, 0); /* Some parts of the code still thinks zero is a terminator, so don't use it for now. */
# ifndef BLDPROG_STRTAB_PURE_ASCII
BldProgBitSet(pThis->bmUsedChars, 0xff); /* Reserve escape byte for codepoints above 127. */
# endif
#endif
/*
* Allocate a hash table double the size of all strings (to avoid too
* many collisions). Add all strings to it, finding duplicates in the
* process.
*/
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
#endif
cMaxStrings *= 2;
pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
if (pThis->papStrHash)
{
pThis->cStrHash = cMaxStrings;
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
if (pThis->papPendingStrings)
#endif
return true;
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
free(pThis->papStrHash);
pThis->papStrHash = NULL;
#endif
}
return false;
}
#if 0 /* unused */
static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
{
free(pThis->papStrHash);
free(pThis->papSortedStrings);
free(pThis->pachStrTab);
# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
free(pThis->papPendingStrings);
# endif
memset(pThis, 0, sizeof(*pThis));
}
#endif
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
{
/*
* Skip leading word separators.
*/
# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
while ( ch == ' '
|| ch == '-'
|| ch == '+'
|| ch == '_')
# else
while (ch == ' ')
# endif
ch = *++pszSrc;
if (ch)
{
/*
* Find end of word.
*/
size_t cchWord = 1;
# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
char chPrev = ch;
while ( (ch = pszSrc[cchWord]) != ' '
&& ch != '\0'
&& ch != '-'
&& ch != '+'
&& ch != '_'
&& ( ch == chPrev
|| !RT_C_IS_UPPER(ch)
|| RT_C_IS_UPPER(chPrev)) )
{
chPrev = ch;
cchWord++;
}
# else
while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
cchWord++;
# endif
*ppszSrc = pszSrc;
return cchWord;
}
*ppszSrc = pszSrc;
return 0;
}
/**
* Analyzes a string.
*
* @param pThis The strint table compiler instance.
* @param pStr The string to analyze.
*/
static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
/*
* Mark all the string characters as used.
*/
const char *psz = pStr->pszString;
char ch;
while ((ch = *psz++) != '\0')
BldProgBitSet(pThis->bmUsedChars, (uint8_t)ch);
/*
* For now we just consider words.
*/
psz = pStr->pszString;
while ((ch = *psz) != '\0')
{
size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
if (cchWord > 1)
{
std::string strWord(psz, cchWord);
BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
if (it != pThis->Frequencies.end())
{
# ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE
char const chSep = psz[cchWord];
if (chSep != '\0' && (it->second.chSep == chSep || it->second.chSep == '\0'))
{
it->second.chSep = chSep;
it->second.cWithSep++;
}
else
# endif
it->second.cWithoutSep++;
}
else
{
# ifdef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE
char const chSep = psz[cchWord];
if (chSep != '\0')
{
BLDPROGWORDFREQSTATS const NewWord = { 0, 0, chSep };
pThis->Frequencies[strWord] = NewWord;
}
else
{
static BLDPROGWORDFREQSTATS const s_NewWord = { 0, 0, 0 };
pThis->Frequencies[strWord] = s_NewWord;
}
# else
pThis->Frequencies[strWord].cWithoutSep = 0;
# endif
}
# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
/* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
if (ch == ' ')
{
ch = psz[++cchWord];
if (ch != ' ' && ch != '\0')
{
size_t const cchSaved = cchWord;
do
cchWord++;
while ((ch = psz[cchWord]) != ' ' && ch != '\0');
strWord = std::string(psz, cchWord);
BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
if (it != pThis->Frequencies.end())
it->second += cchWord - 1;
else
pThis->Frequencies[strWord] = 0;
cchWord = cchSaved;
}
}
# endif
}
else if (!cchWord)
break;
/* Advance. */
psz += cchWord;
}
pStr->cchString = psz - pStr->pszString;
if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
{
fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
abort();
}
}
#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
/**
* Adds a string to the hash table.
* @param pThis The strint table compiler instance.
* @param pStr The string.
*/
static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
pStr->pNextRef = NULL;
pStr->pNextCollision = NULL;
pStr->offStrTab = 0;
pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
{
fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
exit(RTEXITCODE_FAILURE);
}
size_t idxHash = pStr->uHash % pThis->cStrHash;
PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
if (!pCur)
pThis->papStrHash[idxHash] = pStr;
else
{
/* Look for matching string. */
do
{
if ( pCur->uHash == pStr->uHash
&& pCur->cchString == pStr->cchString
&& memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
{
pStr->pNextRef = pCur->pNextRef;
pCur->pNextRef = pStr;
pThis->cDuplicateStrings++;
return;
}
pCur = pCur->pNextCollision;
} while (pCur != NULL);
/* No matching string, insert. */
pThis->cCollisions++;
pStr->pNextCollision = pThis->papStrHash[idxHash];
pThis->papStrHash[idxHash] = pStr;
}
pThis->cUniqueStrings++;
pThis->cchUniqueStrings += pStr->cchString;
}
/**
* Adds a string to the string table.
*
* @param pThis The strint table compiler instance.
* @param pStr The string.
*/
static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
bldProgStrTab_compressorAnalyzeString(pThis, pStr);
if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
else
abort();
#else
bldProgStrTab_AddStringToHashTab(pThis, pStr);
#endif
}
/**
* Adds a string to the string table.
*
* @param pThis The strint table compiler instance.
* @param pStr The string entry (uninitialized).
* @param psz The string, will be duplicated if compression is enabled.
*/
DECLINLINE(void) BldProgStrTab_AddStringDup(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr, const char *psz)
{
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
pStr->pszString = strdup(psz);
if (!pStr->pszString)
abort();
#else
pStr->pszString = (char *)psz;
#endif
BldProgStrTab_AddString(pThis, pStr);
}
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
/**
* Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
* sequences.
*
* @returns New @a pszDst position, NULL if invalid source encoding.
* @param pszDst The destination buffer.
* @param pszSrc The source buffer.
* @param cchSrc How much to copy.
*/
static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
{
while (cchSrc-- > 0)
{
char ch = *pszSrc;
if (!((unsigned char)ch & 0x80))
{
*pszDst++ = ch;
pszSrc++;
}
else
{
# ifdef BLDPROG_STRTAB_PURE_ASCII
fprintf(stderr, "error: unexpected char value %#x\n", ch);
return NULL;
# else
RTUNICP uc;
int rc = RTStrGetCpEx(&pszSrc, &uc);
if (RT_SUCCESS(rc))
{
*pszDst++ = (unsigned char)0xff; /* escape single code point. */
pszDst = RTStrPutCp(pszDst, uc);
}
else
{
fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
return NULL;
}
# endif
}
}
return pszDst;
}
/**
* Replaces the dictionary words and escapes non-ascii chars in a string.
*
* @param pThis The strint table compiler instance.
* @param pString The string to fixup.
*/
static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
char *pszDst = szNew;
const char *pszSrc = pStr->pszString;
const char *pszSrcEnd = pszSrc + pStr->cchString;
char ch;
while ((ch = *pszSrc) != '\0')
{
const char * const pszSrcUncompressed = pszSrc;
size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
if (cchSrcUncompressed > 0)
{
pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
if (!pszDst)
return false;
}
if (!cchWord)
break;
/* Check for g_aWord matches. */
if (cchWord > 1)
{
size_t cchMax = pszSrcEnd - pszSrc;
for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
{
size_t cchLen = pThis->aCompDict[i].cchString;
if ( cchLen >= cchWord
&& cchLen <= cchMax
&& memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
{
*pszDst++ = (unsigned char)i;
pszSrc += cchLen;
cchWord = 0;
break;
}
}
}
if (cchWord > 0)
{
/* Copy the current word. */
pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
if (!pszDst)
return false;
pszSrc += cchWord;
}
}
/* Just terminate it now. */
*pszDst = '\0';
/*
* Update the string.
*/
size_t cchNew = pszDst - &szNew[0];
if (cchNew > pStr->cchString)
{
pStr->pszString = (char *)malloc(cchNew + 1);
if (!pStr->pszString)
{
fprintf(stderr, "Out of memory!\n");
return false;
}
}
memcpy(pStr->pszString, szNew, cchNew + 1);
pStr->cchString = cchNew;
return true;
}
/**
* Entry in SortedDictionary.
*
* Uses variable length string member, so not class and allocated via malloc.
*/
struct SortedDictionaryEntry
{
size_t m_cchGain;
size_t m_cchString;
RT_FLEXIBLE_ARRAY_EXTENSION
char m_szString[RT_FLEXIBLE_ARRAY];
/** Allocates and initializes a new entry. */
static SortedDictionaryEntry *allocate(const char *a_pch, size_t a_cch, size_t a_cchGain, char a_chSep)
{
size_t cbString = a_cch + !!a_chSep + 1;
SortedDictionaryEntry *pNew = (SortedDictionaryEntry *)malloc(RT_UOFFSETOF(SortedDictionaryEntry, m_szString) + cbString);
if (pNew)
{
pNew->m_cchGain = a_cchGain;
memcpy(pNew->m_szString, a_pch, a_cch);
if (a_chSep)
pNew->m_szString[a_cch++] = a_chSep;
pNew->m_szString[a_cch] = '\0';
pNew->m_cchString = a_cch;
}
return pNew;
}
/** Compares this dictionary entry with an incoming one.
* @retval -1 if this entry is of less worth than the new one.
* @retval 0 if this entry is of equal worth to the new one.
* @retval +1 if this entry is of more worth than the new one.
*/
int compare(size_t a_cchGain, size_t a_cchString)
{
/* Higher gain is preferred of course: */
if (m_cchGain < a_cchGain)
return -1;
if (m_cchGain > a_cchGain)
return 1;
/* Gain is the same. Prefer the shorter string, as it will result in a shorter string table: */
if (m_cchString > a_cchString)
return -1;
if (m_cchString < a_cchString)
return 1;
return 0;
}
};
/**
* Insertion sort dictionary that keeps the 256 best words.
*
* Used by bldProgStrTab_compressorDoStringCompression to pick the dictionary
* words.
*/
class SortedDictionary
{
public:
size_t m_cEntries;
SortedDictionaryEntry *m_apEntries[256];
SortedDictionary()
: m_cEntries(0)
{
for (size_t i = 0; i < RT_ELEMENTS(m_apEntries); i++)
m_apEntries[i] = NULL;
}
~SortedDictionary()
{
while (m_cEntries > 0)
{
free(m_apEntries[--m_cEntries]);
m_apEntries[m_cEntries] = NULL;
}
}
/**
* Inserts a new entry, if it's worth it.
* @returns true on succes, false if out of memory.
*/
bool insert(const char *a_pchString, size_t a_cchStringBase, size_t a_cchGain, char a_chSep = 0)
{
size_t const cchString = a_cchStringBase + (a_chSep + 1);
/*
* Drop the insert if the symbol table is full and the insert is less worth the last entry:
*/
if ( m_cEntries >= RT_ELEMENTS(m_apEntries)
&& m_apEntries[RT_ELEMENTS(m_apEntries) - 1]->compare(a_cchGain, cchString) >= 0)
return true;
/*
* Create a new entry to insert.
*/
SortedDictionaryEntry *pNewEntry = SortedDictionaryEntry::allocate(a_pchString, a_cchStringBase, a_cchGain, a_chSep);
if (!pNewEntry)
return false;
/*
* Find the insert point.
*/
if (m_cEntries == 0)
{
m_apEntries[0] = pNewEntry;
m_cEntries = 1;
}
else
{
/* If table is full, drop the last entry before we start (already made
sure the incoming entry is preferable to the one were dropping): */
if (m_cEntries >= RT_ELEMENTS(m_apEntries))
{
free(m_apEntries[RT_ELEMENTS(m_apEntries) - 1]);
m_apEntries[RT_ELEMENTS(m_apEntries) - 1] = NULL;
m_cEntries = RT_ELEMENTS(m_apEntries) - 1;
}
/* Find where to insert the new entry: */
/** @todo use binary search. */
size_t i = m_cEntries;
while (i > 0 && m_apEntries[i - 1]->compare(a_cchGain, cchString) < 0)
i--;
/* Shift entries to make room and insert the new entry. */
if (i < m_cEntries)
memmove(&m_apEntries[i + 1], &m_apEntries[i], (m_cEntries - i) * sizeof(m_apEntries[0]));
m_apEntries[i] = pNewEntry;
m_cEntries++;
}
return true;
}
};
/**
* Compresses the vendor and product strings.
*
* This is very very simple (a lot less work than the string table for
* instance).
*/
static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
{
/*
* Sort the frequency analyzis result and pick the top entries for any
* available dictionary slots.
*/
SortedDictionary SortedDict;
for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
{
bool fInsert;
size_t const cchString = it->first.length();
# ifndef BLDPROG_STRTAB_WITH_WORD_SEP_ALTERNATIVE
size_t const cchGainWithout = it->second.cWithoutSep * cchString;
# else
size_t const cchGainWithout = (it->second.cWithoutSep + it->second.cWithSep) * cchString;
size_t const cchGainWith = it->second.cWithSep * (cchString + 1);
if (cchGainWith > cchGainWithout)
fInsert = SortedDict.insert(it->first.c_str(), cchString, cchGainWith, it->second.chSep);
else
# endif
fInsert = SortedDict.insert(it->first.c_str(), cchString, cchGainWithout);
if (!fInsert)
return false;
}
size_t cb = 0;
size_t cWords = 0;
size_t iDict = 0;
for (size_t i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
{
char szTmp[2] = { (char)i, '\0' };
const char *psz = szTmp;
if ( BldProgBitIsSet(pThis->bmUsedChars, i)
|| iDict >= SortedDict.m_cEntries)
{
/* character entry */
pThis->auCompDictFreq[i] = 0;
pThis->aCompDict[i].cchString = 1;
}
else
{
/* word entry */
cb += SortedDict.m_apEntries[iDict]->m_cchGain;
pThis->auCompDictFreq[i] = SortedDict.m_apEntries[iDict]->m_cchGain;
pThis->aCompDict[i].cchString = SortedDict.m_apEntries[iDict]->m_cchString;
psz = SortedDict.m_apEntries[iDict]->m_szString;
cWords++;
iDict++;
}
pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
if (pThis->aCompDict[i].pszString)
memcpy(pThis->aCompDict[i].pszString, psz, pThis->aCompDict[i].cchString + 1);
else
return false;
}
if (fVerbose)
printf("debug: Estimated string compression saving: %u bytes\n"
"debug: %u words, %u characters\n"
, (unsigned)cb, (unsigned)cWords, (unsigned)(RT_ELEMENTS(pThis->aCompDict) - cWords));
/*
* Rework the strings.
*/
size_t cchOld = 0;
size_t cchOldMax = 0;
size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
size_t cchNew = 0;
size_t cchNewMax = 0;
size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
size_t i = pThis->cPendingStrings;
while (i-- > 0)
{
PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
cchOld += pCurStr->cchString;
if (pCurStr->cchString > cchOldMax)
cchOldMax = pCurStr->cchString;
if (pCurStr->cchString < cchOldMin)
cchOldMin = pCurStr->cchString;
if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
return false;
cchNew += pCurStr->cchString;
if (pCurStr->cchString > cchNewMax)
cchNewMax = pCurStr->cchString;
if (pCurStr->cchString < cchNewMin)
cchNewMin = pCurStr->cchString;
bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
}
/*
* Do debug stats.
*/
if (fVerbose)
{
for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
cchNew += pThis->aCompDict[i].cchString + 1;
printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
if (cchNew < cchOld)
printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
else
printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
printf("debug: Original string lengths: average %u; min %u; max %u\n",
(unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
(unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
}
return true;
}
#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
/**
* Inserts a string into g_apUniqueStrings.
* @param pThis The strint table compiler instance.
* @param pStr The string.
*/
static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
size_t iIdx;
size_t iEnd = pThis->cSortedStrings;
if (iEnd)
{
size_t iStart = 0;
for (;;)
{
iIdx = iStart + (iEnd - iStart) / 2;
if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
{
if (iIdx <= iStart)
break;
iEnd = iIdx;
}
else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
{
if (++iIdx >= iEnd)
break;
iStart = iIdx;
}
else
break;
}
if (iIdx != pThis->cSortedStrings)
memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
(pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
}
else
iIdx = 0;
pThis->papSortedStrings[iIdx] = pStr;
pThis->cSortedStrings++;
}
/**
* Compiles the string table after the string has been added.
*
* This will save space by dropping string terminators, eliminating duplicates
* and try find strings that are sub-strings of others.
*
* Will initialize the StrRef of all StrTabString instances.
*
* @returns success indicator (error flagged).
* @param pThis The strint table compiler instance.
*/
static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
{
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
/*
* Do the compression and add all the compressed strings to the table.
*/
if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
return false;
/*
* Add the dictionary strings.
*/
for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
if (pThis->aCompDict[i].cchString > 1)
bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
# ifdef RT_STRICT
else if (pThis->aCompDict[i].cchString != 1)
abort();
# endif
#endif
if (fVerbose)
printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
(unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
(unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
/*
* Create papSortedStrings from the hash table. The table is sorted by
* string length, with the longer strings first, so that we increase our
* chances of locating duplicate substrings.
*/
pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
if (!pThis->papSortedStrings)
return false;
pThis->cSortedStrings = 0;
size_t idxHash = pThis->cStrHash;
while (idxHash-- > 0)
{
PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
if (pCur)
{
do
{
bldProgStrTab_InsertUniqueString(pThis, pCur);
pCur = pCur->pNextCollision;
} while (pCur);
}
}
/*
* Create the actual string table.
*/
pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
if (!pThis->pachStrTab)
return false;
pThis->cchStrTab = 0;
for (size_t i = 0; i < pThis->cSortedStrings; i++)
{
PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
const char * const pszCur = pCur->pszString;
size_t const cchCur = pCur->cchString;
size_t offStrTab = pThis->cchStrTab;
/*
* See if the string is a substring already found in the string table.
* Excluding the zero terminator increases the chances for this.
*/
size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
const char *pchLeft = pThis->pachStrTab;
char const chFirst = *pszCur;
while (cchLeft > 0)
{
const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
if (!pchCandidate)
break;
if (memcmp(pchCandidate, pszCur, cchCur) == 0)
{
offStrTab = pchCandidate - pThis->pachStrTab;
break;
}
cchLeft -= pchCandidate + 1 - pchLeft;
pchLeft = pchCandidate + 1;
}
if (offStrTab == pThis->cchStrTab)
{
/*
* See if the start of the string overlaps the end of the string table.
*/
if (pThis->cchStrTab && cchCur > 1)
{
cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
while (cchLeft > 0)
{
const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
if (!pchCandidate)
break;
cchLeft -= pchCandidate - pchLeft;
pchLeft = pchCandidate;
if (memcmp(pchLeft, pszCur, cchLeft) == 0)
{
size_t cchToCopy = cchCur - cchLeft;
memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
pThis->cchStrTab += cchToCopy;
offStrTab = pchCandidate - pThis->pachStrTab;
break;
}
cchLeft--;
pchLeft++;
}
}
/*
* If we didn't have any luck above, just append the string.
*/
if (offStrTab == pThis->cchStrTab)
{
memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
pThis->cchStrTab += cchCur;
}
}
/*
* Set the string table offset for all the references to this string.
*/
do
{
pCur->offStrTab = (uint32_t)offStrTab;
pCur = pCur->pNextRef;
} while (pCur != NULL);
}
if (fVerbose)
printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
return true;
}
#ifdef RT_STRICT
/**
* Sanity checks a string table string.
* @param pThis The strint table compiler instance.
* @param pStr The string to check.
*/
static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
{
if (pStr->cchString != strlen(pStr->pszString))
abort();
if (pStr->offStrTab >= pThis->cchStrTab)
abort();
if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
abort();
if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
abort();
}
#endif
/**
* Output the string table string in C string litteral fashion.
*
* @param pThis The strint table instance.
* @param pString The string to print.
* @param pOut The output stream.
*/
static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
{
const unsigned char *psz = (const unsigned char *)pString->pszString;
unsigned char uch;
while ((uch = *psz++) != '\0')
{
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
if (pThis->aCompDict[uch].cchString == 1)
#else
if (!(uch & 0x80))
#endif
{
if (uch != '\'' && uch != '\\')
fputc((char)uch, pOut);
else
{
fputc('\\', pOut);
fputc((char)uch, pOut);
}
}
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
# ifndef BLDPROG_STRTAB_PURE_ASCII
else if (uch == 0xff)
{
RTUNICP uc = RTStrGetCp((const char *)psz);
psz += RTStrCpSize(uc);
fprintf(pOut, "\\u%04x", uc);
}
# else
else
fputs(pThis->aCompDict[uch].pszString, pOut);
# endif
#else
else
fprintf(pOut, "\\x%02x", (unsigned)uch);
NOREF(pThis);
#endif
}
}
/**
* Writes the string table code to the output stream.
*
* @param pThis The strint table compiler instance.
* @param pOut The output stream.
* @param pszScope The scoping ("static " or empty string),
* including trailing space.
* @param pszPrefix The variable prefix. Typically "g_" for C programs,
* whereas C++ classes normally use "class::s_g".
* @param pszBaseName The base name for the variables. The user
* accessible variable will end up as just the
* prefix and basename concatenated.
*/
static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
const char *pszScope, const char *pszPrefix, const char *pszBaseName)
{
#ifdef RT_STRICT
/*
* Do some quick sanity checks while we're here.
*/
# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
{
if (BldProgBitIsSet(pThis->bmUsedChars, i)
? pThis->aCompDict[i].cchString != 1 : pThis->aCompDict[i].cchString < 1)
abort();
if (pThis->aCompDict[i].cchString > 1)
BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
}
# endif
#endif
/*
* Create a table for speeding up the character categorization.
*/
uint8_t abCharCat[256];
memset(abCharCat, 0, sizeof(abCharCat));
abCharCat[(unsigned char)'\\'] = 1;
abCharCat[(unsigned char)'\''] = 1;
for (unsigned i = 0; i < 0x20; i++)
abCharCat[i] = 2;
for (unsigned i = 0x7f; i < 0x100; i++)
abCharCat[i] = 2;
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
for (unsigned i = 0; i < 0x100; i++)
if (!BldProgBitIsSet(pThis->bmUsedChars, i)) /* Encode table references using '\xYY'. */
abCharCat[i] = 2;
#endif
/*
* We follow the sorted string table, one string per line.
*/
fprintf(pOut,
"#include <iprt/bldprog-strtab.h>\n"
"\n"
"static const char g_achStrTab%s[] =\n"
"{\n",
pszBaseName);
uint32_t off = 0;
for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
{
PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
if (offEnd > off)
{
/* Comment with an uncompressed and more readable version of the string. */
if (off == pCur->offStrTab)
fprintf(pOut, "/* 0x%05x = \"", off);
else
fprintf(pOut, "/* 0X%05x = \"", off);
BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
fputs("\" */\n", pOut);
/* Must use char by char here or we may trigger the max string
length limit in the compiler, */
fputs(" ", pOut);
for (; off < offEnd; off++)
{
unsigned char uch = pThis->pachStrTab[off];
fputc('\'', pOut);
if (abCharCat[uch] == 0)
fputc(uch, pOut);
else if (abCharCat[uch] != 1)
fprintf(pOut, "\\x%02x", (unsigned)uch);
else
{
fputc('\\', pOut);
fputc(uch, pOut);
}
fputc('\'', pOut);
fputc(',', pOut);
}
fputc('\n', pOut);
}
}
fprintf(pOut,
"};\n"
"AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
pszBaseName, (unsigned)pThis->cchStrTab);
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
/*
* Write the compression dictionary.
*/
fprintf(pOut,
"static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
"{\n",
pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
if (pThis->aCompDict[i].cchString > 1)
fprintf(pOut, " /*[%3u]=*/ { %#08x, %#04x }, // %6lu - %s\n", i,
pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString,
(unsigned long)pThis->auCompDictFreq[i], pThis->aCompDict[i].pszString);
# ifndef BLDPROG_STRTAB_PURE_ASCII
else if (i == 0xff)
fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // UTF-8 escape\n", i);
# endif
else if (i == 0)
fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // unused, because zero terminator\n", i);
else if (i < 0x20)
fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // %02x\n", i, i);
else
fprintf(pOut, " /*[%3u]=*/ { 0x000000, 0x00 }, // '%c'\n", i, (char)i);
fprintf(pOut, "};\n\n");
#endif
/*
* Write the string table data structure.
*/
fprintf(pOut,
"%sconst RTBLDPROGSTRTAB %s%s = \n"
"{\n"
" /*.pchStrTab = */ &g_achStrTab%s[0],\n"
" /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
,
pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
fprintf(pOut,
" /*.cCompDict = */ %u,\n"
" /*.paCompDict = */ &g_aCompDict%s[0]\n"
"};\n"
# ifndef BLDPROG_STRTAB_PURE_ASCII /* 255 or 256 entries is how the decoder knows */
, (unsigned)RT_ELEMENTS(pThis->aCompDict) - 1,
# else
, (unsigned)RT_ELEMENTS(pThis->aCompDict),
# endif
pszBaseName);
#else
fprintf(pOut,
" /*.cCompDict = */ 0,\n"
" /*.paCompDict = */ NULL\n"
"};\n");
#endif
}
#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
# pragma GCC diagnostic pop
#endif
#endif /* __cplusplus && IN_RING3 */