diff --git a/extensions/spellcheck/hunspell/src/hashmgr.cxx b/extensions/spellcheck/hunspell/src/hashmgr.cxx --- a/extensions/spellcheck/hunspell/src/hashmgr.cxx +++ b/extensions/spellcheck/hunspell/src/hashmgr.cxx @@ -63,16 +63,17 @@ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +#include #include #include #include #include #include #include #include "hashmgr.hxx" @@ -118,52 +119,54 @@ HashMgr::~HashMgr() { // go through column by column of the table for (int i = 0; i < tablesize; i++) { struct hentry* pt = tableptr[i]; struct hentry* nt = NULL; while (pt) { nt = pt->next; if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) - free(pt->astr); - free(pt); + arena_free(pt->astr); + arena_free(pt); pt = nt; } } free(tableptr); } tablesize = 0; if (aliasf) { for (int j = 0; j < (numaliasf); j++) - free(aliasf[j]); - free(aliasf); + arena_free(aliasf[j]); + arena_free(aliasf); aliasf = NULL; if (aliasflen) { - free(aliasflen); + arena_free(aliasflen); aliasflen = NULL; } } if (aliasm) { for (int j = 0; j < (numaliasm); j++) - free(aliasm[j]); - free(aliasm); + arena_free(aliasm[j]); + arena_free(aliasm); aliasm = NULL; } #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT if (utf8) free_utf_tbl(); #endif #endif #ifdef MOZILLA_CLIENT delete[] csconv; #endif + + assert(outstanding_arena_allocations == 0); } // lookup a root word in the hashtable struct hentry* HashMgr::lookup(const char* word) const { struct hentry* dp; if (tableptr) { dp = tableptr[hash(word)]; @@ -222,17 +225,17 @@ int HashMgr::add_word(const std::string& word = word_copy; } bool upcasehomonym = false; int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0; // variable-length hash record with word and optional fields struct hentry* hp = - (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl); + (struct hentry*)arena_alloc(sizeof(struct hentry) + word->size() + descl); if (!hp) { delete desc_copy; delete word_copy; return 1; } char* hpw = hp->word; strcpy(hpw, word->c_str()); @@ -366,57 +369,57 @@ int HashMgr::add_word(const std::string& delete word_copy; return 0; } while (dp->next != NULL) { if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { // remove hidden onlyupcase homonym if (!onlyupcase) { if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { - free(dp->astr); + arena_free(dp->astr); dp->astr = hp->astr; dp->alen = hp->alen; - free(hp); + arena_free(hp); delete desc_copy; delete word_copy; return 0; } else { dp->next_homonym = hp; } } else { upcasehomonym = true; } } dp = dp->next; } if (strcmp(hp->word, dp->word) == 0) { // remove hidden onlyupcase homonym if (!onlyupcase) { if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { - free(dp->astr); + arena_free(dp->astr); dp->astr = hp->astr; dp->alen = hp->alen; - free(hp); + arena_free(hp); delete desc_copy; delete word_copy; return 0; } else { dp->next_homonym = hp; } } else { upcasehomonym = true; } } if (!upcasehomonym) { dp->next = hp; } else { // remove hidden onlyupcase homonym if (hp->astr) - free(hp->astr); - free(hp); + arena_free(hp->astr); + arena_free(hp); } delete desc_copy; delete word_copy; return 0; } int HashMgr::add_hidden_capitalized_word(const std::string& word, @@ -430,17 +433,17 @@ int HashMgr::add_hidden_capitalized_word // add inner capitalized forms to handle the following allcap forms: // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG // Allcaps with suffixes: CIA's -> CIA'S if (((captype == HUHCAP) || (captype == HUHINITCAP) || ((captype == ALLCAP) && (flagslen != 0))) && !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) { unsigned short* flags2 = - (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1)); + (unsigned short*)arena_alloc(sizeof(unsigned short) * (flagslen + 1)); if (!flags2) return 1; if (flagslen) memcpy(flags2, flags, flagslen * sizeof(unsigned short)); flags2[flagslen] = ONLYUPCASEFLAG; if (utf8) { std::string st; std::vector w; @@ -479,23 +482,23 @@ int HashMgr::get_clen_and_captype(const } // remove word (personal dictionary function for standalone applications) int HashMgr::remove(const std::string& word) { struct hentry* dp = lookup(word.c_str()); while (dp) { if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { unsigned short* flags = - (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1)); + (unsigned short*)arena_alloc(sizeof(unsigned short) * (dp->alen + 1)); if (!flags) return 1; for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; flags[dp->alen] = forbiddenword; - free(dp->astr); + arena_free(dp->astr); dp->astr = flags; dp->alen++; std::sort(flags, flags + dp->alen); } dp = dp->next_homonym; } return 0; } @@ -533,17 +536,17 @@ int HashMgr::add_with_affix(const std::s remove_forbidden_flag(word); if (dp && dp->astr) { int captype; int wcl = get_clen_and_captype(word, &captype); if (aliasf) { add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype); } else { unsigned short* flags = - (unsigned short*)malloc(dp->alen * sizeof(unsigned short)); + (unsigned short*) arena_alloc(dp->alen * sizeof(unsigned short)); if (flags) { memcpy((void*)flags, (void*)dp->astr, dp->alen * sizeof(unsigned short)); add_word(word, wcl, flags, dp->alen, NULL, false, captype); } else return 1; } return add_hidden_capitalized_word(word, wcl, dp->astr, @@ -668,17 +671,17 @@ int HashMgr::load_tables(const char* tpa if (aliasf) { int index = atoi(ap.c_str()); al = get_aliasf(index, &flags, dict); if (!al) { HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); } } else { - al = decode_flags(&flags, ap.c_str(), dict); + al = decode_flags(&flags, ap.c_str(), dict, /* arena = */ true); if (al == -1) { HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); delete dict; return 6; } std::sort(flags, flags + al); } } else { @@ -709,47 +712,48 @@ int HashMgr::hash(const char* word) cons hv = (hv << 8) | (*word++); while (*word != 0) { ROTATE(hv, ROTATE_LEN); hv ^= (*word++); } return (unsigned long)hv % tablesize; } -int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const { +int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af, bool arena) const { + auto alloc = [arena, this](int n) { return arena ? this->arena_alloc(n) : malloc(n); }; int len; if (flags.empty()) { *result = NULL; return 0; } switch (flag_mode) { case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) len = flags.size(); if (len % 2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum()); len /= 2; - *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); if (!*result) return -1; for (int i = 0; i < len; i++) { (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) + (unsigned char)flags[i * 2 + 1]; } break; } case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 // 23 233) len = 1; unsigned short* dest; for (size_t i = 0; i < flags.size(); ++i) { if (flags[i] == ',') len++; } - *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); if (!*result) return -1; dest = *result; const char* src = flags.c_str(); for (const char* p = src; *p; p++) { if (*p == ',') { int i = atoi(src); if (i >= DEFAULTFLAGS) @@ -774,26 +778,26 @@ int HashMgr::decode_flags(unsigned short HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum()); break; } case FLAG_UNI: { // UTF-8 characters std::vector w; u8_u16(w, flags); len = w.size(); - *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); if (!*result) return -1; memcpy(*result, w.data(), len * sizeof(short)); break; } default: { // Ispell's one-character flags (erfg -> e r f g) unsigned short* dest; len = flags.size(); - *result = (unsigned short*)malloc(len * sizeof(unsigned short)); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); if (!*result) return -1; dest = *result; for (size_t i = 0; i < flags.size(); ++i) { *dest = (unsigned char)flags[i]; dest++; } } @@ -890,16 +894,18 @@ unsigned short HashMgr::decode_flag(cons default: s = *(unsigned char*)f; } if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); return s; } +// This function is only called by external consumers, and so using the default +// allocator with mystrdup is correct. char* HashMgr::encode_flag(unsigned short f) const { if (f == 0) return mystrdup("(NULL)"); std::string ch; if (flag_mode == FLAG_LONG) { ch.push_back((unsigned char)(f >> 8)); ch.push_back((unsigned char)(f - ((f >> 8) << 8))); } else if (flag_mode == FLAG_NUM) { @@ -1070,42 +1076,42 @@ bool HashMgr::parse_aliasf(const std::st numaliasf = 0; aliasf = NULL; aliasflen = NULL; HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); return false; } aliasf = - (unsigned short**)malloc(numaliasf * sizeof(unsigned short*)); + (unsigned short**)arena_alloc(numaliasf * sizeof(unsigned short*)); aliasflen = - (unsigned short*)malloc(numaliasf * sizeof(unsigned short)); + (unsigned short*)arena_alloc(numaliasf * sizeof(unsigned short)); if (!aliasf || !aliasflen) { numaliasf = 0; if (aliasf) - free(aliasf); + arena_free(aliasf); if (aliasflen) - free(aliasflen); + arena_free(aliasflen); aliasf = NULL; aliasflen = NULL; return false; } np++; break; } default: break; } ++i; start_piece = mystrsep(line, iter); } if (np != 2) { numaliasf = 0; - free(aliasf); - free(aliasflen); + arena_free(aliasf); + arena_free(aliasflen); aliasf = NULL; aliasflen = NULL; HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return false; } /* now parse the numaliasf lines to read in the remainder of the table */ @@ -1126,33 +1132,33 @@ bool HashMgr::parse_aliasf(const std::st errored = true; break; } break; } case 1: { std::string piece(start_piece, iter); aliasflen[j] = - (unsigned short)decode_flags(&(aliasf[j]), piece, af); + (unsigned short)decode_flags(&(aliasf[j]), piece, af, /* arena = */ true); std::sort(aliasf[j], aliasf[j] + aliasflen[j]); break; } default: break; } ++i; start_piece = mystrsep(nl, iter); } } if (!aliasf[j]) { for (int k = 0; k < j; ++k) { - free(aliasf[k]); + arena_free(aliasf[k]); } - free(aliasf); - free(aliasflen); + arena_free(aliasf); + arena_free(aliasflen); aliasf = NULL; aliasflen = NULL; numaliasf = 0; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); return false; } } @@ -1193,33 +1199,33 @@ bool HashMgr::parse_aliasm(const std::st } case 1: { numaliasm = atoi(std::string(start_piece, iter).c_str()); if (numaliasm < 1) { HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); return false; } - aliasm = (char**)malloc(numaliasm * sizeof(char*)); + aliasm = (char**)arena_alloc(numaliasm * sizeof(char*)); if (!aliasm) { numaliasm = 0; return false; } np++; break; } default: break; } ++i; start_piece = mystrsep(line, iter); } if (np != 2) { numaliasm = 0; - free(aliasm); + arena_free(aliasm); aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return false; } /* now parse the numaliasm lines to read in the remainder of the table */ for (int j = 0; j < numaliasm; j++) { @@ -1245,32 +1251,36 @@ bool HashMgr::parse_aliasm(const std::st std::string::const_iterator end = nl.end(); std::string chunk(start_piece, end); if (complexprefixes) { if (utf8) reverseword_utf(chunk); else reverseword(chunk); } - aliasm[j] = mystrdup(chunk.c_str()); + size_t sl = chunk.length() + 1; + aliasm[j] = (char*)arena_alloc(sl); + if (aliasm[j]) { + memcpy(aliasm[j], chunk.c_str(), sl); + } break; } default: break; } ++i; start_piece = mystrsep(nl, iter); } } if (!aliasm[j]) { numaliasm = 0; for (int k = 0; k < j; ++k) { - free(aliasm[k]); + arena_free(aliasm[k]); } - free(aliasm); + arena_free(aliasm); aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); return false; } } return true; } @@ -1379,8 +1389,27 @@ bool HashMgr::parse_reptable(const std:: } return true; } // return replacing table const std::vector& HashMgr::get_reptable() const { return reptable; } + +void* HashMgr::arena_alloc(int num_bytes) { + static const int MIN_CHUNK_SIZE = 4096; + if (arena.empty() || (current_chunk_size - current_chunk_offset < num_bytes)) { + current_chunk_size = std::max(MIN_CHUNK_SIZE, num_bytes); + arena.push_back(std::make_unique(current_chunk_size)); + current_chunk_offset = 0; + } + + uint8_t* ptr = &arena.back()[current_chunk_offset]; + current_chunk_offset += num_bytes; + outstanding_arena_allocations++; + return ptr; +} + +void HashMgr::arena_free(void* ptr) { + --outstanding_arena_allocations; + assert(outstanding_arena_allocations >= 0); +} diff --git a/extensions/spellcheck/hunspell/src/hashmgr.hxx b/extensions/spellcheck/hunspell/src/hashmgr.hxx --- a/extensions/spellcheck/hunspell/src/hashmgr.hxx +++ b/extensions/spellcheck/hunspell/src/hashmgr.hxx @@ -67,16 +67,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef HASHMGR_HXX_ #define HASHMGR_HXX_ #include +#include +#include #include #include #include "htypes.hxx" #include "filemgr.hxx" #include "w_char.hxx" enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; @@ -116,17 +118,23 @@ class HashMgr { struct hentry* lookup(const char*) const; int hash(const char*) const; struct hentry* walk_hashtable(int& col, struct hentry* hp) const; int add(const std::string& word); int add_with_affix(const std::string& word, const std::string& pattern); int remove(const std::string& word); - int decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const; +private: + // Only internal consumers are allowed to arena-allocate. + int decode_flags(unsigned short** result, const std::string& flags, FileMgr* af, bool arena) const; +public: + int decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const { + return decode_flags(result, flags, af, /* arena = */ false); + } bool decode_flags(std::vector& result, const std::string& flags, FileMgr* af) const; unsigned short decode_flag(const char* flag) const; char* encode_flag(unsigned short flag) const; int is_aliasf() const; int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const; int is_aliasm() const; char* get_aliasm(int index) const; const std::vector& get_reptable() const; @@ -148,11 +156,27 @@ class HashMgr { int wcl, unsigned short* flags, int al, const std::string* dp, int captype); bool parse_aliasm(const std::string& line, FileMgr* af); bool parse_reptable(const std::string& line, FileMgr* af); int remove_forbidden_flag(const std::string& word); + + // Our Mozilla fork uses a simple arena allocator for certain strings which + // persist for the lifetime of the HashMgr in order to avoid heap fragmentation. + // It's a simple bump-allocator, so we can't actually free() memory midway + // through the lifecycle, but we have a dummy free() implementation to ensure + // that our calls to arena_alloc() and arena_free() are balanced. + void* arena_alloc(int num_bytes); + void* arena_alloc(int num_bytes) const { + return const_cast(this)->arena_alloc(num_bytes); + } + void arena_free(void* ptr); + + std::vector> arena; + int current_chunk_size = 0; + int current_chunk_offset = 0; + int outstanding_arena_allocations = 0; }; #endif