diff options
Diffstat (limited to 'extensions/spellcheck/hunspell/src/hashmgr.cxx')
-rw-r--r-- | extensions/spellcheck/hunspell/src/hashmgr.cxx | 1415 |
1 files changed, 1415 insertions, 0 deletions
diff --git a/extensions/spellcheck/hunspell/src/hashmgr.cxx b/extensions/spellcheck/hunspell/src/hashmgr.cxx new file mode 100644 index 0000000000..d22f2e7b7d --- /dev/null +++ b/extensions/spellcheck/hunspell/src/hashmgr.cxx @@ -0,0 +1,1415 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * Copyright (C) 2002-2022 Németh László + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. + * + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +/* + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#include <limits> +#include <sstream> + +#include "hashmgr.hxx" +#include "csutil.hxx" +#include "atypes.hxx" +#include "langnum.hxx" + +// build a hash table from a munched word list + +HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) + : tablesize(0), + tableptr(NULL), + flag_mode(FLAG_CHAR), + complexprefixes(0), + utf8(0), + forbiddenword(FORBIDDENWORD) // forbidden word signing flag + , + numaliasf(0), + aliasf(NULL), + aliasflen(0), + numaliasm(0), + aliasm(NULL) { + langnum = 0; + csconv = 0; + load_config(apath, key); + int ec = load_tables(tpath, key); + if (ec) { + /* error condition - what should we do here */ + HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec); + free(tableptr); + //keep tablesize to 1 to fix possible division with zero + tablesize = 1; + tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*)); + if (!tableptr) { + tablesize = 0; + } + } +} + +HashMgr::~HashMgr() { + if (tableptr) { + // now pass through hash table freeing up everything + // go through column by column of the table + for (int i = 0; i < tablesize; i++) { + struct hentry* pt = tableptr[i]; + struct hentry* nt = NULL; + while (pt) { + nt = pt->next; + if (pt->astr && + (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) + arena_free(pt->astr); + arena_free(pt); + pt = nt; + } + } + free(tableptr); + } + tablesize = 0; + + if (aliasf) { + for (int j = 0; j < (numaliasf); j++) + arena_free(aliasf[j]); + arena_free(aliasf); + aliasf = NULL; + if (aliasflen) { + arena_free(aliasflen); + aliasflen = NULL; + } + } + if (aliasm) { + for (int j = 0; j < (numaliasm); j++) + arena_free(aliasm[j]); + arena_free(aliasm); + aliasm = NULL; + } + +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + if (utf8) + free_utf_tbl(); +#endif +#endif + +#ifdef MOZILLA_CLIENT + delete[] csconv; +#endif + + assert(outstanding_arena_allocations == 0); +} + +// lookup a root word in the hashtable + +struct hentry* HashMgr::lookup(const char* word) const { + struct hentry* dp; + if (tableptr) { + dp = tableptr[hash(word)]; + if (!dp) + return NULL; + for (; dp != NULL; dp = dp->next) { + if (strcmp(word, dp->word) == 0) + return dp; + } + } + return NULL; +} + +// add a word to the hash table (private) +int HashMgr::add_word(const std::string& in_word, + int wcl, + unsigned short* aff, + int al, + const std::string* in_desc, + bool onlyupcase, + int captype) { + const std::string* word = &in_word; + const std::string* desc = in_desc; + + std::string *word_copy = NULL; + std::string *desc_copy = NULL; + if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) { + word_copy = new std::string(in_word); + + if (!ignorechars.empty()) { + if (utf8) { + wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16); + } else { + remove_ignored_chars(*word_copy, ignorechars); + } + } + + if (complexprefixes) { + if (utf8) + wcl = reverseword_utf(*word_copy); + else + reverseword(*word_copy); + + if (in_desc && !aliasm) { + desc_copy = new std::string(*in_desc); + + if (complexprefixes) { + if (utf8) + reverseword_utf(*desc_copy); + else + reverseword(*desc_copy); + } + desc = desc_copy; + } + } + + word = word_copy; + } + + bool upcasehomonym = false; + int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0; + // variable-length hash record with word and optional fields + struct hentry* hp = + (struct hentry*)arena_alloc(sizeof(struct hentry) + word->size() + descl); + if (!hp) { + delete desc_copy; + delete word_copy; + return 1; + } + + char* hpw = hp->word; + strcpy(hpw, word->c_str()); + + int i = hash(hpw); + + hp->blen = (unsigned char)word->size(); + hp->clen = (unsigned char)wcl; + hp->alen = (short)al; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; + hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0; + + // store the description string or its pointer + if (desc) { + hp->var |= H_OPT; + if (aliasm) { + hp->var |= H_OPT_ALIASM; + store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str()))); + } else { + strcpy(hpw + word->size() + 1, desc->c_str()); + } + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) { + hp->var |= H_OPT_PHON; + // store ph: fields (pronounciation, misspellings, old orthography etc.) + // of a morphological description in reptable to use in REP replacements. + if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO)) + reptable.reserve(tablesize/MORPH_PHON_RATIO); + std::string fields = HENTRY_DATA(hp); + std::string::const_iterator iter = fields.begin(); + std::string::const_iterator start_piece = mystrsep(fields, iter); + while (start_piece != fields.end()) { + if (std::string(start_piece, iter).find(MORPH_PHON) == 0) { + std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1); + if (ph.size() > 0) { + std::vector<w_char> w; + size_t strippatt; + std::string wordpart; + // dictionary based REP replacement, separated by "->" + // for example "pretty ph:prity ph:priti->pretti" to handle + // both prity -> pretty and pritier -> prettiest suggestions. + if (((strippatt = ph.find("->")) != std::string::npos) && + (strippatt > 0) && (strippatt < ph.size() - 2)) { + wordpart = ph.substr(strippatt + 2); + ph.erase(ph.begin() + strippatt, ph.end()); + } else + wordpart = in_word; + // when the ph: field ends with the character *, + // strip last character of the pattern and the replacement + // to match in REP suggestions also at character changes, + // for example, "pretty ph:prity*" results "prit->prett" + // REP replacement instead of "prity->pretty", to get + // prity->pretty and pritiest->prettiest suggestions. + if (ph.at(ph.size()-1) == '*') { + strippatt = 1; + size_t stripword = 0; + if (utf8) { + while ((strippatt < ph.size()) && + ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80)) + ++strippatt; + while ((stripword < wordpart.size()) && + ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80)) + ++stripword; + } + ++strippatt; + ++stripword; + if ((ph.size() > strippatt) && (wordpart.size() > stripword)) { + ph.erase(ph.size()-strippatt, strippatt); + wordpart.erase(in_word.size()-stripword, stripword); + } + } + // capitalize lowercase pattern for capitalized words to support + // good suggestions also for capitalized misspellings, eg. + // Wednesday ph:wendsay + // results wendsay -> Wednesday and Wendsay -> Wednesday, too. + if (captype==INITCAP) { + std::string ph_capitalized; + if (utf8) { + u8_u16(w, ph); + if (get_captype_utf8(w, langnum) == NOCAP) { + mkinitcap_utf(w, langnum); + u16_u8(ph_capitalized, w); + } + } else if (get_captype(ph, csconv) == NOCAP) + mkinitcap(ph_capitalized, csconv); + + if (ph_capitalized.size() > 0) { + // add also lowercase word in the case of German or + // Hungarian to support lowercase suggestions lowercased by + // compound word generation or derivational suffixes + // (for example by adjectival suffix "-i" of geographical + // names in Hungarian: + // Massachusetts ph:messzecsuzec + // messzecsuzeci -> massachusettsi (adjective) + // For lowercasing by conditional PFX rules, see + // tests/germancompounding test example or the + // Hungarian dictionary.) + if (langnum == LANG_de || langnum == LANG_hu) { + std::string wordpart_lower(wordpart); + if (utf8) { + u8_u16(w, wordpart_lower); + mkallsmall_utf(w, langnum); + u16_u8(wordpart_lower, w); + } else { + mkallsmall(wordpart_lower, csconv); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart_lower); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph_capitalized); + reptable.back().outstrings[0].assign(wordpart); + } + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart); + } + } + start_piece = mystrsep(fields, iter); + } + } + } + + struct hentry* dp = tableptr[i]; + if (!dp) { + tableptr[i] = hp; + delete desc_copy; + delete word_copy; + return 0; + } + while (dp->next != NULL) { + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + arena_free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + arena_free(hp); + delete desc_copy; + delete word_copy; + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + dp = dp->next; + } + if (strcmp(hp->word, dp->word) == 0) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + arena_free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + arena_free(hp); + delete desc_copy; + delete word_copy; + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + if (!upcasehomonym) { + dp->next = hp; + } else { + // remove hidden onlyupcase homonym + if (hp->astr) + arena_free(hp->astr); + arena_free(hp); + } + + delete desc_copy; + delete word_copy; + return 0; +} + +int HashMgr::add_hidden_capitalized_word(const std::string& word, + int wcl, + unsigned short* flags, + int flagslen, + const std::string* dp, + int captype) { + if (flags == NULL) + flagslen = 0; + + // add inner capitalized forms to handle the following allcap forms: + // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG + // Allcaps with suffixes: CIA's -> CIA'S + if (((captype == HUHCAP) || (captype == HUHINITCAP) || + ((captype == ALLCAP) && (flagslen != 0))) && + !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) { + unsigned short* flags2 = + (unsigned short*)arena_alloc(sizeof(unsigned short) * (flagslen + 1)); + if (!flags2) + return 1; + if (flagslen) + memcpy(flags2, flags, flagslen * sizeof(unsigned short)); + flags2[flagslen] = ONLYUPCASEFLAG; + if (utf8) { + std::string st; + std::vector<w_char> w; + u8_u16(w, word); + mkallsmall_utf(w, langnum); + mkinitcap_utf(w, langnum); + u16_u8(st, w); + return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP); + } else { + std::string new_word(word); + mkallsmall(new_word, csconv); + mkinitcap(new_word, csconv); + int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP); + return ret; + } + } + return 0; +} + +// detect captype and modify word length for UTF-8 encoding +int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) { + int len; + if (utf8) { + len = u8_u16(workbuf, word); + *captype = get_captype_utf8(workbuf, langnum); + } else { + len = word.size(); + *captype = get_captype(word, csconv); + } + return len; +} + +int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { + std::vector<w_char> workbuf; + return get_clen_and_captype(word, captype, workbuf); +} + +// remove word (personal dictionary function for standalone applications) +int HashMgr::remove(const std::string& word) { + struct hentry* dp = lookup(word.c_str()); + while (dp) { + if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { + unsigned short* flags = + (unsigned short*)arena_alloc(sizeof(unsigned short) * (dp->alen + 1)); + if (!flags) + return 1; + for (int i = 0; i < dp->alen; i++) + flags[i] = dp->astr[i]; + flags[dp->alen] = forbiddenword; + arena_free(dp->astr); + dp->astr = flags; + dp->alen++; + std::sort(flags, flags + dp->alen); + } + dp = dp->next_homonym; + } + return 0; +} + +/* remove forbidden flag to add a personal word to the hash */ +int HashMgr::remove_forbidden_flag(const std::string& word) { + struct hentry* dp = lookup(word.c_str()); + if (!dp) + return 1; + while (dp) { + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) + dp->alen = 0; // XXX forbidden words of personal dic. + dp = dp->next_homonym; + } + return 0; +} + +// add a custom dic. word to the hash table (public) +int HashMgr::add(const std::string& word) { + if (remove_forbidden_flag(word)) { + int captype; + int al = 0; + unsigned short* flags = NULL; + int wcl = get_clen_and_captype(word, &captype); + add_word(word, wcl, flags, al, NULL, false, captype); + return add_hidden_capitalized_word(word, wcl, flags, al, NULL, + captype); + } + return 0; +} + +int HashMgr::add_with_affix(const std::string& word, const std::string& example) { + // detect captype and modify word length for UTF-8 encoding + struct hentry* dp = lookup(example.c_str()); + remove_forbidden_flag(word); + if (dp && dp->astr) { + int captype; + int wcl = get_clen_and_captype(word, &captype); + if (aliasf) { + add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype); + } else { + unsigned short* flags = + (unsigned short*) arena_alloc(dp->alen * sizeof(unsigned short)); + if (flags) { + memcpy((void*)flags, (void*)dp->astr, + dp->alen * sizeof(unsigned short)); + add_word(word, wcl, flags, dp->alen, NULL, false, captype); + } else + return 1; + } + return add_hidden_capitalized_word(word, wcl, dp->astr, + dp->alen, NULL, captype); + } + return 1; +} + +// walk the hash table entry by entry - null at end +// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); +struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { + if (hp && hp->next != NULL) + return hp->next; + for (col++; col < tablesize; col++) { + if (tableptr[col]) + return tableptr[col]; + } + // null at end and reset to start + col = -1; + return NULL; +} + +// load a munched word list and build a hash table on the fly +int HashMgr::load_tables(const char* tpath, const char* key) { + // open dictionary file + FileMgr* dict = new FileMgr(tpath, key); + if (dict == NULL) + return 1; + + // first read the first line of file to get hash table size */ + std::string ts; + if (!dict->getline(ts)) { + HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath); + delete dict; + return 2; + } + mychomp(ts); + + /* remove byte order mark */ + if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { + ts.erase(0, 3); + } + + tablesize = atoi(ts.c_str()); + + int nExtra = 5 + USERWORD; + + if (tablesize <= 0 || + (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) / + int(sizeof(struct hentry*)))) { + HUNSPELL_WARNING( + stderr, "error: line 1: missing or bad word count in the dic file\n"); + delete dict; + return 4; + } + tablesize += nExtra; + if ((tablesize % 2) == 0) + tablesize++; + + // allocate the hash table + tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*)); + if (!tableptr) { + delete dict; + return 3; + } + + // loop through all words on much list and add to hash + // table and create word and affix strings + + std::vector<w_char> workbuf; + + while (dict->getline(ts)) { + mychomp(ts); + // split each line into word and morphological description + size_t dp_pos = 0; + while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) { + if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) { + for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos) + ; + if (dp_pos == 0) { // missing word + dp_pos = std::string::npos; + } else { + ++dp_pos; + } + break; + } + ++dp_pos; + } + + // tabulator is the old morphological field separator + size_t dp2_pos = ts.find('\t'); + if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) { + dp_pos = dp2_pos + 1; + } + + std::string dp; + if (dp_pos != std::string::npos) { + dp.assign(ts.substr(dp_pos)); + ts.resize(dp_pos - 1); + } + + // split each line into word and affix char strings + // "\/" signs slash in words (not affix separator) + // "/" at beginning of the line is word character (not affix separator) + size_t ap_pos = ts.find('/'); + while (ap_pos != std::string::npos) { + if (ap_pos == 0) { + ++ap_pos; + continue; + } else if (ts[ap_pos - 1] != '\\') + break; + // replace "\/" with "/" + ts.erase(ap_pos - 1, 1); + ap_pos = ts.find('/', ap_pos); + } + + unsigned short* flags; + int al; + if (ap_pos != std::string::npos && ap_pos != ts.size()) { + std::string ap(ts.substr(ap_pos + 1)); + ts.resize(ap_pos); + if (aliasf) { + int index = atoi(ap.c_str()); + al = get_aliasf(index, &flags, dict); + if (!al) { + HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", + dict->getlinenum()); + } + } else { + al = decode_flags(&flags, ap.c_str(), dict, /* arena = */ true); + if (al == -1) { + HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); + delete dict; + return 6; + } + std::sort(flags, flags + al); + } + } else { + al = 0; + flags = NULL; + } + + int captype; + int wcl = get_clen_and_captype(ts, &captype, workbuf); + const std::string *dp_str = dp.empty() ? NULL : &dp; + // add the word and its index plus its capitalized form optionally + if (add_word(ts, wcl, flags, al, dp_str, false, captype) || + add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) { + delete dict; + return 5; + } + } + + delete dict; + return 0; +} + +// the hash function is a simple load and rotate +// algorithm borrowed +int HashMgr::hash(const char* word) const { + unsigned long hv = 0; + for (int i = 0; i < 4 && *word != 0; i++) + hv = (hv << 8) | (*word++); + while (*word != 0) { + ROTATE(hv, ROTATE_LEN); + hv ^= (*word++); + } + return (unsigned long)hv % tablesize; +} + +int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af, bool arena) const { + auto alloc = [arena, this](int n) { return arena ? this->arena_alloc(n) : malloc(n); }; + int len; + if (flags.empty()) { + *result = NULL; + return 0; + } + switch (flag_mode) { + case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) + len = flags.size(); + if (len % 2 == 1) + HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", + af->getlinenum()); + len /= 2; + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + for (int i = 0; i < len; i++) { + (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) + + (unsigned char)flags[i * 2 + 1]; + } + break; + } + case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 + // 23 233) + len = 1; + unsigned short* dest; + for (size_t i = 0; i < flags.size(); ++i) { + if (flags[i] == ',') + len++; + } + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + dest = *result; + const char* src = flags.c_str(); + for (const char* p = src; *p; p++) { + if (*p == ',') { + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING( + stderr, "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + *dest = (unsigned short)i; + if (*dest == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + src = p + 1; + dest++; + } + } + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, + "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + *dest = (unsigned short)i; + if (*dest == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + break; + } + case FLAG_UNI: { // UTF-8 characters + std::vector<w_char> w; + u8_u16(w, flags); + len = w.size(); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + memcpy(*result, w.data(), len * sizeof(short)); + break; + } + default: { // Ispell's one-character flags (erfg -> e r f g) + unsigned short* dest; + len = flags.size(); + *result = (unsigned short*)alloc(len * sizeof(unsigned short)); + if (!*result) + return -1; + dest = *result; + for (size_t i = 0; i < flags.size(); ++i) { + *dest = (unsigned char)flags[i]; + dest++; + } + } + } + return len; +} + +bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const { + if (flags.empty()) { + return false; + } + switch (flag_mode) { + case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) + size_t len = flags.size(); + if (len % 2 == 1) + HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", + af->getlinenum()); + len /= 2; + result.reserve(result.size() + len); + for (size_t i = 0; i < len; ++i) { + result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) + + (unsigned char)flags[i * 2 + 1]); + } + break; + } + case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 + // 23 233) + const char* src = flags.c_str(); + for (const char* p = src; *p; p++) { + if (*p == ',') { + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING( + stderr, "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + result.push_back((unsigned short)i); + if (result.back() == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + src = p + 1; + } + } + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, + "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + result.push_back((unsigned short)i); + if (result.back() == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + break; + } + case FLAG_UNI: { // UTF-8 characters + std::vector<w_char> w; + u8_u16(w, flags); + size_t len = w.size(); + size_t origsize = result.size(); + result.resize(origsize + len); + memcpy(result.data() + origsize, w.data(), len * sizeof(short)); + break; + } + default: { // Ispell's one-character flags (erfg -> e r f g) + result.reserve(flags.size()); + for (size_t i = 0; i < flags.size(); ++i) { + result.push_back((unsigned char)flags[i]); + } + } + } + return true; +} + +unsigned short HashMgr::decode_flag(const char* f) const { + unsigned short s = 0; + int i; + switch (flag_mode) { + case FLAG_LONG: + s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1]; + break; + case FLAG_NUM: + i = atoi(f); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", + i, DEFAULTFLAGS - 1); + s = (unsigned short)i; + break; + case FLAG_UNI: { + std::vector<w_char> w; + u8_u16(w, f); + if (!w.empty()) + memcpy(&s, w.data(), 1 * sizeof(short)); + break; + } + default: + s = *(unsigned char*)f; + } + if (s == 0) + HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); + return s; +} + +// This function is only called by external consumers, and so using the default +// allocator with mystrdup is correct. +char* HashMgr::encode_flag(unsigned short f) const { + if (f == 0) + return mystrdup("(NULL)"); + std::string ch; + if (flag_mode == FLAG_LONG) { + ch.push_back((unsigned char)(f >> 8)); + ch.push_back((unsigned char)(f - ((f >> 8) << 8))); + } else if (flag_mode == FLAG_NUM) { + std::ostringstream stream; + stream << f; + ch = stream.str(); + } else if (flag_mode == FLAG_UNI) { + const w_char* w_c = (const w_char*)&f; + std::vector<w_char> w(w_c, w_c + 1); + u16_u8(ch, w); + } else { + ch.push_back((unsigned char)(f)); + } + return mystrdup(ch.c_str()); +} + +// read in aff file and set flag mode +int HashMgr::load_config(const char* affpath, const char* key) { + int firstline = 1; + + // open the affix file + FileMgr* afflst = new FileMgr(affpath, key); + if (!afflst) { + HUNSPELL_WARNING( + stderr, "Error - could not open affix description file %s\n", affpath); + return 1; + } + + // read in each line ignoring any that do not + // start with a known line type indicator + + std::string line; + while (afflst->getline(line)) { + mychomp(line); + + /* remove byte order mark */ + if (firstline) { + firstline = 0; + if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { + line.erase(0, 3); + } + } + + /* parse in the try string */ + if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) { + if (flag_mode != FLAG_CHAR) { + HUNSPELL_WARNING(stderr, + "error: line %d: multiple definitions of the FLAG " + "affix file parameter\n", + afflst->getlinenum()); + } + if (line.find("long") != std::string::npos) + flag_mode = FLAG_LONG; + if (line.find("num") != std::string::npos) + flag_mode = FLAG_NUM; + if (line.find("UTF-8") != std::string::npos) + flag_mode = FLAG_UNI; + if (flag_mode == FLAG_CHAR) { + HUNSPELL_WARNING( + stderr, + "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", + afflst->getlinenum()); + } + } + + if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { + std::string st; + if (!parse_string(line, st, afflst->getlinenum())) { + delete afflst; + return 1; + } + forbiddenword = decode_flag(st.c_str()); + } + + if (line.compare(0, 3, "SET", 3) == 0) { + if (!parse_string(line, enc, afflst->getlinenum())) { + delete afflst; + return 1; + } + if (enc == "UTF-8") { + utf8 = 1; +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + initialize_utf_tbl(); +#endif +#endif + } else + csconv = get_current_cs(enc); + } + + if (line.compare(0, 4, "LANG", 4) == 0) { + if (!parse_string(line, lang, afflst->getlinenum())) { + delete afflst; + return 1; + } + langnum = get_lang_num(lang); + } + + /* parse in the ignored characters (for example, Arabic optional diacritics + * characters */ + if (line.compare(0, 6, "IGNORE", 6) == 0) { + if (!parse_array(line, ignorechars, ignorechars_utf16, + utf8, afflst->getlinenum())) { + delete afflst; + return 1; + } + } + + if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) { + if (!parse_aliasf(line, afflst)) { + delete afflst; + return 1; + } + } + + if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) { + if (!parse_aliasm(line, afflst)) { + delete afflst; + return 1; + } + } + + if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) + complexprefixes = 1; + + /* parse in the typical fault correcting table */ + if (line.compare(0, 3, "REP", 3) == 0) { + if (!parse_reptable(line, afflst)) { + delete afflst; + return 1; + } + } + + // don't check the full affix file, yet + if (((line.compare(0, 3, "SFX", 3) == 0) || + (line.compare(0, 3, "PFX", 3) == 0)) && + line.size() > 3 && isspace(line[3]) && + !reptable.empty()) // (REP table is in the end of Afrikaans aff file) + break; + } + + if (csconv == NULL) + csconv = get_current_cs(SPELL_ENCODING); + delete afflst; + return 0; +} + +/* parse in the ALIAS table */ +bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { + if (numaliasf != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return false; + } + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasf = atoi(std::string(start_piece, iter).c_str()); + if (numaliasf < 1) { + numaliasf = 0; + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; + } + aliasf = + (unsigned short**)arena_alloc(numaliasf * sizeof(unsigned short*)); + aliasflen = + (unsigned short*)arena_alloc(numaliasf * sizeof(unsigned short)); + if (!aliasf || !aliasflen) { + numaliasf = 0; + if (aliasf) + arena_free(aliasf); + if (aliasflen) + arena_free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + return false; + } + np++; + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(line, iter); + } + if (np != 2) { + numaliasf = 0; + arena_free(aliasf); + arena_free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return false; + } + + /* now parse the numaliasf lines to read in the remainder of the table */ + for (int j = 0; j < numaliasf; j++) { + std::string nl; + aliasf[j] = NULL; + aliasflen[j] = 0; + i = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { + errored = true; + break; + } + break; + } + case 1: { + std::string piece(start_piece, iter); + aliasflen[j] = + (unsigned short)decode_flags(&(aliasf[j]), piece, af, /* arena = */ true); + std::sort(aliasf[j], aliasf[j] + aliasflen[j]); + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(nl, iter); + } + } + if (!aliasf[j]) { + for (int k = 0; k < j; ++k) { + arena_free(aliasf[k]); + } + arena_free(aliasf); + arena_free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + numaliasf = 0; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return false; + } + } + return true; +} + +int HashMgr::is_aliasf() const { + return (aliasf != NULL); +} + +int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const { + if ((index > 0) && (index <= numaliasf)) { + *fvec = aliasf[index - 1]; + return aliasflen[index - 1]; + } + HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", + af->getlinenum(), index); + *fvec = NULL; + return 0; +} + +/* parse morph alias definitions */ +bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) { + if (numaliasm != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return false; + } + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasm = atoi(std::string(start_piece, iter).c_str()); + if (numaliasm < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; + } + aliasm = (char**)arena_alloc(numaliasm * sizeof(char*)); + if (!aliasm) { + numaliasm = 0; + return false; + } + np++; + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(line, iter); + } + if (np != 2) { + numaliasm = 0; + arena_free(aliasm); + aliasm = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return false; + } + + /* now parse the numaliasm lines to read in the remainder of the table */ + for (int j = 0; j < numaliasm; j++) { + std::string nl; + aliasm[j] = NULL; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { + errored = true; + break; + } + break; + } + case 1: { + // add the remaining of the line + std::string::const_iterator end = nl.end(); + std::string chunk(start_piece, end); + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + } + size_t sl = chunk.length() + 1; + aliasm[j] = (char*)arena_alloc(sl); + if (aliasm[j]) { + memcpy(aliasm[j], chunk.c_str(), sl); + } + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(nl, iter); + } + } + if (!aliasm[j]) { + numaliasm = 0; + for (int k = 0; k < j; ++k) { + arena_free(aliasm[k]); + } + arena_free(aliasm); + aliasm = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return false; + } + } + return true; +} + +int HashMgr::is_aliasm() const { + return (aliasm != NULL); +} + +char* HashMgr::get_aliasm(int index) const { + if ((index > 0) && (index <= numaliasm)) + return aliasm[index - 1]; + HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); + return NULL; +} + +/* parse in the typical fault correcting table */ +bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) { + if (!reptable.empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return false; + } + int numrep = -1; + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrep = atoi(std::string(start_piece, iter).c_str()); + if (numrep < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return false; + } + reptable.reserve(numrep); + np++; + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(line, iter); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return false; + } + + /* now parse the numrep lines to read in the remainder of the table */ + for (int j = 0; j < numrep; ++j) { + std::string nl; + reptable.push_back(replentry()); + int type = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { + errored = true; + break; + } + break; + } + case 1: { + if (*start_piece == '^') + type = 1; + reptable.back().pattern.assign(start_piece + type, iter); + mystrrep(reptable.back().pattern, "_", " "); + if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { + type += 2; + reptable.back().pattern.resize(reptable.back().pattern.size() - 1); + } + break; + } + case 2: { + reptable.back().outstrings[type].assign(start_piece, iter); + mystrrep(reptable.back().outstrings[type], "_", " "); + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(nl, iter); + } + } + if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + reptable.clear(); + return false; + } + } + return true; +} + +// return replacing table +const std::vector<replentry>& HashMgr::get_reptable() const { + return reptable; +} + +void* HashMgr::arena_alloc(int num_bytes) { + static const int MIN_CHUNK_SIZE = 4096; + if (arena.empty() || (current_chunk_size - current_chunk_offset < num_bytes)) { + current_chunk_size = std::max(MIN_CHUNK_SIZE, num_bytes); + arena.push_back(std::make_unique<uint8_t[]>(current_chunk_size)); + current_chunk_offset = 0; + } + + uint8_t* ptr = &arena.back()[current_chunk_offset]; + current_chunk_offset += num_bytes; + outstanding_arena_allocations++; + return ptr; +} + +void HashMgr::arena_free(void* ptr) { + --outstanding_arena_allocations; + assert(outstanding_arena_allocations >= 0); +} |