diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:07:11 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:07:11 +0000 |
commit | 63847496f14c813a5d80efd5b7de0f1294ffe1e3 (patch) | |
tree | 01c7571c7c762ceee70638549a99834fdd7c411b /www/spellfix1.html | |
parent | Initial commit. (diff) | |
download | sqlite3-63847496f14c813a5d80efd5b7de0f1294ffe1e3.tar.xz sqlite3-63847496f14c813a5d80efd5b7de0f1294ffe1e3.zip |
Adding upstream version 3.45.1.upstream/3.45.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'www/spellfix1.html')
-rw-r--r-- | www/spellfix1.html | 754 |
1 files changed, 754 insertions, 0 deletions
diff --git a/www/spellfix1.html b/www/spellfix1.html new file mode 100644 index 0000000..f4a06d9 --- /dev/null +++ b/www/spellfix1.html @@ -0,0 +1,754 @@ +<!DOCTYPE html> +<html><head> +<meta name="viewport" content="width=device-width, initial-scale=1.0"> +<meta http-equiv="content-type" content="text/html; charset=UTF-8"> +<link href="sqlite.css" rel="stylesheet"> +<title>The Spellfix1 Virtual Table</title> +<!-- path= --> +</head> +<body> +<div class=nosearch> +<a href="index.html"> +<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0"> +</a> +<div><!-- IE hack to prevent disappearing logo --></div> +<div class="tagline desktoponly"> +Small. Fast. Reliable.<br>Choose any three. +</div> +<div class="menu mainmenu"> +<ul> +<li><a href="index.html">Home</a> +<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a> +<li class='wideonly'><a href='about.html'>About</a> +<li class='desktoponly'><a href="docs.html">Documentation</a> +<li class='desktoponly'><a href="download.html">Download</a> +<li class='wideonly'><a href='copyright.html'>License</a> +<li class='desktoponly'><a href="support.html">Support</a> +<li class='desktoponly'><a href="prosupport.html">Purchase</a> +<li class='search' id='search_menubutton'> +<a href="javascript:void(0)" onclick='toggle_search()'>Search</a> +</ul> +</div> +<div class="menu submenu" id="submenu"> +<ul> +<li><a href='about.html'>About</a> +<li><a href='docs.html'>Documentation</a> +<li><a href='download.html'>Download</a> +<li><a href='support.html'>Support</a> +<li><a href='prosupport.html'>Purchase</a> +</ul> +</div> +<div class="searchmenu" id="searchmenu"> +<form method="GET" action="search"> +<select name="s" id="searchtype"> +<option value="d">Search Documentation</option> +<option value="c">Search Changelog</option> +</select> +<input type="text" name="q" id="searchbox" value=""> +<input type="submit" value="Go"> +</form> +</div> +</div> +<script> +function toggle_div(nm) { +var w = document.getElementById(nm); +if( w.style.display=="block" ){ +w.style.display = "none"; +}else{ +w.style.display = "block"; +} +} +function toggle_search() { +var w = document.getElementById("searchmenu"); +if( w.style.display=="block" ){ +w.style.display = "none"; +} else { +w.style.display = "block"; +setTimeout(function(){ +document.getElementById("searchbox").focus() +}, 30); +} +} +function div_off(nm){document.getElementById(nm).style.display="none";} +window.onbeforeunload = function(e){div_off("submenu");} +/* Disable the Search feature if we are not operating from CGI, since */ +/* Search is accomplished using CGI and will not work without it. */ +if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){ +document.getElementById("search_menubutton").style.display = "none"; +} +/* Used by the Hide/Show button beside syntax diagrams, to toggle the */ +function hideorshow(btn,obj){ +var x = document.getElementById(obj); +var b = document.getElementById(btn); +if( x.style.display!='none' ){ +x.style.display = 'none'; +b.innerHTML='show'; +}else{ +x.style.display = ''; +b.innerHTML='hide'; +} +return false; +} +var antiRobot = 0; +function antiRobotGo(){ +if( antiRobot!=3 ) return; +antiRobot = 7; +var j = document.getElementById("mtimelink"); +if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href"); +} +function antiRobotDefense(){ +document.body.onmousedown=function(){ +antiRobot |= 2; +antiRobotGo(); +document.body.onmousedown=null; +} +document.body.onmousemove=function(){ +antiRobot |= 2; +antiRobotGo(); +document.body.onmousemove=null; +} +setTimeout(function(){ +antiRobot |= 1; +antiRobotGo(); +}, 100) +antiRobotGo(); +} +antiRobotDefense(); +</script> +<div class=fancy> +<div class=nosearch> +<div class="fancy_title"> +The Spellfix1 Virtual Table +</div> +<div class="fancy_toc"> +<a onclick="toggle_toc()"> +<span class="fancy_toc_mark" id="toc_mk">►</span> +Table Of Contents +</a> +<div id="toc_sub"><div class="fancy-toc1"><a href="#overview">1. Overview</a></div> +<div class="fancy-toc1"><a href="#search_refinements">2. Search Refinements</a></div> +<div class="fancy-toc1"><a href="#virtual_table_details">3. Virtual Table Details</a></div> +<div class="fancy-toc1"><a href="#algorithm">4. Algorithm</a></div> +<div class="fancy-toc1"><a href="#configurable_edit_distance">5. Configurable Edit Distance</a></div> +<div class="fancy-toc1"><a href="#dealing_with_unusual_and_difficult_spellings">6. Dealing With Unusual And Difficult Spellings</a></div> +<div class="fancy-toc1"><a href="#auxiliary_functions">7. Auxiliary Functions</a></div> +<div class="fancy-toc1"><a href="#the_editdist3_function">8. The editdist3 function</a></div> +<div class="fancy-toc1"><a href="#the_editdist3_cost_table">9. The editdist3 COST table</a></div> +<div class="fancy-toc1"><a href="#experimenting_with_the_editcost3_function">10. Experimenting with the editcost3() function</a></div> +</div> +</div> +<script> +function toggle_toc(){ +var sub = document.getElementById("toc_sub") +var mk = document.getElementById("toc_mk") +if( sub.style.display!="block" ){ +sub.style.display = "block"; +mk.innerHTML = "▼"; +} else { +sub.style.display = "none"; +mk.innerHTML = "►"; +} +} +</script> +</div> + + + + +<h1 id="overview"><span>1. </span>Overview</h1> + +<p>This spellfix1 <a href="vtab.html">virtual table</a> can be used to search +a large vocabulary for close matches. For example, spellfix1 +can be used to suggest corrections to misspelled words. Or, +it could be used with <a href="fts3.html#fts4">FTS4</a> to do full-text search using potentially +misspelled words. + +</p><p>The implementation for the spellfix1 virtual table is held in the +SQLite source tree in the miscellaneous extensions folder and in +particular in the file +<a href="https://www.sqlite.org/src/finfo?name=ext/misc/spellfix.c">ext/misc/spellfix1.c</a>. +The spellfix1 virtual table is not included in the SQLite <a href="amalgamation.html">amalgamation</a> +and is not a part of any standard SQLite build. It is a <a href="loadext.html">loadable extension</a>. + +</p><p>Once the spellfix1 extension is loaded, an instance of the spellfix1 +virtual table is created like this: + +</p><blockquote><pre> +CREATE VIRTUAL TABLE demo USING spellfix1; +</pre></blockquote> + +<p>The "spellfix1" term is the name of the spellfix module and must be +entered as shown. The "demo" term is the +name of the virtual table you will be creating and can be altered +to suit the needs of your application. The virtual table is initially +empty. In order for the virtual table to be useful, you will need to +populate it with your vocabulary. Suppose you +have a list of words in a table named "big_vocabulary". Then do this: + +</p><blockquote><pre> +INSERT INTO demo(word) SELECT word FROM big_vocabulary; +</pre></blockquote> + +<p>If you intend to use this virtual table in cooperation with an <a href="fts3.html#fts4">FTS4</a> +table (for spelling correction of search terms) then you might extract +the vocabulary using an <a href="fts3.html#fts4aux">fts4aux</a> table: + +</p><blockquote><pre> +INSERT INTO demo(word) SELECT term FROM search_aux WHERE col='*'; +</pre></blockquote> + +<p>You can also provide the virtual table with a "rank" for each word. +The "rank" is an estimate of how common the word is. Larger numbers +mean the word is more common. If you omit the rank when populating +the table, then a rank of 1 is assumed. But if you have rank +information, you can supply it and the virtual table will show a +slight preference for selecting more commonly used terms. To +populate the rank from an fts4aux table "search_aux" do something +like this: + +</p><blockquote><pre> +INSERT INTO demo(word,rank) + SELECT term, documents FROM search_aux WHERE col='*'; +</pre></blockquote> + +<p>To query the virtual table, include a MATCH operator in the WHERE +clause. For example: + +</p><blockquote><pre> +SELECT word FROM demo WHERE word MATCH 'kennasaw'; +</pre></blockquote> + +<p>Using a dataset of American place names (derived from +<a href="http://geonames.usgs.gov/domestic/download_data.htm">http://geonames.usgs.gov/domestic/download_data.htm</a>) the query above +returns 20 results beginning with: + +</p><blockquote><pre> +kennesaw +kenosha +kenesaw +kenaga +keanak +</pre></blockquote> + +<p>If you append the character '*' to the end of the pattern, then +a prefix search is performed. For example: + +</p><blockquote><pre> +SELECT word FROM demo WHERE word MATCH 'kennes*'; +</pre></blockquote> + +<p>Yields 20 results beginning with: + +</p><blockquote><pre> +kennesaw +kennestone +kenneson +kenneys +keanes +keenes +</pre></blockquote> + +<h1 id="search_refinements"><span>2. </span>Search Refinements</h1> + +<p>By default, the spellfix1 table returns no more than 20 results. +(It might return less than 20 if there were fewer good matches.) +You can change the upper bound on the number of returned rows by +adding a "top=N" term to the WHERE clause of your query, where N +is the new maximum. For example, to see the 5 best matches: + +</p><blockquote><pre> +SELECT word FROM demo WHERE word MATCH 'kennes*' AND top=5; +</pre></blockquote> + +<p>Each entry in the spellfix1 virtual table is associated with +a particular language, identified by the integer "langid" column. +The default langid is 0 and if no other actions are taken, the +entire vocabulary is a part of the 0 language. But if your application +needs to operate in multiple languages, then you can specify different +vocabulary items for each language by specifying the langid field +when populating the table. For example: + +</p><blockquote><pre> +INSERT INTO demo(word,langid) SELECT word, 0 FROM en_vocabulary; +INSERT INTO demo(word,langid) SELECT word, 1 FROM de_vocabulary; +INSERT INTO demo(word,langid) SELECT word, 2 FROM fr_vocabulary; +INSERT INTO demo(word,langid) SELECT word, 3 FROM ru_vocabulary; +INSERT INTO demo(word,langid) SELECT word, 4 FROM cn_vocabulary; +</pre></blockquote> + +<p>After the virtual table has been populated with items from multiple +languages, specify the language of interest using a "langid=N" term +in the WHERE clause of the query: + +</p><blockquote><pre> +SELECT word FROM demo WHERE word MATCH 'hildes*' AND langid=1; +</pre></blockquote> + +<p>Note that if you do not include the "langid=N" term in the WHERE clause, +the search will be against language 0 (English in the example above.) +All spellfix1 searches are against a single language id. There is no +way to search all languages at once. + + +</p><h1 id="virtual_table_details"><span>3. </span>Virtual Table Details</h1> + +<p>Each row in the spellfix1 virtual table has a unique rowid +with seven columns plus five extra hidden columns. +The columns are as follows: + +</p><dl> +<dt><p><b>rowid</b></p></dt><dd> +A unique integer number associated with each +vocabulary item in the table. This can be used +as a foreign key on other tables in the database. + +</dd><dt><p><b>word</b></p></dt><dd> +The text of the word that matches the pattern. +Both word and pattern can contain unicode characters +and can be mixed case. + +</dd><dt><p><b>rank</b></p></dt><dd> +This is the rank of the word, as specified in the +original INSERT statement. + + +</dd><dt><p><b>distance</b></p></dt><dd> +This is an edit distance or Levenshtein distance going +from the pattern to the word. + +</dd><dt><p><b>langid</b></p></dt><dd> +This is the language-id of the word. All queries are +against a single language-id, which defaults to 0. +For any given query this value is the same on all rows. + +</dd><dt><p><b>score</b></p></dt><dd> +The score is a combination of rank and distance. The +idea is that a lower score is better. The virtual table +attempts to find words with the lowest score and +by default (unless overridden by ORDER BY) returns +results in order of increasing score. + +</dd><dt><p><b>matchlen</b></p></dt><dd> +In a prefix search, the matchlen is the number of characters in +the string that match against the prefix. For a non-prefix search, +this is the same as length(word). + +</dd><dt><p><b>phonehash</b></p></dt><dd> +This column shows the phonetic hash prefix that was used to restrict +the search. For any given query, this column should be the same for +every row. This information is available for diagnostic purposes and +is not normally considered useful in real applications. + +</dd><dt><p><b>top</b></p></dt><dd> +(HIDDEN) For any query, this value is the same on all +rows. It is an integer which is the maximum number of +rows that will be output. The actually number of rows +output might be less than this number, but it will never +be greater. The default value for top is 20, but that +can be changed for each query by including a term of +the form "top=N" in the WHERE clause of the query. + +</dd><dt><p><b>scope</b></p></dt><dd> +(HIDDEN) For any query, this value is the same on all +rows. The scope is a measure of how widely the virtual +table looks for matching words. Smaller values of +scope cause a broader search. The scope is normally +chosen automatically and is capped at 4. Applications +can change the scope by including a term of the form +"scope=N" in the WHERE clause of the query. Increasing +the scope will make the query run faster, but will reduce +the possible corrections. + +</dd><dt><p><b>srchcnt</b></p></dt><dd> +(HIDDEN) For any query, this value is the same on all +rows. This value is an integer which is the number of +words examined using the edit-distance algorithm to +find the top matches that are ultimately displayed. This +value is for diagnostic use only. + +</dd><dt><p><b>soundslike</b></p></dt><dd> +(HIDDEN) When inserting vocabulary entries, this field +can be set to a spelling that matches what the word +sounds like. See the DEALING WITH UNUSUAL AND DIFFICULT +SPELLINGS section below for details. + +</dd><dt><p><b>command</b></p></dt><dd> +(HIDDEN) The value of the "command" column is always NULL. However, +applications can insert special strings into the "command" column in order +to provoke certain behaviors in the spellfix1 virtual table. +For example, inserting the string 'reset' into the "command" column +will cause the virtual table to reread its edit distance weights +(if there are any). +</dd></dl> + +<h1 id="algorithm"><span>4. </span>Algorithm</h1> + +<p>The spellfix1 virtual table creates a single +shadow table named "%_vocab" (where the % is replaced by the name of +the virtual table; Ex: "demo_vocab" for the "demo" virtual table). +the shadow table contains the following columns: + +</p><dl> +<dt><p><b>id</b></p></dt><dd> +The unique id (INTEGER PRIMARY KEY) + +</dd><dt><p><b>rank</b></p></dt><dd> +The rank of word. + +</dd><dt><p><b>langid</b></p></dt><dd> +The language id for this entry. + +</dd><dt><p><b>word</b></p></dt><dd> +The original UTF8 text of the vocabulary word + +</dd><dt><p><b>k1</b></p></dt><dd> +The word transliterated into lower-case ASCII. +There is a standard table of mappings from non-ASCII +characters into ASCII. Examples: "æ" -> "ae", +"þ" -> "th", "ß" -> "ss", "á" -> "a", ... The +accessory function spellfix1_translit(X) will do +the non-ASCII to ASCII mapping. The built-in lower(X) +function will convert to lower-case. Thus: +k1 = lower(spellfix1_translit(word)). + +If the word is already all lower-case ASCII, then the k1 column +will contain a NULL. This reduces the storage requirements for +the %_vocab table and helps spellfix to run a little faster. +Therefore, it is advantageous to populate as much of the spellfix +table as possible using lower-case ASCII vocabulary. + +</dd><dt><p><b>k2</b></p></dt><dd> +This field holds a phonetic code derived from coalesce(k1,word). +Letters that have similar sounds are mapped into the same symbol. +For example, all vowels and vowel clusters become the +single symbol "A". And the letters "p", "b", "f", and +"v" all become "B". All nasal sounds are represented +as "N". And so forth. The mapping is based on +ideas found in Soundex, Metaphone, and other +long-standing phonetic matching systems. This key can +be generated by the function spellfix1_phonehash(X). +Hence: k2 = spellfix1_phonehash(coalesce(k1,word)) +</dd></dl> + +<p>There is also a function for computing the Wagner edit distance or the +Levenshtein distance between a pattern and a word. This function +is exposed as spellfix1_editdist(X,Y). The edit distance function +returns the "cost" of converting X into Y. Some transformations +cost more than others. Changing one vowel into a different vowel, +for example is relatively cheap, as is doubling a constant, or +omitting the second character of a double-constant. Other transformations +or more expensive. The idea is that the edit distance function returns +a low cost for words that are similar and a higher cost for words +that are further apart. In this implementation, the maximum cost +of any single-character edit (delete, insert, or substitute) is 100, +with lower costs for some edits (such as transforming vowels). + +</p><p>The "score" for a comparison is the edit distance between the pattern +and the word, adjusted down by the base-2 logarithm of the word rank. +For example, a match with distance 100 but rank 1000 would have a +score of 122 (= 100 - log2(1000) + 32) whereas a match with distance +100 with a rank of 1 would have a score of 131 (100 - log2(1) + 32). +(NB: The constant 32 is added to each score to keep it from going +negative in case the edit distance is zero.) In this way, frequently +used words get a slightly lower cost which tends to move them toward +the top of the list of alternative spellings. + +</p><p>A straightforward implementation of a spelling corrector would be +to compare the search term against every word in the vocabulary +and select the 20 with the lowest scores. However, there will +typically be hundreds of thousands or millions of words in the +vocabulary, and so this approach is not fast enough. + +</p><p>Suppose the term that is being spell-corrected is X. To limit +the search space, X is converted to a k2-like key using the +equivalent of: + +</p><blockquote><pre> + key = spellfix1_phonehash(lower(spellfix1_translit(X))) +</pre></blockquote> + +<p>This key is then limited to "scope" characters. The default scope +value is 4, but an alternative scope can be specified using the +"scope=N" term in the WHERE clause. After the key has been truncated, +the edit distance is run against every term in the vocabulary that +has a k2 value that begins with the abbreviated key. + +</p><p>For example, suppose the input word is "Paskagula". The phonetic +key is "BACACALA" which is then truncated to 4 characters "BACA". +The edit distance is then run on the 4980 entries (out of +272,597 entries total) of the vocabulary whose k2 values begin with +BACA, yielding "Pascagoula" as the best match. + +</p><p>Only terms of the vocabulary with a matching langid are searched. +Hence, the same table can contain entries from multiple languages +and only the requested language will be used. The default langid +is 0. + +<a name="configeditdist"></a> + +</p><h1 id="configurable_edit_distance"><span>5. </span>Configurable Edit Distance</h1> + +<p>The built-in Wagner edit-distance function with fixed weights can be +replaced by the <a href="spellfix1.html#editdist3">editdist3()</a> edit-distance function +with application-defined weights and support for unicode, by specifying +the "edit_cost_table=<i>TABLENAME</i>" parameter to the spellfix1 module +when the virtual table is created. +For example: + +</p><blockquote><pre> +CREATE VIRTUAL TABLE demo2 USING spellfix1(edit_cost_table=APPCOST); +</pre></blockquote> + +<p>The <a href="spellfix1.html#editdist3">editdist3()</a> edit-distance function can also be selected or +deselected at run-time by inserting an appropriate string into the +"command" column of the virtual table:</p> + +<blockquote><pre> +INSERT INTO demo2(command) VALUES('edit_cost_table=APPCOST'); +</pre></blockquote> + + +<p>In the examples above, the APPCOST table would be interrogated to find +the edit distance coefficients. It is the presence of the "edit_cost_table=" +parameter to the spellfix1 module name that causes editdist3() to be used +in place of the built-in edit distance function. If APPCOST is an empty +string, then the built-in Wagner edit-distance function is used. + +</p><p>The edit distance coefficients are normally read from the APPCOST table +once and there after stored in memory. Hence, run-time changes to the +APPCOST table will not normally affect the edit distance results. +However, inserting the special string 'reset' into the "command" column of the +virtual table causes the edit distance coefficients to be reread the +APPCOST table. Hence, applications should run a SQL statement similar +to the following when changes to the APPCOST table occur: + +</p><blockquote> +INSERT INTO demo2(command) VALUES("reset"); +</blockquote> + +<h1 id="dealing_with_unusual_and_difficult_spellings"><span>6. </span>Dealing With Unusual And Difficult Spellings</h1> + +<p>The algorithm above works quite well for most cases, but there are +exceptions. These exceptions can be dealt with by making additional +entries in the virtual table using the "soundslike" column. + +</p><p>For example, many words of Greek origin begin with letters "ps" where +the "p" is silent. Ex: psalm, pseudonym, psoriasis, psyche. In +another example, many Scottish surnames can be spelled with an +initial "Mac" or "Mc". Thus, "MacKay" and "McKay" are both pronounced +the same. + +</p><p>Accommodation can be made for words that are not spelled as they +sound by making additional entries into the virtual table for the +same word, but adding an alternative spelling in the "soundslike" +column. For example, the canonical entry for "psalm" would be this: + +</p><blockquote><pre> + INSERT INTO demo(word) VALUES('psalm'); +</pre></blockquote> + +<p>To enhance the ability to correct the spelling of "salm" into +"psalm", make an addition entry like this: + +</p><blockquote><pre> + INSERT INTO demo(word,soundslike) VALUES('psalm','salm'); +</pre></blockquote> + +<p>It is ok to make multiple entries for the same word as long as +each entry has a different soundslike value. Note that if no +soundslike value is specified, the soundslike defaults to the word +itself. + +</p><p>Listed below are some cases where it might make sense to add additional +soundslike entries. The specific entries will depend on the application +and the target language. + +</p><ul> +<li>Silent "p" in words beginning with "ps": psalm, psyche +</li><li>Silent "p" in words beginning with "pn": pneumonia, pneumatic +</li><li>Silent "p" in words beginning with "pt": pterodactyl, ptolemaic +</li><li>Silent "d" in words beginning with "dj": djinn, Djikarta +</li><li>Silent "k" in words beginning with "kn": knight, Knuthson +</li><li>Silent "g" in words beginning with "gn": gnarly, gnome, gnat +</li><li>"Mac" versus "Mc" beginning Scottish surnames +</li><li>"Tch" sounds in Slavic words: Tchaikovsky vs. Chaykovsky +</li><li>The letter "j" pronounced like "h" in Spanish: LaJolla +</li><li>Words beginning with "wr" versus "r": write vs. rite +</li><li>Miscellaneous problem words such as "debt", "tsetse", + "Nguyen", "Van Nuyes". +</li></ul> + +<h1 id="auxiliary_functions"><span>7. </span>Auxiliary Functions</h1> + +<p>The source code module that implements the spellfix1 virtual table also +implements several SQL functions that might be useful to applications +that employ spellfix1 or for testing or diagnostic work while developing +applications that use spellfix1. The following auxiliary functions are +available: + +</p><dl> +<dt><p><b>editdist3(P,W)<br>editdist3(P,W,L)<br>editdist3(T)</b></p></dt><dd> +These routines provide direct access to the version of the Wagner +edit-distance function that allows for application-defined weights +on edit operations. The first two forms of this function compare +pattern P against word W and return the edit distance. In the first +function, the langid is assumed to be 0 and in the second, the +langid is given by the L parameter. The third form of this function +reloads edit distance coefficients from the table named by T. + +</dd><dt><p><b>spellfix1_editdist(P,W)</b></p></dt><dd> +This routine provides access to the built-in Wagner edit-distance +function that uses default, fixed costs. The value returned is +the edit distance needed to transform W into P. + +</dd><dt><p><b>spellfix1_phonehash(X)</b></p></dt><dd> +This routine constructs a phonetic hash of the pure ascii input word X +and returns that hash. This routine is used internally by spellfix1 in +order to transform the K1 column of the shadow table into the K2 +column. + +</dd><dt><p><b>spellfix1_scriptcode(X)</b></p></dt><dd> +Given an input string X, this routine attempts to determine the dominant +script of that input and returns the ISO-15924 numeric code for that +script. The current implementation understands the following scripts: +<ul> +<li> 215 - Latin +</li><li> 220 - Cyrillic +</li><li> 200 - Greek +</li></ul> +Additional language codes might be added in future releases. + +</dd><dt><p><b>spellfix1_translit(X)</b></p></dt><dd> +This routine transliterates unicode text into pure ascii, returning +the pure ascii representation of the input text X. This is the function +that is used internally to transform vocabulary words into the K1 +column of the shadow table. + +</dd></dl> + +<a name="editdist3"></a> + +<h1 id="the_editdist3_function"><span>8. </span>The editdist3 function</h1> + +<p>The editdist3 algorithm is a function that computes the minimum edit +distance (a.k.a. the Levenshtein distance) between two input strings. +The editdist3 algorithm is a configurable alternative to the default +edit distance function of spellfix1. +Features of editdist3 include: + +</p><ul> +<li><p>It works with unicode (UTF8) text. + +</p></li><li><p>A table of insertion, deletion, and substitution costs can be + provided by the application. + +</p></li><li><p>Multi-character insertions, deletions, and substitutions can be + enumerated in the cost table. +</p></li></ul> + +<h1 id="the_editdist3_cost_table"><span>9. </span>The editdist3 COST table</h1> + +<p>To program the costs of editdist3, create a table such as the following: + +</p><blockquote><pre> +CREATE TABLE editcost( + iLang INT, -- The language ID + cFrom TEXT, -- Convert text from this + cTo TEXT, -- Convert text into this + iCost INT -- The cost of doing the conversion +); +</pre></blockquote> + +<p>The cost table can be named anything you want - it does not have to be +called "editcost". And the table can contain additional columns. +The only requirement is that the +table must contain the four columns show above, with exactly the names shown. + +</p><p>The iLang column is a non-negative integer that identifies a set of costs +appropriate for a particular language. The editdist3 function will only use +a single iLang value for any given edit-distance computation. The default +value is 0. It is recommended that applications that only need to use a +single language always use iLang==0 for all entries. + +</p><p>The iCost column is the numeric cost of transforming cFrom into cTo. This +value should be a non-negative integer, and should probably be less than 100. +The default single-character insertion and deletion costs are 100 and the +default single-character to single-character substitution cost is 150. A +cost of 10000 or more is considered "infinite" and causes the rule to be +ignored. + +</p><p>The cFrom and cTo columns show edit transformation strings. Either or both +columns may contain more than one character. Or either column (but not both) +may hold an empty string. When cFrom is empty, that is the cost of inserting +cTo. When cTo is empty, that is the cost of deleting cFrom. + +</p><p>In the spellfix1 algorithm, cFrom is the text as the user entered it and +cTo is the correctly spelled text as it exists in the database. The goal +of the editdist3 algorithm is to determine how close the user-entered text is +to the dictionary text. + +</p><p>There are three special-case entries in the cost table: + +</p><table border="1" align="center"> +<tr><th>cFrom</th><th>cTo</th><th>Meaning</th></tr> +<tr><td>''</td><td>'?'</td><td>The default insertion cost</td></tr> +<tr><td>'?'</td><td>''</td><td>The default deletion cost</td></tr> +<tr><td>'?'</td><td>'?'</td><td>The default substitution cost</td></tr> +</table> + +<p>If any of the special-case entries shows above are omitted, then the +value of 100 is used for insertion and deletion and 150 is used for +substitution. To disable the default insertion, deletion, and/or substitution +set their respective cost to 10000 or more. + +</p><p>Other entries in the cost table specific transforms for particular +characters. +The cost of specific transforms should be less than the default costs, or else +the default costs will take precedence and the specific transforms will never +be used. + +</p><p>Some example, cost table entries: + +</p><blockquote><pre> +INSERT INTO editcost(iLang, cFrom, cTo, iCost) +VALUES(0, 'a', 'ä', 5); +</pre></blockquote> + +<p>The rule above says that the letter "a" in user input can be matched against +the letter "ä" in the dictionary with a penalty of 5. + +</p><blockquote><pre> +INSERT INTO editcost(iLang, cFrom, cTo, iCost) +VALUES(0, 'ss', 'ß', 8); +</pre></blockquote> + +<p>The number of characters in cFrom and cTo do not need to be the same. The +rule above says that "ss" on user input will match "ß" with a penalty of 8. + +</p><h1 id="experimenting_with_the_editcost3_function"><span>10. </span>Experimenting with the editcost3() function</h1> + +<p>The spellfix1 virtual table +uses editdist3 if the "edit_cost_table=TABLE" option +is specified as an argument when the spellfix1 virtual table is created. +But editdist3 can also be tested directly using the built-in "editdist3()" +SQL function. The editdist3() SQL function has 3 forms: + +</p><ol> +<li> editdist3('TABLENAME'); +</li><li> editdist3('string1', 'string2'); +</li><li> editdist3('string1', 'string2', langid); +</li></ol> + +<p>The first form loads the edit distance coefficients from a table called +'TABLENAME'. Any prior coefficients are discarded. So when experimenting +with weights and the weight table changes, simply rerun the single-argument +form of editdist3() to reload revised coefficients. Note that the +edit distance +weights used by the editdist3() SQL function are independent from the +weights used by the spellfix1 virtual table. + +</p><p>The second and third forms return the computed edit distance between strings +'string1' and "string2'. In the second form, a language id of 0 is used. +The language id is specified in the third form. +</p> + |