summaryrefslogtreecommitdiffstats
path: root/sphinx/search/zh.py
diff options
context:
space:
mode:
Diffstat (limited to 'sphinx/search/zh.py')
-rw-r--r--sphinx/search/zh.py262
1 files changed, 262 insertions, 0 deletions
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
new file mode 100644
index 0000000..2a3a6e7
--- /dev/null
+++ b/sphinx/search/zh.py
@@ -0,0 +1,262 @@
+"""Chinese search language: includes routine to split words."""
+
+from __future__ import annotations
+
+import os
+import re
+from typing import TYPE_CHECKING, Dict, List
+
+import snowballstemmer
+
+from sphinx.search import SearchLanguage
+
+try:
+ import jieba
+ JIEBA = True
+except ImportError:
+ JIEBA = False
+
+english_stopwords = set("""
+a and are as at
+be but by
+for
+if in into is it
+near no not
+of on or
+such
+that the their then there these they this to
+was will with
+""".split())
+
+js_porter_stemmer = """
+/**
+ * Porter Stemmer
+ */
+var Stemmer = function() {
+
+ var step2list = {
+ ational: 'ate',
+ tional: 'tion',
+ enci: 'ence',
+ anci: 'ance',
+ izer: 'ize',
+ bli: 'ble',
+ alli: 'al',
+ entli: 'ent',
+ eli: 'e',
+ ousli: 'ous',
+ ization: 'ize',
+ ation: 'ate',
+ ator: 'ate',
+ alism: 'al',
+ iveness: 'ive',
+ fulness: 'ful',
+ ousness: 'ous',
+ aliti: 'al',
+ iviti: 'ive',
+ biliti: 'ble',
+ logi: 'log'
+ };
+
+ var step3list = {
+ icate: 'ic',
+ ative: '',
+ alize: 'al',
+ iciti: 'ic',
+ ical: 'ic',
+ ful: '',
+ ness: ''
+ };
+
+ var c = "[^aeiou]"; // consonant
+ var v = "[aeiouy]"; // vowel
+ var C = c + "[^aeiouy]*"; // consonant sequence
+ var V = v + "[aeiou]*"; // vowel sequence
+
+ var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
+ var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
+ var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
+ var s_v = "^(" + C + ")?" + v; // vowel in stem
+
+ this.stemWord = function (w) {
+ var stem;
+ var suffix;
+ var firstch;
+ var origword = w;
+
+ if (w.length < 3)
+ return w;
+
+ var re;
+ var re2;
+ var re3;
+ var re4;
+
+ firstch = w.substr(0,1);
+ if (firstch == "y")
+ w = firstch.toUpperCase() + w.substr(1);
+
+ // Step 1a
+ re = /^(.+?)(ss|i)es$/;
+ re2 = /^(.+?)([^s])s$/;
+
+ if (re.test(w))
+ w = w.replace(re,"$1$2");
+ else if (re2.test(w))
+ w = w.replace(re2,"$1$2");
+
+ // Step 1b
+ re = /^(.+?)eed$/;
+ re2 = /^(.+?)(ed|ing)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ re = new RegExp(mgr0);
+ if (re.test(fp[1])) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+ }
+ else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1];
+ re2 = new RegExp(s_v);
+ if (re2.test(stem)) {
+ w = stem;
+ re2 = /(at|bl|iz)$/;
+ re3 = new RegExp("([^aeiouylsz])\\\\1$");
+ re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re2.test(w))
+ w = w + "e";
+ else if (re3.test(w)) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+ else if (re4.test(w))
+ w = w + "e";
+ }
+ }
+
+ // Step 1c
+ re = /^(.+?)y$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(s_v);
+ if (re.test(stem))
+ w = stem + "i";
+ }
+
+ // Step 2
+ re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
+ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem))
+ w = stem + step2list[suffix];
+ }
+
+ // Step 3
+ re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem))
+ w = stem + step3list[suffix];
+ }
+
+ // Step 4
+ re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
+iti|ous|ive|ize)$/;
+ re2 = /^(.+?)(s|t)(ion)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ if (re.test(stem))
+ w = stem;
+ }
+ else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1] + fp[2];
+ re2 = new RegExp(mgr1);
+ if (re2.test(stem))
+ w = stem;
+ }
+
+ // Step 5
+ re = /^(.+?)e$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ re2 = new RegExp(meq1);
+ re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
+ w = stem;
+ }
+ re = /ll$/;
+ re2 = new RegExp(mgr1);
+ if (re.test(w) && re2.test(w)) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+
+ // and turn initial Y back to y
+ if (firstch == "y")
+ w = firstch.toLowerCase() + w.substr(1);
+ return w;
+ }
+}
+"""
+
+
+class SearchChinese(SearchLanguage):
+ """
+ Chinese search implementation
+ """
+
+ lang = 'zh'
+ language_name = 'Chinese'
+ js_stemmer_code = js_porter_stemmer
+ stopwords = english_stopwords
+ latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
+ latin_terms: list[str] = []
+
+ def init(self, options: dict) -> None:
+ if JIEBA:
+ dict_path = options.get('dict')
+ if dict_path and os.path.isfile(dict_path):
+ jieba.load_userdict(dict_path)
+
+ self.stemmer = snowballstemmer.stemmer('english')
+
+ def split(self, input: str) -> list[str]:
+ chinese: list[str] = []
+ if JIEBA:
+ chinese = list(jieba.cut_for_search(input))
+
+ latin1 = \
+ [term.strip() for term in self.latin1_letters.findall(input)]
+ self.latin_terms.extend(latin1)
+ return chinese + latin1
+
+ def word_filter(self, stemmed_word: str) -> bool:
+ return len(stemmed_word) > 1
+
+ def stem(self, word: str) -> str:
+ # Don't stem Latin words that are long enough to be relevant for search
+ # if not stemmed, but would be too short after being stemmed
+ # avoids some issues with acronyms
+ should_not_be_stemmed = (
+ word in self.latin_terms and
+ len(word) >= 3 and
+ len(self.stemmer.stemWord(word.lower())) < 3
+ )
+ if should_not_be_stemmed:
+ return word.lower()
+ return self.stemmer.stemWord(word.lower())