summaryrefslogtreecommitdiffstats
path: root/sphinx/search/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'sphinx/search/__init__.py')
-rw-r--r--sphinx/search/__init__.py556
1 files changed, 556 insertions, 0 deletions
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
new file mode 100644
index 0000000..21758d3
--- /dev/null
+++ b/sphinx/search/__init__.py
@@ -0,0 +1,556 @@
+"""Create a full-text search index for offline search."""
+from __future__ import annotations
+
+import dataclasses
+import functools
+import html
+import json
+import pickle
+import re
+from importlib import import_module
+from os import path
+from typing import IO, TYPE_CHECKING, Any
+
+from docutils import nodes
+from docutils.nodes import Element, Node
+
+from sphinx import addnodes, package_dir
+from sphinx.environment import BuildEnvironment
+from sphinx.util.index_entries import split_index_msg
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable
+
+
+class SearchLanguage:
+ """
+ This class is the base class for search natural language preprocessors. If
+ you want to add support for a new language, you should override the methods
+ of this class.
+
+ You should override `lang` class property too (e.g. 'en', 'fr' and so on).
+
+ .. attribute:: stopwords
+
+ This is a set of stop words of the target language. Default `stopwords`
+ is empty. This word is used for building index and embedded in JS.
+
+ .. attribute:: js_splitter_code
+
+ Return splitter function of JavaScript version. The function should be
+ named as ``splitQuery``. And it should take a string and return list of
+ strings.
+
+ .. versionadded:: 3.0
+
+ .. attribute:: js_stemmer_code
+
+ Return stemmer class of JavaScript version. This class' name should be
+ ``Stemmer`` and this class must have ``stemWord`` method. This string is
+ embedded as-is in searchtools.js.
+
+ This class is used to preprocess search word which Sphinx HTML readers
+ type, before searching index. Default implementation does nothing.
+ """
+ lang: str | None = None
+ language_name: str | None = None
+ stopwords: set[str] = set()
+ js_splitter_code: str = ""
+ js_stemmer_rawcode: str | None = None
+ js_stemmer_code = """
+/**
+ * Dummy stemmer for languages without stemming rules.
+ */
+var Stemmer = function() {
+ this.stemWord = function(w) {
+ return w;
+ }
+}
+"""
+
+ _word_re = re.compile(r'\w+')
+
+ def __init__(self, options: dict) -> None:
+ self.options = options
+ self.init(options)
+
+ def init(self, options: dict) -> None:
+ """
+ Initialize the class with the options the user has given.
+ """
+
+ def split(self, input: str) -> list[str]:
+ """
+ This method splits a sentence into words. Default splitter splits input
+ at white spaces, which should be enough for most languages except CJK
+ languages.
+ """
+ return self._word_re.findall(input)
+
+ def stem(self, word: str) -> str:
+ """
+ This method implements stemming algorithm of the Python version.
+
+ Default implementation does nothing. You should implement this if the
+ language has any stemming rules.
+
+ This class is used to preprocess search words before registering them in
+ the search index. The stemming of the Python version and the JS version
+ (given in the js_stemmer_code attribute) must be compatible.
+ """
+ return word
+
+ def word_filter(self, word: str) -> bool:
+ """
+ Return true if the target word should be registered in the search index.
+ This method is called after stemming.
+ """
+ return (
+ len(word) == 0 or not (
+ ((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
+ (ord(word[0]) < 256 and (
+ word in self.stopwords
+ ))))
+
+
+# SearchEnglish imported after SearchLanguage is defined due to circular import
+from sphinx.search.en import SearchEnglish
+
+
+def parse_stop_word(source: str) -> set[str]:
+ """
+ Parse snowball style word list like this:
+
+ * http://snowball.tartarus.org/algorithms/finnish/stop.txt
+ """
+ result: set[str] = set()
+ for line in source.splitlines():
+ line = line.split('|')[0] # remove comment
+ result.update(line.split())
+ return result
+
+
+# maps language name to module.class or directly a class
+languages: dict[str, str | type[SearchLanguage]] = {
+ 'da': 'sphinx.search.da.SearchDanish',
+ 'de': 'sphinx.search.de.SearchGerman',
+ 'en': SearchEnglish,
+ 'es': 'sphinx.search.es.SearchSpanish',
+ 'fi': 'sphinx.search.fi.SearchFinnish',
+ 'fr': 'sphinx.search.fr.SearchFrench',
+ 'hu': 'sphinx.search.hu.SearchHungarian',
+ 'it': 'sphinx.search.it.SearchItalian',
+ 'ja': 'sphinx.search.ja.SearchJapanese',
+ 'nl': 'sphinx.search.nl.SearchDutch',
+ 'no': 'sphinx.search.no.SearchNorwegian',
+ 'pt': 'sphinx.search.pt.SearchPortuguese',
+ 'ro': 'sphinx.search.ro.SearchRomanian',
+ 'ru': 'sphinx.search.ru.SearchRussian',
+ 'sv': 'sphinx.search.sv.SearchSwedish',
+ 'tr': 'sphinx.search.tr.SearchTurkish',
+ 'zh': 'sphinx.search.zh.SearchChinese',
+}
+
+
+class _JavaScriptIndex:
+ """
+ The search index as JavaScript file that calls a function
+ on the documentation search object to register the index.
+ """
+
+ PREFIX = 'Search.setIndex('
+ SUFFIX = ')'
+
+ def dumps(self, data: Any) -> str:
+ return self.PREFIX + json.dumps(data) + self.SUFFIX
+
+ def loads(self, s: str) -> Any:
+ data = s[len(self.PREFIX):-len(self.SUFFIX)]
+ if not data or not s.startswith(self.PREFIX) or not \
+ s.endswith(self.SUFFIX):
+ raise ValueError('invalid data')
+ return json.loads(data)
+
+ def dump(self, data: Any, f: IO) -> None:
+ f.write(self.dumps(data))
+
+ def load(self, f: IO) -> Any:
+ return self.loads(f.read())
+
+
+js_index = _JavaScriptIndex()
+
+
+def _is_meta_keywords(
+ node: nodes.meta, # type: ignore[name-defined]
+ lang: str | None,
+) -> bool:
+ if node.get('name') == 'keywords':
+ meta_lang = node.get('lang')
+ if meta_lang is None: # lang not specified
+ return True
+ elif meta_lang == lang: # matched to html_search_language
+ return True
+
+ return False
+
+
+@dataclasses.dataclass
+class WordStore:
+ words: list[str] = dataclasses.field(default_factory=list)
+ titles: list[tuple[str, str]] = dataclasses.field(default_factory=list)
+ title_words: list[str] = dataclasses.field(default_factory=list)
+
+
+class WordCollector(nodes.NodeVisitor):
+ """
+ A special visitor that collects words for the `IndexBuilder`.
+ """
+
+ def __init__(self, document: nodes.document, lang: SearchLanguage) -> None:
+ super().__init__(document)
+ self.found_words: list[str] = []
+ self.found_titles: list[tuple[str, str]] = []
+ self.found_title_words: list[str] = []
+ self.lang = lang
+
+ def dispatch_visit(self, node: Node) -> None:
+ if isinstance(node, nodes.comment):
+ raise nodes.SkipNode
+ elif isinstance(node, nodes.raw):
+ if 'html' in node.get('format', '').split():
+ # Some people might put content in raw HTML that should be searched,
+ # so we just amateurishly strip HTML tags and index the remaining
+ # content
+ nodetext = re.sub(r'<style.*?</style>', '', node.astext(), flags=re.IGNORECASE|re.DOTALL)
+ nodetext = re.sub(r'<script.*?</script>', '', nodetext, flags=re.IGNORECASE|re.DOTALL)
+ nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+ self.found_words.extend(self.lang.split(nodetext))
+ raise nodes.SkipNode
+ elif isinstance(node, nodes.Text):
+ self.found_words.extend(self.lang.split(node.astext()))
+ elif isinstance(node, nodes.title):
+ title = node.astext()
+ ids = node.parent['ids']
+ self.found_titles.append((title, ids[0] if ids else None))
+ self.found_title_words.extend(self.lang.split(title))
+ elif isinstance(node, Element) and _is_meta_keywords(node, self.lang.lang):
+ keywords = node['content']
+ keywords = [keyword.strip() for keyword in keywords.split(',')]
+ self.found_words.extend(keywords)
+
+
+class IndexBuilder:
+ """
+ Helper class that creates a search index based on the doctrees
+ passed to the `feed` method.
+ """
+ formats = {
+ 'json': json,
+ 'pickle': pickle
+ }
+
+ def __init__(self, env: BuildEnvironment, lang: str, options: dict, scoring: str) -> None:
+ self.env = env
+ # docname -> title
+ self._titles: dict[str, str] = env._search_index_titles
+ # docname -> filename
+ self._filenames: dict[str, str] = env._search_index_filenames
+ # stemmed words -> set(docname)
+ self._mapping: dict[str, set[str]] = env._search_index_mapping
+ # stemmed words in titles -> set(docname)
+ self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
+ # docname -> all titles in document
+ self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles
+ # docname -> list(index entry)
+ self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
+ # objtype -> index
+ self._objtypes: dict[tuple[str, str], int] = env._search_index_objtypes
+ # objtype index -> (domain, type, objname (localized))
+ self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
+ # add language-specific SearchLanguage instance
+ lang_class = languages.get(lang)
+
+ # fallback; try again with language-code
+ if lang_class is None and '_' in lang:
+ lang_class = languages.get(lang.split('_')[0])
+
+ if lang_class is None:
+ self.lang: SearchLanguage = SearchEnglish(options)
+ elif isinstance(lang_class, str):
+ module, classname = lang_class.rsplit('.', 1)
+ lang_class: type[SearchLanguage] = getattr(import_module(module), classname) # type: ignore[no-redef]
+ self.lang = lang_class(options) # type: ignore[operator]
+ else:
+ # it's directly a class (e.g. added by app.add_search_language)
+ self.lang = lang_class(options)
+
+ if scoring:
+ with open(scoring, 'rb') as fp:
+ self.js_scorer_code = fp.read().decode()
+ else:
+ self.js_scorer_code = ''
+ self.js_splitter_code = ""
+
+ def load(self, stream: IO, format: Any) -> None:
+ """Reconstruct from frozen data."""
+ if isinstance(format, str):
+ format = self.formats[format]
+ frozen = format.load(stream)
+ # if an old index is present, we treat it as not existing.
+ if not isinstance(frozen, dict) or \
+ frozen.get('envversion') != self.env.version:
+ raise ValueError('old format')
+ index2fn = frozen['docnames']
+ self._filenames = dict(zip(index2fn, frozen['filenames']))
+ self._titles = dict(zip(index2fn, frozen['titles']))
+ self._all_titles = {}
+
+ for docname in self._titles.keys():
+ self._all_titles[docname] = []
+ for title, doc_tuples in frozen['alltitles'].items():
+ for doc, titleid in doc_tuples:
+ self._all_titles[index2fn[doc]].append((title, titleid))
+
+ def load_terms(mapping: dict[str, Any]) -> dict[str, set[str]]:
+ rv = {}
+ for k, v in mapping.items():
+ if isinstance(v, int):
+ rv[k] = {index2fn[v]}
+ else:
+ rv[k] = {index2fn[i] for i in v}
+ return rv
+
+ self._mapping = load_terms(frozen['terms'])
+ self._title_mapping = load_terms(frozen['titleterms'])
+ # no need to load keywords/objtypes
+
+ def dump(self, stream: IO, format: Any) -> None:
+ """Dump the frozen index to a stream."""
+ if isinstance(format, str):
+ format = self.formats[format]
+ format.dump(self.freeze(), stream)
+
+ def get_objects(self, fn2index: dict[str, int]
+ ) -> dict[str, list[tuple[int, int, int, str, str]]]:
+ rv: dict[str, list[tuple[int, int, int, str, str]]] = {}
+ otypes = self._objtypes
+ onames = self._objnames
+ for domainname, domain in sorted(self.env.domains.items()):
+ for fullname, dispname, type, docname, anchor, prio in \
+ sorted(domain.get_objects()):
+ if docname not in fn2index:
+ continue
+ if prio < 0:
+ continue
+ fullname = html.escape(fullname)
+ dispname = html.escape(dispname)
+ prefix, _, name = dispname.rpartition('.')
+ plist = rv.setdefault(prefix, [])
+ try:
+ typeindex = otypes[domainname, type]
+ except KeyError:
+ typeindex = len(otypes)
+ otypes[domainname, type] = typeindex
+ otype = domain.object_types.get(type)
+ if otype:
+ # use str() to fire translation proxies
+ onames[typeindex] = (domainname, type,
+ str(domain.get_type_name(otype)))
+ else:
+ onames[typeindex] = (domainname, type, type)
+ if anchor == fullname:
+ shortanchor = ''
+ elif anchor == type + '-' + fullname:
+ shortanchor = '-'
+ else:
+ shortanchor = anchor
+ plist.append((fn2index[docname], typeindex, prio, shortanchor, name))
+ return rv
+
+ def get_terms(self, fn2index: dict) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
+ rvs: tuple[dict[str, list[str]], dict[str, list[str]]] = ({}, {})
+ for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
+ for k, v in mapping.items():
+ if len(v) == 1:
+ fn, = v
+ if fn in fn2index:
+ rv[k] = fn2index[fn]
+ else:
+ rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])
+ return rvs
+
+ def freeze(self) -> dict[str, Any]:
+ """Create a usable data structure for serializing."""
+ docnames, titles = zip(*sorted(self._titles.items()))
+ filenames = [self._filenames.get(docname) for docname in docnames]
+ fn2index = {f: i for (i, f) in enumerate(docnames)}
+ terms, title_terms = self.get_terms(fn2index)
+
+ objects = self.get_objects(fn2index) # populates _objtypes
+ objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
+ objnames = self._objnames
+
+ alltitles: dict[str, list[tuple[int, str]]] = {}
+ for docname, titlelist in self._all_titles.items():
+ for title, titleid in titlelist:
+ alltitles.setdefault(title, []).append((fn2index[docname], titleid))
+
+ index_entries: dict[str, list[tuple[int, str]]] = {}
+ for docname, entries in self._index_entries.items():
+ for entry, entry_id, main_entry in entries:
+ index_entries.setdefault(entry.lower(), []).append((fn2index[docname], entry_id))
+
+ return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
+ objects=objects, objtypes=objtypes, objnames=objnames,
+ titleterms=title_terms, envversion=self.env.version,
+ alltitles=alltitles, indexentries=index_entries)
+
+ def label(self) -> str:
+ return f"{self.lang.language_name} (code: {self.lang.lang})"
+
+ def prune(self, docnames: Iterable[str]) -> None:
+ """Remove data for all docnames not in the list."""
+ new_titles = {}
+ new_alltitles = {}
+ new_filenames = {}
+ for docname in docnames:
+ if docname in self._titles:
+ new_titles[docname] = self._titles[docname]
+ new_alltitles[docname] = self._all_titles[docname]
+ new_filenames[docname] = self._filenames[docname]
+ self._titles = new_titles
+ self._filenames = new_filenames
+ self._all_titles = new_alltitles
+ for wordnames in self._mapping.values():
+ wordnames.intersection_update(docnames)
+ for wordnames in self._title_mapping.values():
+ wordnames.intersection_update(docnames)
+
+ def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
+ """Feed a doctree to the index."""
+ self._titles[docname] = title
+ self._filenames[docname] = filename
+
+ word_store = self._word_collector(doctree)
+
+ _filter = self.lang.word_filter
+ _stem = self.lang.stem
+
+ # memoise self.lang.stem
+ @functools.lru_cache(maxsize=None)
+ def stem(word_to_stem: str) -> str:
+ return _stem(word_to_stem).lower()
+
+ self._all_titles[docname] = word_store.titles
+
+ for word in word_store.title_words:
+ # add stemmed and unstemmed as the stemmer must not remove words
+ # from search index.
+ stemmed_word = stem(word)
+ if _filter(stemmed_word):
+ self._title_mapping.setdefault(stemmed_word, set()).add(docname)
+ elif _filter(word):
+ self._title_mapping.setdefault(word, set()).add(docname)
+
+ for word in word_store.words:
+ # add stemmed and unstemmed as the stemmer must not remove words
+ # from search index.
+ stemmed_word = stem(word)
+ if not _filter(stemmed_word) and _filter(word):
+ stemmed_word = word
+ already_indexed = docname in self._title_mapping.get(stemmed_word, ())
+ if _filter(stemmed_word) and not already_indexed:
+ self._mapping.setdefault(stemmed_word, set()).add(docname)
+
+ # find explicit entries within index directives
+ _index_entries: set[tuple[str, str, str]] = set()
+ for node in doctree.findall(addnodes.index):
+ for entry_type, value, target_id, main, _category_key in node['entries']:
+ try:
+ result = split_index_msg(entry_type, value)
+ except ValueError:
+ pass
+ else:
+ target_id = target_id or ''
+ if entry_type in {'see', 'seealso'}:
+ _index_entries.add((result[0], target_id, main))
+ _index_entries |= {(x, target_id, main) for x in result}
+
+ self._index_entries[docname] = sorted(_index_entries)
+
+ def _word_collector(self, doctree: nodes.document) -> WordStore:
+ def _visit_nodes(node):
+ if isinstance(node, nodes.comment):
+ return
+ elif isinstance(node, nodes.raw):
+ if 'html' in node.get('format', '').split():
+ # Some people might put content in raw HTML that should be searched,
+ # so we just amateurishly strip HTML tags and index the remaining
+ # content
+ nodetext = re.sub(r'<style.*?</style>', '', node.astext(),
+ flags=re.IGNORECASE | re.DOTALL)
+ nodetext = re.sub(r'<script.*?</script>', '', nodetext,
+ flags=re.IGNORECASE | re.DOTALL)
+ nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+ word_store.words.extend(split(nodetext))
+ return
+ elif (isinstance(node, nodes.meta) # type: ignore[attr-defined]
+ and _is_meta_keywords(node, language)):
+ keywords = [keyword.strip() for keyword in node['content'].split(',')]
+ word_store.words.extend(keywords)
+ elif isinstance(node, nodes.Text):
+ word_store.words.extend(split(node.astext()))
+ elif isinstance(node, nodes.title):
+ title = node.astext()
+ ids = node.parent['ids']
+ word_store.titles.append((title, ids[0] if ids else None))
+ word_store.title_words.extend(split(title))
+ for child in node.children:
+ _visit_nodes(child)
+ return
+
+ word_store = WordStore()
+ split = self.lang.split
+ language = self.lang.lang
+ _visit_nodes(doctree)
+ return word_store
+
+ def context_for_searchtool(self) -> dict[str, Any]:
+ if self.lang.js_splitter_code:
+ js_splitter_code = self.lang.js_splitter_code
+ else:
+ js_splitter_code = self.js_splitter_code
+
+ return {
+ 'search_language_stemming_code': self.get_js_stemmer_code(),
+ 'search_language_stop_words': json.dumps(sorted(self.lang.stopwords)),
+ 'search_scorer_tool': self.js_scorer_code,
+ 'search_word_splitter_code': js_splitter_code,
+ }
+
+ def get_js_stemmer_rawcodes(self) -> list[str]:
+ """Returns a list of non-minified stemmer JS files to copy."""
+ if self.lang.js_stemmer_rawcode:
+ return [
+ path.join(package_dir, 'search', 'non-minified-js', fname)
+ for fname in ('base-stemmer.js', self.lang.js_stemmer_rawcode)
+ ]
+ else:
+ return []
+
+ def get_js_stemmer_rawcode(self) -> str | None:
+ return None
+
+ def get_js_stemmer_code(self) -> str:
+ """Returns JS code that will be inserted into language_data.js."""
+ if self.lang.js_stemmer_rawcode:
+ js_dir = path.join(package_dir, 'search', 'minified-js')
+ with open(path.join(js_dir, 'base-stemmer.js'), encoding='utf-8') as js_file:
+ base_js = js_file.read()
+ with open(path.join(js_dir, self.lang.js_stemmer_rawcode), encoding='utf-8') as js_file:
+ language_js = js_file.read()
+ return ('%s\n%s\nStemmer = %sStemmer;' %
+ (base_js, language_js, self.lang.language_name))
+ else:
+ return self.lang.js_stemmer_code