toolkit/components/translation/LanguageDetector.sys.mjs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

// workerManager is exported for tests.
import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";

const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";

/**
 * The length of the substring to pull from the document's text for language
 * identification.
 *
 * This value should ideally be one that is large enough to yield a confident
 * identification result without being too large or expensive to extract.
 *
 * At this time, this value is not driven by statistical data or analysis.
 *
 * For the moment, while we investigate which language identification library
 * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
 */
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;

export var workerManager = {
  // Since Emscripten can handle heap growth, but not heap shrinkage, we
  // need to refresh the worker after we've processed a particularly large
  // string in order to prevent unnecessary resident memory growth.
  //
  // These values define the cut-off string length and the idle timeout
  // (in milliseconds) before destroying a worker. Once a string of the
  // maximum size has been processed, the worker is marked for
  // destruction, and is terminated as soon as it has been idle for the
  // given timeout.
  //
  // 1.5MB. This is the approximate string length that forces heap growth
  // for a 2MB heap.
  LARGE_STRING: 1.5 * 1024 * 1024,
  IDLE_TIMEOUT: 10 * 1000,

  detectionQueue: [],

  detectLanguage(aParams) {
    return this.workerReady
      .then(worker => {
        return new Promise(resolve => {
          this.detectionQueue.push({ resolve });
          worker.postMessage(aParams);
        });
      })
      .then(result => {
        // We have our asynchronous result from the worker.
        //
        // Determine if our input was large enough to trigger heap growth,
        // or if we're already waiting to destroy the worker when it's
        // idle. If so, schedule termination after the idle timeout.
        if (
          aParams.text.length >= this.LARGE_STRING ||
          this._idleTimeout != null
        ) {
          this.flushWorker();
        }

        return result;
      });
  },

  _worker: null,
  _workerReadyPromise: null,

  get workerReady() {
    if (!this._workerReadyPromise) {
      this._workerReadyPromise = new Promise(resolve => {
        let worker = new Worker(WORKER_URL);
        worker.onmessage = aMsg => {
          if (aMsg.data == "ready") {
            resolve(worker);
          } else {
            this.detectionQueue.shift().resolve(aMsg.data);
          }
        };
        this._worker = worker;
      });
    }

    return this._workerReadyPromise;
  },

  // Holds the ID of the current pending idle cleanup setTimeout.
  _idleTimeout: null,

  // Schedule the current worker to be terminated after the idle timeout.
  flushWorker() {
    if (this._idleTimeout != null) {
      clearTimeout(this._idleTimeout);
    }

    this._idleTimeout = setTimeout(
      this._flushWorker.bind(this),
      this.IDLE_TIMEOUT
    );
  },

  // Immediately terminate the worker, as long as there no pending
  // results. Otherwise, reschedule termination until after the next
  // idle timeout.
  _flushWorker() {
    if (this.detectionQueue.length) {
      this.flushWorker();
    } else {
      if (this._worker) {
        this._worker.terminate();
      }

      this._worker = null;
      this._workerReadyPromise = null;
      this._idleTimeout = null;
    }
  },
};

export var LanguageDetector = {
  /**
   * Detect the language of a given string.
   *
   * The argument may be either a string containing the text to analyze,
   * or an object with the following properties:
   *
   *  - 'text' The text to analyze.
   *
   *  - 'isHTML' (optional) A boolean, indicating whether the text
   *      should be analyzed as HTML rather than plain text.
   *
   *  - 'language' (optional) A string indicating the expected language.
   *      For text extracted from HTTP documents, this is expected to
   *      come from the Content-Language header.
   *
   *  - 'tld' (optional) A string indicating the top-level domain of the
   *      document the text was extracted from.
   *
   *  - 'encoding' (optional) A string describing the encoding of the
   *      document the string was extracted from. Note that, regardless
   *      of the value of this property, the 'text' property must be a
   *      UTF-16 JavaScript string.
   *
   * @returns {Promise<Object>}
   * @resolves When detection is finished, with a object containing
   * these fields:
   *  - 'language' (string with a language code)
   *  - 'confident' (boolean) Whether the detector is confident of the
   *      result.
   *  - 'languages' (array) An array of up to three elements, containing
   *      the most prevalent languages detected. It contains a
   *      'languageCode' property, containing the ISO language code of
   *      the language, and a 'percent' property, describing the
   *      approximate percentage of the input which is in that language.
   *      For text of an unknown language, the result may contain an
   *      entry with the languge code 'un', indicating the percent of
   *      the text which is unknown.
   */
  detectLanguage(aParams) {
    if (typeof aParams == "string") {
      aParams = { text: aParams };
    }

    return workerManager.detectLanguage(aParams);
  },

  /**
   * Attempts to determine the language in which the document's content is written.
   *
   * For the moment, while we investigate which language identification library
   * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
   * @returns {string | null}
   */
  async detectLanguageFromDocument(aDocument) {
    // Grab a selection of text.
    let encoder = Cu.createDocumentEncoder("text/plain");
    encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
    let text = encoder
      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
      .replaceAll("\r", "")
      .replaceAll("\n", " ");

    const { language, confident } = await workerManager.detectLanguage({
      text,
    });

    workerManager.flushWorker();

    return confident ? language : null;
  },
};