firefox/toolkit/components/ml/content/HttpInference.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */

// Guide models to respond with readable / parseable JSON-ish grammar. Allow
// some whitespace to avoid token resampling while ensuring escaped strings.
const JSON_GRAMMAR = `root ::= ws? "{" ws* key ( "," ws* key )* ws* "}" ws?
arr ::= "[" ws* str ( "," ws* str )* ws* "]" ws?
key ::= str ":" ws? val
str ::= ["] ( "\\\\" ["n] | [^\\\\"\\n] )+ ["] ws?
val ::= arr | str
ws  ::= [ \n]
`;

/**
 * Handle various kinds of ai/ml http inference APIs.
 */
export const HttpInference = {
  /**
   * GenAI prompt completion
   *
   * @param {object} config options for the request
   * @param {string} config.endpoint http completion API
   * @param {string} config.prompt to send
   * @param {string} config.bearer optional token for some endpoints
   * @param {string} config.model optional for some endpoints
   * @param {Function} config.onStream optional callback for streaming response
   * @param {object} context optional placeholder values
   * @returns {Promise<string>} response of the completion request
   */
  async completion(
    { bearer, endpoint, model, prompt, onStream },
    context = {}
  ) {
    let request, response;

    // Try to get JSON response if prompt includes "json"
    const expectJSON = prompt.search(/\bjson\b/i) >= 0;

    // Conditionally add prompt context if needed and allowed
    Object.entries(context).forEach(([key, val]) => {
      const placeholder = `%${key}%`;
      if (prompt.includes(placeholder)) {
        prompt = prompt.replace(placeholder, JSON.stringify(val));
      }
    });

    let streaming = !!onStream;

    // TODO: Pick a body format in a smarter way
    const body = {};
    if (endpoint.endsWith("/v1/chat/completions")) {
      body.messages = [{ content: prompt, role: "user" }];
      body.max_tokens = 1024;
      body.model = model;
      if (streaming) {
        body.stream = true;
      }
      if (expectJSON) {
        // TODO: Better deciding when to include grammar
        if (endpoint.includes("localhost")) {
          body.grammar = JSON_GRAMMAR;
        }
        body.response_format = { type: "json_object" };
      }
    } else if (endpoint.endsWith(":predict")) {
      body.instances = [{ content: prompt }];
      body.parameters = { maxOutputTokens: 1024 };
      streaming = false;
    } else if (endpoint.endsWith(":streamGenerateContent")) {
      body.contents = [{ parts: [{ text: prompt }], role: "user" }];
      body.generation_config = { maxOutputTokens: 1024 };
      // This endpoint doesn't do server-sent events format
      streaming = false;
    } else if (endpoint.endsWith("/completion")) {
      body.prompt = prompt;
      if (streaming) {
        body.stream = true;
      }
      if (expectJSON) {
        body.grammar = JSON_GRAMMAR;
      }
    } else {
      body.model = model;
      body.prompt = prompt;
      streaming = false;
    }

    const headers = {
      "Content-Type": "application/json",
    };
    if (bearer) {
      headers.Authorization = `Bearer ${bearer}`;
    }

    let ret = "";
    try {
      request = await fetch(endpoint, {
        body: JSON.stringify(body),
        headers,
        method: "POST",
      });

      if (request.status != 200) {
        throw await request.text();
      }

      if (streaming) {
        const reader = request.body.getReader();
        const decoder = new TextDecoder();
        // eslint-disable-next-line no-constant-condition
        while (true) {
          const { done, value } = await reader.read();
          if (done) {
            break;
          }

          // Read the JSON data of each server-sent event
          const lines = decoder
            .decode(value)
            .split("\n")
            .filter(l => l);
          for (const line of lines) {
            try {
              response = JSON.parse(line.replace(/^data: /, ""));
              const chunk =
                response.content ?? response.choices?.[0].delta.content;
              if (chunk?.length) {
                // Accumulate chunks for partial and final value
                ret += chunk;
                onStream(ret);
              }
            } catch (ex) {}
          }
        }
      } else {
        response = await request.json();
        ret =
          response.response ??
          response.content ??
          response.choices?.[0].message.content ??
          response.predictions?.[0].content ??
          response.map(r => r.candidates[0].content.parts[0].text).join("");

        // Some wrap JSON responses in code block
        if (expectJSON) {
          ret = ret.replace(/^\s*```\s*(json)?/i, "").replace(/```\s*$/, "");
        }
      }
    } catch (ex) {
      ret = [endpoint, request?.status, ex, JSON.stringify(response)].join(
        "\n\n"
      );
    }

    return ret;
  },
};