summaryrefslogtreecommitdiffstats
path: root/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
parentInitial commit. (diff)
downloadfirefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp')
-rw-r--r--dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp462
1 files changed, 462 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
new file mode 100644
index 0000000000..e68ccc417e
--- /dev/null
+++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
@@ -0,0 +1,462 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsThreadUtils.h"
+#include "nsXPCOMCIDInternal.h"
+#include "OnlineSpeechRecognitionService.h"
+#include "nsIFile.h"
+#include "SpeechGrammar.h"
+#include "SpeechRecognition.h"
+#include "SpeechRecognitionAlternative.h"
+#include "SpeechRecognitionResult.h"
+#include "SpeechRecognitionResultList.h"
+#include "nsIObserverService.h"
+#include "mozilla/dom/Document.h"
+#include "mozilla/Preferences.h"
+#include "mozilla/ScopeExit.h"
+#include "mozilla/StaticPrefs_media.h"
+#include "mozilla/Services.h"
+#include "nsDirectoryServiceDefs.h"
+#include "nsDirectoryServiceUtils.h"
+#include "nsNetUtil.h"
+#include "nsContentUtils.h"
+#include "nsIChannel.h"
+#include "nsIHttpChannel.h"
+#include "nsIPrincipal.h"
+#include "nsIStreamListener.h"
+#include "nsIUploadChannel2.h"
+#include "mozilla/dom/ClientIPCTypes.h"
+#include "nsStringStream.h"
+#include "nsIOutputStream.h"
+#include "nsStreamUtils.h"
+#include "OpusTrackEncoder.h"
+#include "OggWriter.h"
+#include "nsIClassOfService.h"
+#include <json/json.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace mozilla {
+
+using namespace dom;
+
+#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \
+ "media.webspeech.service.endpoint"
+#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/"
+#define MAX_LISTENING_TIME_MS 10000
+
+NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService,
+ nsIStreamListener)
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) {
+ MOZ_ASSERT(NS_IsMainThread());
+ return NS_OK;
+}
+
+static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure,
+ const char* aFromRawSegment,
+ uint32_t aToOffset, uint32_t aCount,
+ uint32_t* aWriteCount) {
+ nsCString* buf = static_cast<nsCString*>(aClosure);
+ buf->Append(aFromRawSegment, aCount);
+ *aWriteCount = aCount;
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest,
+ nsIInputStream* aInputStream,
+ uint64_t aOffset,
+ uint32_t aCount) {
+ MOZ_ASSERT(NS_IsMainThread());
+ nsresult rv;
+ uint32_t readCount;
+ rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount,
+ &readCount);
+ NS_ENSURE_SUCCESS(rv, rv);
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest,
+ nsresult aStatusCode) {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); });
+
+ if (mAborted) {
+ return NS_OK;
+ }
+
+ bool success;
+ float confidence = 0;
+ Json::Value root;
+ Json::CharReaderBuilder builder;
+ bool parsingSuccessful;
+ nsAutoCString result;
+ nsAutoCString hypoValue;
+ nsAutoCString errorMsg;
+ SpeechRecognitionErrorCode errorCode;
+
+ SR_LOG("STT Result: %s", mBuf.get());
+
+ if (NS_FAILED(aStatusCode)) {
+ success = false;
+ errorMsg.AssignLiteral("Error connecting to the service.");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ } else {
+ success = true;
+ UniquePtr<Json::CharReader> const reader(builder.newCharReader());
+ parsingSuccessful =
+ reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr);
+ if (!parsingSuccessful) {
+ // there's an internal server error
+ success = false;
+ errorMsg.AssignLiteral("Internal server error");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ } else {
+ result.Assign(root.get("status", "error").asString().c_str());
+ if (result.EqualsLiteral("ok")) {
+ // ok, we have a result
+ if (!root["data"].empty()) {
+ hypoValue.Assign(root["data"][0].get("text", "").asString().c_str());
+ confidence = root["data"][0].get("confidence", "0").asFloat();
+ } else {
+ success = false;
+ errorMsg.AssignLiteral("Error reading result data.");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ }
+ } else {
+ success = false;
+ errorMsg.Assign(root.get("message", "").asString().c_str());
+ errorCode = SpeechRecognitionErrorCode::No_speech;
+ }
+ }
+ }
+
+ if (!success) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg);
+ } else {
+ // Declare javascript result events
+ RefPtr<SpeechEvent> event = new SpeechEvent(
+ mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
+ SpeechRecognitionResultList* resultList =
+ new SpeechRecognitionResultList(mRecognition);
+ SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
+
+ if (mRecognition->MaxAlternatives() > 0) {
+ SpeechRecognitionAlternative* alternative =
+ new SpeechRecognitionAlternative(mRecognition);
+
+ alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue);
+ alternative->mConfidence = confidence;
+
+ result->mItems.AppendElement(alternative);
+ }
+ resultList->mItems.AppendElement(result);
+
+ event->mRecognitionResultList = resultList;
+ NS_DispatchToMainThread(event);
+ }
+
+ return NS_OK;
+}
+
+OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default;
+OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default;
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Initialize(
+ WeakPtr<SpeechRecognition> aSpeechRecognition) {
+ MOZ_ASSERT(NS_IsMainThread());
+ mWriter = MakeUnique<OggWriter>();
+ mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>(
+ "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition);
+ mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding();
+ MOZ_ASSERT(mEncodeTaskQueue);
+ return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::EncoderFinished() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ MOZ_ASSERT(mEncodedAudioQueue.IsFinished());
+
+ while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) {
+ AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame});
+ DebugOnly<nsresult> rv =
+ mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream()
+ ? ContainerWriter::END_OF_STREAM
+ : 0);
+ MOZ_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED);
+ MOZ_ASSERT(mWriter->IsWritingComplete());
+
+ NS_DispatchToMainThread(
+ NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this,
+ &OnlineSpeechRecognitionService::DoSTT));
+}
+
+void OnlineSpeechRecognitionService::EncoderInitialized() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata;
+ metadata.AppendElement(mAudioEncoder->GetMetadata());
+ if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
+ SR_LOG("wrong meta data type!");
+ MOZ_ASSERT_UNREACHABLE();
+ }
+
+ nsresult rv = mWriter->SetMetadata(metadata);
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+
+ rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER);
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+
+ Unused << rv;
+}
+
+void OnlineSpeechRecognitionService::EncoderError() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ SR_LOG("Error encoding frames.");
+ mEncodedData.Clear();
+ NS_DispatchToMainThread(NS_NewRunnableFunction(
+ "SpeechRecognition::DispatchError",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (!mRecognition) {
+ return;
+ }
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Audio_capture, "Encoder error");
+ }));
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
+ int32_t aSampleRate) {
+ MOZ_ASSERT(!NS_IsMainThread());
+ int64_t duration = aAudioSegment->GetDuration();
+ if (duration <= 0) {
+ return NS_OK;
+ }
+
+ if (!mAudioEncoder) {
+ mSpeechEncoderListener = new SpeechEncoderListener(this);
+ mAudioEncoder =
+ MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue);
+ RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent();
+ mAudioEncoder->SetWorkerThread(mEncoderThread);
+ mAudioEncoder->RegisterListener(mSpeechEncoderListener);
+ }
+
+ mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment));
+
+ TimeStamp now = TimeStamp::Now();
+ if (mFirstIteration.IsNull()) {
+ mFirstIteration = now;
+ }
+
+ if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) {
+ NS_DispatchToMainThread(NS_NewRunnableFunction(
+ "SpeechRecognition::Stop",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (!mRecognition) {
+ return;
+ }
+ mRecognition->Stop();
+ }));
+
+ return NS_OK;
+ }
+
+ return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::DoSTT() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ if (mAborted) {
+ return;
+ }
+
+ nsresult rv;
+ nsCOMPtr<nsIChannel> chan;
+ nsCOMPtr<nsIURI> uri;
+ nsAutoCString speechRecognitionEndpoint;
+ nsAutoCString prefEndpoint;
+ nsAutoString language;
+
+ Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT,
+ prefEndpoint);
+
+ if (!prefEndpoint.IsEmpty()) {
+ speechRecognitionEndpoint = prefEndpoint;
+ } else {
+ speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT;
+ }
+
+ rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr,
+ nullptr);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Unknown URI");
+ return;
+ }
+
+ nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT;
+ nsLoadFlags loadFlags =
+ nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER;
+ nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER;
+
+ nsPIDOMWindowInner* window = mRecognition->GetOwner();
+ if (NS_WARN_IF(!window)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Aborted, "No window");
+ return;
+ }
+
+ Document* doc = window->GetExtantDoc();
+ if (NS_WARN_IF(!doc)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Aborted, "No document");
+ }
+ rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags,
+ contentPolicy, nullptr, nullptr, nullptr, nullptr,
+ loadFlags);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Failed to open channel");
+ return;
+ }
+
+ nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan);
+ if (httpChan) {
+ rv = httpChan->SetRequestMethod("POST"_ns);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ if (httpChan) {
+ mRecognition->GetLang(language);
+ // Accept-Language-STT is a custom header of our backend server used to set
+ // the language of the speech sample being submitted by the client
+ rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns,
+ NS_ConvertUTF16toUTF8(language), false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Tell the server to not store the transcription by default
+ rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Tell the server to not store the sample by default
+ rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Set the product tag as teh web speech api
+ rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan));
+ if (cos) {
+ cos->AddClassFlags(nsIClassOfService::UrgentStart);
+ }
+
+ nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan);
+ if (uploadChan) {
+ nsCOMPtr<nsIInputStream> bodyStream;
+ uint32_t length = 0;
+ for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+ length += chunk.Length();
+ }
+
+ nsTArray<uint8_t> audio;
+ if (!audio.SetCapacity(length, fallible)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Audio_capture, "Allocation error");
+ return;
+ }
+
+ for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+ audio.AppendElements(chunk);
+ }
+
+ mEncodedData.Clear();
+
+ rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio));
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Failed to open stream");
+ return;
+ }
+ if (bodyStream) {
+ rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns,
+ length, "POST"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+ }
+
+ rv = chan->AsyncOpen(this);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Internal server error");
+ }
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::SoundEnd() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ if (!mEncodeTaskQueue) {
+ // Not initialized
+ return NS_OK;
+ }
+
+ nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(
+ "OnlineSpeechRecognitionService::SoundEnd",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (mAudioEncoder) {
+ mAudioEncoder->NotifyEndOfStream();
+ mAudioEncoder->UnregisterListener(mSpeechEncoderListener);
+ mSpeechEncoderListener = nullptr;
+ mAudioEncoder = nullptr;
+ EncoderFinished();
+ }
+ }));
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+ Unused << rv;
+
+ mEncodeTaskQueue = nullptr;
+
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ValidateAndSetGrammarList(
+ SpeechGrammar* aSpeechGrammar,
+ nsISpeechGrammarCompilationCallback* aCallback) {
+ // This is an online LVCSR (STT) service,
+ // so we don't need to set a grammar
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Abort() {
+ MOZ_ASSERT(NS_IsMainThread());
+ if (mAborted) {
+ return NS_OK;
+ }
+ mAborted = true;
+ return SoundEnd();
+}
+} // namespace mozilla