diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp')
-rw-r--r-- | dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp new file mode 100644 index 0000000000..e68ccc417e --- /dev/null +++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp @@ -0,0 +1,462 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsThreadUtils.h" +#include "nsXPCOMCIDInternal.h" +#include "OnlineSpeechRecognitionService.h" +#include "nsIFile.h" +#include "SpeechGrammar.h" +#include "SpeechRecognition.h" +#include "SpeechRecognitionAlternative.h" +#include "SpeechRecognitionResult.h" +#include "SpeechRecognitionResultList.h" +#include "nsIObserverService.h" +#include "mozilla/dom/Document.h" +#include "mozilla/Preferences.h" +#include "mozilla/ScopeExit.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/Services.h" +#include "nsDirectoryServiceDefs.h" +#include "nsDirectoryServiceUtils.h" +#include "nsNetUtil.h" +#include "nsContentUtils.h" +#include "nsIChannel.h" +#include "nsIHttpChannel.h" +#include "nsIPrincipal.h" +#include "nsIStreamListener.h" +#include "nsIUploadChannel2.h" +#include "mozilla/dom/ClientIPCTypes.h" +#include "nsStringStream.h" +#include "nsIOutputStream.h" +#include "nsStreamUtils.h" +#include "OpusTrackEncoder.h" +#include "OggWriter.h" +#include "nsIClassOfService.h" +#include <json/json.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +namespace mozilla { + +using namespace dom; + +#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \ + "media.webspeech.service.endpoint" +#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/" +#define MAX_LISTENING_TIME_MS 10000 + +NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService, + nsIStreamListener) + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure, + const char* aFromRawSegment, + uint32_t aToOffset, uint32_t aCount, + uint32_t* aWriteCount) { + nsCString* buf = static_cast<nsCString*>(aClosure); + buf->Append(aFromRawSegment, aCount); + *aWriteCount = aCount; + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest, + nsIInputStream* aInputStream, + uint64_t aOffset, + uint32_t aCount) { + MOZ_ASSERT(NS_IsMainThread()); + nsresult rv; + uint32_t readCount; + rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount, + &readCount); + NS_ENSURE_SUCCESS(rv, rv); + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest, + nsresult aStatusCode) { + MOZ_ASSERT(NS_IsMainThread()); + + auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); }); + + if (mAborted) { + return NS_OK; + } + + bool success; + float confidence = 0; + Json::Value root; + Json::CharReaderBuilder builder; + bool parsingSuccessful; + nsAutoCString result; + nsAutoCString hypoValue; + nsAutoCString errorMsg; + SpeechRecognitionErrorCode errorCode; + + SR_LOG("STT Result: %s", mBuf.get()); + + if (NS_FAILED(aStatusCode)) { + success = false; + errorMsg.AssignLiteral("Error connecting to the service."); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + success = true; + UniquePtr<Json::CharReader> const reader(builder.newCharReader()); + parsingSuccessful = + reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr); + if (!parsingSuccessful) { + // there's an internal server error + success = false; + errorMsg.AssignLiteral("Internal server error"); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + result.Assign(root.get("status", "error").asString().c_str()); + if (result.EqualsLiteral("ok")) { + // ok, we have a result + if (!root["data"].empty()) { + hypoValue.Assign(root["data"][0].get("text", "").asString().c_str()); + confidence = root["data"][0].get("confidence", "0").asFloat(); + } else { + success = false; + errorMsg.AssignLiteral("Error reading result data."); + errorCode = SpeechRecognitionErrorCode::Network; + } + } else { + success = false; + errorMsg.Assign(root.get("message", "").asString().c_str()); + errorCode = SpeechRecognitionErrorCode::No_speech; + } + } + } + + if (!success) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg); + } else { + // Declare javascript result events + RefPtr<SpeechEvent> event = new SpeechEvent( + mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); + SpeechRecognitionResultList* resultList = + new SpeechRecognitionResultList(mRecognition); + SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); + + if (mRecognition->MaxAlternatives() > 0) { + SpeechRecognitionAlternative* alternative = + new SpeechRecognitionAlternative(mRecognition); + + alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue); + alternative->mConfidence = confidence; + + result->mItems.AppendElement(alternative); + } + resultList->mItems.AppendElement(result); + + event->mRecognitionResultList = resultList; + NS_DispatchToMainThread(event); + } + + return NS_OK; +} + +OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default; +OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default; + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Initialize( + WeakPtr<SpeechRecognition> aSpeechRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + mWriter = MakeUnique<OggWriter>(); + mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>( + "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition); + mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding(); + MOZ_ASSERT(mEncodeTaskQueue); + return NS_OK; +} + +void OnlineSpeechRecognitionService::EncoderFinished() { + MOZ_ASSERT(!NS_IsMainThread()); + MOZ_ASSERT(mEncodedAudioQueue.IsFinished()); + + while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) { + AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame}); + DebugOnly<nsresult> rv = + mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream() + ? ContainerWriter::END_OF_STREAM + : 0); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + } + + mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED); + MOZ_ASSERT(mWriter->IsWritingComplete()); + + NS_DispatchToMainThread( + NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this, + &OnlineSpeechRecognitionService::DoSTT)); +} + +void OnlineSpeechRecognitionService::EncoderInitialized() { + MOZ_ASSERT(!NS_IsMainThread()); + AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata; + metadata.AppendElement(mAudioEncoder->GetMetadata()); + if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) { + SR_LOG("wrong meta data type!"); + MOZ_ASSERT_UNREACHABLE(); + } + + nsresult rv = mWriter->SetMetadata(metadata); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + Unused << rv; +} + +void OnlineSpeechRecognitionService::EncoderError() { + MOZ_ASSERT(!NS_IsMainThread()); + SR_LOG("Error encoding frames."); + mEncodedData.Clear(); + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::DispatchError", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (!mRecognition) { + return; + } + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Encoder error"); + })); +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, + int32_t aSampleRate) { + MOZ_ASSERT(!NS_IsMainThread()); + int64_t duration = aAudioSegment->GetDuration(); + if (duration <= 0) { + return NS_OK; + } + + if (!mAudioEncoder) { + mSpeechEncoderListener = new SpeechEncoderListener(this); + mAudioEncoder = + MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue); + RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent(); + mAudioEncoder->SetWorkerThread(mEncoderThread); + mAudioEncoder->RegisterListener(mSpeechEncoderListener); + } + + mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment)); + + TimeStamp now = TimeStamp::Now(); + if (mFirstIteration.IsNull()) { + mFirstIteration = now; + } + + if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) { + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::Stop", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (!mRecognition) { + return; + } + mRecognition->Stop(); + })); + + return NS_OK; + } + + return NS_OK; +} + +void OnlineSpeechRecognitionService::DoSTT() { + MOZ_ASSERT(NS_IsMainThread()); + + if (mAborted) { + return; + } + + nsresult rv; + nsCOMPtr<nsIChannel> chan; + nsCOMPtr<nsIURI> uri; + nsAutoCString speechRecognitionEndpoint; + nsAutoCString prefEndpoint; + nsAutoString language; + + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT, + prefEndpoint); + + if (!prefEndpoint.IsEmpty()) { + speechRecognitionEndpoint = prefEndpoint; + } else { + speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT; + } + + rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr, + nullptr); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Unknown URI"); + return; + } + + nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT; + nsLoadFlags loadFlags = + nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER; + nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER; + + nsPIDOMWindowInner* window = mRecognition->GetOwner(); + if (NS_WARN_IF(!window)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No window"); + return; + } + + Document* doc = window->GetExtantDoc(); + if (NS_WARN_IF(!doc)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No document"); + } + rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags, + contentPolicy, nullptr, nullptr, nullptr, nullptr, + loadFlags); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open channel"); + return; + } + + nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan); + if (httpChan) { + rv = httpChan->SetRequestMethod("POST"_ns); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + if (httpChan) { + mRecognition->GetLang(language); + // Accept-Language-STT is a custom header of our backend server used to set + // the language of the speech sample being submitted by the client + rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns, + NS_ConvertUTF16toUTF8(language), false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the transcription by default + rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the sample by default + rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Set the product tag as teh web speech api + rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan)); + if (cos) { + cos->AddClassFlags(nsIClassOfService::UrgentStart); + } + + nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan); + if (uploadChan) { + nsCOMPtr<nsIInputStream> bodyStream; + uint32_t length = 0; + for (const nsTArray<uint8_t>& chunk : mEncodedData) { + length += chunk.Length(); + } + + nsTArray<uint8_t> audio; + if (!audio.SetCapacity(length, fallible)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Allocation error"); + return; + } + + for (const nsTArray<uint8_t>& chunk : mEncodedData) { + audio.AppendElements(chunk); + } + + mEncodedData.Clear(); + + rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio)); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open stream"); + return; + } + if (bodyStream) { + rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns, + length, "POST"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + } + + rv = chan->AsyncOpen(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Internal server error"); + } +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::SoundEnd() { + MOZ_ASSERT(NS_IsMainThread()); + + if (!mEncodeTaskQueue) { + // Not initialized + return NS_OK; + } + + nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( + "OnlineSpeechRecognitionService::SoundEnd", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (mAudioEncoder) { + mAudioEncoder->NotifyEndOfStream(); + mAudioEncoder->UnregisterListener(mSpeechEncoderListener); + mSpeechEncoderListener = nullptr; + mAudioEncoder = nullptr; + EncoderFinished(); + } + })); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + + mEncodeTaskQueue = nullptr; + + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ValidateAndSetGrammarList( + SpeechGrammar* aSpeechGrammar, + nsISpeechGrammarCompilationCallback* aCallback) { + // This is an online LVCSR (STT) service, + // so we don't need to set a grammar + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Abort() { + MOZ_ASSERT(NS_IsMainThread()); + if (mAborted) { + return NS_OK; + } + mAborted = true; + return SoundEnd(); +} +} // namespace mozilla |