/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:set ts=2 sw=2 sts=2 et cindent: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "nsThreadUtils.h" #include "nsXPCOMCIDInternal.h" #include "OnlineSpeechRecognitionService.h" #include "nsIFile.h" #include "SpeechGrammar.h" #include "SpeechRecognition.h" #include "SpeechRecognitionAlternative.h" #include "SpeechRecognitionResult.h" #include "SpeechRecognitionResultList.h" #include "nsIObserverService.h" #include "mozilla/dom/Document.h" #include "mozilla/Preferences.h" #include "mozilla/ScopeExit.h" #include "mozilla/StaticPrefs_media.h" #include "mozilla/Services.h" #include "nsDirectoryServiceDefs.h" #include "nsDirectoryServiceUtils.h" #include "nsNetUtil.h" #include "nsContentUtils.h" #include "nsIChannel.h" #include "nsIHttpChannel.h" #include "nsIPrincipal.h" #include "nsIStreamListener.h" #include "nsIUploadChannel2.h" #include "mozilla/dom/ClientIPCTypes.h" #include "nsStringStream.h" #include "nsIOutputStream.h" #include "nsStreamUtils.h" #include "OpusTrackEncoder.h" #include "OggWriter.h" #include "nsIClassOfService.h" #include #include #include #include namespace mozilla { using namespace dom; #define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \ "media.webspeech.service.endpoint" #define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/" #define MAX_LISTENING_TIME_MS 10000 NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService, nsIStreamListener) NS_IMETHODIMP OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) { MOZ_ASSERT(NS_IsMainThread()); return NS_OK; } static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure, const char* aFromRawSegment, uint32_t aToOffset, uint32_t aCount, uint32_t* aWriteCount) { nsCString* buf = static_cast(aClosure); buf->Append(aFromRawSegment, aCount); *aWriteCount = aCount; return NS_OK; } NS_IMETHODIMP OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInputStream, uint64_t aOffset, uint32_t aCount) { MOZ_ASSERT(NS_IsMainThread()); nsresult rv; uint32_t readCount; rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount, &readCount); NS_ENSURE_SUCCESS(rv, rv); return NS_OK; } NS_IMETHODIMP OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest, nsresult aStatusCode) { MOZ_ASSERT(NS_IsMainThread()); auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); }); if (mAborted) { return NS_OK; } bool success; float confidence = 0; Json::Value root; Json::CharReaderBuilder builder; bool parsingSuccessful; nsAutoCString result; nsAutoCString hypoValue; nsAutoCString errorMsg; SpeechRecognitionErrorCode errorCode; SR_LOG("STT Result: %s", mBuf.get()); if (NS_FAILED(aStatusCode)) { success = false; errorMsg.AssignLiteral("Error connecting to the service."); errorCode = SpeechRecognitionErrorCode::Network; } else { success = true; UniquePtr const reader(builder.newCharReader()); parsingSuccessful = reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr); if (!parsingSuccessful) { // there's an internal server error success = false; errorMsg.AssignLiteral("Internal server error"); errorCode = SpeechRecognitionErrorCode::Network; } else { result.Assign(root.get("status", "error").asString().c_str()); if (result.EqualsLiteral("ok")) { // ok, we have a result if (!root["data"].empty()) { hypoValue.Assign(root["data"][0].get("text", "").asString().c_str()); confidence = root["data"][0].get("confidence", "0").asFloat(); } else { success = false; errorMsg.AssignLiteral("Error reading result data."); errorCode = SpeechRecognitionErrorCode::Network; } } else { success = false; errorMsg.Assign(root.get("message", "").asString().c_str()); errorCode = SpeechRecognitionErrorCode::No_speech; } } } if (!success) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg); } else { // Declare javascript result events RefPtr event = new SpeechEvent( mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); SpeechRecognitionResultList* resultList = new SpeechRecognitionResultList(mRecognition); SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); if (mRecognition->MaxAlternatives() > 0) { SpeechRecognitionAlternative* alternative = new SpeechRecognitionAlternative(mRecognition); alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue); alternative->mConfidence = confidence; result->mItems.AppendElement(alternative); } resultList->mItems.AppendElement(result); event->mRecognitionResultList = resultList; NS_DispatchToMainThread(event); } return NS_OK; } OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default; OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default; NS_IMETHODIMP OnlineSpeechRecognitionService::Initialize( WeakPtr aSpeechRecognition) { MOZ_ASSERT(NS_IsMainThread()); mWriter = MakeUnique(); mRecognition = new nsMainThreadPtrHolder( "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition); mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding(); MOZ_ASSERT(mEncodeTaskQueue); return NS_OK; } void OnlineSpeechRecognitionService::EncoderFinished() { MOZ_ASSERT(!NS_IsMainThread()); MOZ_ASSERT(mEncodedAudioQueue.IsFinished()); while (RefPtr frame = mEncodedAudioQueue.PopFront()) { AutoTArray, 1> frames({frame}); DebugOnly rv = mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream() ? ContainerWriter::END_OF_STREAM : 0); MOZ_ASSERT(NS_SUCCEEDED(rv)); } mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED); MOZ_ASSERT(mWriter->IsWritingComplete()); NS_DispatchToMainThread( NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this, &OnlineSpeechRecognitionService::DoSTT)); } void OnlineSpeechRecognitionService::EncoderInitialized() { MOZ_ASSERT(!NS_IsMainThread()); AutoTArray, 1> metadata; metadata.AppendElement(mAudioEncoder->GetMetadata()); if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) { SR_LOG("wrong meta data type!"); MOZ_ASSERT_UNREACHABLE(); } nsresult rv = mWriter->SetMetadata(metadata); MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER); MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); Unused << rv; } void OnlineSpeechRecognitionService::EncoderError() { MOZ_ASSERT(!NS_IsMainThread()); SR_LOG("Error encoding frames."); mEncodedData.Clear(); NS_DispatchToMainThread(NS_NewRunnableFunction( "SpeechRecognition::DispatchError", [this, self = RefPtr(this)]() { if (!mRecognition) { return; } mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Audio_capture, "Encoder error"); })); } NS_IMETHODIMP OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, int32_t aSampleRate) { MOZ_ASSERT(!NS_IsMainThread()); int64_t duration = aAudioSegment->GetDuration(); if (duration <= 0) { return NS_OK; } if (!mAudioEncoder) { mSpeechEncoderListener = new SpeechEncoderListener(this); mAudioEncoder = MakeUnique(aSampleRate, mEncodedAudioQueue); RefPtr mEncoderThread = AbstractThread::GetCurrent(); mAudioEncoder->SetWorkerThread(mEncoderThread); mAudioEncoder->RegisterListener(mSpeechEncoderListener); } mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment)); TimeStamp now = TimeStamp::Now(); if (mFirstIteration.IsNull()) { mFirstIteration = now; } if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) { NS_DispatchToMainThread(NS_NewRunnableFunction( "SpeechRecognition::Stop", [this, self = RefPtr(this)]() { if (!mRecognition) { return; } mRecognition->Stop(); })); return NS_OK; } return NS_OK; } void OnlineSpeechRecognitionService::DoSTT() { MOZ_ASSERT(NS_IsMainThread()); if (mAborted) { return; } nsresult rv; nsCOMPtr chan; nsCOMPtr uri; nsAutoCString speechRecognitionEndpoint; nsAutoCString prefEndpoint; nsAutoString language; Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT, prefEndpoint); if (!prefEndpoint.IsEmpty()) { speechRecognitionEndpoint = prefEndpoint; } else { speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT; } rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr, nullptr); if (NS_WARN_IF(NS_FAILED(rv))) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Network, "Unknown URI"); return; } nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT; nsLoadFlags loadFlags = nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER; nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER; nsPIDOMWindowInner* window = mRecognition->GetOwner(); if (NS_WARN_IF(!window)) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Aborted, "No window"); return; } Document* doc = window->GetExtantDoc(); if (NS_WARN_IF(!doc)) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Aborted, "No document"); } rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags, contentPolicy, nullptr, nullptr, nullptr, nullptr, loadFlags); if (NS_WARN_IF(NS_FAILED(rv))) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Network, "Failed to open channel"); return; } nsCOMPtr httpChan = do_QueryInterface(chan); if (httpChan) { rv = httpChan->SetRequestMethod("POST"_ns); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); } if (httpChan) { mRecognition->GetLang(language); // Accept-Language-STT is a custom header of our backend server used to set // the language of the speech sample being submitted by the client rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns, NS_ConvertUTF16toUTF8(language), false); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); // Tell the server to not store the transcription by default rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); // Tell the server to not store the sample by default rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); // Set the product tag as teh web speech api rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); } nsCOMPtr cos(do_QueryInterface(chan)); if (cos) { cos->AddClassFlags(nsIClassOfService::UrgentStart); } nsCOMPtr uploadChan = do_QueryInterface(chan); if (uploadChan) { nsCOMPtr bodyStream; uint32_t length = 0; for (const nsTArray& chunk : mEncodedData) { length += chunk.Length(); } nsTArray audio; if (!audio.SetCapacity(length, fallible)) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Audio_capture, "Allocation error"); return; } for (const nsTArray& chunk : mEncodedData) { audio.AppendElements(chunk); } mEncodedData.Clear(); rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio)); if (NS_WARN_IF(NS_FAILED(rv))) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Network, "Failed to open stream"); return; } if (bodyStream) { rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns, length, "POST"_ns, false); MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); } } rv = chan->AsyncOpen(this); if (NS_WARN_IF(NS_FAILED(rv))) { mRecognition->DispatchError( SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, SpeechRecognitionErrorCode::Network, "Internal server error"); } } NS_IMETHODIMP OnlineSpeechRecognitionService::SoundEnd() { MOZ_ASSERT(NS_IsMainThread()); if (!mEncodeTaskQueue) { // Not initialized return NS_OK; } nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( "OnlineSpeechRecognitionService::SoundEnd", [this, self = RefPtr(this)]() { if (mAudioEncoder) { mAudioEncoder->NotifyEndOfStream(); mAudioEncoder->UnregisterListener(mSpeechEncoderListener); mSpeechEncoderListener = nullptr; mAudioEncoder = nullptr; EncoderFinished(); } })); MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); Unused << rv; mEncodeTaskQueue = nullptr; return NS_OK; } NS_IMETHODIMP OnlineSpeechRecognitionService::ValidateAndSetGrammarList( SpeechGrammar* aSpeechGrammar, nsISpeechGrammarCompilationCallback* aCallback) { // This is an online LVCSR (STT) service, // so we don't need to set a grammar return NS_OK; } NS_IMETHODIMP OnlineSpeechRecognitionService::Abort() { MOZ_ASSERT(NS_IsMainThread()); if (mAborted) { return NS_OK; } mAborted = true; return SoundEnd(); } } // namespace mozilla