diff options
Diffstat (limited to 'dom/media/webspeech/recognition/SpeechRecognition.cpp')
-rw-r--r-- | dom/media/webspeech/recognition/SpeechRecognition.cpp | 1170 |
1 files changed, 1170 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp new file mode 100644 index 0000000000..75d1ba7709 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp @@ -0,0 +1,1170 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognition.h" + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" + +#include "mozilla/dom/AudioStreamTrack.h" +#include "mozilla/dom/BindingUtils.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/SpeechRecognitionBinding.h" +#include "mozilla/dom/MediaStreamTrackBinding.h" +#include "mozilla/dom/MediaStreamError.h" +#include "mozilla/dom/RootedDictionary.h" +#include "mozilla/dom/SpeechGrammar.h" +#include "mozilla/MediaManager.h" +#include "mozilla/Preferences.h" +#include "mozilla/ResultVariant.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/AbstractThread.h" +#include "VideoUtils.h" +#include "AudioSegment.h" +#include "MediaEnginePrefs.h" +#include "endpointer.h" + +#include "mozilla/dom/SpeechRecognitionEvent.h" +#include "nsComponentManagerUtils.h" +#include "nsContentUtils.h" +#include "mozilla/dom/Document.h" +#include "nsIObserverService.h" +#include "nsIPermissionManager.h" +#include "nsIPrincipal.h" +#include "nsPIDOMWindow.h" +#include "nsServiceManagerUtils.h" +#include "nsQueryObject.h" +#include "SpeechTrackListener.h" + +#include <algorithm> + +// Undo the windows.h damage +#if defined(XP_WIN) && defined(GetMessage) +# undef GetMessage +#endif + +namespace mozilla::dom { + +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" +#define DEFAULT_RECOGNITION_SERVICE "online" + +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \ + "media.webspeech.long_silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \ + "media.webspeech.long_speech_length" +#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \ + "media.webspeech.recognition.timeout" + +static const uint32_t kSAMPLE_RATE = 16000; + +// number of frames corresponding to 300ms of audio to send to endpointer while +// it's in environment estimation mode +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; + +LogModule* GetSpeechRecognitionLog() { + static LazyLogModule sLog("SpeechRecognition"); + return sLog; +} +#define SR_LOG(...) \ + MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +namespace { +class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker { + public: + SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition, + const nsString& aName) + : media::ShutdownBlocker(aName), mRecognition(aRecognition) {} + + NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override { + MOZ_ASSERT(NS_IsMainThread()); + // AbortSilently will eventually clear the blocker. + mRecognition->Abort(); + return NS_OK; + } + + private: + const RefPtr<SpeechRecognition> mRecognition; +}; + +enum class ServiceCreationError { + ServiceNotFound, +}; + +Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError> +CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow, + SpeechRecognition* aRecognition, + const nsAString& aLang) { + nsAutoCString speechRecognitionServiceCID; + + nsAutoCString prefValue; + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue); + nsAutoCString speechRecognitionService; + + if (!prefValue.IsEmpty()) { + speechRecognitionService = prefValue; + } else { + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; + } + + if (StaticPrefs::media_webspeech_test_fake_recognition_service()) { + speechRecognitionServiceCID = + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; + } else { + speechRecognitionServiceCID = + nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + + speechRecognitionService; + } + + nsresult rv; + nsCOMPtr<nsISpeechRecognitionService> recognitionService; + recognitionService = + do_CreateInstance(speechRecognitionServiceCID.get(), &rv); + if (!recognitionService) { + return Err(ServiceCreationError::ServiceNotFound); + } + + return recognitionService; +} +} // namespace + +NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition, + DOMEventTargetHelper, mStream, + mTrack, mRecognitionService, + mSpeechGrammarList) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition) + NS_INTERFACE_MAP_ENTRY(nsIObserver) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) + +SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) + : DOMEventTargetHelper(aOwnerWindow), + mEndpointer(kSAMPLE_RATE), + mAudioSamplesPerChunk(mEndpointer.FrameSize()), + mSpeechDetectionTimer(NS_NewTimer()), + mSpeechGrammarList(new SpeechGrammarList(GetOwner())), + mContinuous(false), + mInterimResults(false), + mMaxAlternatives(1) { + SR_LOG("created SpeechRecognition"); + + if (StaticPrefs::media_webspeech_test_enable()) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + } + + mEndpointer.set_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); + mEndpointer.set_long_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); + mEndpointer.set_long_speech_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); + + mSpeechDetectionTimeoutMs = + Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000); + + Reset(); +} + +SpeechRecognition::~SpeechRecognition() = default; + +bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) { + return mCurrentState >= begin && mCurrentState <= end; +} + +void SpeechRecognition::SetState(FSMState state) { + mCurrentState = state; + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); +} + +JSObject* SpeechRecognition::WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) { + return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto); +} + +bool SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) { + nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal); + + nsresult rv; + nsCOMPtr<nsIPermissionManager> mgr = + do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION; + rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition"_ns, + &speechRecognition); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + bool hasPermission = + (speechRecognition == nsIPermissionManager::ALLOW_ACTION); + + return (hasPermission || + StaticPrefs::media_webspeech_recognition_force_enable() || + StaticPrefs::media_webspeech_test_enable()) && + StaticPrefs::media_webspeech_recognition_enable(); +} + +already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor( + const GlobalObject& aGlobal, ErrorResult& aRv) { + nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports()); + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + return nullptr; + } + + RefPtr<SpeechRecognition> object = new SpeechRecognition(win); + return object.forget(); +} + +void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) { + SR_LOG("Processing %s, current state is %s", GetName(aEvent), + GetName(mCurrentState)); + + if (mAborted && aEvent->mType != EVENT_ABORT) { + // ignore all events while aborting + return; + } + + Transition(aEvent); +} + +void SpeechRecognition::Transition(SpeechEvent* aEvent) { + switch (mCurrentState) { + case STATE_IDLE: + switch (aEvent->mType) { + case EVENT_START: + // TODO: may want to time out if we wait too long + // for user to approve + WaitForAudioData(aEvent); + break; + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_STARTING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + StartedAudioCapture(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_STOP: + ResetAndEnd(); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ESTIMATING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForEstimation(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + DetectSpeech(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_RECOGNIZING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForSpeechEnd(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_RESULT: + switch (aEvent->mType) { + case EVENT_STOP: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + NotifyFinalResult(aEvent); + break; + case EVENT_AUDIO_DATA: + DoNothing(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_START: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", + GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ABORTING: + switch (aEvent->mType) { + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + default: + MOZ_CRASH("Invalid state"); + } +} + +/* + * Handle a segment of recorded audio data. + * Returns the number of samples that were processed. + */ +uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, + TrackRate aTrackRate) { + AudioSegment::ChunkIterator iterator(*aSegment); + uint32_t samples = 0; + while (!iterator.IsEnded()) { + float out; + mEndpointer.ProcessAudio(*iterator, &out); + samples += iterator->GetDuration(); + iterator.Next(); + } + + // we need to call the nsISpeechRecognitionService::ProcessAudioSegment + // in a separate thread so that any eventual encoding or pre-processing + // of the audio does not block the main thread + nsresult rv = mEncodeTaskQueue->Dispatch( + NewRunnableMethod<StoreCopyPassByPtr<AudioSegment>, TrackRate>( + "nsISpeechRecognitionService::ProcessAudioSegment", + mRecognitionService, + &nsISpeechRecognitionService::ProcessAudioSegment, + std::move(*aSegment), aTrackRate)); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + return samples; +} + +/**************************************************************************** + * FSM Transition functions + * + * If a transition function may cause a DOM event to be fired, + * it may also be re-entered, since the event handler may cause the + * event loop to spin and new SpeechEvents to be processed. + * + * Rules: + * 1) These methods should call SetState as soon as possible. + * 2) If these methods dispatch DOM events, or call methods that dispatch + * DOM events, that should be done as late as possible. + * 3) If anything must happen after dispatching a DOM event, make sure + * the state is still what the method expected it to be. + ****************************************************************************/ + +void SpeechRecognition::Reset() { + SetState(STATE_IDLE); + + // This breaks potential ref-cycles. + mRecognitionService = nullptr; + + ++mStreamGeneration; + if (mStream) { + mStream->UnregisterTrackListener(this); + mStream = nullptr; + } + mTrack = nullptr; + mTrackIsOwned = false; + mStopRecordingPromise = nullptr; + mEncodeTaskQueue = nullptr; + mEstimationSamples = 0; + mBufferedSamples = 0; + mSpeechDetectionTimer->Cancel(); + mAborted = false; +} + +void SpeechRecognition::ResetAndEnd() { + Reset(); + DispatchTrustedEvent(u"end"_ns); +} + +void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) { + SetState(STATE_STARTING); +} + +void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEndpointer.SetEnvironmentEstimationMode(); + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + + DispatchTrustedEvent(u"audiostart"_ns); + if (mCurrentState == STATE_ESTIMATING) { + DispatchTrustedEvent(u"start"_ns); + } +} + +void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_RESULT); + + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); + + // This will run SoundEnd on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->SoundEnd(); }); + + StopRecording(); +} + +void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEstimationSamples > kESTIMATION_SAMPLES) { + mEndpointer.SetUserInputMode(); + SetState(STATE_WAITING_FOR_SPEECH); + } +} + +void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_SPEECH); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.DidStartReceivingSpeech()) { + mSpeechDetectionTimer->Cancel(); + SetState(STATE_RECOGNIZING); + DispatchTrustedEvent(u"speechstart"_ns); + } +} + +void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) { + SetState(STATE_RECOGNIZING); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.speech_input_complete()) { + DispatchTrustedEvent(u"speechend"_ns); + + if (mCurrentState == STATE_RECOGNIZING) { + // FIXME: StopRecordingAndRecognize should only be called for single + // shot services for continuous we should just inform the service + StopRecordingAndRecognize(aEvent); + } + } +} + +void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) { + ResetAndEnd(); + + RootedDictionary<SpeechRecognitionEventInit> init(RootingCx()); + init.mBubbles = true; + init.mCancelable = false; + // init.mResultIndex = 0; + init.mResults = aEvent->mRecognitionResultList; + init.mInterpretation = JS::NullValue(); + // init.mEmma = nullptr; + + RefPtr<SpeechRecognitionEvent> event = + SpeechRecognitionEvent::Constructor(this, u"result"_ns, init); + event->SetTrusted(true); + + DispatchEvent(*event); +} + +void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {} + +void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) { + if (mRecognitionService) { + if (mTrack) { + // This will run Abort on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->Abort(); }); + } else { + // Recording hasn't started yet. We can just call Abort(). + mRecognitionService->Abort(); + } + } + + StopRecording()->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); }); + + SetState(STATE_ABORTING); +} + +void SpeechRecognition::AbortError(SpeechEvent* aEvent) { + AbortSilently(aEvent); + NotifyError(aEvent); +} + +void SpeechRecognition::NotifyError(SpeechEvent* aEvent) { + aEvent->mError->SetTrusted(true); + + DispatchEvent(*aEvent->mError); +} + +/************************************** + * Event triggers and other functions * + **************************************/ +NS_IMETHODIMP +SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) { + // hold a reference so that the underlying track doesn't get collected. + mTrack = aTrack; + MOZ_ASSERT(!mTrack->Ended()); + + mSpeechListener = SpeechTrackListener::Create(this); + mTrack->AddListener(mSpeechListener); + + nsString blockerName; + blockerName.AppendPrintf("SpeechRecognition %p shutdown", this); + mShutdownBlocker = + MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName); + media::MustGetShutdownBarrier()->AddBlocker( + mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__, + u"SpeechRecognition shutdown"_ns); + + mEndpointer.StartSession(); + + return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs, + nsITimer::TYPE_ONE_SHOT); +} + +RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() { + if (!mTrack) { + // Recording wasn't started, or has already been stopped. + if (mStream) { + // Ensure we don't start recording because a track became available + // before we get reset. + mStream->UnregisterTrackListener(this); + } + return GenericNonExclusivePromise::CreateAndResolve(true, __func__); + } + + if (mStopRecordingPromise) { + return mStopRecordingPromise; + } + + mTrack->RemoveListener(mSpeechListener); + if (mTrackIsOwned) { + mTrack->Stop(); + } + + mEndpointer.EndSession(); + DispatchTrustedEvent(u"audioend"_ns); + + // Block shutdown until the speech track listener has been removed from the + // MSG, as it holds a reference to us, and we reference the world, which we + // don't want to leak. + mStopRecordingPromise = + mSpeechListener->mRemovedPromise + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { + SR_LOG("Shutting down encoding thread"); + return mEncodeTaskQueue->BeginShutdown(); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return ShutdownPromise::CreateAndResolve(false, __func__); + }) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { + media::MustGetShutdownBarrier()->RemoveBlocker( + mShutdownBlocker); + mShutdownBlocker = nullptr; + + MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE); + return GenericNonExclusivePromise::CreateAndResolve(true, + __func__); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return GenericNonExclusivePromise::CreateAndResolve(false, + __func__); + }); + return mStopRecordingPromise; +} + +NS_IMETHODIMP +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); + + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::No_speech, + "No speech detected (timeout)"); + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() && + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { + ProcessTestEventRequest(aSubject, nsDependentString(aData)); + } + + return NS_OK; +} + +void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, + const nsAString& aEventName) { + if (aEventName.EqualsLiteral("EVENT_ABORT")) { + Abort(); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { + DispatchError( + SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? + "AUDIO_ERROR test event"); + } else { + NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(), + "Got request for fake recognition service event, but " + "media.webspeech.test.fake_recognition_service is unset"); + + // let the fake recognition service handle the request + } +} + +already_AddRefed<SpeechGrammarList> SpeechRecognition::Grammars() const { + RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList; + return speechGrammarList.forget(); +} + +void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) { + mSpeechGrammarList = &aArg; +} + +void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; } + +void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; } + +bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const { + return mContinuous; +} + +void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) { + mContinuous = aArg; +} + +bool SpeechRecognition::InterimResults() const { return mInterimResults; } + +void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; } + +uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; } + +void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) { + mMaxAlternatives = aArg; +} + +void SpeechRecognition::GetServiceURI(nsString& aRetVal, + ErrorResult& aRv) const { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, + CallerType aCallerType, ErrorResult& aRv) { + if (mCurrentState != STATE_IDLE) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return; + } + + if (!SetRecognitionService(aRv)) { + return; + } + + if (!ValidateAndSetGrammarList(aRv)) { + return; + } + + mEncodeTaskQueue = + TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER), + "WebSpeechEncoderThread"); + + nsresult rv; + rv = mRecognitionService->Initialize(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + MediaStreamConstraints constraints; + constraints.mAudio.SetAsBoolean() = true; + + if (aStream.WasPassed()) { + mStream = &aStream.Value(); + mTrackIsOwned = false; + mStream->RegisterTrackListener(this); + nsTArray<RefPtr<AudioStreamTrack>> tracks; + mStream->GetAudioTracks(tracks); + for (const RefPtr<AudioStreamTrack>& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + break; + } + } + } else { + mTrackIsOwned = true; + nsPIDOMWindowInner* win = GetOwner(); + if (!win || !win->IsFullyActive()) { + aRv.ThrowInvalidStateError("The document is not fully active."); + return; + } + AutoNoJSAPI nojsapi; + RefPtr<SpeechRecognition> self(this); + MediaManager::Get() + ->GetUserMedia(win, constraints, aCallerType) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [this, self, + generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) { + nsTArray<RefPtr<AudioStreamTrack>> tracks; + aStream->GetAudioTracks(tracks); + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + for (const RefPtr<AudioStreamTrack>& track : tracks) { + track->Stop(); + } + return; + } + mStream = std::move(aStream); + mStream->RegisterTrackListener(this); + for (const RefPtr<AudioStreamTrack>& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + } + } + }, + [this, self, + generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) { + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + return; + } + SpeechRecognitionErrorCode errorCode; + + if (error->mName == MediaMgrError::Name::NotAllowedError) { + errorCode = SpeechRecognitionErrorCode::Not_allowed; + } else { + errorCode = SpeechRecognitionErrorCode::Audio_capture; + } + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, + error->mMessage); + }); + } + + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); + NS_DispatchToMainThread(event); +} + +bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) { + if (!GetOwner()) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + // See: + // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang + nsAutoString lang; + if (!mLang.IsEmpty()) { + lang = mLang; + } else { + nsCOMPtr<Document> document = GetOwner()->GetExtantDoc(); + if (!document) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + nsCOMPtr<Element> element = document->GetRootElement(); + if (!element) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + nsAutoString lang; + element->GetLang(lang); + } + + auto result = CreateSpeechRecognitionService(GetOwner(), this, lang); + + if (result.isErr()) { + switch (result.unwrapErr()) { + case ServiceCreationError::ServiceNotFound: + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + break; + default: + MOZ_CRASH("Unknown error"); + } + return false; + } + + mRecognitionService = result.unwrap(); + MOZ_DIAGNOSTIC_ASSERT(mRecognitionService); + return true; +} + +bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) { + if (!mSpeechGrammarList) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + uint32_t grammarListLength = mSpeechGrammarList->Length(); + for (uint32_t count = 0; count < grammarListLength; ++count) { + RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv); + if (aRv.Failed()) { + return false; + } + if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList( + speechGrammar.get(), nullptr))) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + } + + return true; +} + +void SpeechRecognition::Stop() { + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::Abort() { + if (mAborted) { + return; + } + + mAborted = true; + + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::NotifyTrackAdded( + const RefPtr<MediaStreamTrack>& aTrack) { + if (mTrack) { + return; + } + + RefPtr<AudioStreamTrack> audioTrack = aTrack->AsAudioStreamTrack(); + if (!audioTrack) { + return; + } + + if (audioTrack->Ended()) { + return; + } + + StartRecording(audioTrack); +} + +void SpeechRecognition::DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsACString& aMessage) { + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || + aErrorType == EVENT_AUDIO_ERROR, + "Invalid error type!"); + + RefPtr<SpeechRecognitionError> srError = + new SpeechRecognitionError(nullptr, nullptr, nullptr); + + srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode, + aMessage); + + RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); + event->mError = srError; + NS_DispatchToMainThread(event); +} + +/* + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. + * Updates mBufferedSamples and returns the number of samples that were + * buffered. + */ +uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, + uint32_t aSampleCount) { + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); + MOZ_ASSERT(mAudioSamplesBuffer); + + int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); + size_t samplesToCopy = + std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples); + + PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy); + + mBufferedSamples += samplesToCopy; + return samplesToCopy; +} + +/* + * Split a samples buffer starting of a given size into + * chunks of equal size. The chunks are stored in the array + * received as argument. + * Returns the offset of the end of the last chunk that was + * created. + */ +uint32_t SpeechRecognition::SplitSamplesBuffer( + const int16_t* aSamplesBuffer, uint32_t aSampleCount, + nsTArray<RefPtr<SharedBuffer>>& aResult) { + uint32_t chunkStart = 0; + + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { + CheckedInt<size_t> bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + RefPtr<SharedBuffer> chunk = SharedBuffer::Create(bufferSize); + + PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart, + mAudioSamplesPerChunk); + + aResult.AppendElement(chunk.forget()); + chunkStart += mAudioSamplesPerChunk; + } + + return chunkStart; +} + +AudioSegment* SpeechRecognition::CreateAudioSegment( + nsTArray<RefPtr<SharedBuffer>>& aChunks) { + AudioSegment* segment = new AudioSegment(); + for (uint32_t i = 0; i < aChunks.Length(); ++i) { + RefPtr<SharedBuffer> buffer = aChunks[i]; + const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); + + AutoTArray<const int16_t*, 1> channels; + channels.AppendElement(chunkData); + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, + PRINCIPAL_HANDLE_NONE); + } + + return segment; +} + +void SpeechRecognition::FeedAudioData( + nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, + MediaTrackListener* aProvider, TrackRate aTrackRate) { + NS_ASSERTION(!NS_IsMainThread(), + "FeedAudioData should not be called in the main thread"); + + // Endpointer expects to receive samples in chunks whose size is a + // multiple of its frame size. + // Since we can't assume we will receive the frames in appropriate-sized + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk + // (a multiple of Endpointer's frame size) before feeding to Endpointer. + + // ensure aSamples is deleted + RefPtr<SharedBuffer> refSamples = aSamples; + + uint32_t samplesIndex = 0; + const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); + AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend; + + // fill up our buffer and make a chunk out of it, if possible + if (mBufferedSamples > 0) { + samplesIndex += FillSamplesBuffer(samples, aDuration); + + if (mBufferedSamples == mAudioSamplesPerChunk) { + chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); + mBufferedSamples = 0; + } + } + + // create sample chunks of correct size + if (samplesIndex < aDuration) { + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, + aDuration - samplesIndex, chunksToSend); + } + + // buffer remaining samples + if (samplesIndex < aDuration) { + mBufferedSamples = 0; + CheckedInt<size_t> bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + mAudioSamplesBuffer = SharedBuffer::Create(bufferSize); + + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); + } + + AudioSegment* segment = CreateAudioSegment(chunksToSend); + RefPtr<SpeechEvent> event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA); + event->mAudioSegment = segment; + event->mProvider = aProvider; + event->mTrackRate = aTrackRate; + NS_DispatchToMainThread(event); +} + +const char* SpeechRecognition::GetName(FSMState aId) { + static const char* names[] = { + "STATE_IDLE", "STATE_STARTING", + "STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH", + "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT", + "STATE_ABORTING", + }; + + MOZ_ASSERT(aId < STATE_COUNT); + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); + return names[aId]; +} + +const char* SpeechRecognition::GetName(SpeechEvent* aEvent) { + static const char* names[] = {"EVENT_START", + "EVENT_STOP", + "EVENT_ABORT", + "EVENT_AUDIO_DATA", + "EVENT_AUDIO_ERROR", + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", + "EVENT_RECOGNITIONSERVICE_ERROR"}; + + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); + return names[aEvent->mType]; +} + +TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const { + MOZ_ASSERT(NS_IsMainThread()); + return mEncodeTaskQueue; +} + +SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>( + "SpeechEvent::SpeechEvent", aRecognition)), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(aRecognition), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::~SpeechEvent() { delete mAudioSegment; } + +NS_IMETHODIMP +SpeechEvent::Run() { + mRecognition->ProcessEvent(this); + return NS_OK; +} + +} // namespace mozilla::dom |