/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:set ts=2 sw=2 sts=2 et cindent: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef mozilla_dom_SpeechRecognition_h #define mozilla_dom_SpeechRecognition_h #include "mozilla/Attributes.h" #include "mozilla/DOMEventTargetHelper.h" #include "nsCOMPtr.h" #include "nsString.h" #include "nsWrapperCache.h" #include "nsTArray.h" #include "js/TypeDecls.h" #include "nsProxyRelease.h" #include "DOMMediaStream.h" #include "nsITimer.h" #include "MediaTrackGraph.h" #include "AudioSegment.h" #include "mozilla/WeakPtr.h" #include "SpeechGrammarList.h" #include "SpeechRecognitionResultList.h" #include "nsISpeechRecognitionService.h" #include "endpointer.h" #include "mozilla/dom/BindingDeclarations.h" #include "mozilla/dom/SpeechRecognitionError.h" namespace mozilla { namespace media { class ShutdownBlocker; } namespace dom { #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \ "SpeechRecognitionTest:RequestEvent" #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" class GlobalObject; class AudioStreamTrack; class SpeechEvent; class SpeechTrackListener; LogModule* GetSpeechRecognitionLog(); #define SR_LOG(...) \ MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) class SpeechRecognition final : public DOMEventTargetHelper, public nsIObserver, public DOMMediaStream::TrackListener, public SupportsWeakPtr { public: explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); NS_DECL_ISUPPORTS_INHERITED NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper) NS_DECL_NSIOBSERVER JSObject* WrapObject(JSContext* aCx, JS::Handle aGivenProto) override; static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); static already_AddRefed Constructor( const GlobalObject& aGlobal, ErrorResult& aRv); static already_AddRefed WebkitSpeechRecognition( const GlobalObject& aGlobal, ErrorResult& aRv) { return Constructor(aGlobal, aRv); } already_AddRefed Grammars() const; void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); void GetLang(nsString& aRetVal) const; void SetLang(const nsAString& aArg); bool GetContinuous(ErrorResult& aRv) const; void SetContinuous(bool aArg, ErrorResult& aRv); bool InterimResults() const; void SetInterimResults(bool aArg); uint32_t MaxAlternatives() const; TaskQueue* GetTaskQueueForEncoding() const; void SetMaxAlternatives(uint32_t aArg); void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); void Start(const Optional>& aStream, CallerType aCallerType, ErrorResult& aRv); void Stop(); void Abort(); IMPL_EVENT_HANDLER(audiostart) IMPL_EVENT_HANDLER(soundstart) IMPL_EVENT_HANDLER(speechstart) IMPL_EVENT_HANDLER(speechend) IMPL_EVENT_HANDLER(soundend) IMPL_EVENT_HANDLER(audioend) IMPL_EVENT_HANDLER(result) IMPL_EVENT_HANDLER(nomatch) IMPL_EVENT_HANDLER(error) IMPL_EVENT_HANDLER(start) IMPL_EVENT_HANDLER(end) enum EventType { EVENT_START, EVENT_STOP, EVENT_ABORT, EVENT_AUDIO_DATA, EVENT_AUDIO_ERROR, EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, EVENT_RECOGNITIONSERVICE_FINAL_RESULT, EVENT_RECOGNITIONSERVICE_ERROR, EVENT_COUNT }; void NotifyTrackAdded(const RefPtr& aTrack) override; // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16. void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsACString& aMessage); template void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const char (&aMessage)[N]) { DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage)); } uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray>& aResult); AudioSegment* CreateAudioSegment(nsTArray>& aChunks); void FeedAudioData(nsMainThreadPtrHandle& aRecognition, already_AddRefed aSamples, uint32_t aDuration, MediaTrackListener* aProvider, TrackRate aTrackRate); friend class SpeechEvent; private: virtual ~SpeechRecognition(); enum FSMState { STATE_IDLE, STATE_STARTING, STATE_ESTIMATING, STATE_WAITING_FOR_SPEECH, STATE_RECOGNIZING, STATE_WAITING_FOR_RESULT, STATE_ABORTING, STATE_COUNT }; void SetState(FSMState state); bool StateBetween(FSMState begin, FSMState end); bool SetRecognitionService(ErrorResult& aRv); bool ValidateAndSetGrammarList(ErrorResult& aRv); NS_IMETHOD StartRecording(RefPtr& aDOMStream); RefPtr StopRecording(); uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); void NotifyError(SpeechEvent* aEvent); void ProcessEvent(SpeechEvent* aEvent); void Transition(SpeechEvent* aEvent); void Reset(); void ResetAndEnd(); void WaitForAudioData(SpeechEvent* aEvent); void StartedAudioCapture(SpeechEvent* aEvent); void StopRecordingAndRecognize(SpeechEvent* aEvent); void WaitForEstimation(SpeechEvent* aEvent); void DetectSpeech(SpeechEvent* aEvent); void WaitForSpeechEnd(SpeechEvent* aEvent); void NotifyFinalResult(SpeechEvent* aEvent); void DoNothing(SpeechEvent* aEvent); void AbortSilently(SpeechEvent* aEvent); void AbortError(SpeechEvent* aEvent); RefPtr mStream; RefPtr mTrack; bool mTrackIsOwned = false; RefPtr mStopRecordingPromise; RefPtr mSpeechListener; nsCOMPtr mRecognitionService; RefPtr mShutdownBlocker; // TaskQueue responsible for pre-processing the samples by the service // it runs in a separate thread from the main thread RefPtr mEncodeTaskQueue; // A generation ID of the MediaStream a started session is for, so that // a gUM request that resolves after the session has stopped, and a new // one has started, can exit early. Main thread only. Can wrap. uint8_t mStreamGeneration = 0; FSMState mCurrentState; Endpointer mEndpointer; uint32_t mEstimationSamples; uint32_t mAudioSamplesPerChunk; // maximum amount of seconds the engine will wait for voice // until returning a 'no speech detected' error uint32_t mSpeechDetectionTimeoutMs; // buffer holds one chunk of mAudioSamplesPerChunk // samples before feeding it to mEndpointer RefPtr mAudioSamplesBuffer; uint32_t mBufferedSamples; nsCOMPtr mSpeechDetectionTimer; bool mAborted; nsString mLang; RefPtr mSpeechGrammarList; // private flag used to hold if the user called the setContinuous() method // of the API bool mContinuous; // WebSpeechAPI (http://bit.ly/1gIl7DC) states: // // 1. Default value MUST be false // 2. If true, interim results SHOULD be returned // 3. If false, interim results MUST NOT be returned // // Pocketsphinx does not return interm results; so, defaulting // mInterimResults to false, then ignoring its subsequent value // is a conforming implementation. bool mInterimResults; // WebSpeechAPI (http://bit.ly/1JAiqeo) states: // // 1. Default value is 1 // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives // per result" // // Pocketsphinx can only return at maximum a single // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives // while for a 0 value returning no SpeechRecognitionAlternative per result is // a conforming implementation. uint32_t mMaxAlternatives; void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName); const char* GetName(FSMState aId); const char* GetName(SpeechEvent* aEvent); }; class SpeechEvent : public Runnable { public: SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType); SpeechEvent(nsMainThreadPtrHandle& aRecognition, SpeechRecognition::EventType aType); ~SpeechEvent(); NS_IMETHOD Run() override; AudioSegment* mAudioSegment; RefPtr mRecognitionResultList; // TODO: make this a session being passed which // also has index and stuff RefPtr mError; friend class SpeechRecognition; private: nsMainThreadPtrHandle mRecognition; // for AUDIO_DATA events, keep a reference to the provider // of the data (i.e., the SpeechTrackListener) to ensure it // is kept alive (and keeps SpeechRecognition alive) until this // event gets processed. RefPtr mProvider; SpeechRecognition::EventType mType; TrackRate mTrackRate; }; } // namespace dom inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) { return ToSupports(static_cast(aRec)); } } // namespace mozilla #endif