diff options
Diffstat (limited to 'dom/media/webspeech/recognition')
49 files changed, 5440 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp new file mode 100644 index 0000000000..e68ccc417e --- /dev/null +++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp @@ -0,0 +1,462 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsThreadUtils.h" +#include "nsXPCOMCIDInternal.h" +#include "OnlineSpeechRecognitionService.h" +#include "nsIFile.h" +#include "SpeechGrammar.h" +#include "SpeechRecognition.h" +#include "SpeechRecognitionAlternative.h" +#include "SpeechRecognitionResult.h" +#include "SpeechRecognitionResultList.h" +#include "nsIObserverService.h" +#include "mozilla/dom/Document.h" +#include "mozilla/Preferences.h" +#include "mozilla/ScopeExit.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/Services.h" +#include "nsDirectoryServiceDefs.h" +#include "nsDirectoryServiceUtils.h" +#include "nsNetUtil.h" +#include "nsContentUtils.h" +#include "nsIChannel.h" +#include "nsIHttpChannel.h" +#include "nsIPrincipal.h" +#include "nsIStreamListener.h" +#include "nsIUploadChannel2.h" +#include "mozilla/dom/ClientIPCTypes.h" +#include "nsStringStream.h" +#include "nsIOutputStream.h" +#include "nsStreamUtils.h" +#include "OpusTrackEncoder.h" +#include "OggWriter.h" +#include "nsIClassOfService.h" +#include <json/json.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +namespace mozilla { + +using namespace dom; + +#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \ + "media.webspeech.service.endpoint" +#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/" +#define MAX_LISTENING_TIME_MS 10000 + +NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService, + nsIStreamListener) + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure, + const char* aFromRawSegment, + uint32_t aToOffset, uint32_t aCount, + uint32_t* aWriteCount) { + nsCString* buf = static_cast<nsCString*>(aClosure); + buf->Append(aFromRawSegment, aCount); + *aWriteCount = aCount; + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest, + nsIInputStream* aInputStream, + uint64_t aOffset, + uint32_t aCount) { + MOZ_ASSERT(NS_IsMainThread()); + nsresult rv; + uint32_t readCount; + rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount, + &readCount); + NS_ENSURE_SUCCESS(rv, rv); + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest, + nsresult aStatusCode) { + MOZ_ASSERT(NS_IsMainThread()); + + auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); }); + + if (mAborted) { + return NS_OK; + } + + bool success; + float confidence = 0; + Json::Value root; + Json::CharReaderBuilder builder; + bool parsingSuccessful; + nsAutoCString result; + nsAutoCString hypoValue; + nsAutoCString errorMsg; + SpeechRecognitionErrorCode errorCode; + + SR_LOG("STT Result: %s", mBuf.get()); + + if (NS_FAILED(aStatusCode)) { + success = false; + errorMsg.AssignLiteral("Error connecting to the service."); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + success = true; + UniquePtr<Json::CharReader> const reader(builder.newCharReader()); + parsingSuccessful = + reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr); + if (!parsingSuccessful) { + // there's an internal server error + success = false; + errorMsg.AssignLiteral("Internal server error"); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + result.Assign(root.get("status", "error").asString().c_str()); + if (result.EqualsLiteral("ok")) { + // ok, we have a result + if (!root["data"].empty()) { + hypoValue.Assign(root["data"][0].get("text", "").asString().c_str()); + confidence = root["data"][0].get("confidence", "0").asFloat(); + } else { + success = false; + errorMsg.AssignLiteral("Error reading result data."); + errorCode = SpeechRecognitionErrorCode::Network; + } + } else { + success = false; + errorMsg.Assign(root.get("message", "").asString().c_str()); + errorCode = SpeechRecognitionErrorCode::No_speech; + } + } + } + + if (!success) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg); + } else { + // Declare javascript result events + RefPtr<SpeechEvent> event = new SpeechEvent( + mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); + SpeechRecognitionResultList* resultList = + new SpeechRecognitionResultList(mRecognition); + SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); + + if (mRecognition->MaxAlternatives() > 0) { + SpeechRecognitionAlternative* alternative = + new SpeechRecognitionAlternative(mRecognition); + + alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue); + alternative->mConfidence = confidence; + + result->mItems.AppendElement(alternative); + } + resultList->mItems.AppendElement(result); + + event->mRecognitionResultList = resultList; + NS_DispatchToMainThread(event); + } + + return NS_OK; +} + +OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default; +OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default; + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Initialize( + WeakPtr<SpeechRecognition> aSpeechRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + mWriter = MakeUnique<OggWriter>(); + mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>( + "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition); + mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding(); + MOZ_ASSERT(mEncodeTaskQueue); + return NS_OK; +} + +void OnlineSpeechRecognitionService::EncoderFinished() { + MOZ_ASSERT(!NS_IsMainThread()); + MOZ_ASSERT(mEncodedAudioQueue.IsFinished()); + + while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) { + AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame}); + DebugOnly<nsresult> rv = + mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream() + ? ContainerWriter::END_OF_STREAM + : 0); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + } + + mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED); + MOZ_ASSERT(mWriter->IsWritingComplete()); + + NS_DispatchToMainThread( + NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this, + &OnlineSpeechRecognitionService::DoSTT)); +} + +void OnlineSpeechRecognitionService::EncoderInitialized() { + MOZ_ASSERT(!NS_IsMainThread()); + AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata; + metadata.AppendElement(mAudioEncoder->GetMetadata()); + if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) { + SR_LOG("wrong meta data type!"); + MOZ_ASSERT_UNREACHABLE(); + } + + nsresult rv = mWriter->SetMetadata(metadata); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + Unused << rv; +} + +void OnlineSpeechRecognitionService::EncoderError() { + MOZ_ASSERT(!NS_IsMainThread()); + SR_LOG("Error encoding frames."); + mEncodedData.Clear(); + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::DispatchError", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (!mRecognition) { + return; + } + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Encoder error"); + })); +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, + int32_t aSampleRate) { + MOZ_ASSERT(!NS_IsMainThread()); + int64_t duration = aAudioSegment->GetDuration(); + if (duration <= 0) { + return NS_OK; + } + + if (!mAudioEncoder) { + mSpeechEncoderListener = new SpeechEncoderListener(this); + mAudioEncoder = + MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue); + RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent(); + mAudioEncoder->SetWorkerThread(mEncoderThread); + mAudioEncoder->RegisterListener(mSpeechEncoderListener); + } + + mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment)); + + TimeStamp now = TimeStamp::Now(); + if (mFirstIteration.IsNull()) { + mFirstIteration = now; + } + + if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) { + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::Stop", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (!mRecognition) { + return; + } + mRecognition->Stop(); + })); + + return NS_OK; + } + + return NS_OK; +} + +void OnlineSpeechRecognitionService::DoSTT() { + MOZ_ASSERT(NS_IsMainThread()); + + if (mAborted) { + return; + } + + nsresult rv; + nsCOMPtr<nsIChannel> chan; + nsCOMPtr<nsIURI> uri; + nsAutoCString speechRecognitionEndpoint; + nsAutoCString prefEndpoint; + nsAutoString language; + + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT, + prefEndpoint); + + if (!prefEndpoint.IsEmpty()) { + speechRecognitionEndpoint = prefEndpoint; + } else { + speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT; + } + + rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr, + nullptr); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Unknown URI"); + return; + } + + nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT; + nsLoadFlags loadFlags = + nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER; + nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER; + + nsPIDOMWindowInner* window = mRecognition->GetOwner(); + if (NS_WARN_IF(!window)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No window"); + return; + } + + Document* doc = window->GetExtantDoc(); + if (NS_WARN_IF(!doc)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No document"); + } + rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags, + contentPolicy, nullptr, nullptr, nullptr, nullptr, + loadFlags); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open channel"); + return; + } + + nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan); + if (httpChan) { + rv = httpChan->SetRequestMethod("POST"_ns); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + if (httpChan) { + mRecognition->GetLang(language); + // Accept-Language-STT is a custom header of our backend server used to set + // the language of the speech sample being submitted by the client + rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns, + NS_ConvertUTF16toUTF8(language), false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the transcription by default + rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the sample by default + rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Set the product tag as teh web speech api + rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan)); + if (cos) { + cos->AddClassFlags(nsIClassOfService::UrgentStart); + } + + nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan); + if (uploadChan) { + nsCOMPtr<nsIInputStream> bodyStream; + uint32_t length = 0; + for (const nsTArray<uint8_t>& chunk : mEncodedData) { + length += chunk.Length(); + } + + nsTArray<uint8_t> audio; + if (!audio.SetCapacity(length, fallible)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Allocation error"); + return; + } + + for (const nsTArray<uint8_t>& chunk : mEncodedData) { + audio.AppendElements(chunk); + } + + mEncodedData.Clear(); + + rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio)); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open stream"); + return; + } + if (bodyStream) { + rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns, + length, "POST"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + } + + rv = chan->AsyncOpen(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Internal server error"); + } +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::SoundEnd() { + MOZ_ASSERT(NS_IsMainThread()); + + if (!mEncodeTaskQueue) { + // Not initialized + return NS_OK; + } + + nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( + "OnlineSpeechRecognitionService::SoundEnd", + [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { + if (mAudioEncoder) { + mAudioEncoder->NotifyEndOfStream(); + mAudioEncoder->UnregisterListener(mSpeechEncoderListener); + mSpeechEncoderListener = nullptr; + mAudioEncoder = nullptr; + EncoderFinished(); + } + })); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + + mEncodeTaskQueue = nullptr; + + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ValidateAndSetGrammarList( + SpeechGrammar* aSpeechGrammar, + nsISpeechGrammarCompilationCallback* aCallback) { + // This is an online LVCSR (STT) service, + // so we don't need to set a grammar + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Abort() { + MOZ_ASSERT(NS_IsMainThread()); + if (mAborted) { + return NS_OK; + } + mAborted = true; + return SoundEnd(); +} +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h new file mode 100644 index 0000000000..c049e5046a --- /dev/null +++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h @@ -0,0 +1,132 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_OnlineRecognitionService_h +#define mozilla_dom_OnlineRecognitionService_h + +#include "nsCOMPtr.h" +#include "nsTArray.h" +#include "nsISpeechRecognitionService.h" +#include "speex/speex_resampler.h" +#include "nsIStreamListener.h" +#include "OpusTrackEncoder.h" +#include "ContainerWriter.h" + +#define NS_ONLINE_SPEECH_RECOGNITION_SERVICE_CID \ + {0x0ff5ce56, \ + 0x5b09, \ + 0x4db8, \ + {0xad, 0xc6, 0x82, 0x66, 0xaf, 0x95, 0xf8, 0x64}}; + +namespace mozilla { + +namespace ipc { +class PrincipalInfo; +} // namespace ipc + +/** + * Online implementation of the nsISpeechRecognitionService interface + */ +class OnlineSpeechRecognitionService : public nsISpeechRecognitionService, + public nsIStreamListener { + public: + // Add XPCOM glue code + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSISPEECHRECOGNITIONSERVICE + NS_DECL_NSIREQUESTOBSERVER + NS_DECL_NSISTREAMLISTENER + + /** + * Listener responsible for handling the events raised by the TrackEncoder + */ + class SpeechEncoderListener : public TrackEncoderListener { + public: + explicit SpeechEncoderListener(OnlineSpeechRecognitionService* aService) + : mService(aService), mOwningThread(AbstractThread::GetCurrent()) {} + + void Started(TrackEncoder* aEncoder) override {} + + void Initialized(TrackEncoder* aEncoder) override { + MOZ_ASSERT(mOwningThread->IsCurrentThreadIn()); + mService->EncoderInitialized(); + } + + void Error(TrackEncoder* aEncoder) override { + MOZ_ASSERT(mOwningThread->IsCurrentThreadIn()); + mService->EncoderError(); + } + + private: + const RefPtr<OnlineSpeechRecognitionService> mService; + const RefPtr<AbstractThread> mOwningThread; + }; + + /** + * Default constructs a OnlineSpeechRecognitionService + */ + OnlineSpeechRecognitionService(); + + /** + * Called by SpeechEncoderListener when the AudioTrackEncoder has been + * initialized. + */ + void EncoderInitialized(); + + /** + * Called after the AudioTrackEncoder has encoded all data for us to wrap in a + * container and pass along. + */ + void EncoderFinished(); + + /** + * Called by SpeechEncoderListener when the AudioTrackEncoder has + * encountered an error. + */ + void EncoderError(); + + private: + /** + * Private destructor to prevent bypassing of reference counting + */ + virtual ~OnlineSpeechRecognitionService(); + + /** The associated SpeechRecognition */ + nsMainThreadPtrHandle<dom::SpeechRecognition> mRecognition; + + /** + * Builds a mock SpeechRecognitionResultList + */ + dom::SpeechRecognitionResultList* BuildMockResultList(); + + /** + * Method responsible for uploading the audio to the remote endpoint + */ + void DoSTT(); + + // Encoded and packaged ogg audio data + nsTArray<nsTArray<uint8_t>> mEncodedData; + // Member responsible for holding a reference to the TrackEncoderListener + RefPtr<SpeechEncoderListener> mSpeechEncoderListener; + // MediaQueue fed encoded data by mAudioEncoder + MediaQueue<EncodedFrame> mEncodedAudioQueue; + // Encoder responsible for encoding the frames from pcm to opus which is the + // format supported by our backend + UniquePtr<AudioTrackEncoder> mAudioEncoder; + // Object responsible for wrapping the opus frames into an ogg container + UniquePtr<ContainerWriter> mWriter; + // Member responsible for storing the json string returned by the endpoint + nsCString mBuf; + // Used to calculate a ceiling on the time spent listening. + TimeStamp mFirstIteration; + // flag responsible to control if the user choose to abort + bool mAborted = false; + // reference to the audio encoder queue + RefPtr<TaskQueue> mEncodeTaskQueue; +}; + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechGrammar.cpp b/dom/media/webspeech/recognition/SpeechGrammar.cpp new file mode 100644 index 0000000000..de6e9fa30f --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammar.cpp @@ -0,0 +1,57 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechGrammar.h" + +#include "mozilla/ErrorResult.h" +#include "mozilla/dom/SpeechGrammarBinding.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammar, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammar) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammar) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammar) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechGrammar::SpeechGrammar(nsISupports* aParent) : mParent(aParent) {} + +SpeechGrammar::~SpeechGrammar() = default; + +already_AddRefed<SpeechGrammar> SpeechGrammar::Constructor( + const GlobalObject& aGlobal) { + RefPtr<SpeechGrammar> speechGrammar = + new SpeechGrammar(aGlobal.GetAsSupports()); + return speechGrammar.forget(); +} + +nsISupports* SpeechGrammar::GetParentObject() const { return mParent; } + +JSObject* SpeechGrammar::WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) { + return SpeechGrammar_Binding::Wrap(aCx, this, aGivenProto); +} + +void SpeechGrammar::GetSrc(nsString& aRetVal, ErrorResult& aRv) const { + aRetVal = mSrc; +} + +void SpeechGrammar::SetSrc(const nsAString& aArg, ErrorResult& aRv) { + mSrc = aArg; +} + +float SpeechGrammar::GetWeight(ErrorResult& aRv) const { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return 0; +} + +void SpeechGrammar::SetWeight(float aArg, ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechGrammar.h b/dom/media/webspeech/recognition/SpeechGrammar.h new file mode 100644 index 0000000000..0dee1e9792 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammar.h @@ -0,0 +1,64 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechGrammar_h +#define mozilla_dom_SpeechGrammar_h + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +namespace mozilla { +class ErrorResult; + +namespace dom { + +class GlobalObject; + +class SpeechGrammar final : public nsISupports, public nsWrapperCache { + public: + explicit SpeechGrammar(nsISupports* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammar) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + static already_AddRefed<SpeechGrammar> Constructor( + const GlobalObject& aGlobal); + + static already_AddRefed<SpeechGrammar> WebkitSpeechGrammar( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal); + } + + void GetSrc(nsString& aRetVal, ErrorResult& aRv) const; + + void SetSrc(const nsAString& aArg, ErrorResult& aRv); + + float GetWeight(ErrorResult& aRv) const; + + void SetWeight(float aArg, ErrorResult& aRv); + + private: + ~SpeechGrammar(); + + nsCOMPtr<nsISupports> mParent; + + nsString mSrc; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.cpp b/dom/media/webspeech/recognition/SpeechGrammarList.cpp new file mode 100644 index 0000000000..4317452057 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammarList.cpp @@ -0,0 +1,76 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechGrammarList.h" + +#include "mozilla/dom/SpeechGrammar.h" +#include "mozilla/dom/SpeechGrammarListBinding.h" +#include "mozilla/ErrorResult.h" +#include "nsCOMPtr.h" +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammarList, mParent, mItems) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammarList) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammarList) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammarList) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechGrammarList::SpeechGrammarList(nsISupports* aParent) : mParent(aParent) {} + +SpeechGrammarList::~SpeechGrammarList() = default; + +already_AddRefed<SpeechGrammarList> SpeechGrammarList::Constructor( + const GlobalObject& aGlobal) { + RefPtr<SpeechGrammarList> speechGrammarList = + new SpeechGrammarList(aGlobal.GetAsSupports()); + return speechGrammarList.forget(); +} + +JSObject* SpeechGrammarList::WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) { + return SpeechGrammarList_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechGrammarList::GetParentObject() const { return mParent; } + +uint32_t SpeechGrammarList::Length() const { return mItems.Length(); } + +already_AddRefed<SpeechGrammar> SpeechGrammarList::Item(uint32_t aIndex, + ErrorResult& aRv) { + RefPtr<SpeechGrammar> result = mItems.ElementAt(aIndex); + return result.forget(); +} + +void SpeechGrammarList::AddFromURI(const nsAString& aSrc, + const Optional<float>& aWeight, + ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechGrammarList::AddFromString(const nsAString& aString, + const Optional<float>& aWeight, + ErrorResult& aRv) { + SpeechGrammar* speechGrammar = new SpeechGrammar(mParent); + speechGrammar->SetSrc(aString, aRv); + mItems.AppendElement(speechGrammar); +} + +already_AddRefed<SpeechGrammar> SpeechGrammarList::IndexedGetter( + uint32_t aIndex, bool& aPresent, ErrorResult& aRv) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + ErrorResult rv; + aPresent = true; + return Item(aIndex, rv); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.h b/dom/media/webspeech/recognition/SpeechGrammarList.h new file mode 100644 index 0000000000..7f1e09cd9e --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammarList.h @@ -0,0 +1,73 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechGrammarList_h +#define mozilla_dom_SpeechGrammarList_h + +#include "mozilla/Attributes.h" +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsTArray.h" +#include "nsWrapperCache.h" + +struct JSContext; + +namespace mozilla { + +class ErrorResult; + +namespace dom { + +class GlobalObject; +class SpeechGrammar; +template <typename> +class Optional; + +class SpeechGrammarList final : public nsISupports, public nsWrapperCache { + public: + explicit SpeechGrammarList(nsISupports* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammarList) + + static already_AddRefed<SpeechGrammarList> Constructor( + const GlobalObject& aGlobal); + + static already_AddRefed<SpeechGrammarList> WebkitSpeechGrammarList( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal); + } + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed<SpeechGrammar> Item(uint32_t aIndex, ErrorResult& aRv); + + void AddFromURI(const nsAString& aSrc, const Optional<float>& aWeight, + ErrorResult& aRv); + + void AddFromString(const nsAString& aString, const Optional<float>& aWeight, + ErrorResult& aRv); + + already_AddRefed<SpeechGrammar> IndexedGetter(uint32_t aIndex, bool& aPresent, + ErrorResult& aRv); + + private: + ~SpeechGrammarList(); + + nsCOMPtr<nsISupports> mParent; + + nsTArray<RefPtr<SpeechGrammar>> mItems; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp new file mode 100644 index 0000000000..75d1ba7709 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp @@ -0,0 +1,1170 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognition.h" + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" + +#include "mozilla/dom/AudioStreamTrack.h" +#include "mozilla/dom/BindingUtils.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/SpeechRecognitionBinding.h" +#include "mozilla/dom/MediaStreamTrackBinding.h" +#include "mozilla/dom/MediaStreamError.h" +#include "mozilla/dom/RootedDictionary.h" +#include "mozilla/dom/SpeechGrammar.h" +#include "mozilla/MediaManager.h" +#include "mozilla/Preferences.h" +#include "mozilla/ResultVariant.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/AbstractThread.h" +#include "VideoUtils.h" +#include "AudioSegment.h" +#include "MediaEnginePrefs.h" +#include "endpointer.h" + +#include "mozilla/dom/SpeechRecognitionEvent.h" +#include "nsComponentManagerUtils.h" +#include "nsContentUtils.h" +#include "mozilla/dom/Document.h" +#include "nsIObserverService.h" +#include "nsIPermissionManager.h" +#include "nsIPrincipal.h" +#include "nsPIDOMWindow.h" +#include "nsServiceManagerUtils.h" +#include "nsQueryObject.h" +#include "SpeechTrackListener.h" + +#include <algorithm> + +// Undo the windows.h damage +#if defined(XP_WIN) && defined(GetMessage) +# undef GetMessage +#endif + +namespace mozilla::dom { + +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" +#define DEFAULT_RECOGNITION_SERVICE "online" + +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \ + "media.webspeech.long_silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \ + "media.webspeech.long_speech_length" +#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \ + "media.webspeech.recognition.timeout" + +static const uint32_t kSAMPLE_RATE = 16000; + +// number of frames corresponding to 300ms of audio to send to endpointer while +// it's in environment estimation mode +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; + +LogModule* GetSpeechRecognitionLog() { + static LazyLogModule sLog("SpeechRecognition"); + return sLog; +} +#define SR_LOG(...) \ + MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +namespace { +class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker { + public: + SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition, + const nsString& aName) + : media::ShutdownBlocker(aName), mRecognition(aRecognition) {} + + NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override { + MOZ_ASSERT(NS_IsMainThread()); + // AbortSilently will eventually clear the blocker. + mRecognition->Abort(); + return NS_OK; + } + + private: + const RefPtr<SpeechRecognition> mRecognition; +}; + +enum class ServiceCreationError { + ServiceNotFound, +}; + +Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError> +CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow, + SpeechRecognition* aRecognition, + const nsAString& aLang) { + nsAutoCString speechRecognitionServiceCID; + + nsAutoCString prefValue; + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue); + nsAutoCString speechRecognitionService; + + if (!prefValue.IsEmpty()) { + speechRecognitionService = prefValue; + } else { + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; + } + + if (StaticPrefs::media_webspeech_test_fake_recognition_service()) { + speechRecognitionServiceCID = + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; + } else { + speechRecognitionServiceCID = + nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + + speechRecognitionService; + } + + nsresult rv; + nsCOMPtr<nsISpeechRecognitionService> recognitionService; + recognitionService = + do_CreateInstance(speechRecognitionServiceCID.get(), &rv); + if (!recognitionService) { + return Err(ServiceCreationError::ServiceNotFound); + } + + return recognitionService; +} +} // namespace + +NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition, + DOMEventTargetHelper, mStream, + mTrack, mRecognitionService, + mSpeechGrammarList) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition) + NS_INTERFACE_MAP_ENTRY(nsIObserver) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) + +SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) + : DOMEventTargetHelper(aOwnerWindow), + mEndpointer(kSAMPLE_RATE), + mAudioSamplesPerChunk(mEndpointer.FrameSize()), + mSpeechDetectionTimer(NS_NewTimer()), + mSpeechGrammarList(new SpeechGrammarList(GetOwner())), + mContinuous(false), + mInterimResults(false), + mMaxAlternatives(1) { + SR_LOG("created SpeechRecognition"); + + if (StaticPrefs::media_webspeech_test_enable()) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + } + + mEndpointer.set_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); + mEndpointer.set_long_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); + mEndpointer.set_long_speech_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); + + mSpeechDetectionTimeoutMs = + Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000); + + Reset(); +} + +SpeechRecognition::~SpeechRecognition() = default; + +bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) { + return mCurrentState >= begin && mCurrentState <= end; +} + +void SpeechRecognition::SetState(FSMState state) { + mCurrentState = state; + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); +} + +JSObject* SpeechRecognition::WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) { + return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto); +} + +bool SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) { + nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal); + + nsresult rv; + nsCOMPtr<nsIPermissionManager> mgr = + do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION; + rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition"_ns, + &speechRecognition); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + bool hasPermission = + (speechRecognition == nsIPermissionManager::ALLOW_ACTION); + + return (hasPermission || + StaticPrefs::media_webspeech_recognition_force_enable() || + StaticPrefs::media_webspeech_test_enable()) && + StaticPrefs::media_webspeech_recognition_enable(); +} + +already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor( + const GlobalObject& aGlobal, ErrorResult& aRv) { + nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports()); + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + return nullptr; + } + + RefPtr<SpeechRecognition> object = new SpeechRecognition(win); + return object.forget(); +} + +void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) { + SR_LOG("Processing %s, current state is %s", GetName(aEvent), + GetName(mCurrentState)); + + if (mAborted && aEvent->mType != EVENT_ABORT) { + // ignore all events while aborting + return; + } + + Transition(aEvent); +} + +void SpeechRecognition::Transition(SpeechEvent* aEvent) { + switch (mCurrentState) { + case STATE_IDLE: + switch (aEvent->mType) { + case EVENT_START: + // TODO: may want to time out if we wait too long + // for user to approve + WaitForAudioData(aEvent); + break; + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_STARTING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + StartedAudioCapture(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_STOP: + ResetAndEnd(); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ESTIMATING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForEstimation(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + DetectSpeech(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_RECOGNIZING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForSpeechEnd(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_RESULT: + switch (aEvent->mType) { + case EVENT_STOP: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + NotifyFinalResult(aEvent); + break; + case EVENT_AUDIO_DATA: + DoNothing(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_START: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", + GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ABORTING: + switch (aEvent->mType) { + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + default: + MOZ_CRASH("Invalid state"); + } +} + +/* + * Handle a segment of recorded audio data. + * Returns the number of samples that were processed. + */ +uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, + TrackRate aTrackRate) { + AudioSegment::ChunkIterator iterator(*aSegment); + uint32_t samples = 0; + while (!iterator.IsEnded()) { + float out; + mEndpointer.ProcessAudio(*iterator, &out); + samples += iterator->GetDuration(); + iterator.Next(); + } + + // we need to call the nsISpeechRecognitionService::ProcessAudioSegment + // in a separate thread so that any eventual encoding or pre-processing + // of the audio does not block the main thread + nsresult rv = mEncodeTaskQueue->Dispatch( + NewRunnableMethod<StoreCopyPassByPtr<AudioSegment>, TrackRate>( + "nsISpeechRecognitionService::ProcessAudioSegment", + mRecognitionService, + &nsISpeechRecognitionService::ProcessAudioSegment, + std::move(*aSegment), aTrackRate)); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + return samples; +} + +/**************************************************************************** + * FSM Transition functions + * + * If a transition function may cause a DOM event to be fired, + * it may also be re-entered, since the event handler may cause the + * event loop to spin and new SpeechEvents to be processed. + * + * Rules: + * 1) These methods should call SetState as soon as possible. + * 2) If these methods dispatch DOM events, or call methods that dispatch + * DOM events, that should be done as late as possible. + * 3) If anything must happen after dispatching a DOM event, make sure + * the state is still what the method expected it to be. + ****************************************************************************/ + +void SpeechRecognition::Reset() { + SetState(STATE_IDLE); + + // This breaks potential ref-cycles. + mRecognitionService = nullptr; + + ++mStreamGeneration; + if (mStream) { + mStream->UnregisterTrackListener(this); + mStream = nullptr; + } + mTrack = nullptr; + mTrackIsOwned = false; + mStopRecordingPromise = nullptr; + mEncodeTaskQueue = nullptr; + mEstimationSamples = 0; + mBufferedSamples = 0; + mSpeechDetectionTimer->Cancel(); + mAborted = false; +} + +void SpeechRecognition::ResetAndEnd() { + Reset(); + DispatchTrustedEvent(u"end"_ns); +} + +void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) { + SetState(STATE_STARTING); +} + +void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEndpointer.SetEnvironmentEstimationMode(); + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + + DispatchTrustedEvent(u"audiostart"_ns); + if (mCurrentState == STATE_ESTIMATING) { + DispatchTrustedEvent(u"start"_ns); + } +} + +void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_RESULT); + + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); + + // This will run SoundEnd on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->SoundEnd(); }); + + StopRecording(); +} + +void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEstimationSamples > kESTIMATION_SAMPLES) { + mEndpointer.SetUserInputMode(); + SetState(STATE_WAITING_FOR_SPEECH); + } +} + +void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_SPEECH); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.DidStartReceivingSpeech()) { + mSpeechDetectionTimer->Cancel(); + SetState(STATE_RECOGNIZING); + DispatchTrustedEvent(u"speechstart"_ns); + } +} + +void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) { + SetState(STATE_RECOGNIZING); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.speech_input_complete()) { + DispatchTrustedEvent(u"speechend"_ns); + + if (mCurrentState == STATE_RECOGNIZING) { + // FIXME: StopRecordingAndRecognize should only be called for single + // shot services for continuous we should just inform the service + StopRecordingAndRecognize(aEvent); + } + } +} + +void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) { + ResetAndEnd(); + + RootedDictionary<SpeechRecognitionEventInit> init(RootingCx()); + init.mBubbles = true; + init.mCancelable = false; + // init.mResultIndex = 0; + init.mResults = aEvent->mRecognitionResultList; + init.mInterpretation = JS::NullValue(); + // init.mEmma = nullptr; + + RefPtr<SpeechRecognitionEvent> event = + SpeechRecognitionEvent::Constructor(this, u"result"_ns, init); + event->SetTrusted(true); + + DispatchEvent(*event); +} + +void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {} + +void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) { + if (mRecognitionService) { + if (mTrack) { + // This will run Abort on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->Abort(); }); + } else { + // Recording hasn't started yet. We can just call Abort(). + mRecognitionService->Abort(); + } + } + + StopRecording()->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); }); + + SetState(STATE_ABORTING); +} + +void SpeechRecognition::AbortError(SpeechEvent* aEvent) { + AbortSilently(aEvent); + NotifyError(aEvent); +} + +void SpeechRecognition::NotifyError(SpeechEvent* aEvent) { + aEvent->mError->SetTrusted(true); + + DispatchEvent(*aEvent->mError); +} + +/************************************** + * Event triggers and other functions * + **************************************/ +NS_IMETHODIMP +SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) { + // hold a reference so that the underlying track doesn't get collected. + mTrack = aTrack; + MOZ_ASSERT(!mTrack->Ended()); + + mSpeechListener = SpeechTrackListener::Create(this); + mTrack->AddListener(mSpeechListener); + + nsString blockerName; + blockerName.AppendPrintf("SpeechRecognition %p shutdown", this); + mShutdownBlocker = + MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName); + media::MustGetShutdownBarrier()->AddBlocker( + mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__, + u"SpeechRecognition shutdown"_ns); + + mEndpointer.StartSession(); + + return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs, + nsITimer::TYPE_ONE_SHOT); +} + +RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() { + if (!mTrack) { + // Recording wasn't started, or has already been stopped. + if (mStream) { + // Ensure we don't start recording because a track became available + // before we get reset. + mStream->UnregisterTrackListener(this); + } + return GenericNonExclusivePromise::CreateAndResolve(true, __func__); + } + + if (mStopRecordingPromise) { + return mStopRecordingPromise; + } + + mTrack->RemoveListener(mSpeechListener); + if (mTrackIsOwned) { + mTrack->Stop(); + } + + mEndpointer.EndSession(); + DispatchTrustedEvent(u"audioend"_ns); + + // Block shutdown until the speech track listener has been removed from the + // MSG, as it holds a reference to us, and we reference the world, which we + // don't want to leak. + mStopRecordingPromise = + mSpeechListener->mRemovedPromise + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { + SR_LOG("Shutting down encoding thread"); + return mEncodeTaskQueue->BeginShutdown(); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return ShutdownPromise::CreateAndResolve(false, __func__); + }) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr<SpeechRecognition>(this), this] { + media::MustGetShutdownBarrier()->RemoveBlocker( + mShutdownBlocker); + mShutdownBlocker = nullptr; + + MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE); + return GenericNonExclusivePromise::CreateAndResolve(true, + __func__); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return GenericNonExclusivePromise::CreateAndResolve(false, + __func__); + }); + return mStopRecordingPromise; +} + +NS_IMETHODIMP +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); + + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::No_speech, + "No speech detected (timeout)"); + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() && + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { + ProcessTestEventRequest(aSubject, nsDependentString(aData)); + } + + return NS_OK; +} + +void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, + const nsAString& aEventName) { + if (aEventName.EqualsLiteral("EVENT_ABORT")) { + Abort(); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { + DispatchError( + SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? + "AUDIO_ERROR test event"); + } else { + NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(), + "Got request for fake recognition service event, but " + "media.webspeech.test.fake_recognition_service is unset"); + + // let the fake recognition service handle the request + } +} + +already_AddRefed<SpeechGrammarList> SpeechRecognition::Grammars() const { + RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList; + return speechGrammarList.forget(); +} + +void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) { + mSpeechGrammarList = &aArg; +} + +void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; } + +void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; } + +bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const { + return mContinuous; +} + +void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) { + mContinuous = aArg; +} + +bool SpeechRecognition::InterimResults() const { return mInterimResults; } + +void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; } + +uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; } + +void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) { + mMaxAlternatives = aArg; +} + +void SpeechRecognition::GetServiceURI(nsString& aRetVal, + ErrorResult& aRv) const { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, + CallerType aCallerType, ErrorResult& aRv) { + if (mCurrentState != STATE_IDLE) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return; + } + + if (!SetRecognitionService(aRv)) { + return; + } + + if (!ValidateAndSetGrammarList(aRv)) { + return; + } + + mEncodeTaskQueue = + TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER), + "WebSpeechEncoderThread"); + + nsresult rv; + rv = mRecognitionService->Initialize(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + MediaStreamConstraints constraints; + constraints.mAudio.SetAsBoolean() = true; + + if (aStream.WasPassed()) { + mStream = &aStream.Value(); + mTrackIsOwned = false; + mStream->RegisterTrackListener(this); + nsTArray<RefPtr<AudioStreamTrack>> tracks; + mStream->GetAudioTracks(tracks); + for (const RefPtr<AudioStreamTrack>& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + break; + } + } + } else { + mTrackIsOwned = true; + nsPIDOMWindowInner* win = GetOwner(); + if (!win || !win->IsFullyActive()) { + aRv.ThrowInvalidStateError("The document is not fully active."); + return; + } + AutoNoJSAPI nojsapi; + RefPtr<SpeechRecognition> self(this); + MediaManager::Get() + ->GetUserMedia(win, constraints, aCallerType) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [this, self, + generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) { + nsTArray<RefPtr<AudioStreamTrack>> tracks; + aStream->GetAudioTracks(tracks); + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + for (const RefPtr<AudioStreamTrack>& track : tracks) { + track->Stop(); + } + return; + } + mStream = std::move(aStream); + mStream->RegisterTrackListener(this); + for (const RefPtr<AudioStreamTrack>& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + } + } + }, + [this, self, + generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) { + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + return; + } + SpeechRecognitionErrorCode errorCode; + + if (error->mName == MediaMgrError::Name::NotAllowedError) { + errorCode = SpeechRecognitionErrorCode::Not_allowed; + } else { + errorCode = SpeechRecognitionErrorCode::Audio_capture; + } + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, + error->mMessage); + }); + } + + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); + NS_DispatchToMainThread(event); +} + +bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) { + if (!GetOwner()) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + // See: + // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang + nsAutoString lang; + if (!mLang.IsEmpty()) { + lang = mLang; + } else { + nsCOMPtr<Document> document = GetOwner()->GetExtantDoc(); + if (!document) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + nsCOMPtr<Element> element = document->GetRootElement(); + if (!element) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + nsAutoString lang; + element->GetLang(lang); + } + + auto result = CreateSpeechRecognitionService(GetOwner(), this, lang); + + if (result.isErr()) { + switch (result.unwrapErr()) { + case ServiceCreationError::ServiceNotFound: + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + break; + default: + MOZ_CRASH("Unknown error"); + } + return false; + } + + mRecognitionService = result.unwrap(); + MOZ_DIAGNOSTIC_ASSERT(mRecognitionService); + return true; +} + +bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) { + if (!mSpeechGrammarList) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + uint32_t grammarListLength = mSpeechGrammarList->Length(); + for (uint32_t count = 0; count < grammarListLength; ++count) { + RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv); + if (aRv.Failed()) { + return false; + } + if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList( + speechGrammar.get(), nullptr))) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + } + + return true; +} + +void SpeechRecognition::Stop() { + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::Abort() { + if (mAborted) { + return; + } + + mAborted = true; + + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::NotifyTrackAdded( + const RefPtr<MediaStreamTrack>& aTrack) { + if (mTrack) { + return; + } + + RefPtr<AudioStreamTrack> audioTrack = aTrack->AsAudioStreamTrack(); + if (!audioTrack) { + return; + } + + if (audioTrack->Ended()) { + return; + } + + StartRecording(audioTrack); +} + +void SpeechRecognition::DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsACString& aMessage) { + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || + aErrorType == EVENT_AUDIO_ERROR, + "Invalid error type!"); + + RefPtr<SpeechRecognitionError> srError = + new SpeechRecognitionError(nullptr, nullptr, nullptr); + + srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode, + aMessage); + + RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); + event->mError = srError; + NS_DispatchToMainThread(event); +} + +/* + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. + * Updates mBufferedSamples and returns the number of samples that were + * buffered. + */ +uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, + uint32_t aSampleCount) { + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); + MOZ_ASSERT(mAudioSamplesBuffer); + + int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); + size_t samplesToCopy = + std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples); + + PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy); + + mBufferedSamples += samplesToCopy; + return samplesToCopy; +} + +/* + * Split a samples buffer starting of a given size into + * chunks of equal size. The chunks are stored in the array + * received as argument. + * Returns the offset of the end of the last chunk that was + * created. + */ +uint32_t SpeechRecognition::SplitSamplesBuffer( + const int16_t* aSamplesBuffer, uint32_t aSampleCount, + nsTArray<RefPtr<SharedBuffer>>& aResult) { + uint32_t chunkStart = 0; + + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { + CheckedInt<size_t> bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + RefPtr<SharedBuffer> chunk = SharedBuffer::Create(bufferSize); + + PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart, + mAudioSamplesPerChunk); + + aResult.AppendElement(chunk.forget()); + chunkStart += mAudioSamplesPerChunk; + } + + return chunkStart; +} + +AudioSegment* SpeechRecognition::CreateAudioSegment( + nsTArray<RefPtr<SharedBuffer>>& aChunks) { + AudioSegment* segment = new AudioSegment(); + for (uint32_t i = 0; i < aChunks.Length(); ++i) { + RefPtr<SharedBuffer> buffer = aChunks[i]; + const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); + + AutoTArray<const int16_t*, 1> channels; + channels.AppendElement(chunkData); + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, + PRINCIPAL_HANDLE_NONE); + } + + return segment; +} + +void SpeechRecognition::FeedAudioData( + nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, + MediaTrackListener* aProvider, TrackRate aTrackRate) { + NS_ASSERTION(!NS_IsMainThread(), + "FeedAudioData should not be called in the main thread"); + + // Endpointer expects to receive samples in chunks whose size is a + // multiple of its frame size. + // Since we can't assume we will receive the frames in appropriate-sized + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk + // (a multiple of Endpointer's frame size) before feeding to Endpointer. + + // ensure aSamples is deleted + RefPtr<SharedBuffer> refSamples = aSamples; + + uint32_t samplesIndex = 0; + const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); + AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend; + + // fill up our buffer and make a chunk out of it, if possible + if (mBufferedSamples > 0) { + samplesIndex += FillSamplesBuffer(samples, aDuration); + + if (mBufferedSamples == mAudioSamplesPerChunk) { + chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); + mBufferedSamples = 0; + } + } + + // create sample chunks of correct size + if (samplesIndex < aDuration) { + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, + aDuration - samplesIndex, chunksToSend); + } + + // buffer remaining samples + if (samplesIndex < aDuration) { + mBufferedSamples = 0; + CheckedInt<size_t> bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + mAudioSamplesBuffer = SharedBuffer::Create(bufferSize); + + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); + } + + AudioSegment* segment = CreateAudioSegment(chunksToSend); + RefPtr<SpeechEvent> event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA); + event->mAudioSegment = segment; + event->mProvider = aProvider; + event->mTrackRate = aTrackRate; + NS_DispatchToMainThread(event); +} + +const char* SpeechRecognition::GetName(FSMState aId) { + static const char* names[] = { + "STATE_IDLE", "STATE_STARTING", + "STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH", + "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT", + "STATE_ABORTING", + }; + + MOZ_ASSERT(aId < STATE_COUNT); + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); + return names[aId]; +} + +const char* SpeechRecognition::GetName(SpeechEvent* aEvent) { + static const char* names[] = {"EVENT_START", + "EVENT_STOP", + "EVENT_ABORT", + "EVENT_AUDIO_DATA", + "EVENT_AUDIO_ERROR", + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", + "EVENT_RECOGNITIONSERVICE_ERROR"}; + + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); + return names[aEvent->mType]; +} + +TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const { + MOZ_ASSERT(NS_IsMainThread()); + return mEncodeTaskQueue; +} + +SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>( + "SpeechEvent::SpeechEvent", aRecognition)), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(aRecognition), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::~SpeechEvent() { delete mAudioSegment; } + +NS_IMETHODIMP +SpeechEvent::Run() { + mRecognition->ProcessEvent(this); + return NS_OK; +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognition.h b/dom/media/webspeech/recognition/SpeechRecognition.h new file mode 100644 index 0000000000..687f38041e --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.h @@ -0,0 +1,314 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognition_h +#define mozilla_dom_SpeechRecognition_h + +#include "mozilla/Attributes.h" +#include "mozilla/DOMEventTargetHelper.h" +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" +#include "nsProxyRelease.h" +#include "DOMMediaStream.h" +#include "nsITimer.h" +#include "MediaTrackGraph.h" +#include "AudioSegment.h" +#include "mozilla/WeakPtr.h" + +#include "SpeechGrammarList.h" +#include "SpeechRecognitionResultList.h" +#include "nsISpeechRecognitionService.h" +#include "endpointer.h" + +#include "mozilla/dom/BindingDeclarations.h" +#include "mozilla/dom/SpeechRecognitionError.h" + +namespace mozilla { + +namespace media { +class ShutdownBlocker; +} + +namespace dom { + +#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \ + "SpeechRecognitionTest:RequestEvent" +#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" + +class GlobalObject; +class AudioStreamTrack; +class SpeechEvent; +class SpeechTrackListener; + +LogModule* GetSpeechRecognitionLog(); +#define SR_LOG(...) \ + MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +class SpeechRecognition final : public DOMEventTargetHelper, + public nsIObserver, + public DOMMediaStream::TrackListener, + public SupportsWeakPtr { + public: + explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); + + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, + DOMEventTargetHelper) + + NS_DECL_NSIOBSERVER + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); + + static already_AddRefed<SpeechRecognition> Constructor( + const GlobalObject& aGlobal, ErrorResult& aRv); + + static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal, aRv); + } + + already_AddRefed<SpeechGrammarList> Grammars() const; + + void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); + + void GetLang(nsString& aRetVal) const; + + void SetLang(const nsAString& aArg); + + bool GetContinuous(ErrorResult& aRv) const; + + void SetContinuous(bool aArg, ErrorResult& aRv); + + bool InterimResults() const; + + void SetInterimResults(bool aArg); + + uint32_t MaxAlternatives() const; + + TaskQueue* GetTaskQueueForEncoding() const; + + void SetMaxAlternatives(uint32_t aArg); + + void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; + + void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); + + void Start(const Optional<NonNull<DOMMediaStream>>& aStream, + CallerType aCallerType, ErrorResult& aRv); + + void Stop(); + + void Abort(); + + IMPL_EVENT_HANDLER(audiostart) + IMPL_EVENT_HANDLER(soundstart) + IMPL_EVENT_HANDLER(speechstart) + IMPL_EVENT_HANDLER(speechend) + IMPL_EVENT_HANDLER(soundend) + IMPL_EVENT_HANDLER(audioend) + IMPL_EVENT_HANDLER(result) + IMPL_EVENT_HANDLER(nomatch) + IMPL_EVENT_HANDLER(error) + IMPL_EVENT_HANDLER(start) + IMPL_EVENT_HANDLER(end) + + enum EventType { + EVENT_START, + EVENT_STOP, + EVENT_ABORT, + EVENT_AUDIO_DATA, + EVENT_AUDIO_ERROR, + EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, + EVENT_RECOGNITIONSERVICE_FINAL_RESULT, + EVENT_RECOGNITIONSERVICE_ERROR, + EVENT_COUNT + }; + + void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack) override; + // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are + // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16. + void DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsACString& aMessage); + template <int N> + void DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const char (&aMessage)[N]) { + DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage)); + } + uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); + uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, + uint32_t aSampleCount, + nsTArray<RefPtr<SharedBuffer>>& aResult); + AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks); + void FeedAudioData(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + already_AddRefed<SharedBuffer> aSamples, + uint32_t aDuration, MediaTrackListener* aProvider, + TrackRate aTrackRate); + + friend class SpeechEvent; + + private: + virtual ~SpeechRecognition(); + + enum FSMState { + STATE_IDLE, + STATE_STARTING, + STATE_ESTIMATING, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FOR_RESULT, + STATE_ABORTING, + STATE_COUNT + }; + + void SetState(FSMState state); + bool StateBetween(FSMState begin, FSMState end); + + bool SetRecognitionService(ErrorResult& aRv); + bool ValidateAndSetGrammarList(ErrorResult& aRv); + + NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream); + RefPtr<GenericNonExclusivePromise> StopRecording(); + + uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); + void NotifyError(SpeechEvent* aEvent); + + void ProcessEvent(SpeechEvent* aEvent); + void Transition(SpeechEvent* aEvent); + + void Reset(); + void ResetAndEnd(); + void WaitForAudioData(SpeechEvent* aEvent); + void StartedAudioCapture(SpeechEvent* aEvent); + void StopRecordingAndRecognize(SpeechEvent* aEvent); + void WaitForEstimation(SpeechEvent* aEvent); + void DetectSpeech(SpeechEvent* aEvent); + void WaitForSpeechEnd(SpeechEvent* aEvent); + void NotifyFinalResult(SpeechEvent* aEvent); + void DoNothing(SpeechEvent* aEvent); + void AbortSilently(SpeechEvent* aEvent); + void AbortError(SpeechEvent* aEvent); + + RefPtr<DOMMediaStream> mStream; + RefPtr<AudioStreamTrack> mTrack; + bool mTrackIsOwned = false; + RefPtr<GenericNonExclusivePromise> mStopRecordingPromise; + RefPtr<SpeechTrackListener> mSpeechListener; + nsCOMPtr<nsISpeechRecognitionService> mRecognitionService; + RefPtr<media::ShutdownBlocker> mShutdownBlocker; + // TaskQueue responsible for pre-processing the samples by the service + // it runs in a separate thread from the main thread + RefPtr<TaskQueue> mEncodeTaskQueue; + + // A generation ID of the MediaStream a started session is for, so that + // a gUM request that resolves after the session has stopped, and a new + // one has started, can exit early. Main thread only. Can wrap. + uint8_t mStreamGeneration = 0; + + FSMState mCurrentState; + + Endpointer mEndpointer; + uint32_t mEstimationSamples; + + uint32_t mAudioSamplesPerChunk; + + // maximum amount of seconds the engine will wait for voice + // until returning a 'no speech detected' error + uint32_t mSpeechDetectionTimeoutMs; + + // buffer holds one chunk of mAudioSamplesPerChunk + // samples before feeding it to mEndpointer + RefPtr<SharedBuffer> mAudioSamplesBuffer; + uint32_t mBufferedSamples; + + nsCOMPtr<nsITimer> mSpeechDetectionTimer; + bool mAborted; + + nsString mLang; + + RefPtr<SpeechGrammarList> mSpeechGrammarList; + + // private flag used to hold if the user called the setContinuous() method + // of the API + bool mContinuous; + + // WebSpeechAPI (http://bit.ly/1gIl7DC) states: + // + // 1. Default value MUST be false + // 2. If true, interim results SHOULD be returned + // 3. If false, interim results MUST NOT be returned + // + // Pocketsphinx does not return interm results; so, defaulting + // mInterimResults to false, then ignoring its subsequent value + // is a conforming implementation. + bool mInterimResults; + + // WebSpeechAPI (http://bit.ly/1JAiqeo) states: + // + // 1. Default value is 1 + // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives + // per result" + // + // Pocketsphinx can only return at maximum a single + // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting + // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives + // while for a 0 value returning no SpeechRecognitionAlternative per result is + // a conforming implementation. + uint32_t mMaxAlternatives; + + void ProcessTestEventRequest(nsISupports* aSubject, + const nsAString& aEventName); + + const char* GetName(FSMState aId); + const char* GetName(SpeechEvent* aEvent); +}; + +class SpeechEvent : public Runnable { + public: + SpeechEvent(SpeechRecognition* aRecognition, + SpeechRecognition::EventType aType); + SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, + SpeechRecognition::EventType aType); + + ~SpeechEvent(); + + NS_IMETHOD Run() override; + AudioSegment* mAudioSegment; + RefPtr<SpeechRecognitionResultList> + mRecognitionResultList; // TODO: make this a session being passed which + // also has index and stuff + RefPtr<SpeechRecognitionError> mError; + + friend class SpeechRecognition; + + private: + nsMainThreadPtrHandle<SpeechRecognition> mRecognition; + + // for AUDIO_DATA events, keep a reference to the provider + // of the data (i.e., the SpeechTrackListener) to ensure it + // is kept alive (and keeps SpeechRecognition alive) until this + // event gets processed. + RefPtr<MediaTrackListener> mProvider; + SpeechRecognition::EventType mType; + TrackRate mTrackRate; +}; + +} // namespace dom + +inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) { + return ToSupports(static_cast<DOMEventTargetHelper*>(aRec)); +} + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp new file mode 100644 index 0000000000..4dee9090a7 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp @@ -0,0 +1,44 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionAlternative.h" + +#include "mozilla/dom/SpeechRecognitionAlternativeBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionAlternative, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionAlternative) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionAlternative) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionAlternative) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionAlternative::SpeechRecognitionAlternative( + SpeechRecognition* aParent) + : mConfidence(0), mParent(aParent) {} + +SpeechRecognitionAlternative::~SpeechRecognitionAlternative() = default; + +JSObject* SpeechRecognitionAlternative::WrapObject( + JSContext* aCx, JS::Handle<JSObject*> aGivenProto) { + return SpeechRecognitionAlternative_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechRecognitionAlternative::GetParentObject() const { + return static_cast<EventTarget*>(mParent.get()); +} + +void SpeechRecognitionAlternative::GetTranscript(nsString& aRetVal) const { + aRetVal = mTranscript; +} + +float SpeechRecognitionAlternative::Confidence() const { return mConfidence; } + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h new file mode 100644 index 0000000000..017d869943 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h @@ -0,0 +1,49 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionAlternative_h +#define mozilla_dom_SpeechRecognitionAlternative_h + +#include "nsCycleCollectionParticipant.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +namespace mozilla::dom { + +class SpeechRecognition; + +class SpeechRecognitionAlternative final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionAlternative(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionAlternative) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + void GetTranscript(nsString& aRetVal) const; + + float Confidence() const; + + nsString mTranscript; + float mConfidence; + + private: + ~SpeechRecognitionAlternative(); + + RefPtr<SpeechRecognition> mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp new file mode 100644 index 0000000000..009281b234 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp @@ -0,0 +1,59 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionResult.h" +#include "mozilla/dom/SpeechRecognitionResultBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResult, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResult) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResult) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResult) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionResult::SpeechRecognitionResult(SpeechRecognition* aParent) + : mParent(aParent) {} + +SpeechRecognitionResult::~SpeechRecognitionResult() = default; + +JSObject* SpeechRecognitionResult::WrapObject( + JSContext* aCx, JS::Handle<JSObject*> aGivenProto) { + return SpeechRecognitionResult_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechRecognitionResult::GetParentObject() const { + return static_cast<EventTarget*>(mParent.get()); +} + +already_AddRefed<SpeechRecognitionAlternative> +SpeechRecognitionResult::IndexedGetter(uint32_t aIndex, bool& aPresent) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + + aPresent = true; + return Item(aIndex); +} + +uint32_t SpeechRecognitionResult::Length() const { return mItems.Length(); } + +already_AddRefed<SpeechRecognitionAlternative> SpeechRecognitionResult::Item( + uint32_t aIndex) { + RefPtr<SpeechRecognitionAlternative> alternative = mItems.ElementAt(aIndex); + return alternative.forget(); +} + +bool SpeechRecognitionResult::IsFinal() const { + return true; // TODO +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.h b/dom/media/webspeech/recognition/SpeechRecognitionResult.h new file mode 100644 index 0000000000..fc9e8fd660 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.h @@ -0,0 +1,54 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionResult_h +#define mozilla_dom_SpeechRecognitionResult_h + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +#include "SpeechRecognitionAlternative.h" + +namespace mozilla::dom { + +class SpeechRecognitionResult final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionResult(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResult) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed<SpeechRecognitionAlternative> Item(uint32_t aIndex); + + bool IsFinal() const; + + already_AddRefed<SpeechRecognitionAlternative> IndexedGetter(uint32_t aIndex, + bool& aPresent); + + nsTArray<RefPtr<SpeechRecognitionAlternative>> mItems; + + private: + ~SpeechRecognitionResult(); + + RefPtr<SpeechRecognition> mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp new file mode 100644 index 0000000000..2aa81a5982 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionResultList.h" + +#include "mozilla/dom/SpeechRecognitionResultListBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResultList, mParent, + mItems) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResultList) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResultList) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResultList) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionResultList::SpeechRecognitionResultList( + SpeechRecognition* aParent) + : mParent(aParent) {} + +SpeechRecognitionResultList::~SpeechRecognitionResultList() = default; + +nsISupports* SpeechRecognitionResultList::GetParentObject() const { + return static_cast<EventTarget*>(mParent.get()); +} + +JSObject* SpeechRecognitionResultList::WrapObject( + JSContext* aCx, JS::Handle<JSObject*> aGivenProto) { + return SpeechRecognitionResultList_Binding::Wrap(aCx, this, aGivenProto); +} + +already_AddRefed<SpeechRecognitionResult> +SpeechRecognitionResultList::IndexedGetter(uint32_t aIndex, bool& aPresent) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + + aPresent = true; + return Item(aIndex); +} + +uint32_t SpeechRecognitionResultList::Length() const { return mItems.Length(); } + +already_AddRefed<SpeechRecognitionResult> SpeechRecognitionResultList::Item( + uint32_t aIndex) { + RefPtr<SpeechRecognitionResult> result = mItems.ElementAt(aIndex); + return result.forget(); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.h b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h new file mode 100644 index 0000000000..b45659564b --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h @@ -0,0 +1,53 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionResultList_h +#define mozilla_dom_SpeechRecognitionResultList_h + +#include "nsCycleCollectionParticipant.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +#include "SpeechRecognitionResult.h" + +namespace mozilla::dom { + +class SpeechRecognition; + +class SpeechRecognitionResultList final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionResultList(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResultList) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle<JSObject*> aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed<SpeechRecognitionResult> Item(uint32_t aIndex); + + already_AddRefed<SpeechRecognitionResult> IndexedGetter(uint32_t aIndex, + bool& aPresent); + + nsTArray<RefPtr<SpeechRecognitionResult>> mItems; + + private: + ~SpeechRecognitionResultList(); + + RefPtr<SpeechRecognition> mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.cpp b/dom/media/webspeech/recognition/SpeechTrackListener.cpp new file mode 100644 index 0000000000..6e3dcbac85 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechTrackListener.cpp @@ -0,0 +1,100 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechTrackListener.h" + +#include "SpeechRecognition.h" +#include "nsProxyRelease.h" + +namespace mozilla::dom { + +SpeechTrackListener::SpeechTrackListener(SpeechRecognition* aRecognition) + : mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>( + "SpeechTrackListener::SpeechTrackListener", aRecognition, false)), + mRemovedPromise( + mRemovedHolder.Ensure("SpeechTrackListener::mRemovedPromise")) { + MOZ_ASSERT(NS_IsMainThread()); +} + +already_AddRefed<SpeechTrackListener> SpeechTrackListener::Create( + SpeechRecognition* aRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + RefPtr<SpeechTrackListener> listener = new SpeechTrackListener(aRecognition); + + listener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [listener] { listener->mRecognition = nullptr; }); + + return listener.forget(); +} + +void SpeechTrackListener::NotifyQueuedChanges( + MediaTrackGraph* aGraph, TrackTime aTrackOffset, + const MediaSegment& aQueuedMedia) { + AudioSegment* audio = const_cast<AudioSegment*>( + static_cast<const AudioSegment*>(&aQueuedMedia)); + + AudioSegment::ChunkIterator iterator(*audio); + while (!iterator.IsEnded()) { + // Skip over-large chunks so we don't crash! + if (iterator->GetDuration() > INT_MAX) { + continue; + } + int duration = int(iterator->GetDuration()); + + if (iterator->IsNull()) { + nsTArray<int16_t> nullData; + PodZero(nullData.AppendElements(duration), duration); + ConvertAndDispatchAudioChunk(duration, iterator->mVolume, + nullData.Elements(), aGraph->GraphRate()); + } else { + AudioSampleFormat format = iterator->mBufferFormat; + + MOZ_ASSERT(format == AUDIO_FORMAT_S16 || format == AUDIO_FORMAT_FLOAT32); + + if (format == AUDIO_FORMAT_S16) { + ConvertAndDispatchAudioChunk( + duration, iterator->mVolume, + static_cast<const int16_t*>(iterator->mChannelData[0]), + aGraph->GraphRate()); + } else if (format == AUDIO_FORMAT_FLOAT32) { + ConvertAndDispatchAudioChunk( + duration, iterator->mVolume, + static_cast<const float*>(iterator->mChannelData[0]), + aGraph->GraphRate()); + } + } + + iterator.Next(); + } +} + +template <typename SampleFormatType> +void SpeechTrackListener::ConvertAndDispatchAudioChunk(int aDuration, + float aVolume, + SampleFormatType* aData, + TrackRate aTrackRate) { + CheckedInt<size_t> bufferSize(sizeof(int16_t)); + bufferSize *= aDuration; + bufferSize *= 1; // channel + RefPtr<SharedBuffer> samples(SharedBuffer::Create(bufferSize)); + + int16_t* to = static_cast<int16_t*>(samples->Data()); + ConvertAudioSamplesWithScale(aData, to, aDuration, aVolume); + + mRecognition->FeedAudioData(mRecognition, samples.forget(), aDuration, this, + aTrackRate); +} + +void SpeechTrackListener::NotifyEnded(MediaTrackGraph* aGraph) { + // TODO dispatch SpeechEnd event so services can be informed +} + +void SpeechTrackListener::NotifyRemoved(MediaTrackGraph* aGraph) { + mRemovedHolder.ResolveIfExists(true, __func__); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.h b/dom/media/webspeech/recognition/SpeechTrackListener.h new file mode 100644 index 0000000000..aa41be379c --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechTrackListener.h @@ -0,0 +1,55 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechStreamListener_h +#define mozilla_dom_SpeechStreamListener_h + +#include "MediaTrackGraph.h" +#include "MediaTrackListener.h" +#include "AudioSegment.h" +#include "mozilla/MozPromise.h" + +namespace mozilla { + +class AudioSegment; + +namespace dom { + +class SpeechRecognition; + +class SpeechTrackListener : public MediaTrackListener { + private: + explicit SpeechTrackListener(SpeechRecognition* aRecognition); + + public: + static already_AddRefed<SpeechTrackListener> Create( + SpeechRecognition* aRecognition); + + ~SpeechTrackListener() = default; + + void NotifyQueuedChanges(MediaTrackGraph* aGraph, TrackTime aTrackOffset, + const MediaSegment& aQueuedMedia) override; + + void NotifyEnded(MediaTrackGraph* aGraph) override; + + void NotifyRemoved(MediaTrackGraph* aGraph) override; + + private: + template <typename SampleFormatType> + void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, + SampleFormatType* aData, + TrackRate aTrackRate); + nsMainThreadPtrHandle<SpeechRecognition> mRecognition; + MozPromiseHolder<GenericNonExclusivePromise> mRemovedHolder; + + public: + const RefPtr<GenericNonExclusivePromise> mRemovedPromise; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/endpointer.cc b/dom/media/webspeech/recognition/endpointer.cc new file mode 100644 index 0000000000..2347043d4b --- /dev/null +++ b/dom/media/webspeech/recognition/endpointer.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "endpointer.h" + +#include "AudioSegment.h" + +namespace { +const int kFrameRate = 200; // 1 frame = 5ms of audio. +} + +namespace mozilla { + +Endpointer::Endpointer(int sample_rate) + : speech_input_possibly_complete_silence_length_us_(-1), + speech_input_complete_silence_length_us_(-1), + audio_frame_time_us_(0), + sample_rate_(sample_rate), + frame_size_(0) { + Reset(); + + frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); + + speech_input_minimum_length_us_ = + static_cast<int64_t>(1.7 * 1000000); + speech_input_complete_silence_length_us_ = + static_cast<int64_t>(0.5 * 1000000); + long_speech_input_complete_silence_length_us_ = -1; + long_speech_length_us_ = -1; + speech_input_possibly_complete_silence_length_us_ = + 1 * 1000000; + + // Set the default configuration for Push To Talk mode. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(1000.0f); + ep_config.set_min_decision_threshold(50.0f); + ep_config.set_fast_update_dur(0.2f); + ep_config.set_sample_rate(static_cast<float>(sample_rate)); + ep_config.set_min_fundamental_frequency(57.143f); + ep_config.set_max_fundamental_frequency(400.0f); + ep_config.set_contamination_rejection_period(0.25f); + energy_endpointer_.Init(ep_config); +} + +void Endpointer::Reset() { + old_ep_status_ = EP_PRE_SPEECH; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + speech_previously_detected_ = false; + speech_input_complete_ = false; + audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. + speech_end_time_us_ = -1; + speech_start_time_us_ = -1; +} + +void Endpointer::StartSession() { + Reset(); + energy_endpointer_.StartSession(); +} + +void Endpointer::EndSession() { + energy_endpointer_.EndSession(); +} + +void Endpointer::SetEnvironmentEstimationMode() { + Reset(); + energy_endpointer_.SetEnvironmentEstimationMode(); +} + +void Endpointer::SetUserInputMode() { + energy_endpointer_.SetUserInputMode(); +} + +EpStatus Endpointer::Status(int64_t *time) { + return energy_endpointer_.Status(time); +} + +EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { + MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format"); + const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]); + const int num_samples = raw_audio.mDuration; + EpStatus ep_status = EP_PRE_SPEECH; + + // Process the input data in blocks of frame_size_, dropping any incomplete + // frames at the end (which is ok since typically the caller will be recording + // audio in multiples of our frame size). + int sample_index = 0; + while (sample_index + frame_size_ <= num_samples) { + // Have the endpointer process the frame. + energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, + audio_data + sample_index, + frame_size_, + rms_out); + sample_index += frame_size_; + audio_frame_time_us_ += (frame_size_ * 1000000) / + sample_rate_; + + // Get the status of the endpointer. + int64_t ep_time; + ep_status = energy_endpointer_.Status(&ep_time); + if (old_ep_status_ != ep_status) + fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status); + + // Handle state changes. + if ((EP_SPEECH_PRESENT == ep_status) && + (EP_POSSIBLE_ONSET == old_ep_status_)) { + speech_end_time_us_ = -1; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + // Trigger SpeechInputDidStart event on first detection. + if (false == speech_previously_detected_) { + speech_previously_detected_ = true; + speech_start_time_us_ = ep_time; + } + } + if ((EP_PRE_SPEECH == ep_status) && + (EP_POSSIBLE_OFFSET == old_ep_status_)) { + speech_end_time_us_ = ep_time; + waiting_for_speech_possibly_complete_timeout_ = true; + waiting_for_speech_complete_timeout_ = true; + } + if (ep_time > speech_input_minimum_length_us_) { + // Speech possibly complete timeout. + if ((waiting_for_speech_possibly_complete_timeout_) && + (ep_time - speech_end_time_us_ > + speech_input_possibly_complete_silence_length_us_)) { + waiting_for_speech_possibly_complete_timeout_ = false; + } + if (waiting_for_speech_complete_timeout_) { + // The length of the silence timeout period can be held constant, or it + // can be changed after a fixed amount of time from the beginning of + // speech. + bool has_stepped_silence = + (long_speech_length_us_ > 0) && + (long_speech_input_complete_silence_length_us_ > 0); + int64_t requested_silence_length; + if (has_stepped_silence && + (ep_time - speech_start_time_us_) > long_speech_length_us_) { + requested_silence_length = + long_speech_input_complete_silence_length_us_; + } else { + requested_silence_length = + speech_input_complete_silence_length_us_; + } + + // Speech complete timeout. + if ((ep_time - speech_end_time_us_) > requested_silence_length) { + waiting_for_speech_complete_timeout_ = false; + speech_input_complete_ = true; + } + } + } + old_ep_status_ = ep_status; + } + return ep_status; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/endpointer.h b/dom/media/webspeech/recognition/endpointer.h new file mode 100644 index 0000000000..7879d6b9f3 --- /dev/null +++ b/dom/media/webspeech/recognition/endpointer.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ + +#include "energy_endpointer.h" + +namespace mozilla { + +struct AudioChunk; + +// A simple interface to the underlying energy-endpointer implementation, this +// class lets callers provide audio as being recorded and let them poll to find +// when the user has stopped speaking. +// +// There are two events that may trigger the end of speech: +// +// speechInputPossiblyComplete event: +// +// Signals that silence/noise has been detected for a *short* amount of +// time after some speech has been detected. It can be used for low latency +// UI feedback. To disable it, set it to a large amount. +// +// speechInputComplete event: +// +// This event is intended to signal end of input and to stop recording. +// The amount of time to wait after speech is set by +// speech_input_complete_silence_length_ and optionally two other +// parameters (see below). +// This time can be held constant, or can change as more speech is detected. +// In the latter case, the time changes after a set amount of time from the +// *beginning* of speech. This is motivated by the expectation that there +// will be two distinct types of inputs: short search queries and longer +// dictation style input. +// +// Three parameters are used to define the piecewise constant timeout function. +// The timeout length is speech_input_complete_silence_length until +// long_speech_length, when it changes to +// long_speech_input_complete_silence_length. +class Endpointer { + public: + explicit Endpointer(int sample_rate); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Process a segment of audio, which may be more than one frame. + // The status of the last frame will be returned. + EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); + + // Get the status of the endpointer. + EpStatus Status(int64_t *time_us); + + // Get the expected frame size for audio chunks. Audio chunks are expected + // to contain a number of samples that is a multiple of this number, and extra + // samples will be dropped. + int32_t FrameSize() const { + return frame_size_; + } + + // Returns true if the endpointer detected reasonable audio levels above + // background noise which could be user speech, false if not. + bool DidStartReceivingSpeech() const { + return speech_previously_detected_; + } + + bool IsEstimatingEnvironment() const { + return energy_endpointer_.estimating_environment(); + } + + void set_speech_input_complete_silence_length(int64_t time_us) { + speech_input_complete_silence_length_us_ = time_us; + } + + void set_long_speech_input_complete_silence_length(int64_t time_us) { + long_speech_input_complete_silence_length_us_ = time_us; + } + + void set_speech_input_possibly_complete_silence_length(int64_t time_us) { + speech_input_possibly_complete_silence_length_us_ = time_us; + } + + void set_long_speech_length(int64_t time_us) { + long_speech_length_us_ = time_us; + } + + bool speech_input_complete() const { + return speech_input_complete_; + } + + // RMS background noise level in dB. + float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } + + private: + // Reset internal states. Helper method common to initial input utterance + // and following input utternaces. + void Reset(); + + // Minimum allowable length of speech input. + int64_t speech_input_minimum_length_us_; + + // The speechInputPossiblyComplete event signals that silence/noise has been + // detected for a *short* amount of time after some speech has been detected. + // This proporty specifies the time period. + int64_t speech_input_possibly_complete_silence_length_us_; + + // The speechInputComplete event signals that silence/noise has been + // detected for a *long* amount of time after some speech has been detected. + // This property specifies the time period. + int64_t speech_input_complete_silence_length_us_; + + // Same as above, this specifies the required silence period after speech + // detection. This period is used instead of + // speech_input_complete_silence_length_ when the utterance is longer than + // long_speech_length_. This parameter is optional. + int64_t long_speech_input_complete_silence_length_us_; + + // The period of time after which the endpointer should consider + // long_speech_input_complete_silence_length_ as a valid silence period + // instead of speech_input_complete_silence_length_. This parameter is + // optional. + int64_t long_speech_length_us_; + + // First speech onset time, used in determination of speech complete timeout. + int64_t speech_start_time_us_; + + // Most recent end time, used in determination of speech complete timeout. + int64_t speech_end_time_us_; + + int64_t audio_frame_time_us_; + EpStatus old_ep_status_; + bool waiting_for_speech_possibly_complete_timeout_; + bool waiting_for_speech_complete_timeout_; + bool speech_previously_detected_; + bool speech_input_complete_; + EnergyEndpointer energy_endpointer_; + int sample_rate_; + int32_t frame_size_; +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ diff --git a/dom/media/webspeech/recognition/energy_endpointer.cc b/dom/media/webspeech/recognition/energy_endpointer.cc new file mode 100644 index 0000000000..b1c1ee0bcf --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "energy_endpointer.h" + +#include <math.h> + +namespace { + +// Returns the RMS (quadratic mean) of the input signal. +float RMS(const int16_t* samples, int num_samples) { + int64_t ssq_int64_t = 0; + int64_t sum_int64_t = 0; + for (int i = 0; i < num_samples; ++i) { + sum_int64_t += samples[i]; + ssq_int64_t += samples[i] * samples[i]; + } + // now convert to floats. + double sum = static_cast<double>(sum_int64_t); + sum /= num_samples; + double ssq = static_cast<double>(ssq_int64_t); + return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); +} + +int64_t Secs2Usecs(float seconds) { + return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); +} + +float GetDecibel(float value) { + if (value > 1.0e-100) + return 20 * log10(value); + return -2000.0; +} + +} // namespace + +namespace mozilla { + +// Stores threshold-crossing histories for making decisions about the speech +// state. +class EnergyEndpointer::HistoryRing { + public: + HistoryRing() : insertion_index_(0) {} + + // Resets the ring to |size| elements each with state |initial_state| + void SetRing(int size, bool initial_state); + + // Inserts a new entry into the ring and drops the oldest entry. + void Insert(int64_t time_us, bool decision); + + // Returns the time in microseconds of the most recently added entry. + int64_t EndTime() const; + + // Returns the sum of all intervals during which 'decision' is true within + // the time in seconds specified by 'duration'. The returned interval is + // in seconds. + float RingSum(float duration_sec); + + private: + struct DecisionPoint { + int64_t time_us; + bool decision; + }; + + std::vector<DecisionPoint> decision_points_; + int insertion_index_; // Index at which the next item gets added/inserted. + + HistoryRing(const HistoryRing&); + void operator=(const HistoryRing&); +}; + +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { + insertion_index_ = 0; + decision_points_.clear(); + DecisionPoint init = { -1, initial_state }; + decision_points_.resize(size, init); +} + +void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { + decision_points_[insertion_index_].time_us = time_us; + decision_points_[insertion_index_].decision = decision; + insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); +} + +int64_t EnergyEndpointer::HistoryRing::EndTime() const { + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + return decision_points_[ind].time_us; +} + +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { + if (decision_points_.empty()) + return 0.0; + + int64_t sum_us = 0; + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + int64_t end_us = decision_points_[ind].time_us; + bool is_on = decision_points_[ind].decision; + int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); + if (start_us < 0) + start_us = 0; + size_t n_summed = 1; // n points ==> (n-1) intervals + while ((decision_points_[ind].time_us > start_us) && + (n_summed < decision_points_.size())) { + --ind; + if (ind < 0) + ind = decision_points_.size() - 1; + if (is_on) + sum_us += end_us - decision_points_[ind].time_us; + is_on = decision_points_[ind].decision; + end_us = decision_points_[ind].time_us; + n_summed++; + } + + return 1.0e-6f * sum_us; // Returns total time that was super threshold. +} + +EnergyEndpointer::EnergyEndpointer() + : status_(EP_PRE_SPEECH), + offset_confirm_dur_sec_(0), + endpointer_time_us_(0), + fast_update_frames_(0), + frame_counter_(0), + max_window_dur_(4.0), + sample_rate_(0), + history_(new HistoryRing()), + decision_threshold_(0), + estimating_environment_(false), + noise_level_(0), + rms_adapt_(0), + start_lag_(0), + end_lag_(0), + user_input_start_time_us_(0) { +} + +EnergyEndpointer::~EnergyEndpointer() { +} + +int EnergyEndpointer::TimeToFrame(float time) const { + return static_cast<int32_t>(0.5 + (time / params_.frame_period())); +} + +void EnergyEndpointer::Restart(bool reset_threshold) { + status_ = EP_PRE_SPEECH; + user_input_start_time_us_ = 0; + + if (reset_threshold) { + decision_threshold_ = params_.decision_threshold(); + rms_adapt_ = decision_threshold_; + noise_level_ = params_.decision_threshold() / 2.0f; + frame_counter_ = 0; // Used for rapid initial update of levels. + } + + // Set up the memories to hold the history windows. + history_->SetRing(TimeToFrame(max_window_dur_), false); + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; +} + +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { + params_ = params; + + // Find the longest history interval to be used, and make the ring + // large enough to accommodate that number of frames. NOTE: This + // depends upon ep_frame_period being set correctly in the factory + // that did this instantiation. + max_window_dur_ = params_.onset_window(); + if (params_.speech_on_window() > max_window_dur_) + max_window_dur_ = params_.speech_on_window(); + if (params_.offset_window() > max_window_dur_) + max_window_dur_ = params_.offset_window(); + Restart(true); + + offset_confirm_dur_sec_ = params_.offset_window() - + params_.offset_confirm_dur(); + if (offset_confirm_dur_sec_ < 0.0) + offset_confirm_dur_sec_ = 0.0; + + user_input_start_time_us_ = 0; + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; + // The initial value of the noise and speech levels is inconsequential. + // The level of the first frame will overwrite these values. + noise_level_ = params_.decision_threshold() / 2.0f; + fast_update_frames_ = + static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); + + frame_counter_ = 0; // Used for rapid initial update of levels. + + sample_rate_ = params_.sample_rate(); + start_lag_ = static_cast<int>(sample_rate_ / + params_.max_fundamental_frequency()); + end_lag_ = static_cast<int>(sample_rate_ / + params_.min_fundamental_frequency()); +} + +void EnergyEndpointer::StartSession() { + Restart(true); +} + +void EnergyEndpointer::EndSession() { + status_ = EP_POST_SPEECH; +} + +void EnergyEndpointer::SetEnvironmentEstimationMode() { + Restart(true); + estimating_environment_ = true; +} + +void EnergyEndpointer::SetUserInputMode() { + estimating_environment_ = false; + user_input_start_time_us_ = endpointer_time_us_; +} + +void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, + const int16_t* samples, + int num_samples, + float* rms_out) { + endpointer_time_us_ = time_us; + float rms = RMS(samples, num_samples); + + // Check that this is user input audio vs. pre-input adaptation audio. + // Input audio starts when the user indicates start of input, by e.g. + // pressing push-to-talk. Audio recieved prior to that is used to update + // noise and speech level estimates. + if (!estimating_environment_) { + bool decision = false; + if ((endpointer_time_us_ - user_input_start_time_us_) < + Secs2Usecs(params_.contamination_rejection_period())) { + decision = false; + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); + } else { + decision = (rms > decision_threshold_); + } + + history_->Insert(endpointer_time_us_, decision); + + switch (status_) { + case EP_PRE_SPEECH: + if (history_->RingSum(params_.onset_window()) > + params_.onset_detect_dur()) { + status_ = EP_POSSIBLE_ONSET; + } + break; + + case EP_POSSIBLE_ONSET: { + float tsum = history_->RingSum(params_.onset_window()); + if (tsum > params_.onset_confirm_dur()) { + status_ = EP_SPEECH_PRESENT; + } else { // If signal is not maintained, drop back to pre-speech. + if (tsum <= params_.onset_detect_dur()) + status_ = EP_PRE_SPEECH; + } + break; + } + + case EP_SPEECH_PRESENT: { + // To induce hysteresis in the state residency, we allow a + // smaller residency time in the on_ring, than was required to + // enter the SPEECH_PERSENT state. + float on_time = history_->RingSum(params_.speech_on_window()); + if (on_time < params_.on_maintain_dur()) + status_ = EP_POSSIBLE_OFFSET; + break; + } + + case EP_POSSIBLE_OFFSET: + if (history_->RingSum(params_.offset_window()) <= + offset_confirm_dur_sec_) { + // Note that this offset time may be beyond the end + // of the input buffer in a real-time system. It will be up + // to the RecognizerSession to decide what to do. + status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. + } else { // If speech picks up again we allow return to SPEECH_PRESENT. + if (history_->RingSum(params_.speech_on_window()) >= + params_.on_maintain_dur()) + status_ = EP_SPEECH_PRESENT; + } + break; + + default: + break; + } + + // If this is a quiet, non-speech region, slowly adapt the detection + // threshold to be about 6dB above the average RMS. + if ((!decision) && (status_ == EP_PRE_SPEECH)) { + decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); + rms_adapt_ = decision_threshold_; + } else { + // If this is in a speech region, adapt the decision threshold to + // be about 10dB below the average RMS. If the noise level is high, + // the threshold is pushed up. + // Adaptation up to a higher level is 5 times faster than decay to + // a lower level. + if ((status_ == EP_SPEECH_PRESENT) && decision) { + if (rms_adapt_ > rms) { + rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); + } else { + rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); + } + float target_threshold = 0.3f * rms_adapt_ + noise_level_; + decision_threshold_ = (.90f * decision_threshold_) + + (0.10f * target_threshold); + } + } + + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } + + // Update speech and noise levels. + UpdateLevels(rms); + ++frame_counter_; + + if (rms_out) + *rms_out = GetDecibel(rms); +} + +float EnergyEndpointer::GetNoiseLevelDb() const { + return GetDecibel(noise_level_); +} + +void EnergyEndpointer::UpdateLevels(float rms) { + // Update quickly initially. We assume this is noise and that + // speech is 6dB above the noise. + if (frame_counter_ < fast_update_frames_) { + // Alpha increases from 0 to (k-1)/k where k is the number of time + // steps in the initial adaptation period. + float alpha = static_cast<float>(frame_counter_) / + static_cast<float>(fast_update_frames_); + noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); + } else { + // Update Noise level. The noise level adapts quickly downward, but + // slowly upward. The noise_level_ parameter is not currently used + // for threshold adaptation. It is used for UI feedback. + if (noise_level_ < rms) + noise_level_ = (0.999f * noise_level_) + (0.001f * rms); + else + noise_level_ = (0.95f * noise_level_) + (0.05f * rms); + } + if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { + decision_threshold_ = noise_level_ * 2; // 6dB above noise level. + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } +} + +EpStatus EnergyEndpointer::Status(int64_t* status_time) const { + *status_time = history_->EndTime(); + return status_; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/energy_endpointer.h b/dom/media/webspeech/recognition/energy_endpointer.h new file mode 100644 index 0000000000..12d3c736e3 --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The EnergyEndpointer class finds likely speech onset and offset points. +// +// The implementation described here is about the simplest possible. +// It is based on timings of threshold crossings for overall signal +// RMS. It is suitable for light weight applications. +// +// As written, the basic idea is that one specifies intervals that +// must be occupied by super- and sub-threshold energy levels, and +// defers decisions re onset and offset times until these +// specifications have been met. Three basic intervals are tested: an +// onset window, a speech-on window, and an offset window. We require +// super-threshold to exceed some mimimum total durations in the onset +// and speech-on windows before declaring the speech onset time, and +// we specify a required sub-threshold residency in the offset window +// before declaring speech offset. As the various residency requirements are +// met, the EnergyEndpointer instance assumes various states, and can return the +// ID of these states to the client (see EpStatus below). +// +// The levels of the speech and background noise are continuously updated. It is +// important that the background noise level be estimated initially for +// robustness in noisy conditions. The first frames are assumed to be background +// noise and a fast update rate is used for the noise level. The duration for +// fast update is controlled by the fast_update_dur_ paramter. +// +// If used in noisy conditions, the endpointer should be started and run in the +// EnvironmentEstimation mode, for at least 200ms, before switching to +// UserInputMode. +// Audio feedback contamination can appear in the input audio, if not cut +// out or handled by echo cancellation. Audio feedback can trigger a false +// accept. The false accepts can be ignored by setting +// ep_contamination_rejection_period. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ + +#include <vector> + +#include "mozilla/UniquePtr.h" + +#include "energy_endpointer_params.h" + +namespace mozilla { + +// Endpointer status codes +enum EpStatus { + EP_PRE_SPEECH = 10, + EP_POSSIBLE_ONSET, + EP_SPEECH_PRESENT, + EP_POSSIBLE_OFFSET, + EP_POST_SPEECH, +}; + +class EnergyEndpointer { + public: + // The default construction MUST be followed by Init(), before any + // other use can be made of the instance. + EnergyEndpointer(); + virtual ~EnergyEndpointer(); + + void Init(const EnergyEndpointerParams& params); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Computes the next input frame and modifies EnergyEndpointer status as + // appropriate based on the computation. + void ProcessAudioFrame(int64_t time_us, + const int16_t* samples, int num_samples, + float* rms_out); + + // Returns the current state of the EnergyEndpointer and the time + // corresponding to the most recently computed frame. + EpStatus Status(int64_t* status_time_us) const; + + bool estimating_environment() const { + return estimating_environment_; + } + + // Returns estimated noise level in dB. + float GetNoiseLevelDb() const; + + private: + class HistoryRing; + + // Resets the endpointer internal state. If reset_threshold is true, the + // state will be reset completely, including adaptive thresholds and the + // removal of all history information. + void Restart(bool reset_threshold); + + // Update internal speech and noise levels. + void UpdateLevels(float rms); + + // Returns the number of frames (or frame number) corresponding to + // the 'time' (in seconds). + int TimeToFrame(float time) const; + + EpStatus status_; // The current state of this instance. + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH + int64_t endpointer_time_us_; // Time of the most recently received audio frame. + int64_t fast_update_frames_; // Number of frames for initial level adaptation. + int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. + float max_window_dur_; // Largest search window size (seconds) + float sample_rate_; // Sampling rate. + + // Ring buffers to hold the speech activity history. + UniquePtr<HistoryRing> history_; + + // Configuration parameters. + EnergyEndpointerParams params_; + + // RMS which must be exceeded to conclude frame is speech. + float decision_threshold_; + + // Flag to indicate that audio should be used to estimate environment, prior + // to receiving user input. + bool estimating_environment_; + + // Estimate of the background noise level. Used externally for UI feedback. + float noise_level_; + + // An adaptive threshold used to update decision_threshold_ when appropriate. + float rms_adapt_; + + // Start lag corresponds to the highest fundamental frequency. + int start_lag_; + + // End lag corresponds to the lowest fundamental frequency. + int end_lag_; + + // Time when mode switched from environment estimation to user input. This + // is used to time forced rejection of audio feedback contamination. + int64_t user_input_start_time_us_; + + // prevent copy constructor and assignment + EnergyEndpointer(const EnergyEndpointer&); + void operator=(const EnergyEndpointer&); +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.cc b/dom/media/webspeech/recognition/energy_endpointer_params.cc new file mode 100644 index 0000000000..cac4f1b238 --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer_params.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "energy_endpointer_params.h" + +namespace mozilla { + +EnergyEndpointerParams::EnergyEndpointerParams() { + SetDefaults(); +} + +void EnergyEndpointerParams::SetDefaults() { + frame_period_ = 0.01f; + frame_duration_ = 0.01f; + endpoint_margin_ = 0.2f; + onset_window_ = 0.15f; + speech_on_window_ = 0.4f; + offset_window_ = 0.15f; + onset_detect_dur_ = 0.09f; + onset_confirm_dur_ = 0.075f; + on_maintain_dur_ = 0.10f; + offset_confirm_dur_ = 0.12f; + decision_threshold_ = 150.0f; + min_decision_threshold_ = 50.0f; + fast_update_dur_ = 0.2f; + sample_rate_ = 8000.0f; + min_fundamental_frequency_ = 57.143f; + max_fundamental_frequency_ = 400.0f; + contamination_rejection_period_ = 0.25f; +} + +void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) { + frame_period_ = source.frame_period(); + frame_duration_ = source.frame_duration(); + endpoint_margin_ = source.endpoint_margin(); + onset_window_ = source.onset_window(); + speech_on_window_ = source.speech_on_window(); + offset_window_ = source.offset_window(); + onset_detect_dur_ = source.onset_detect_dur(); + onset_confirm_dur_ = source.onset_confirm_dur(); + on_maintain_dur_ = source.on_maintain_dur(); + offset_confirm_dur_ = source.offset_confirm_dur(); + decision_threshold_ = source.decision_threshold(); + min_decision_threshold_ = source.min_decision_threshold(); + fast_update_dur_ = source.fast_update_dur(); + sample_rate_ = source.sample_rate(); + min_fundamental_frequency_ = source.min_fundamental_frequency(); + max_fundamental_frequency_ = source.max_fundamental_frequency(); + contamination_rejection_period_ = source.contamination_rejection_period(); +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.h b/dom/media/webspeech/recognition/energy_endpointer_params.h new file mode 100644 index 0000000000..6437c6dc0f --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer_params.h @@ -0,0 +1,159 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ + +namespace mozilla { + +// Input parameters for the EnergyEndpointer class. +class EnergyEndpointerParams { + public: + EnergyEndpointerParams(); + + void SetDefaults(); + + void operator=(const EnergyEndpointerParams& source); + + // Accessors and mutators + float frame_period() const { return frame_period_; } + void set_frame_period(float frame_period) { + frame_period_ = frame_period; + } + + float frame_duration() const { return frame_duration_; } + void set_frame_duration(float frame_duration) { + frame_duration_ = frame_duration; + } + + float endpoint_margin() const { return endpoint_margin_; } + void set_endpoint_margin(float endpoint_margin) { + endpoint_margin_ = endpoint_margin; + } + + float onset_window() const { return onset_window_; } + void set_onset_window(float onset_window) { onset_window_ = onset_window; } + + float speech_on_window() const { return speech_on_window_; } + void set_speech_on_window(float speech_on_window) { + speech_on_window_ = speech_on_window; + } + + float offset_window() const { return offset_window_; } + void set_offset_window(float offset_window) { + offset_window_ = offset_window; + } + + float onset_detect_dur() const { return onset_detect_dur_; } + void set_onset_detect_dur(float onset_detect_dur) { + onset_detect_dur_ = onset_detect_dur; + } + + float onset_confirm_dur() const { return onset_confirm_dur_; } + void set_onset_confirm_dur(float onset_confirm_dur) { + onset_confirm_dur_ = onset_confirm_dur; + } + + float on_maintain_dur() const { return on_maintain_dur_; } + void set_on_maintain_dur(float on_maintain_dur) { + on_maintain_dur_ = on_maintain_dur; + } + + float offset_confirm_dur() const { return offset_confirm_dur_; } + void set_offset_confirm_dur(float offset_confirm_dur) { + offset_confirm_dur_ = offset_confirm_dur; + } + + float decision_threshold() const { return decision_threshold_; } + void set_decision_threshold(float decision_threshold) { + decision_threshold_ = decision_threshold; + } + + float min_decision_threshold() const { return min_decision_threshold_; } + void set_min_decision_threshold(float min_decision_threshold) { + min_decision_threshold_ = min_decision_threshold; + } + + float fast_update_dur() const { return fast_update_dur_; } + void set_fast_update_dur(float fast_update_dur) { + fast_update_dur_ = fast_update_dur; + } + + float sample_rate() const { return sample_rate_; } + void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; } + + float min_fundamental_frequency() const { return min_fundamental_frequency_; } + void set_min_fundamental_frequency(float min_fundamental_frequency) { + min_fundamental_frequency_ = min_fundamental_frequency; + } + + float max_fundamental_frequency() const { return max_fundamental_frequency_; } + void set_max_fundamental_frequency(float max_fundamental_frequency) { + max_fundamental_frequency_ = max_fundamental_frequency; + } + + float contamination_rejection_period() const { + return contamination_rejection_period_; + } + void set_contamination_rejection_period( + float contamination_rejection_period) { + contamination_rejection_period_ = contamination_rejection_period; + } + + private: + float frame_period_; // Frame period + float frame_duration_; // Window size + float onset_window_; // Interval scanned for onset activity + float speech_on_window_; // Inverval scanned for ongoing speech + float offset_window_; // Interval scanned for offset evidence + float offset_confirm_dur_; // Silence duration required to confirm offset + float decision_threshold_; // Initial rms detection threshold + float min_decision_threshold_; // Minimum rms detection threshold + float fast_update_dur_; // Period for initial estimation of levels. + float sample_rate_; // Expected sample rate. + + // Time to add on either side of endpoint threshold crossings + float endpoint_margin_; + // Total dur within onset_window required to enter ONSET state + float onset_detect_dur_; + // Total on time within onset_window required to enter SPEECH_ON state + float onset_confirm_dur_; + // Minimum dur in SPEECH_ON state required to maintain ON state + float on_maintain_dur_; + // Minimum fundamental frequency for autocorrelation. + float min_fundamental_frequency_; + // Maximum fundamental frequency for autocorrelation. + float max_fundamental_frequency_; + // Period after start of user input that above threshold values are ignored. + // This is to reject audio feedback contamination. + float contamination_rejection_period_; +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ diff --git a/dom/media/webspeech/recognition/moz.build b/dom/media/webspeech/recognition/moz.build new file mode 100644 index 0000000000..89de4fcbbe --- /dev/null +++ b/dom/media/webspeech/recognition/moz.build @@ -0,0 +1,64 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +MOCHITEST_MANIFESTS += ["test/mochitest.toml"] + +XPIDL_MODULE = "dom_webspeechrecognition" + +XPIDL_SOURCES = ["nsISpeechRecognitionService.idl"] + +EXPORTS.mozilla.dom += [ + "OnlineSpeechRecognitionService.h", + "SpeechGrammar.h", + "SpeechGrammarList.h", + "SpeechRecognition.h", + "SpeechRecognitionAlternative.h", + "SpeechRecognitionResult.h", + "SpeechRecognitionResultList.h", + "SpeechTrackListener.h", +] + +EXPORTS += [ + "endpointer.h", + "energy_endpointer.h", + "energy_endpointer_params.h", +] + +if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]: + EXPORTS.mozilla.dom += [ + "test/FakeSpeechRecognitionService.h", + ] + +UNIFIED_SOURCES += [ + "endpointer.cc", + "energy_endpointer.cc", + "energy_endpointer_params.cc", + "OnlineSpeechRecognitionService.cpp", + "SpeechGrammar.cpp", + "SpeechGrammarList.cpp", + "SpeechRecognition.cpp", + "SpeechRecognitionAlternative.cpp", + "SpeechRecognitionResult.cpp", + "SpeechRecognitionResultList.cpp", + "SpeechTrackListener.cpp", +] + +if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]: + UNIFIED_SOURCES += [ + "test/FakeSpeechRecognitionService.cpp", + ] + +USE_LIBS += [ + "jsoncpp", +] + +LOCAL_INCLUDES += [ + "/dom/base", + "/toolkit/components/jsoncpp/include", +] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" diff --git a/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl new file mode 100644 index 0000000000..a43d277da0 --- /dev/null +++ b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +%{C++ +#include "mozilla/WeakPtr.h" + +namespace mozilla { +class AudioSegment; +namespace dom { +class SpeechRecognition; +class SpeechRecognitionResultList; +class SpeechGrammarList; +class SpeechGrammar; +} +} +%} + +native SpeechRecognitionWeakPtr(mozilla::WeakPtr<mozilla::dom::SpeechRecognition>); +[ptr] native AudioSegmentPtr(mozilla::AudioSegment); +[ptr] native SpeechGrammarPtr(mozilla::dom::SpeechGrammar); +[ptr] native SpeechGrammarListPtr(mozilla::dom::SpeechGrammarList); + +[uuid(6fcb6ee8-a6db-49ba-9f06-355d7ee18ea7)] +interface nsISpeechGrammarCompilationCallback : nsISupports { + void grammarCompilationEnd(in SpeechGrammarPtr grammarObject, in boolean success); +}; + +[uuid(8e97f287-f322-44e8-8888-8344fa408ef8)] +interface nsISpeechRecognitionService : nsISupports { + void initialize(in SpeechRecognitionWeakPtr aSpeechRecognition); + void processAudioSegment(in AudioSegmentPtr aAudioSegment, in long aSampleRate); + void validateAndSetGrammarList(in SpeechGrammarPtr aSpeechGrammar, in nsISpeechGrammarCompilationCallback aCallback); + void soundEnd(); + void abort(); +}; + +%{C++ +#define NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "@mozilla.org/webspeech/service;1?name=" +%} diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp new file mode 100644 index 0000000000..cf14cb3750 --- /dev/null +++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp @@ -0,0 +1,118 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsThreadUtils.h" + +#include "FakeSpeechRecognitionService.h" + +#include "SpeechRecognition.h" +#include "SpeechRecognitionAlternative.h" +#include "SpeechRecognitionResult.h" +#include "SpeechRecognitionResultList.h" +#include "nsIObserverService.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPrefs_media.h" + +namespace mozilla { + +using namespace dom; + +NS_IMPL_ISUPPORTS(FakeSpeechRecognitionService, nsISpeechRecognitionService, + nsIObserver) + +FakeSpeechRecognitionService::FakeSpeechRecognitionService() = default; + +FakeSpeechRecognitionService::~FakeSpeechRecognitionService() = default; + +NS_IMETHODIMP +FakeSpeechRecognitionService::Initialize( + WeakPtr<SpeechRecognition> aSpeechRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + mRecognition = aSpeechRecognition; + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, + int32_t aSampleRate) { + MOZ_ASSERT(!NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::SoundEnd() { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::ValidateAndSetGrammarList( + mozilla::dom::SpeechGrammar*, nsISpeechGrammarCompilationCallback*) { + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::Abort() { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(StaticPrefs::media_webspeech_test_fake_recognition_service(), + "Got request to fake recognition service event, but " + "media.webspeech.test.fake_recognition_service is not set"); + + if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + + return NS_OK; + } + + const nsDependentString eventName = nsDependentString(aData); + + if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_ERROR")) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, // TODO different codes? + "RECOGNITIONSERVICE_ERROR test event"); + + } else if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_FINAL_RESULT")) { + RefPtr<SpeechEvent> event = new SpeechEvent( + mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); + + event->mRecognitionResultList = BuildMockResultList(); + NS_DispatchToMainThread(event); + } + return NS_OK; +} + +SpeechRecognitionResultList* +FakeSpeechRecognitionService::BuildMockResultList() { + SpeechRecognitionResultList* resultList = + new SpeechRecognitionResultList(mRecognition); + SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); + if (0 < mRecognition->MaxAlternatives()) { + SpeechRecognitionAlternative* alternative = + new SpeechRecognitionAlternative(mRecognition); + + alternative->mTranscript = u"Mock final result"_ns; + alternative->mConfidence = 0.0f; + + result->mItems.AppendElement(alternative); + } + resultList->mItems.AppendElement(result); + + return resultList; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h new file mode 100644 index 0000000000..69e2786b76 --- /dev/null +++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h @@ -0,0 +1,40 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_FakeSpeechRecognitionService_h +#define mozilla_dom_FakeSpeechRecognitionService_h + +#include "nsCOMPtr.h" +#include "nsIObserver.h" +#include "nsISpeechRecognitionService.h" + +#define NS_FAKE_SPEECH_RECOGNITION_SERVICE_CID \ + {0x48c345e7, \ + 0x9929, \ + 0x4f9a, \ + {0xa5, 0x63, 0xf4, 0x78, 0x22, 0x2d, 0xab, 0xcd}}; + +namespace mozilla { + +class FakeSpeechRecognitionService : public nsISpeechRecognitionService, + public nsIObserver { + public: + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSISPEECHRECOGNITIONSERVICE + NS_DECL_NSIOBSERVER + + FakeSpeechRecognitionService(); + + private: + virtual ~FakeSpeechRecognitionService(); + + WeakPtr<dom::SpeechRecognition> mRecognition; + dom::SpeechRecognitionResultList* BuildMockResultList(); +}; + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/test/head.js b/dom/media/webspeech/recognition/test/head.js new file mode 100644 index 0000000000..c77a7ee926 --- /dev/null +++ b/dom/media/webspeech/recognition/test/head.js @@ -0,0 +1,200 @@ +"use strict"; + +const DEFAULT_AUDIO_SAMPLE_FILE = "hello.ogg"; +const SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC = + "SpeechRecognitionTest:RequestEvent"; +const SPEECH_RECOGNITION_TEST_END_TOPIC = "SpeechRecognitionTest:End"; + +var errorCodes = { + NO_SPEECH: "no-speech", + ABORTED: "aborted", + AUDIO_CAPTURE: "audio-capture", + NETWORK: "network", + NOT_ALLOWED: "not-allowed", + SERVICE_NOT_ALLOWED: "service-not-allowed", + BAD_GRAMMAR: "bad-grammar", + LANGUAGE_NOT_SUPPORTED: "language-not-supported", +}; + +var Services = SpecialPowers.Services; + +function EventManager(sr) { + var self = this; + var nEventsExpected = 0; + self.eventsReceived = []; + + var allEvents = [ + "audiostart", + "soundstart", + "speechstart", + "speechend", + "soundend", + "audioend", + "result", + "nomatch", + "error", + "start", + "end", + ]; + + var eventDependencies = { + speechend: "speechstart", + soundend: "soundstart", + audioend: "audiostart", + }; + + var isDone = false; + + // set up grammar + var sgl = new SpeechGrammarList(); + sgl.addFromString("#JSGF V1.0; grammar test; public <simple> = hello ;", 1); + sr.grammars = sgl; + + // AUDIO_DATA events are asynchronous, + // so we queue events requested while they are being + // issued to make them seem synchronous + var isSendingAudioData = false; + var queuedEventRequests = []; + + // register default handlers + for (var i = 0; i < allEvents.length; i++) { + (function (eventName) { + sr["on" + eventName] = function (evt) { + var message = "unexpected event: " + eventName; + if (eventName == "error") { + message += " -- " + evt.message; + } + + ok(false, message); + if (self.doneFunc && !isDone) { + isDone = true; + self.doneFunc(); + } + }; + })(allEvents[i]); + } + + self.expect = function EventManager_expect(eventName, cb) { + nEventsExpected++; + + sr["on" + eventName] = function (evt) { + self.eventsReceived.push(eventName); + ok(true, "received event " + eventName); + + var dep = eventDependencies[eventName]; + if (dep) { + ok( + self.eventsReceived.includes(dep), + eventName + " must come after " + dep + ); + } + + cb && cb(evt, sr); + if ( + self.doneFunc && + !isDone && + nEventsExpected === self.eventsReceived.length + ) { + isDone = true; + self.doneFunc(); + } + }; + }; + + self.start = function EventManager_start() { + isSendingAudioData = true; + var audioTag = document.createElement("audio"); + audioTag.src = self.audioSampleFile; + + var stream = audioTag.mozCaptureStreamUntilEnded(); + audioTag.addEventListener("ended", function () { + info("Sample stream ended, requesting queued events"); + isSendingAudioData = false; + while (queuedEventRequests.length) { + self.requestFSMEvent(queuedEventRequests.shift()); + } + }); + + audioTag.play(); + sr.start(stream); + }; + + self.requestFSMEvent = function EventManager_requestFSMEvent(eventName) { + if (isSendingAudioData) { + info( + "Queuing event " + eventName + " until we're done sending audio data" + ); + queuedEventRequests.push(eventName); + return; + } + + info("requesting " + eventName); + Services.obs.notifyObservers( + null, + SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC, + eventName + ); + }; + + self.requestTestEnd = function EventManager_requestTestEnd() { + Services.obs.notifyObservers(null, SPEECH_RECOGNITION_TEST_END_TOPIC); + }; +} + +function buildResultCallback(transcript) { + return function (evt) { + is(evt.results[0][0].transcript, transcript, "expect correct transcript"); + }; +} + +function buildErrorCallback(errcode) { + return function (err) { + is(err.error, errcode, "expect correct error code"); + }; +} + +function performTest(options) { + var prefs = options.prefs; + + prefs.unshift( + ["media.webspeech.recognition.enable", true], + ["media.webspeech.test.enable", true] + ); + + SpecialPowers.pushPrefEnv({ set: prefs }, function () { + var sr; + if (!options.webkit) { + sr = new SpeechRecognition(); + } else { + sr = new webkitSpeechRecognition(); + var grammar = new webkitSpeechGrammar(); + var speechrecognitionlist = new webkitSpeechGrammarList(); + speechrecognitionlist.addFromString("", 1); + sr.grammars = speechrecognitionlist; + } + var em = new EventManager(sr); + + for (var eventName in options.expectedEvents) { + var cb = options.expectedEvents[eventName]; + em.expect(eventName, cb); + } + + em.doneFunc = function () { + em.requestTestEnd(); + if (options.doneFunc) { + options.doneFunc(); + } + }; + + em.audioSampleFile = DEFAULT_AUDIO_SAMPLE_FILE; + if (options.audioSampleFile) { + em.audioSampleFile = options.audioSampleFile; + } + + em.start(); + + for (var i = 0; i < options.eventsToRequest.length; i++) { + em.requestFSMEvent(options.eventsToRequest[i]); + } + }); +} diff --git a/dom/media/webspeech/recognition/test/hello.ogg b/dom/media/webspeech/recognition/test/hello.ogg Binary files differnew file mode 100644 index 0000000000..7a80926065 --- /dev/null +++ b/dom/media/webspeech/recognition/test/hello.ogg diff --git a/dom/media/webspeech/recognition/test/hello.ogg^headers^ b/dom/media/webspeech/recognition/test/hello.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/hello.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/http_requesthandler.sjs b/dom/media/webspeech/recognition/test/http_requesthandler.sjs new file mode 100644 index 0000000000..e79662b850 --- /dev/null +++ b/dom/media/webspeech/recognition/test/http_requesthandler.sjs @@ -0,0 +1,87 @@ +const CC = Components.Constructor; + +// Context structure - we need to set this up properly to pass to setObjectState +const ctx = { + QueryInterface(iid) { + if (iid.equals(Ci.nsISupports)) { + return this; + } + throw Components.Exception("", Cr.NS_ERROR_NO_INTERFACE); + }, +}; + +function setRequest(request) { + setObjectState(key, request); +} +function getRequest() { + let request; + getObjectState(v => { + request = v; + }); + return request; +} + +function handleRequest(request, response) { + response.processAsync(); + if (request.queryString == "save") { + // Get the context structure and finish the old request + getObjectState("context", function (obj) { + savedCtx = obj.wrappedJSObject; + request = savedCtx.request; + + response.setHeader("Content-Type", "application/octet-stream", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + + const input = request.bodyInputStream; + const output = response.bodyOutputStream; + let bodyAvail; + while ((bodyAvail = input.available()) > 0) { + output.writeFrom(input, bodyAvail); + } + response.finish(); + }); + return; + } + + if ( + request.queryString == "malformedresult=1" || + request.queryString == "emptyresult=1" + ) { + jsonOK = + request.queryString == "malformedresult=1" + ? '{"status":"ok","dat' + : '{"status":"ok","data":[]}'; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } else if (request.queryString == "hangup=1") { + response.finish(); + } else if (request.queryString == "return400=1") { + jsonOK = "{'message':'Bad header:accept-language-stt'}"; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 400, "Bad Request"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } else { + ctx.wrappedJSObject = ctx; + ctx.request = request; + setObjectState("context", ctx); + jsonOK = '{"status":"ok","data":[{"confidence":0.9085610,"text":"hello"}]}'; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } +} diff --git a/dom/media/webspeech/recognition/test/mochitest.toml b/dom/media/webspeech/recognition/test/mochitest.toml new file mode 100644 index 0000000000..5b127db7f3 --- /dev/null +++ b/dom/media/webspeech/recognition/test/mochitest.toml @@ -0,0 +1,44 @@ +[DEFAULT] +tags = "mtg" +subsuite = "media" +support-files = [ + "head.js", + "hello.ogg", + "hello.ogg^headers^", + "http_requesthandler.sjs", + "sinoid+hello.ogg", + "sinoid+hello.ogg^headers^", + "silence.ogg", + "silence.ogg^headers^", +] + +["test_abort.html"] + +["test_audio_capture_error.html"] + +["test_call_start_from_end_handler.html"] +tags = "capturestream" + +["test_nested_eventloop.html"] +skip-if = ["os == 'android'"] + +["test_online_400_response.html"] + +["test_online_empty_result_handling.html"] + +["test_online_hangup.html"] + +["test_online_http.html"] + +["test_online_http_webkit.html"] + +["test_online_malformed_result_handling.html"] + +["test_preference_enable.html"] + +["test_recognition_service_error.html"] + +["test_success_without_recognition_service.html"] + +["test_timeout.html"] +skip-if = ["os == 'linux'"] # Bug 1307991 - low frequency on try pushes diff --git a/dom/media/webspeech/recognition/test/silence.ogg b/dom/media/webspeech/recognition/test/silence.ogg Binary files differnew file mode 100644 index 0000000000..e6da3a5022 --- /dev/null +++ b/dom/media/webspeech/recognition/test/silence.ogg diff --git a/dom/media/webspeech/recognition/test/silence.ogg^headers^ b/dom/media/webspeech/recognition/test/silence.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/silence.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg b/dom/media/webspeech/recognition/test/sinoid+hello.ogg Binary files differnew file mode 100644 index 0000000000..7092e82f30 --- /dev/null +++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/test_abort.html b/dom/media/webspeech/recognition/test/test_abort.html new file mode 100644 index 0000000000..0f22770cc7 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_abort.html @@ -0,0 +1,73 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Call abort from inside handlers</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + // Abort inside event handlers, should't get a + // result after that + + var nextEventIdx = 0; + var eventsToAbortOn = [ + "start", + "audiostart", + "speechstart", + "speechend", + "audioend" + ]; + + function doNextTest() { + var nextEvent = eventsToAbortOn[nextEventIdx]; + var expectedEvents = { + "start": null, + "audiostart": null, + "audioend": null, + "end": null + }; + + if (nextEventIdx >= eventsToAbortOn.indexOf("speechstart")) { + expectedEvents.speechstart = null; + } + + if (nextEventIdx >= eventsToAbortOn.indexOf("speechend")) { + expectedEvents.speechend = null; + } + + info("Aborting on " + nextEvent); + expectedEvents[nextEvent] = function(evt, sr) { + sr.abort(); + }; + + nextEventIdx++; + + performTest({ + eventsToRequest: [], + expectedEvents, + doneFunc: (nextEventIdx < eventsToAbortOn.length) ? doNextTest : SimpleTest.finish, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); + } + + doNextTest(); +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_audio_capture_error.html b/dom/media/webspeech/recognition/test/test_audio_capture_error.html new file mode 100644 index 0000000000..0c054dbf0b --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_audio_capture_error.html @@ -0,0 +1,42 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Behavior on audio error</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: ['EVENT_AUDIO_ERROR'], + expectedEvents: { + 'start': null, + 'audiostart': null, + 'speechstart': null, + 'speechend': null, + 'audioend': null, + 'error': buildErrorCallback(errorCodes.AUDIO_CAPTURE), + 'end': null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html new file mode 100644 index 0000000000..895648ad9e --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html @@ -0,0 +1,102 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Restart recognition from end handler</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + function createAudioStream() { + var audioTag = document.createElement("audio"); + audioTag.src = DEFAULT_AUDIO_SAMPLE_FILE; + + var stream = audioTag.mozCaptureStreamUntilEnded(); + audioTag.play(); + + return stream; + } + + var done = false; + function endHandler(evt, sr) { + if (done) { + SimpleTest.finish(); + return; + } + + try { + var stream = createAudioStream(); + sr.start(stream); // shouldn't fail + } catch (err) { + ok(false, "Failed to start() from end() callback"); + } + + // calling start() may cause some callbacks to fire, but we're + // no longer interested in them, except for onend, which is where + // we'll conclude the test. + sr.onstart = null; + sr.onaudiostart = null; + sr.onspeechstart = null; + sr.onspeechend = null; + sr.onaudioend = null; + sr.onresult = null; + + // FIXME(ggp) the state transition caused by start() is async, + // but abort() is sync (see bug 1055093). until we normalize + // state transitions, we need to setTimeout here to make sure + // abort() finds the speech recognition object in the correct + // state (namely, STATE_STARTING). + setTimeout(function() { + sr.abort(); + done = true; + }); + + info("Successfully start() from end() callback"); + } + + function expectExceptionHandler(evt, sr) { + try { + sr.start(createAudioStream()); + } catch (err) { + is(err.name, "InvalidStateError"); + return; + } + + ok(false, "Calling start() didn't raise InvalidStateError"); + } + + performTest({ + eventsToRequest: [ + 'EVENT_RECOGNITIONSERVICE_FINAL_RESULT' + ], + expectedEvents: { + 'start': expectExceptionHandler, + 'audiostart': expectExceptionHandler, + 'speechstart': expectExceptionHandler, + 'speechend': expectExceptionHandler, + 'audioend': expectExceptionHandler, + 'result': buildResultCallback("Mock final result"), + 'end': endHandler, + }, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_nested_eventloop.html b/dom/media/webspeech/recognition/test/test_nested_eventloop.html new file mode 100644 index 0000000000..4924766b44 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_nested_eventloop.html @@ -0,0 +1,82 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Spin the event loop from inside a callback</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + /* + * SpecialPowers.spinEventLoop can be used to spin the event loop, causing + * queued SpeechEvents (such as those created by calls to start(), stop() + * or abort()) to be processed immediately. + * When this is done from inside DOM event handlers, it is possible to + * cause reentrancy in our C++ code, which we should be able to withstand. + */ + function abortAndSpinEventLoop(evt, sr) { + sr.abort(); + SpecialPowers.spinEventLoop(window); + } + function doneFunc() { + // Trigger gc now and wait some time to make sure this test gets the blame + // for any assertions caused by spinning the event loop. + // + // NB - The assertions should be gone, but this looks too scary to touch + // during batch cleanup. + var count = 0, GC_COUNT = 4; + + function triggerGCOrFinish() { + SpecialPowers.gc(); + count++; + + if (count == GC_COUNT) { + SimpleTest.finish(); + } + } + + for (var i = 0; i < GC_COUNT; i++) { + setTimeout(triggerGCOrFinish, 0); + } + } + + /* + * We start by performing a normal start, then abort from the audiostart + * callback and force the EVENT_ABORT to be processed while still inside + * the event handler. This causes the recording to stop, which raises + * the audioend and (later on) end events. + * Then, we abort (once again spinning the event loop) from the audioend + * handler, attempting to cause a re-entry into the abort code. This second + * call should be ignored, and we get the end callback and finish. + */ + + performTest({ + eventsToRequest: [], + expectedEvents: { + "audiostart": abortAndSpinEventLoop, + "audioend": abortAndSpinEventLoop, + "end": null + }, + doneFunc, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_400_response.html b/dom/media/webspeech/recognition/test/test_online_400_response.html new file mode 100644 index 0000000000..1a7d0ed452 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_400_response.html @@ -0,0 +1,47 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test the speech recognition service behavior +whenever the server returns a 400 error +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [], + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + 'error': buildErrorCallback(errorCodes.NETWORK), + "speechstart": null, + "speechend": null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?return400=1"], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html new file mode 100644 index 0000000000..46f1e7e0cb --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html @@ -0,0 +1,48 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test the speech recognition service behavior +whenever the server returns a valid json object, but without any transcription +results on it, for example: `{"status":"ok","data":[]}` +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [], + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + 'error': buildErrorCallback(errorCodes.NETWORK), + "speechstart": null, + "speechend": null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?emptyresult=1"], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_hangup.html b/dom/media/webspeech/recognition/test/test_online_hangup.html new file mode 100644 index 0000000000..4a46f80f8f --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_hangup.html @@ -0,0 +1,47 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test the speech recognition service behavior +whenever the server hangups the connection without sending any response +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [], + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + 'error': buildErrorCallback(errorCodes.NETWORK), + "speechstart": null, + "speechend": null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?hangup=1"], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_http.html b/dom/media/webspeech/recognition/test/test_online_http.html new file mode 100644 index 0000000000..644a9a098b --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_http.html @@ -0,0 +1,87 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test a successfull speech recognition request and +that audio is being properly encoded +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + async function validateRawAudio(buffer) { + const ac = new AudioContext(); + const decodedData = await ac.decodeAudioData(buffer); + const source = ac.createBufferSource(); + source.buffer = decodedData; + source.loop = true; + const analyser = ac.createAnalyser(); + analyser.smoothingTimeConstant = 0.2; + analyser.fftSize = 1024; + source.connect(analyser); + const binIndexForFrequency = frequency => + 1 + Math.round(frequency * analyser.fftSize / ac.sampleRate); + source.start(); + const data = new Uint8Array(analyser.frequencyBinCount); + const start = performance.now(); + while (true) { + if (performance.now() - start > 10000) { + return false; + } + analyser.getByteFrequencyData(data); + if (data[binIndexForFrequency(200)] < 50 && + data[binIndexForFrequency(440)] > 180 && + data[binIndexForFrequency(1000)] < 50) { + return true; + } + await new Promise(r => requestAnimationFrame(r)); + } + } + + async function verifyEncodedAudio(requestUrl) { + try { + const response = await fetch(requestUrl); + const buffer = await response.arrayBuffer(); + ok(await validateRawAudio(buffer), "Audio encoding is valid"); + } catch(e) { + ok(false, e); + } finally { + SimpleTest.finish(); + } + } + + performTest({ + eventsToRequest: {}, + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"), + "speechstart": null, + "speechend": null + }, + audioSampleFile: "sinoid+hello.ogg", + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"], + ["media.webspeech.recognition.timeout", 100000]] + }); +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_http_webkit.html b/dom/media/webspeech/recognition/test/test_online_http_webkit.html new file mode 100644 index 0000000000..5801697a45 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_http_webkit.html @@ -0,0 +1,88 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test a successfull speech recognition request and +that audio is being properly encoded +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + async function validateRawAudio(buffer) { + const ac = new AudioContext(); + const decodedData = await ac.decodeAudioData(buffer); + const source = ac.createBufferSource(); + source.buffer = decodedData; + source.loop = true; + const analyser = ac.createAnalyser(); + analyser.smoothingTimeConstant = 0.2; + analyser.fftSize = 1024; + source.connect(analyser); + const binIndexForFrequency = frequency => + 1 + Math.round(frequency * analyser.fftSize / ac.sampleRate); + source.start(); + const data = new Uint8Array(analyser.frequencyBinCount); + const start = performance.now(); + while (true) { + if (performance.now() - start > 10000) { + return false; + } + analyser.getByteFrequencyData(data); + if (data[binIndexForFrequency(200)] < 50 && + data[binIndexForFrequency(440)] > 180 && + data[binIndexForFrequency(1000)] < 50) { + return true; + } + await new Promise(r => requestAnimationFrame(r)); + } + } + + async function verifyEncodedAudio(requestUrl) { + try { + const response = await fetch(requestUrl); + const buffer = await response.arrayBuffer(); + ok(await validateRawAudio(buffer), "Audio encoding is valid"); + } catch(e) { + ok(false, e); + } finally { + SimpleTest.finish(); + } + } + + performTest({ + eventsToRequest: {}, + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"), + "speechstart": null, + "speechend": null + }, + audioSampleFile: "sinoid+hello.ogg", + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"], + ["media.webspeech.recognition.timeout", 100000]], + webkit: true + }); +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html new file mode 100644 index 0000000000..b071a46ea3 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html @@ -0,0 +1,48 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=1248897 +The intent of this file is to test the speech recognition service behavior +whenever the server returns an invalid/corrupted json object, for example: +`{"status":"ok","dat` +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 1248897 -- Online speech service</title> + <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [], + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "end": null, + 'error': buildErrorCallback(errorCodes.NETWORK), + "speechstart": null, + "speechend": null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.recognition.enable", true], + ["media.webspeech.recognition.force_enable", true], + ["media.webspeech.service.endpoint", + "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?malformedresult=1"], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_preference_enable.html b/dom/media/webspeech/recognition/test/test_preference_enable.html new file mode 100644 index 0000000000..2b56f82e2c --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_preference_enable.html @@ -0,0 +1,43 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- No objects should be visible with preference disabled</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + SpecialPowers.pushPrefEnv({ + set: [["media.webspeech.recognition.enable", false]] + }, function() { + var objects = [ + "SpeechRecognition", + "SpeechGrammar", + "SpeechRecognitionResult", + "SpeechRecognitionResultList", + "SpeechRecognitionAlternative" + ]; + + for (var i = 0; i < objects.length; i++) { + is(window[objects[i]], undefined, + objects[i] + " should be undefined with pref off"); + } + + SimpleTest.finish(); + }); +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_recognition_service_error.html b/dom/media/webspeech/recognition/test/test_recognition_service_error.html new file mode 100644 index 0000000000..e8e59e2afc --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_recognition_service_error.html @@ -0,0 +1,45 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Behavior on recognition service error</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [ + 'EVENT_RECOGNITIONSERVICE_ERROR' + ], + expectedEvents: { + 'start': null, + 'audiostart': null, + 'speechstart': null, + 'speechend': null, + 'audioend': null, + 'error': buildErrorCallback(errorCodes.NETWORK), + 'end': null + }, + doneFunc: SimpleTest.finish, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html new file mode 100644 index 0000000000..38748ed5cb --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html @@ -0,0 +1,45 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Success with fake recognition service</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [ + 'EVENT_RECOGNITIONSERVICE_FINAL_RESULT' + ], + expectedEvents: { + 'start': null, + 'audiostart': null, + 'speechstart': null, + 'speechend': null, + 'audioend': null, + 'result': buildResultCallback("Mock final result"), + 'end': null + }, + doneFunc:SimpleTest.finish, + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 100000]] + }); + +</script> +</pre> +</body> +</html> diff --git a/dom/media/webspeech/recognition/test/test_timeout.html b/dom/media/webspeech/recognition/test/test_timeout.html new file mode 100644 index 0000000000..8334c9e779 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_timeout.html @@ -0,0 +1,42 @@ +<!DOCTYPE HTML> +<html> +<!-- +https://bugzilla.mozilla.org/show_bug.cgi?id=650295 +--> +<head> + <meta charset="utf-8"> + <title>Test for Bug 650295 -- Timeout for user speech</title> + <script src="/tests/SimpleTest/SimpleTest.js"></script> + <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/> + <script type="application/javascript" src="head.js"></script> +</head> +<body> +<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a> +<p id="display"></p> +<div id="content" style="display: none"> + +</div> +<pre id="test"> +<script type="text/javascript"> + SimpleTest.waitForExplicitFinish(); + + performTest({ + eventsToRequest: [], + expectedEvents: { + "start": null, + "audiostart": null, + "audioend": null, + "error": buildErrorCallback(errorCodes.NO_SPEECH), + "end": null + }, + doneFunc: SimpleTest.finish, + audioSampleFile: "silence.ogg", + prefs: [["media.webspeech.test.fake_fsm_events", true], + ["media.webspeech.test.fake_recognition_service", true], + ["media.webspeech.recognition.timeout", 1000]] + }); + +</script> +</pre> +</body> +</html> |