summaryrefslogtreecommitdiffstats
path: root/dom/media/webspeech/recognition
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
commit6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /dom/media/webspeech/recognition
parentInitial commit. (diff)
downloadthunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'dom/media/webspeech/recognition')
-rw-r--r--dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp462
-rw-r--r--dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h132
-rw-r--r--dom/media/webspeech/recognition/SpeechGrammar.cpp57
-rw-r--r--dom/media/webspeech/recognition/SpeechGrammar.h64
-rw-r--r--dom/media/webspeech/recognition/SpeechGrammarList.cpp76
-rw-r--r--dom/media/webspeech/recognition/SpeechGrammarList.h73
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognition.cpp1170
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognition.h314
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp44
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionAlternative.h49
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionResult.cpp59
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionResult.h54
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp58
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognitionResultList.h53
-rw-r--r--dom/media/webspeech/recognition/SpeechTrackListener.cpp92
-rw-r--r--dom/media/webspeech/recognition/SpeechTrackListener.h50
-rw-r--r--dom/media/webspeech/recognition/endpointer.cc193
-rw-r--r--dom/media/webspeech/recognition/endpointer.h180
-rw-r--r--dom/media/webspeech/recognition/energy_endpointer.cc393
-rw-r--r--dom/media/webspeech/recognition/energy_endpointer.h180
-rw-r--r--dom/media/webspeech/recognition/energy_endpointer_params.cc77
-rw-r--r--dom/media/webspeech/recognition/energy_endpointer_params.h159
-rw-r--r--dom/media/webspeech/recognition/moz.build64
-rw-r--r--dom/media/webspeech/recognition/nsISpeechRecognitionService.idl43
-rw-r--r--dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp118
-rw-r--r--dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h40
-rw-r--r--dom/media/webspeech/recognition/test/head.js200
-rw-r--r--dom/media/webspeech/recognition/test/hello.oggbin0 -> 11328 bytes
-rw-r--r--dom/media/webspeech/recognition/test/hello.ogg^headers^1
-rw-r--r--dom/media/webspeech/recognition/test/http_requesthandler.sjs85
-rw-r--r--dom/media/webspeech/recognition/test/mochitest.ini35
-rw-r--r--dom/media/webspeech/recognition/test/silence.oggbin0 -> 106941 bytes
-rw-r--r--dom/media/webspeech/recognition/test/silence.ogg^headers^1
-rw-r--r--dom/media/webspeech/recognition/test/sinoid+hello.oggbin0 -> 29514 bytes
-rw-r--r--dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^1
-rw-r--r--dom/media/webspeech/recognition/test/test_abort.html73
-rw-r--r--dom/media/webspeech/recognition/test/test_audio_capture_error.html42
-rw-r--r--dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html102
-rw-r--r--dom/media/webspeech/recognition/test/test_nested_eventloop.html82
-rw-r--r--dom/media/webspeech/recognition/test/test_online_400_response.html47
-rw-r--r--dom/media/webspeech/recognition/test/test_online_empty_result_handling.html48
-rw-r--r--dom/media/webspeech/recognition/test/test_online_hangup.html47
-rw-r--r--dom/media/webspeech/recognition/test/test_online_http.html89
-rw-r--r--dom/media/webspeech/recognition/test/test_online_http_webkit.html90
-rw-r--r--dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html48
-rw-r--r--dom/media/webspeech/recognition/test/test_preference_enable.html43
-rw-r--r--dom/media/webspeech/recognition/test/test_recognition_service_error.html45
-rw-r--r--dom/media/webspeech/recognition/test/test_success_without_recognition_service.html45
-rw-r--r--dom/media/webspeech/recognition/test/test_timeout.html42
49 files changed, 5420 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
new file mode 100644
index 0000000000..e68ccc417e
--- /dev/null
+++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
@@ -0,0 +1,462 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsThreadUtils.h"
+#include "nsXPCOMCIDInternal.h"
+#include "OnlineSpeechRecognitionService.h"
+#include "nsIFile.h"
+#include "SpeechGrammar.h"
+#include "SpeechRecognition.h"
+#include "SpeechRecognitionAlternative.h"
+#include "SpeechRecognitionResult.h"
+#include "SpeechRecognitionResultList.h"
+#include "nsIObserverService.h"
+#include "mozilla/dom/Document.h"
+#include "mozilla/Preferences.h"
+#include "mozilla/ScopeExit.h"
+#include "mozilla/StaticPrefs_media.h"
+#include "mozilla/Services.h"
+#include "nsDirectoryServiceDefs.h"
+#include "nsDirectoryServiceUtils.h"
+#include "nsNetUtil.h"
+#include "nsContentUtils.h"
+#include "nsIChannel.h"
+#include "nsIHttpChannel.h"
+#include "nsIPrincipal.h"
+#include "nsIStreamListener.h"
+#include "nsIUploadChannel2.h"
+#include "mozilla/dom/ClientIPCTypes.h"
+#include "nsStringStream.h"
+#include "nsIOutputStream.h"
+#include "nsStreamUtils.h"
+#include "OpusTrackEncoder.h"
+#include "OggWriter.h"
+#include "nsIClassOfService.h"
+#include <json/json.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace mozilla {
+
+using namespace dom;
+
+#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \
+ "media.webspeech.service.endpoint"
+#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/"
+#define MAX_LISTENING_TIME_MS 10000
+
+NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService,
+ nsIStreamListener)
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) {
+ MOZ_ASSERT(NS_IsMainThread());
+ return NS_OK;
+}
+
+static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure,
+ const char* aFromRawSegment,
+ uint32_t aToOffset, uint32_t aCount,
+ uint32_t* aWriteCount) {
+ nsCString* buf = static_cast<nsCString*>(aClosure);
+ buf->Append(aFromRawSegment, aCount);
+ *aWriteCount = aCount;
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest,
+ nsIInputStream* aInputStream,
+ uint64_t aOffset,
+ uint32_t aCount) {
+ MOZ_ASSERT(NS_IsMainThread());
+ nsresult rv;
+ uint32_t readCount;
+ rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount,
+ &readCount);
+ NS_ENSURE_SUCCESS(rv, rv);
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest,
+ nsresult aStatusCode) {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); });
+
+ if (mAborted) {
+ return NS_OK;
+ }
+
+ bool success;
+ float confidence = 0;
+ Json::Value root;
+ Json::CharReaderBuilder builder;
+ bool parsingSuccessful;
+ nsAutoCString result;
+ nsAutoCString hypoValue;
+ nsAutoCString errorMsg;
+ SpeechRecognitionErrorCode errorCode;
+
+ SR_LOG("STT Result: %s", mBuf.get());
+
+ if (NS_FAILED(aStatusCode)) {
+ success = false;
+ errorMsg.AssignLiteral("Error connecting to the service.");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ } else {
+ success = true;
+ UniquePtr<Json::CharReader> const reader(builder.newCharReader());
+ parsingSuccessful =
+ reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr);
+ if (!parsingSuccessful) {
+ // there's an internal server error
+ success = false;
+ errorMsg.AssignLiteral("Internal server error");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ } else {
+ result.Assign(root.get("status", "error").asString().c_str());
+ if (result.EqualsLiteral("ok")) {
+ // ok, we have a result
+ if (!root["data"].empty()) {
+ hypoValue.Assign(root["data"][0].get("text", "").asString().c_str());
+ confidence = root["data"][0].get("confidence", "0").asFloat();
+ } else {
+ success = false;
+ errorMsg.AssignLiteral("Error reading result data.");
+ errorCode = SpeechRecognitionErrorCode::Network;
+ }
+ } else {
+ success = false;
+ errorMsg.Assign(root.get("message", "").asString().c_str());
+ errorCode = SpeechRecognitionErrorCode::No_speech;
+ }
+ }
+ }
+
+ if (!success) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg);
+ } else {
+ // Declare javascript result events
+ RefPtr<SpeechEvent> event = new SpeechEvent(
+ mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
+ SpeechRecognitionResultList* resultList =
+ new SpeechRecognitionResultList(mRecognition);
+ SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
+
+ if (mRecognition->MaxAlternatives() > 0) {
+ SpeechRecognitionAlternative* alternative =
+ new SpeechRecognitionAlternative(mRecognition);
+
+ alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue);
+ alternative->mConfidence = confidence;
+
+ result->mItems.AppendElement(alternative);
+ }
+ resultList->mItems.AppendElement(result);
+
+ event->mRecognitionResultList = resultList;
+ NS_DispatchToMainThread(event);
+ }
+
+ return NS_OK;
+}
+
+OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default;
+OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default;
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Initialize(
+ WeakPtr<SpeechRecognition> aSpeechRecognition) {
+ MOZ_ASSERT(NS_IsMainThread());
+ mWriter = MakeUnique<OggWriter>();
+ mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>(
+ "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition);
+ mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding();
+ MOZ_ASSERT(mEncodeTaskQueue);
+ return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::EncoderFinished() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ MOZ_ASSERT(mEncodedAudioQueue.IsFinished());
+
+ while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) {
+ AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame});
+ DebugOnly<nsresult> rv =
+ mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream()
+ ? ContainerWriter::END_OF_STREAM
+ : 0);
+ MOZ_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED);
+ MOZ_ASSERT(mWriter->IsWritingComplete());
+
+ NS_DispatchToMainThread(
+ NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this,
+ &OnlineSpeechRecognitionService::DoSTT));
+}
+
+void OnlineSpeechRecognitionService::EncoderInitialized() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata;
+ metadata.AppendElement(mAudioEncoder->GetMetadata());
+ if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
+ SR_LOG("wrong meta data type!");
+ MOZ_ASSERT_UNREACHABLE();
+ }
+
+ nsresult rv = mWriter->SetMetadata(metadata);
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+
+ rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER);
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+
+ Unused << rv;
+}
+
+void OnlineSpeechRecognitionService::EncoderError() {
+ MOZ_ASSERT(!NS_IsMainThread());
+ SR_LOG("Error encoding frames.");
+ mEncodedData.Clear();
+ NS_DispatchToMainThread(NS_NewRunnableFunction(
+ "SpeechRecognition::DispatchError",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (!mRecognition) {
+ return;
+ }
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Audio_capture, "Encoder error");
+ }));
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
+ int32_t aSampleRate) {
+ MOZ_ASSERT(!NS_IsMainThread());
+ int64_t duration = aAudioSegment->GetDuration();
+ if (duration <= 0) {
+ return NS_OK;
+ }
+
+ if (!mAudioEncoder) {
+ mSpeechEncoderListener = new SpeechEncoderListener(this);
+ mAudioEncoder =
+ MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue);
+ RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent();
+ mAudioEncoder->SetWorkerThread(mEncoderThread);
+ mAudioEncoder->RegisterListener(mSpeechEncoderListener);
+ }
+
+ mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment));
+
+ TimeStamp now = TimeStamp::Now();
+ if (mFirstIteration.IsNull()) {
+ mFirstIteration = now;
+ }
+
+ if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) {
+ NS_DispatchToMainThread(NS_NewRunnableFunction(
+ "SpeechRecognition::Stop",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (!mRecognition) {
+ return;
+ }
+ mRecognition->Stop();
+ }));
+
+ return NS_OK;
+ }
+
+ return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::DoSTT() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ if (mAborted) {
+ return;
+ }
+
+ nsresult rv;
+ nsCOMPtr<nsIChannel> chan;
+ nsCOMPtr<nsIURI> uri;
+ nsAutoCString speechRecognitionEndpoint;
+ nsAutoCString prefEndpoint;
+ nsAutoString language;
+
+ Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT,
+ prefEndpoint);
+
+ if (!prefEndpoint.IsEmpty()) {
+ speechRecognitionEndpoint = prefEndpoint;
+ } else {
+ speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT;
+ }
+
+ rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr,
+ nullptr);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Unknown URI");
+ return;
+ }
+
+ nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT;
+ nsLoadFlags loadFlags =
+ nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER;
+ nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER;
+
+ nsPIDOMWindowInner* window = mRecognition->GetOwner();
+ if (NS_WARN_IF(!window)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Aborted, "No window");
+ return;
+ }
+
+ Document* doc = window->GetExtantDoc();
+ if (NS_WARN_IF(!doc)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Aborted, "No document");
+ }
+ rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags,
+ contentPolicy, nullptr, nullptr, nullptr, nullptr,
+ loadFlags);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Failed to open channel");
+ return;
+ }
+
+ nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan);
+ if (httpChan) {
+ rv = httpChan->SetRequestMethod("POST"_ns);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ if (httpChan) {
+ mRecognition->GetLang(language);
+ // Accept-Language-STT is a custom header of our backend server used to set
+ // the language of the speech sample being submitted by the client
+ rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns,
+ NS_ConvertUTF16toUTF8(language), false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Tell the server to not store the transcription by default
+ rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Tell the server to not store the sample by default
+ rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ // Set the product tag as teh web speech api
+ rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+
+ nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan));
+ if (cos) {
+ cos->AddClassFlags(nsIClassOfService::UrgentStart);
+ }
+
+ nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan);
+ if (uploadChan) {
+ nsCOMPtr<nsIInputStream> bodyStream;
+ uint32_t length = 0;
+ for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+ length += chunk.Length();
+ }
+
+ nsTArray<uint8_t> audio;
+ if (!audio.SetCapacity(length, fallible)) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Audio_capture, "Allocation error");
+ return;
+ }
+
+ for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+ audio.AppendElements(chunk);
+ }
+
+ mEncodedData.Clear();
+
+ rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio));
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Failed to open stream");
+ return;
+ }
+ if (bodyStream) {
+ rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns,
+ length, "POST"_ns, false);
+ MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+ }
+ }
+
+ rv = chan->AsyncOpen(this);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, "Internal server error");
+ }
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::SoundEnd() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ if (!mEncodeTaskQueue) {
+ // Not initialized
+ return NS_OK;
+ }
+
+ nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(
+ "OnlineSpeechRecognitionService::SoundEnd",
+ [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+ if (mAudioEncoder) {
+ mAudioEncoder->NotifyEndOfStream();
+ mAudioEncoder->UnregisterListener(mSpeechEncoderListener);
+ mSpeechEncoderListener = nullptr;
+ mAudioEncoder = nullptr;
+ EncoderFinished();
+ }
+ }));
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+ Unused << rv;
+
+ mEncodeTaskQueue = nullptr;
+
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ValidateAndSetGrammarList(
+ SpeechGrammar* aSpeechGrammar,
+ nsISpeechGrammarCompilationCallback* aCallback) {
+ // This is an online LVCSR (STT) service,
+ // so we don't need to set a grammar
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Abort() {
+ MOZ_ASSERT(NS_IsMainThread());
+ if (mAborted) {
+ return NS_OK;
+ }
+ mAborted = true;
+ return SoundEnd();
+}
+} // namespace mozilla
diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h
new file mode 100644
index 0000000000..c049e5046a
--- /dev/null
+++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h
@@ -0,0 +1,132 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_OnlineRecognitionService_h
+#define mozilla_dom_OnlineRecognitionService_h
+
+#include "nsCOMPtr.h"
+#include "nsTArray.h"
+#include "nsISpeechRecognitionService.h"
+#include "speex/speex_resampler.h"
+#include "nsIStreamListener.h"
+#include "OpusTrackEncoder.h"
+#include "ContainerWriter.h"
+
+#define NS_ONLINE_SPEECH_RECOGNITION_SERVICE_CID \
+ {0x0ff5ce56, \
+ 0x5b09, \
+ 0x4db8, \
+ {0xad, 0xc6, 0x82, 0x66, 0xaf, 0x95, 0xf8, 0x64}};
+
+namespace mozilla {
+
+namespace ipc {
+class PrincipalInfo;
+} // namespace ipc
+
+/**
+ * Online implementation of the nsISpeechRecognitionService interface
+ */
+class OnlineSpeechRecognitionService : public nsISpeechRecognitionService,
+ public nsIStreamListener {
+ public:
+ // Add XPCOM glue code
+ NS_DECL_THREADSAFE_ISUPPORTS
+ NS_DECL_NSISPEECHRECOGNITIONSERVICE
+ NS_DECL_NSIREQUESTOBSERVER
+ NS_DECL_NSISTREAMLISTENER
+
+ /**
+ * Listener responsible for handling the events raised by the TrackEncoder
+ */
+ class SpeechEncoderListener : public TrackEncoderListener {
+ public:
+ explicit SpeechEncoderListener(OnlineSpeechRecognitionService* aService)
+ : mService(aService), mOwningThread(AbstractThread::GetCurrent()) {}
+
+ void Started(TrackEncoder* aEncoder) override {}
+
+ void Initialized(TrackEncoder* aEncoder) override {
+ MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
+ mService->EncoderInitialized();
+ }
+
+ void Error(TrackEncoder* aEncoder) override {
+ MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
+ mService->EncoderError();
+ }
+
+ private:
+ const RefPtr<OnlineSpeechRecognitionService> mService;
+ const RefPtr<AbstractThread> mOwningThread;
+ };
+
+ /**
+ * Default constructs a OnlineSpeechRecognitionService
+ */
+ OnlineSpeechRecognitionService();
+
+ /**
+ * Called by SpeechEncoderListener when the AudioTrackEncoder has been
+ * initialized.
+ */
+ void EncoderInitialized();
+
+ /**
+ * Called after the AudioTrackEncoder has encoded all data for us to wrap in a
+ * container and pass along.
+ */
+ void EncoderFinished();
+
+ /**
+ * Called by SpeechEncoderListener when the AudioTrackEncoder has
+ * encountered an error.
+ */
+ void EncoderError();
+
+ private:
+ /**
+ * Private destructor to prevent bypassing of reference counting
+ */
+ virtual ~OnlineSpeechRecognitionService();
+
+ /** The associated SpeechRecognition */
+ nsMainThreadPtrHandle<dom::SpeechRecognition> mRecognition;
+
+ /**
+ * Builds a mock SpeechRecognitionResultList
+ */
+ dom::SpeechRecognitionResultList* BuildMockResultList();
+
+ /**
+ * Method responsible for uploading the audio to the remote endpoint
+ */
+ void DoSTT();
+
+ // Encoded and packaged ogg audio data
+ nsTArray<nsTArray<uint8_t>> mEncodedData;
+ // Member responsible for holding a reference to the TrackEncoderListener
+ RefPtr<SpeechEncoderListener> mSpeechEncoderListener;
+ // MediaQueue fed encoded data by mAudioEncoder
+ MediaQueue<EncodedFrame> mEncodedAudioQueue;
+ // Encoder responsible for encoding the frames from pcm to opus which is the
+ // format supported by our backend
+ UniquePtr<AudioTrackEncoder> mAudioEncoder;
+ // Object responsible for wrapping the opus frames into an ogg container
+ UniquePtr<ContainerWriter> mWriter;
+ // Member responsible for storing the json string returned by the endpoint
+ nsCString mBuf;
+ // Used to calculate a ceiling on the time spent listening.
+ TimeStamp mFirstIteration;
+ // flag responsible to control if the user choose to abort
+ bool mAborted = false;
+ // reference to the audio encoder queue
+ RefPtr<TaskQueue> mEncodeTaskQueue;
+};
+
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechGrammar.cpp b/dom/media/webspeech/recognition/SpeechGrammar.cpp
new file mode 100644
index 0000000000..de6e9fa30f
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechGrammar.cpp
@@ -0,0 +1,57 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechGrammar.h"
+
+#include "mozilla/ErrorResult.h"
+#include "mozilla/dom/SpeechGrammarBinding.h"
+
+namespace mozilla::dom {
+
+NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammar, mParent)
+NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammar)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammar)
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammar)
+ NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY
+ NS_INTERFACE_MAP_ENTRY(nsISupports)
+NS_INTERFACE_MAP_END
+
+SpeechGrammar::SpeechGrammar(nsISupports* aParent) : mParent(aParent) {}
+
+SpeechGrammar::~SpeechGrammar() = default;
+
+already_AddRefed<SpeechGrammar> SpeechGrammar::Constructor(
+ const GlobalObject& aGlobal) {
+ RefPtr<SpeechGrammar> speechGrammar =
+ new SpeechGrammar(aGlobal.GetAsSupports());
+ return speechGrammar.forget();
+}
+
+nsISupports* SpeechGrammar::GetParentObject() const { return mParent; }
+
+JSObject* SpeechGrammar::WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) {
+ return SpeechGrammar_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+void SpeechGrammar::GetSrc(nsString& aRetVal, ErrorResult& aRv) const {
+ aRetVal = mSrc;
+}
+
+void SpeechGrammar::SetSrc(const nsAString& aArg, ErrorResult& aRv) {
+ mSrc = aArg;
+}
+
+float SpeechGrammar::GetWeight(ErrorResult& aRv) const {
+ aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+ return 0;
+}
+
+void SpeechGrammar::SetWeight(float aArg, ErrorResult& aRv) {
+ aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechGrammar.h b/dom/media/webspeech/recognition/SpeechGrammar.h
new file mode 100644
index 0000000000..0dee1e9792
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechGrammar.h
@@ -0,0 +1,64 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechGrammar_h
+#define mozilla_dom_SpeechGrammar_h
+
+#include "nsCOMPtr.h"
+#include "nsCycleCollectionParticipant.h"
+#include "nsString.h"
+#include "nsWrapperCache.h"
+#include "js/TypeDecls.h"
+
+#include "mozilla/Attributes.h"
+
+namespace mozilla {
+class ErrorResult;
+
+namespace dom {
+
+class GlobalObject;
+
+class SpeechGrammar final : public nsISupports, public nsWrapperCache {
+ public:
+ explicit SpeechGrammar(nsISupports* aParent);
+
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammar)
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ static already_AddRefed<SpeechGrammar> Constructor(
+ const GlobalObject& aGlobal);
+
+ static already_AddRefed<SpeechGrammar> WebkitSpeechGrammar(
+ const GlobalObject& aGlobal, ErrorResult& aRv) {
+ return Constructor(aGlobal);
+ }
+
+ void GetSrc(nsString& aRetVal, ErrorResult& aRv) const;
+
+ void SetSrc(const nsAString& aArg, ErrorResult& aRv);
+
+ float GetWeight(ErrorResult& aRv) const;
+
+ void SetWeight(float aArg, ErrorResult& aRv);
+
+ private:
+ ~SpeechGrammar();
+
+ nsCOMPtr<nsISupports> mParent;
+
+ nsString mSrc;
+};
+
+} // namespace dom
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.cpp b/dom/media/webspeech/recognition/SpeechGrammarList.cpp
new file mode 100644
index 0000000000..4317452057
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechGrammarList.cpp
@@ -0,0 +1,76 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechGrammarList.h"
+
+#include "mozilla/dom/SpeechGrammar.h"
+#include "mozilla/dom/SpeechGrammarListBinding.h"
+#include "mozilla/ErrorResult.h"
+#include "nsCOMPtr.h"
+#include "SpeechRecognition.h"
+
+namespace mozilla::dom {
+
+NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammarList, mParent, mItems)
+NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammarList)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammarList)
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammarList)
+ NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY
+ NS_INTERFACE_MAP_ENTRY(nsISupports)
+NS_INTERFACE_MAP_END
+
+SpeechGrammarList::SpeechGrammarList(nsISupports* aParent) : mParent(aParent) {}
+
+SpeechGrammarList::~SpeechGrammarList() = default;
+
+already_AddRefed<SpeechGrammarList> SpeechGrammarList::Constructor(
+ const GlobalObject& aGlobal) {
+ RefPtr<SpeechGrammarList> speechGrammarList =
+ new SpeechGrammarList(aGlobal.GetAsSupports());
+ return speechGrammarList.forget();
+}
+
+JSObject* SpeechGrammarList::WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) {
+ return SpeechGrammarList_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+nsISupports* SpeechGrammarList::GetParentObject() const { return mParent; }
+
+uint32_t SpeechGrammarList::Length() const { return mItems.Length(); }
+
+already_AddRefed<SpeechGrammar> SpeechGrammarList::Item(uint32_t aIndex,
+ ErrorResult& aRv) {
+ RefPtr<SpeechGrammar> result = mItems.ElementAt(aIndex);
+ return result.forget();
+}
+
+void SpeechGrammarList::AddFromURI(const nsAString& aSrc,
+ const Optional<float>& aWeight,
+ ErrorResult& aRv) {
+ aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+}
+
+void SpeechGrammarList::AddFromString(const nsAString& aString,
+ const Optional<float>& aWeight,
+ ErrorResult& aRv) {
+ SpeechGrammar* speechGrammar = new SpeechGrammar(mParent);
+ speechGrammar->SetSrc(aString, aRv);
+ mItems.AppendElement(speechGrammar);
+}
+
+already_AddRefed<SpeechGrammar> SpeechGrammarList::IndexedGetter(
+ uint32_t aIndex, bool& aPresent, ErrorResult& aRv) {
+ if (aIndex >= Length()) {
+ aPresent = false;
+ return nullptr;
+ }
+ ErrorResult rv;
+ aPresent = true;
+ return Item(aIndex, rv);
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.h b/dom/media/webspeech/recognition/SpeechGrammarList.h
new file mode 100644
index 0000000000..7f1e09cd9e
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechGrammarList.h
@@ -0,0 +1,73 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechGrammarList_h
+#define mozilla_dom_SpeechGrammarList_h
+
+#include "mozilla/Attributes.h"
+#include "nsCOMPtr.h"
+#include "nsCycleCollectionParticipant.h"
+#include "nsTArray.h"
+#include "nsWrapperCache.h"
+
+struct JSContext;
+
+namespace mozilla {
+
+class ErrorResult;
+
+namespace dom {
+
+class GlobalObject;
+class SpeechGrammar;
+template <typename>
+class Optional;
+
+class SpeechGrammarList final : public nsISupports, public nsWrapperCache {
+ public:
+ explicit SpeechGrammarList(nsISupports* aParent);
+
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammarList)
+
+ static already_AddRefed<SpeechGrammarList> Constructor(
+ const GlobalObject& aGlobal);
+
+ static already_AddRefed<SpeechGrammarList> WebkitSpeechGrammarList(
+ const GlobalObject& aGlobal, ErrorResult& aRv) {
+ return Constructor(aGlobal);
+ }
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ uint32_t Length() const;
+
+ already_AddRefed<SpeechGrammar> Item(uint32_t aIndex, ErrorResult& aRv);
+
+ void AddFromURI(const nsAString& aSrc, const Optional<float>& aWeight,
+ ErrorResult& aRv);
+
+ void AddFromString(const nsAString& aString, const Optional<float>& aWeight,
+ ErrorResult& aRv);
+
+ already_AddRefed<SpeechGrammar> IndexedGetter(uint32_t aIndex, bool& aPresent,
+ ErrorResult& aRv);
+
+ private:
+ ~SpeechGrammarList();
+
+ nsCOMPtr<nsISupports> mParent;
+
+ nsTArray<RefPtr<SpeechGrammar>> mItems;
+};
+
+} // namespace dom
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp
new file mode 100644
index 0000000000..e3bf531218
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp
@@ -0,0 +1,1170 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechRecognition.h"
+
+#include "nsCOMPtr.h"
+#include "nsCycleCollectionParticipant.h"
+
+#include "mozilla/dom/AudioStreamTrack.h"
+#include "mozilla/dom/BindingUtils.h"
+#include "mozilla/dom/Element.h"
+#include "mozilla/dom/SpeechRecognitionBinding.h"
+#include "mozilla/dom/MediaStreamTrackBinding.h"
+#include "mozilla/dom/MediaStreamError.h"
+#include "mozilla/dom/RootedDictionary.h"
+#include "mozilla/dom/SpeechGrammar.h"
+#include "mozilla/MediaManager.h"
+#include "mozilla/Preferences.h"
+#include "mozilla/ResultVariant.h"
+#include "mozilla/Services.h"
+#include "mozilla/StaticPrefs_media.h"
+#include "mozilla/AbstractThread.h"
+#include "VideoUtils.h"
+#include "AudioSegment.h"
+#include "MediaEnginePrefs.h"
+#include "endpointer.h"
+
+#include "mozilla/dom/SpeechRecognitionEvent.h"
+#include "nsComponentManagerUtils.h"
+#include "nsContentUtils.h"
+#include "mozilla/dom/Document.h"
+#include "nsIObserverService.h"
+#include "nsIPermissionManager.h"
+#include "nsIPrincipal.h"
+#include "nsPIDOMWindow.h"
+#include "nsServiceManagerUtils.h"
+#include "nsQueryObject.h"
+#include "SpeechTrackListener.h"
+
+#include <algorithm>
+
+// Undo the windows.h damage
+#if defined(XP_WIN) && defined(GetMessage)
+# undef GetMessage
+#endif
+
+namespace mozilla::dom {
+
+#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
+#define DEFAULT_RECOGNITION_SERVICE "online"
+
+#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
+#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \
+ "media.webspeech.long_silence_length"
+#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \
+ "media.webspeech.long_speech_length"
+#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \
+ "media.webspeech.recognition.timeout"
+
+static const uint32_t kSAMPLE_RATE = 16000;
+
+// number of frames corresponding to 300ms of audio to send to endpointer while
+// it's in environment estimation mode
+// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
+static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
+
+LogModule* GetSpeechRecognitionLog() {
+ static LazyLogModule sLog("SpeechRecognition");
+ return sLog;
+}
+#define SR_LOG(...) \
+ MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
+
+namespace {
+class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {
+ public:
+ SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition,
+ const nsString& aName)
+ : media::ShutdownBlocker(aName), mRecognition(aRecognition) {}
+
+ NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {
+ MOZ_ASSERT(NS_IsMainThread());
+ // AbortSilently will eventually clear the blocker.
+ mRecognition->Abort();
+ return NS_OK;
+ }
+
+ private:
+ const RefPtr<SpeechRecognition> mRecognition;
+};
+
+enum class ServiceCreationError {
+ ServiceNotFound,
+};
+
+Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError>
+CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow,
+ SpeechRecognition* aRecognition,
+ const nsAString& aLang) {
+ nsAutoCString speechRecognitionServiceCID;
+
+ nsAutoCString prefValue;
+ Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue);
+ nsAutoCString speechRecognitionService;
+
+ if (!prefValue.IsEmpty()) {
+ speechRecognitionService = prefValue;
+ } else {
+ speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
+ }
+
+ if (StaticPrefs::media_webspeech_test_fake_recognition_service()) {
+ speechRecognitionServiceCID =
+ NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
+ } else {
+ speechRecognitionServiceCID =
+ nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
+ speechRecognitionService;
+ }
+
+ nsresult rv;
+ nsCOMPtr<nsISpeechRecognitionService> recognitionService;
+ recognitionService =
+ do_CreateInstance(speechRecognitionServiceCID.get(), &rv);
+ if (!recognitionService) {
+ return Err(ServiceCreationError::ServiceNotFound);
+ }
+
+ return recognitionService;
+}
+} // namespace
+
+NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition,
+ DOMEventTargetHelper, mStream,
+ mTrack, mRecognitionService,
+ mSpeechGrammarList)
+
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition)
+ NS_INTERFACE_MAP_ENTRY(nsIObserver)
+NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
+
+NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
+NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
+
+SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
+ : DOMEventTargetHelper(aOwnerWindow),
+ mEndpointer(kSAMPLE_RATE),
+ mAudioSamplesPerChunk(mEndpointer.FrameSize()),
+ mSpeechDetectionTimer(NS_NewTimer()),
+ mSpeechGrammarList(new SpeechGrammarList(GetOwner())),
+ mContinuous(false),
+ mInterimResults(false),
+ mMaxAlternatives(1) {
+ SR_LOG("created SpeechRecognition");
+
+ if (StaticPrefs::media_webspeech_test_enable()) {
+ nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
+ obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
+ obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
+ }
+
+ mEndpointer.set_speech_input_complete_silence_length(
+ Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000));
+ mEndpointer.set_long_speech_input_complete_silence_length(
+ Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
+ mEndpointer.set_long_speech_length(
+ Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
+
+ mSpeechDetectionTimeoutMs =
+ Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000);
+
+ Reset();
+}
+
+SpeechRecognition::~SpeechRecognition() = default;
+
+bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) {
+ return mCurrentState >= begin && mCurrentState <= end;
+}
+
+void SpeechRecognition::SetState(FSMState state) {
+ mCurrentState = state;
+ SR_LOG("Transitioned to state %s", GetName(mCurrentState));
+}
+
+JSObject* SpeechRecognition::WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) {
+ return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+bool SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) {
+ nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal);
+
+ nsresult rv;
+ nsCOMPtr<nsIPermissionManager> mgr =
+ do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ return false;
+ }
+
+ uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION;
+ rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition"_ns,
+ &speechRecognition);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ return false;
+ }
+
+ bool hasPermission =
+ (speechRecognition == nsIPermissionManager::ALLOW_ACTION);
+
+ return (hasPermission ||
+ StaticPrefs::media_webspeech_recognition_force_enable() ||
+ StaticPrefs::media_webspeech_test_enable()) &&
+ StaticPrefs::media_webspeech_recognition_enable();
+}
+
+already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor(
+ const GlobalObject& aGlobal, ErrorResult& aRv) {
+ nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports());
+ if (!win) {
+ aRv.Throw(NS_ERROR_FAILURE);
+ return nullptr;
+ }
+
+ RefPtr<SpeechRecognition> object = new SpeechRecognition(win);
+ return object.forget();
+}
+
+void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) {
+ SR_LOG("Processing %s, current state is %s", GetName(aEvent),
+ GetName(mCurrentState));
+
+ if (mAborted && aEvent->mType != EVENT_ABORT) {
+ // ignore all events while aborting
+ return;
+ }
+
+ Transition(aEvent);
+}
+
+void SpeechRecognition::Transition(SpeechEvent* aEvent) {
+ switch (mCurrentState) {
+ case STATE_IDLE:
+ switch (aEvent->mType) {
+ case EVENT_START:
+ // TODO: may want to time out if we wait too long
+ // for user to approve
+ WaitForAudioData(aEvent);
+ break;
+ case EVENT_STOP:
+ case EVENT_ABORT:
+ case EVENT_AUDIO_DATA:
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ DoNothing(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ AbortError(aEvent);
+ break;
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_STARTING:
+ switch (aEvent->mType) {
+ case EVENT_AUDIO_DATA:
+ StartedAudioCapture(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ AbortError(aEvent);
+ break;
+ case EVENT_ABORT:
+ AbortSilently(aEvent);
+ break;
+ case EVENT_STOP:
+ ResetAndEnd();
+ break;
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ DoNothing(aEvent);
+ break;
+ case EVENT_START:
+ SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_ESTIMATING:
+ switch (aEvent->mType) {
+ case EVENT_AUDIO_DATA:
+ WaitForEstimation(aEvent);
+ break;
+ case EVENT_STOP:
+ StopRecordingAndRecognize(aEvent);
+ break;
+ case EVENT_ABORT:
+ AbortSilently(aEvent);
+ break;
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ DoNothing(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ AbortError(aEvent);
+ break;
+ case EVENT_START:
+ SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_WAITING_FOR_SPEECH:
+ switch (aEvent->mType) {
+ case EVENT_AUDIO_DATA:
+ DetectSpeech(aEvent);
+ break;
+ case EVENT_STOP:
+ StopRecordingAndRecognize(aEvent);
+ break;
+ case EVENT_ABORT:
+ AbortSilently(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ AbortError(aEvent);
+ break;
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ DoNothing(aEvent);
+ break;
+ case EVENT_START:
+ SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_RECOGNIZING:
+ switch (aEvent->mType) {
+ case EVENT_AUDIO_DATA:
+ WaitForSpeechEnd(aEvent);
+ break;
+ case EVENT_STOP:
+ StopRecordingAndRecognize(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ AbortError(aEvent);
+ break;
+ case EVENT_ABORT:
+ AbortSilently(aEvent);
+ break;
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ DoNothing(aEvent);
+ break;
+ case EVENT_START:
+ SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_WAITING_FOR_RESULT:
+ switch (aEvent->mType) {
+ case EVENT_STOP:
+ DoNothing(aEvent);
+ break;
+ case EVENT_AUDIO_ERROR:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ AbortError(aEvent);
+ break;
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ NotifyFinalResult(aEvent);
+ break;
+ case EVENT_AUDIO_DATA:
+ DoNothing(aEvent);
+ break;
+ case EVENT_ABORT:
+ AbortSilently(aEvent);
+ break;
+ case EVENT_START:
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s",
+ GetName(aEvent));
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ case STATE_ABORTING:
+ switch (aEvent->mType) {
+ case EVENT_STOP:
+ case EVENT_ABORT:
+ case EVENT_AUDIO_DATA:
+ case EVENT_AUDIO_ERROR:
+ case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+ case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+ case EVENT_RECOGNITIONSERVICE_ERROR:
+ DoNothing(aEvent);
+ break;
+ case EVENT_START:
+ SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent));
+ MOZ_CRASH();
+ default:
+ MOZ_CRASH("Invalid event");
+ }
+ break;
+ default:
+ MOZ_CRASH("Invalid state");
+ }
+}
+
+/*
+ * Handle a segment of recorded audio data.
+ * Returns the number of samples that were processed.
+ */
+uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,
+ TrackRate aTrackRate) {
+ AudioSegment::ChunkIterator iterator(*aSegment);
+ uint32_t samples = 0;
+ while (!iterator.IsEnded()) {
+ float out;
+ mEndpointer.ProcessAudio(*iterator, &out);
+ samples += iterator->GetDuration();
+ iterator.Next();
+ }
+
+ // we need to call the nsISpeechRecognitionService::ProcessAudioSegment
+ // in a separate thread so that any eventual encoding or pre-processing
+ // of the audio does not block the main thread
+ nsresult rv = mEncodeTaskQueue->Dispatch(
+ NewRunnableMethod<StoreCopyPassByPtr<AudioSegment>, TrackRate>(
+ "nsISpeechRecognitionService::ProcessAudioSegment",
+ mRecognitionService,
+ &nsISpeechRecognitionService::ProcessAudioSegment,
+ std::move(*aSegment), aTrackRate));
+ MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+ Unused << rv;
+ return samples;
+}
+
+/****************************************************************************
+ * FSM Transition functions
+ *
+ * If a transition function may cause a DOM event to be fired,
+ * it may also be re-entered, since the event handler may cause the
+ * event loop to spin and new SpeechEvents to be processed.
+ *
+ * Rules:
+ * 1) These methods should call SetState as soon as possible.
+ * 2) If these methods dispatch DOM events, or call methods that dispatch
+ * DOM events, that should be done as late as possible.
+ * 3) If anything must happen after dispatching a DOM event, make sure
+ * the state is still what the method expected it to be.
+ ****************************************************************************/
+
+void SpeechRecognition::Reset() {
+ SetState(STATE_IDLE);
+
+ // This breaks potential ref-cycles.
+ mRecognitionService = nullptr;
+
+ ++mStreamGeneration;
+ if (mStream) {
+ mStream->UnregisterTrackListener(this);
+ mStream = nullptr;
+ }
+ mTrack = nullptr;
+ mTrackIsOwned = false;
+ mStopRecordingPromise = nullptr;
+ mEncodeTaskQueue = nullptr;
+ mEstimationSamples = 0;
+ mBufferedSamples = 0;
+ mSpeechDetectionTimer->Cancel();
+ mAborted = false;
+}
+
+void SpeechRecognition::ResetAndEnd() {
+ Reset();
+ DispatchTrustedEvent(u"end"_ns);
+}
+
+void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) {
+ SetState(STATE_STARTING);
+}
+
+void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) {
+ SetState(STATE_ESTIMATING);
+
+ mEndpointer.SetEnvironmentEstimationMode();
+ mEstimationSamples +=
+ ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
+
+ DispatchTrustedEvent(u"audiostart"_ns);
+ if (mCurrentState == STATE_ESTIMATING) {
+ DispatchTrustedEvent(u"start"_ns);
+ }
+}
+
+void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) {
+ SetState(STATE_WAITING_FOR_RESULT);
+
+ MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
+
+ // This will run SoundEnd on the service just before StopRecording begins
+ // shutting the encode thread down.
+ mSpeechListener->mRemovedPromise->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [service = mRecognitionService] { service->SoundEnd(); });
+
+ StopRecording();
+}
+
+void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) {
+ SetState(STATE_ESTIMATING);
+
+ mEstimationSamples +=
+ ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
+ if (mEstimationSamples > kESTIMATION_SAMPLES) {
+ mEndpointer.SetUserInputMode();
+ SetState(STATE_WAITING_FOR_SPEECH);
+ }
+}
+
+void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) {
+ SetState(STATE_WAITING_FOR_SPEECH);
+
+ ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
+ if (mEndpointer.DidStartReceivingSpeech()) {
+ mSpeechDetectionTimer->Cancel();
+ SetState(STATE_RECOGNIZING);
+ DispatchTrustedEvent(u"speechstart"_ns);
+ }
+}
+
+void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) {
+ SetState(STATE_RECOGNIZING);
+
+ ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
+ if (mEndpointer.speech_input_complete()) {
+ DispatchTrustedEvent(u"speechend"_ns);
+
+ if (mCurrentState == STATE_RECOGNIZING) {
+ // FIXME: StopRecordingAndRecognize should only be called for single
+ // shot services for continuous we should just inform the service
+ StopRecordingAndRecognize(aEvent);
+ }
+ }
+}
+
+void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) {
+ ResetAndEnd();
+
+ RootedDictionary<SpeechRecognitionEventInit> init(RootingCx());
+ init.mBubbles = true;
+ init.mCancelable = false;
+ // init.mResultIndex = 0;
+ init.mResults = aEvent->mRecognitionResultList;
+ init.mInterpretation = JS::NullValue();
+ // init.mEmma = nullptr;
+
+ RefPtr<SpeechRecognitionEvent> event =
+ SpeechRecognitionEvent::Constructor(this, u"result"_ns, init);
+ event->SetTrusted(true);
+
+ DispatchEvent(*event);
+}
+
+void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {}
+
+void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) {
+ if (mRecognitionService) {
+ if (mTrack) {
+ // This will run Abort on the service just before StopRecording begins
+ // shutting the encode thread down.
+ mSpeechListener->mRemovedPromise->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [service = mRecognitionService] { service->Abort(); });
+ } else {
+ // Recording hasn't started yet. We can just call Abort().
+ mRecognitionService->Abort();
+ }
+ }
+
+ StopRecording()->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); });
+
+ SetState(STATE_ABORTING);
+}
+
+void SpeechRecognition::AbortError(SpeechEvent* aEvent) {
+ AbortSilently(aEvent);
+ NotifyError(aEvent);
+}
+
+void SpeechRecognition::NotifyError(SpeechEvent* aEvent) {
+ aEvent->mError->SetTrusted(true);
+
+ DispatchEvent(*aEvent->mError);
+}
+
+/**************************************
+ * Event triggers and other functions *
+ **************************************/
+NS_IMETHODIMP
+SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) {
+ // hold a reference so that the underlying track doesn't get collected.
+ mTrack = aTrack;
+ MOZ_ASSERT(!mTrack->Ended());
+
+ mSpeechListener = new SpeechTrackListener(this);
+ mTrack->AddListener(mSpeechListener);
+
+ nsString blockerName;
+ blockerName.AppendPrintf("SpeechRecognition %p shutdown", this);
+ mShutdownBlocker =
+ MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName);
+ media::MustGetShutdownBarrier()->AddBlocker(
+ mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__,
+ u"SpeechRecognition shutdown"_ns);
+
+ mEndpointer.StartSession();
+
+ return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs,
+ nsITimer::TYPE_ONE_SHOT);
+}
+
+RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() {
+ if (!mTrack) {
+ // Recording wasn't started, or has already been stopped.
+ if (mStream) {
+ // Ensure we don't start recording because a track became available
+ // before we get reset.
+ mStream->UnregisterTrackListener(this);
+ }
+ return GenericNonExclusivePromise::CreateAndResolve(true, __func__);
+ }
+
+ if (mStopRecordingPromise) {
+ return mStopRecordingPromise;
+ }
+
+ mTrack->RemoveListener(mSpeechListener);
+ if (mTrackIsOwned) {
+ mTrack->Stop();
+ }
+
+ mEndpointer.EndSession();
+ DispatchTrustedEvent(u"audioend"_ns);
+
+ // Block shutdown until the speech track listener has been removed from the
+ // MSG, as it holds a reference to us, and we reference the world, which we
+ // don't want to leak.
+ mStopRecordingPromise =
+ mSpeechListener->mRemovedPromise
+ ->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [self = RefPtr<SpeechRecognition>(this), this] {
+ SR_LOG("Shutting down encoding thread");
+ return mEncodeTaskQueue->BeginShutdown();
+ },
+ [] {
+ MOZ_CRASH("Unexpected rejection");
+ return ShutdownPromise::CreateAndResolve(false, __func__);
+ })
+ ->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [self = RefPtr<SpeechRecognition>(this), this] {
+ media::MustGetShutdownBarrier()->RemoveBlocker(
+ mShutdownBlocker);
+ mShutdownBlocker = nullptr;
+
+ MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE);
+ return GenericNonExclusivePromise::CreateAndResolve(true,
+ __func__);
+ },
+ [] {
+ MOZ_CRASH("Unexpected rejection");
+ return GenericNonExclusivePromise::CreateAndResolve(false,
+ __func__);
+ });
+ return mStopRecordingPromise;
+}
+
+NS_IMETHODIMP
+SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
+ const char16_t* aData) {
+ MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
+
+ if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
+ StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
+ DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
+ SpeechRecognitionErrorCode::No_speech,
+ "No speech detected (timeout)");
+ } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
+ nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
+ obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
+ obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
+ } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() &&
+ !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
+ ProcessTestEventRequest(aSubject, nsDependentString(aData));
+ }
+
+ return NS_OK;
+}
+
+void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject,
+ const nsAString& aEventName) {
+ if (aEventName.EqualsLiteral("EVENT_ABORT")) {
+ Abort();
+ } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
+ DispatchError(
+ SpeechRecognition::EVENT_AUDIO_ERROR,
+ SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
+ "AUDIO_ERROR test event");
+ } else {
+ NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(),
+ "Got request for fake recognition service event, but "
+ "media.webspeech.test.fake_recognition_service is unset");
+
+ // let the fake recognition service handle the request
+ }
+}
+
+already_AddRefed<SpeechGrammarList> SpeechRecognition::Grammars() const {
+ RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList;
+ return speechGrammarList.forget();
+}
+
+void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) {
+ mSpeechGrammarList = &aArg;
+}
+
+void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; }
+
+void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; }
+
+bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const {
+ return mContinuous;
+}
+
+void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) {
+ mContinuous = aArg;
+}
+
+bool SpeechRecognition::InterimResults() const { return mInterimResults; }
+
+void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; }
+
+uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; }
+
+void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) {
+ mMaxAlternatives = aArg;
+}
+
+void SpeechRecognition::GetServiceURI(nsString& aRetVal,
+ ErrorResult& aRv) const {
+ aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+}
+
+void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) {
+ aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+}
+
+void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
+ CallerType aCallerType, ErrorResult& aRv) {
+ if (mCurrentState != STATE_IDLE) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return;
+ }
+
+ if (!SetRecognitionService(aRv)) {
+ return;
+ }
+
+ if (!ValidateAndSetGrammarList(aRv)) {
+ return;
+ }
+
+ mEncodeTaskQueue =
+ TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER),
+ "WebSpeechEncoderThread");
+
+ nsresult rv;
+ rv = mRecognitionService->Initialize(this);
+ if (NS_WARN_IF(NS_FAILED(rv))) {
+ return;
+ }
+
+ MediaStreamConstraints constraints;
+ constraints.mAudio.SetAsBoolean() = true;
+
+ if (aStream.WasPassed()) {
+ mStream = &aStream.Value();
+ mTrackIsOwned = false;
+ mStream->RegisterTrackListener(this);
+ nsTArray<RefPtr<AudioStreamTrack>> tracks;
+ mStream->GetAudioTracks(tracks);
+ for (const RefPtr<AudioStreamTrack>& track : tracks) {
+ if (!track->Ended()) {
+ NotifyTrackAdded(track);
+ break;
+ }
+ }
+ } else {
+ mTrackIsOwned = true;
+ nsPIDOMWindowInner* win = GetOwner();
+ if (!win || !win->IsFullyActive()) {
+ aRv.ThrowInvalidStateError("The document is not fully active.");
+ return;
+ }
+ AutoNoJSAPI nojsapi;
+ RefPtr<SpeechRecognition> self(this);
+ MediaManager::Get()
+ ->GetUserMedia(win, constraints, aCallerType)
+ ->Then(
+ GetCurrentSerialEventTarget(), __func__,
+ [this, self,
+ generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) {
+ nsTArray<RefPtr<AudioStreamTrack>> tracks;
+ aStream->GetAudioTracks(tracks);
+ if (mAborted || mCurrentState != STATE_STARTING ||
+ mStreamGeneration != generation) {
+ // We were probably aborted. Exit early.
+ for (const RefPtr<AudioStreamTrack>& track : tracks) {
+ track->Stop();
+ }
+ return;
+ }
+ mStream = std::move(aStream);
+ mStream->RegisterTrackListener(this);
+ for (const RefPtr<AudioStreamTrack>& track : tracks) {
+ if (!track->Ended()) {
+ NotifyTrackAdded(track);
+ }
+ }
+ },
+ [this, self,
+ generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) {
+ if (mAborted || mCurrentState != STATE_STARTING ||
+ mStreamGeneration != generation) {
+ // We were probably aborted. Exit early.
+ return;
+ }
+ SpeechRecognitionErrorCode errorCode;
+
+ if (error->mName == MediaMgrError::Name::NotAllowedError) {
+ errorCode = SpeechRecognitionErrorCode::Not_allowed;
+ } else {
+ errorCode = SpeechRecognitionErrorCode::Audio_capture;
+ }
+ DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
+ error->mMessage);
+ });
+ }
+
+ RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
+ NS_DispatchToMainThread(event);
+}
+
+bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) {
+ if (!GetOwner()) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return false;
+ }
+
+ // See:
+ // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
+ nsAutoString lang;
+ if (!mLang.IsEmpty()) {
+ lang = mLang;
+ } else {
+ nsCOMPtr<Document> document = GetOwner()->GetExtantDoc();
+ if (!document) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return false;
+ }
+ nsCOMPtr<Element> element = document->GetRootElement();
+ if (!element) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return false;
+ }
+
+ nsAutoString lang;
+ element->GetLang(lang);
+ }
+
+ auto result = CreateSpeechRecognitionService(GetOwner(), this, lang);
+
+ if (result.isErr()) {
+ switch (result.unwrapErr()) {
+ case ServiceCreationError::ServiceNotFound:
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ break;
+ default:
+ MOZ_CRASH("Unknown error");
+ }
+ return false;
+ }
+
+ mRecognitionService = result.unwrap();
+ MOZ_DIAGNOSTIC_ASSERT(mRecognitionService);
+ return true;
+}
+
+bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) {
+ if (!mSpeechGrammarList) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return false;
+ }
+
+ uint32_t grammarListLength = mSpeechGrammarList->Length();
+ for (uint32_t count = 0; count < grammarListLength; ++count) {
+ RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
+ if (aRv.Failed()) {
+ return false;
+ }
+ if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(
+ speechGrammar.get(), nullptr))) {
+ aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void SpeechRecognition::Stop() {
+ RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
+ NS_DispatchToMainThread(event);
+}
+
+void SpeechRecognition::Abort() {
+ if (mAborted) {
+ return;
+ }
+
+ mAborted = true;
+
+ RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
+ NS_DispatchToMainThread(event);
+}
+
+void SpeechRecognition::NotifyTrackAdded(
+ const RefPtr<MediaStreamTrack>& aTrack) {
+ if (mTrack) {
+ return;
+ }
+
+ RefPtr<AudioStreamTrack> audioTrack = aTrack->AsAudioStreamTrack();
+ if (!audioTrack) {
+ return;
+ }
+
+ if (audioTrack->Ended()) {
+ return;
+ }
+
+ StartRecording(audioTrack);
+}
+
+void SpeechRecognition::DispatchError(EventType aErrorType,
+ SpeechRecognitionErrorCode aErrorCode,
+ const nsACString& aMessage) {
+ MOZ_ASSERT(NS_IsMainThread());
+ MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
+ aErrorType == EVENT_AUDIO_ERROR,
+ "Invalid error type!");
+
+ RefPtr<SpeechRecognitionError> srError =
+ new SpeechRecognitionError(nullptr, nullptr, nullptr);
+
+ srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode,
+ aMessage);
+
+ RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
+ event->mError = srError;
+ NS_DispatchToMainThread(event);
+}
+
+/*
+ * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
+ * Updates mBufferedSamples and returns the number of samples that were
+ * buffered.
+ */
+uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
+ uint32_t aSampleCount) {
+ MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
+ MOZ_ASSERT(mAudioSamplesBuffer);
+
+ int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
+ size_t samplesToCopy =
+ std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples);
+
+ PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy);
+
+ mBufferedSamples += samplesToCopy;
+ return samplesToCopy;
+}
+
+/*
+ * Split a samples buffer starting of a given size into
+ * chunks of equal size. The chunks are stored in the array
+ * received as argument.
+ * Returns the offset of the end of the last chunk that was
+ * created.
+ */
+uint32_t SpeechRecognition::SplitSamplesBuffer(
+ const int16_t* aSamplesBuffer, uint32_t aSampleCount,
+ nsTArray<RefPtr<SharedBuffer>>& aResult) {
+ uint32_t chunkStart = 0;
+
+ while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
+ CheckedInt<size_t> bufferSize(sizeof(int16_t));
+ bufferSize *= mAudioSamplesPerChunk;
+ RefPtr<SharedBuffer> chunk = SharedBuffer::Create(bufferSize);
+
+ PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart,
+ mAudioSamplesPerChunk);
+
+ aResult.AppendElement(chunk.forget());
+ chunkStart += mAudioSamplesPerChunk;
+ }
+
+ return chunkStart;
+}
+
+AudioSegment* SpeechRecognition::CreateAudioSegment(
+ nsTArray<RefPtr<SharedBuffer>>& aChunks) {
+ AudioSegment* segment = new AudioSegment();
+ for (uint32_t i = 0; i < aChunks.Length(); ++i) {
+ RefPtr<SharedBuffer> buffer = aChunks[i];
+ const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
+
+ AutoTArray<const int16_t*, 1> channels;
+ channels.AppendElement(chunkData);
+ segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk,
+ PRINCIPAL_HANDLE_NONE);
+ }
+
+ return segment;
+}
+
+void SpeechRecognition::FeedAudioData(
+ nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
+ already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration,
+ MediaTrackListener* aProvider, TrackRate aTrackRate) {
+ NS_ASSERTION(!NS_IsMainThread(),
+ "FeedAudioData should not be called in the main thread");
+
+ // Endpointer expects to receive samples in chunks whose size is a
+ // multiple of its frame size.
+ // Since we can't assume we will receive the frames in appropriate-sized
+ // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
+ // (a multiple of Endpointer's frame size) before feeding to Endpointer.
+
+ // ensure aSamples is deleted
+ RefPtr<SharedBuffer> refSamples = aSamples;
+
+ uint32_t samplesIndex = 0;
+ const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
+ AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend;
+
+ // fill up our buffer and make a chunk out of it, if possible
+ if (mBufferedSamples > 0) {
+ samplesIndex += FillSamplesBuffer(samples, aDuration);
+
+ if (mBufferedSamples == mAudioSamplesPerChunk) {
+ chunksToSend.AppendElement(mAudioSamplesBuffer.forget());
+ mBufferedSamples = 0;
+ }
+ }
+
+ // create sample chunks of correct size
+ if (samplesIndex < aDuration) {
+ samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
+ aDuration - samplesIndex, chunksToSend);
+ }
+
+ // buffer remaining samples
+ if (samplesIndex < aDuration) {
+ mBufferedSamples = 0;
+ CheckedInt<size_t> bufferSize(sizeof(int16_t));
+ bufferSize *= mAudioSamplesPerChunk;
+ mAudioSamplesBuffer = SharedBuffer::Create(bufferSize);
+
+ FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
+ }
+
+ AudioSegment* segment = CreateAudioSegment(chunksToSend);
+ RefPtr<SpeechEvent> event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA);
+ event->mAudioSegment = segment;
+ event->mProvider = aProvider;
+ event->mTrackRate = aTrackRate;
+ NS_DispatchToMainThread(event);
+}
+
+const char* SpeechRecognition::GetName(FSMState aId) {
+ static const char* names[] = {
+ "STATE_IDLE", "STATE_STARTING",
+ "STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH",
+ "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT",
+ "STATE_ABORTING",
+ };
+
+ MOZ_ASSERT(aId < STATE_COUNT);
+ MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
+ return names[aId];
+}
+
+const char* SpeechRecognition::GetName(SpeechEvent* aEvent) {
+ static const char* names[] = {"EVENT_START",
+ "EVENT_STOP",
+ "EVENT_ABORT",
+ "EVENT_AUDIO_DATA",
+ "EVENT_AUDIO_ERROR",
+ "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
+ "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
+ "EVENT_RECOGNITIONSERVICE_ERROR"};
+
+ MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
+ MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
+ return names[aEvent->mType];
+}
+
+TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const {
+ MOZ_ASSERT(NS_IsMainThread());
+ return mEncodeTaskQueue;
+}
+
+SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition,
+ SpeechRecognition::EventType aType)
+ : Runnable("dom::SpeechEvent"),
+ mAudioSegment(nullptr),
+ mRecognitionResultList(nullptr),
+ mError(nullptr),
+ mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>(
+ "SpeechEvent::SpeechEvent", aRecognition)),
+ mType(aType),
+ mTrackRate(0) {}
+
+SpeechEvent::SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
+ SpeechRecognition::EventType aType)
+ : Runnable("dom::SpeechEvent"),
+ mAudioSegment(nullptr),
+ mRecognitionResultList(nullptr),
+ mError(nullptr),
+ mRecognition(aRecognition),
+ mType(aType),
+ mTrackRate(0) {}
+
+SpeechEvent::~SpeechEvent() { delete mAudioSegment; }
+
+NS_IMETHODIMP
+SpeechEvent::Run() {
+ mRecognition->ProcessEvent(this);
+ return NS_OK;
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.h b/dom/media/webspeech/recognition/SpeechRecognition.h
new file mode 100644
index 0000000000..687f38041e
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognition.h
@@ -0,0 +1,314 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechRecognition_h
+#define mozilla_dom_SpeechRecognition_h
+
+#include "mozilla/Attributes.h"
+#include "mozilla/DOMEventTargetHelper.h"
+#include "nsCOMPtr.h"
+#include "nsString.h"
+#include "nsWrapperCache.h"
+#include "nsTArray.h"
+#include "js/TypeDecls.h"
+#include "nsProxyRelease.h"
+#include "DOMMediaStream.h"
+#include "nsITimer.h"
+#include "MediaTrackGraph.h"
+#include "AudioSegment.h"
+#include "mozilla/WeakPtr.h"
+
+#include "SpeechGrammarList.h"
+#include "SpeechRecognitionResultList.h"
+#include "nsISpeechRecognitionService.h"
+#include "endpointer.h"
+
+#include "mozilla/dom/BindingDeclarations.h"
+#include "mozilla/dom/SpeechRecognitionError.h"
+
+namespace mozilla {
+
+namespace media {
+class ShutdownBlocker;
+}
+
+namespace dom {
+
+#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \
+ "SpeechRecognitionTest:RequestEvent"
+#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
+
+class GlobalObject;
+class AudioStreamTrack;
+class SpeechEvent;
+class SpeechTrackListener;
+
+LogModule* GetSpeechRecognitionLog();
+#define SR_LOG(...) \
+ MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
+
+class SpeechRecognition final : public DOMEventTargetHelper,
+ public nsIObserver,
+ public DOMMediaStream::TrackListener,
+ public SupportsWeakPtr {
+ public:
+ explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
+
+ NS_DECL_ISUPPORTS_INHERITED
+ NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition,
+ DOMEventTargetHelper)
+
+ NS_DECL_NSIOBSERVER
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
+
+ static already_AddRefed<SpeechRecognition> Constructor(
+ const GlobalObject& aGlobal, ErrorResult& aRv);
+
+ static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition(
+ const GlobalObject& aGlobal, ErrorResult& aRv) {
+ return Constructor(aGlobal, aRv);
+ }
+
+ already_AddRefed<SpeechGrammarList> Grammars() const;
+
+ void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
+
+ void GetLang(nsString& aRetVal) const;
+
+ void SetLang(const nsAString& aArg);
+
+ bool GetContinuous(ErrorResult& aRv) const;
+
+ void SetContinuous(bool aArg, ErrorResult& aRv);
+
+ bool InterimResults() const;
+
+ void SetInterimResults(bool aArg);
+
+ uint32_t MaxAlternatives() const;
+
+ TaskQueue* GetTaskQueueForEncoding() const;
+
+ void SetMaxAlternatives(uint32_t aArg);
+
+ void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
+
+ void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
+
+ void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
+ CallerType aCallerType, ErrorResult& aRv);
+
+ void Stop();
+
+ void Abort();
+
+ IMPL_EVENT_HANDLER(audiostart)
+ IMPL_EVENT_HANDLER(soundstart)
+ IMPL_EVENT_HANDLER(speechstart)
+ IMPL_EVENT_HANDLER(speechend)
+ IMPL_EVENT_HANDLER(soundend)
+ IMPL_EVENT_HANDLER(audioend)
+ IMPL_EVENT_HANDLER(result)
+ IMPL_EVENT_HANDLER(nomatch)
+ IMPL_EVENT_HANDLER(error)
+ IMPL_EVENT_HANDLER(start)
+ IMPL_EVENT_HANDLER(end)
+
+ enum EventType {
+ EVENT_START,
+ EVENT_STOP,
+ EVENT_ABORT,
+ EVENT_AUDIO_DATA,
+ EVENT_AUDIO_ERROR,
+ EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
+ EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
+ EVENT_RECOGNITIONSERVICE_ERROR,
+ EVENT_COUNT
+ };
+
+ void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack) override;
+ // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are
+ // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16.
+ void DispatchError(EventType aErrorType,
+ SpeechRecognitionErrorCode aErrorCode,
+ const nsACString& aMessage);
+ template <int N>
+ void DispatchError(EventType aErrorType,
+ SpeechRecognitionErrorCode aErrorCode,
+ const char (&aMessage)[N]) {
+ DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage));
+ }
+ uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
+ uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer,
+ uint32_t aSampleCount,
+ nsTArray<RefPtr<SharedBuffer>>& aResult);
+ AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
+ void FeedAudioData(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
+ already_AddRefed<SharedBuffer> aSamples,
+ uint32_t aDuration, MediaTrackListener* aProvider,
+ TrackRate aTrackRate);
+
+ friend class SpeechEvent;
+
+ private:
+ virtual ~SpeechRecognition();
+
+ enum FSMState {
+ STATE_IDLE,
+ STATE_STARTING,
+ STATE_ESTIMATING,
+ STATE_WAITING_FOR_SPEECH,
+ STATE_RECOGNIZING,
+ STATE_WAITING_FOR_RESULT,
+ STATE_ABORTING,
+ STATE_COUNT
+ };
+
+ void SetState(FSMState state);
+ bool StateBetween(FSMState begin, FSMState end);
+
+ bool SetRecognitionService(ErrorResult& aRv);
+ bool ValidateAndSetGrammarList(ErrorResult& aRv);
+
+ NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream);
+ RefPtr<GenericNonExclusivePromise> StopRecording();
+
+ uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
+ void NotifyError(SpeechEvent* aEvent);
+
+ void ProcessEvent(SpeechEvent* aEvent);
+ void Transition(SpeechEvent* aEvent);
+
+ void Reset();
+ void ResetAndEnd();
+ void WaitForAudioData(SpeechEvent* aEvent);
+ void StartedAudioCapture(SpeechEvent* aEvent);
+ void StopRecordingAndRecognize(SpeechEvent* aEvent);
+ void WaitForEstimation(SpeechEvent* aEvent);
+ void DetectSpeech(SpeechEvent* aEvent);
+ void WaitForSpeechEnd(SpeechEvent* aEvent);
+ void NotifyFinalResult(SpeechEvent* aEvent);
+ void DoNothing(SpeechEvent* aEvent);
+ void AbortSilently(SpeechEvent* aEvent);
+ void AbortError(SpeechEvent* aEvent);
+
+ RefPtr<DOMMediaStream> mStream;
+ RefPtr<AudioStreamTrack> mTrack;
+ bool mTrackIsOwned = false;
+ RefPtr<GenericNonExclusivePromise> mStopRecordingPromise;
+ RefPtr<SpeechTrackListener> mSpeechListener;
+ nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
+ RefPtr<media::ShutdownBlocker> mShutdownBlocker;
+ // TaskQueue responsible for pre-processing the samples by the service
+ // it runs in a separate thread from the main thread
+ RefPtr<TaskQueue> mEncodeTaskQueue;
+
+ // A generation ID of the MediaStream a started session is for, so that
+ // a gUM request that resolves after the session has stopped, and a new
+ // one has started, can exit early. Main thread only. Can wrap.
+ uint8_t mStreamGeneration = 0;
+
+ FSMState mCurrentState;
+
+ Endpointer mEndpointer;
+ uint32_t mEstimationSamples;
+
+ uint32_t mAudioSamplesPerChunk;
+
+ // maximum amount of seconds the engine will wait for voice
+ // until returning a 'no speech detected' error
+ uint32_t mSpeechDetectionTimeoutMs;
+
+ // buffer holds one chunk of mAudioSamplesPerChunk
+ // samples before feeding it to mEndpointer
+ RefPtr<SharedBuffer> mAudioSamplesBuffer;
+ uint32_t mBufferedSamples;
+
+ nsCOMPtr<nsITimer> mSpeechDetectionTimer;
+ bool mAborted;
+
+ nsString mLang;
+
+ RefPtr<SpeechGrammarList> mSpeechGrammarList;
+
+ // private flag used to hold if the user called the setContinuous() method
+ // of the API
+ bool mContinuous;
+
+ // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
+ //
+ // 1. Default value MUST be false
+ // 2. If true, interim results SHOULD be returned
+ // 3. If false, interim results MUST NOT be returned
+ //
+ // Pocketsphinx does not return interm results; so, defaulting
+ // mInterimResults to false, then ignoring its subsequent value
+ // is a conforming implementation.
+ bool mInterimResults;
+
+ // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
+ //
+ // 1. Default value is 1
+ // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives
+ // per result"
+ //
+ // Pocketsphinx can only return at maximum a single
+ // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting
+ // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives
+ // while for a 0 value returning no SpeechRecognitionAlternative per result is
+ // a conforming implementation.
+ uint32_t mMaxAlternatives;
+
+ void ProcessTestEventRequest(nsISupports* aSubject,
+ const nsAString& aEventName);
+
+ const char* GetName(FSMState aId);
+ const char* GetName(SpeechEvent* aEvent);
+};
+
+class SpeechEvent : public Runnable {
+ public:
+ SpeechEvent(SpeechRecognition* aRecognition,
+ SpeechRecognition::EventType aType);
+ SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
+ SpeechRecognition::EventType aType);
+
+ ~SpeechEvent();
+
+ NS_IMETHOD Run() override;
+ AudioSegment* mAudioSegment;
+ RefPtr<SpeechRecognitionResultList>
+ mRecognitionResultList; // TODO: make this a session being passed which
+ // also has index and stuff
+ RefPtr<SpeechRecognitionError> mError;
+
+ friend class SpeechRecognition;
+
+ private:
+ nsMainThreadPtrHandle<SpeechRecognition> mRecognition;
+
+ // for AUDIO_DATA events, keep a reference to the provider
+ // of the data (i.e., the SpeechTrackListener) to ensure it
+ // is kept alive (and keeps SpeechRecognition alive) until this
+ // event gets processed.
+ RefPtr<MediaTrackListener> mProvider;
+ SpeechRecognition::EventType mType;
+ TrackRate mTrackRate;
+};
+
+} // namespace dom
+
+inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) {
+ return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
+}
+
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp
new file mode 100644
index 0000000000..4dee9090a7
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp
@@ -0,0 +1,44 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechRecognitionAlternative.h"
+
+#include "mozilla/dom/SpeechRecognitionAlternativeBinding.h"
+
+#include "SpeechRecognition.h"
+
+namespace mozilla::dom {
+
+NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionAlternative, mParent)
+NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionAlternative)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionAlternative)
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionAlternative)
+ NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY
+ NS_INTERFACE_MAP_ENTRY(nsISupports)
+NS_INTERFACE_MAP_END
+
+SpeechRecognitionAlternative::SpeechRecognitionAlternative(
+ SpeechRecognition* aParent)
+ : mConfidence(0), mParent(aParent) {}
+
+SpeechRecognitionAlternative::~SpeechRecognitionAlternative() = default;
+
+JSObject* SpeechRecognitionAlternative::WrapObject(
+ JSContext* aCx, JS::Handle<JSObject*> aGivenProto) {
+ return SpeechRecognitionAlternative_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+nsISupports* SpeechRecognitionAlternative::GetParentObject() const {
+ return static_cast<EventTarget*>(mParent.get());
+}
+
+void SpeechRecognitionAlternative::GetTranscript(nsString& aRetVal) const {
+ aRetVal = mTranscript;
+}
+
+float SpeechRecognitionAlternative::Confidence() const { return mConfidence; }
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h
new file mode 100644
index 0000000000..017d869943
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h
@@ -0,0 +1,49 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechRecognitionAlternative_h
+#define mozilla_dom_SpeechRecognitionAlternative_h
+
+#include "nsCycleCollectionParticipant.h"
+#include "nsString.h"
+#include "nsWrapperCache.h"
+#include "js/TypeDecls.h"
+
+#include "mozilla/Attributes.h"
+
+namespace mozilla::dom {
+
+class SpeechRecognition;
+
+class SpeechRecognitionAlternative final : public nsISupports,
+ public nsWrapperCache {
+ public:
+ explicit SpeechRecognitionAlternative(SpeechRecognition* aParent);
+
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionAlternative)
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ void GetTranscript(nsString& aRetVal) const;
+
+ float Confidence() const;
+
+ nsString mTranscript;
+ float mConfidence;
+
+ private:
+ ~SpeechRecognitionAlternative();
+
+ RefPtr<SpeechRecognition> mParent;
+};
+
+} // namespace mozilla::dom
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp
new file mode 100644
index 0000000000..009281b234
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp
@@ -0,0 +1,59 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechRecognitionResult.h"
+#include "mozilla/dom/SpeechRecognitionResultBinding.h"
+
+#include "SpeechRecognition.h"
+
+namespace mozilla::dom {
+
+NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResult, mParent)
+NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResult)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResult)
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResult)
+ NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY
+ NS_INTERFACE_MAP_ENTRY(nsISupports)
+NS_INTERFACE_MAP_END
+
+SpeechRecognitionResult::SpeechRecognitionResult(SpeechRecognition* aParent)
+ : mParent(aParent) {}
+
+SpeechRecognitionResult::~SpeechRecognitionResult() = default;
+
+JSObject* SpeechRecognitionResult::WrapObject(
+ JSContext* aCx, JS::Handle<JSObject*> aGivenProto) {
+ return SpeechRecognitionResult_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+nsISupports* SpeechRecognitionResult::GetParentObject() const {
+ return static_cast<EventTarget*>(mParent.get());
+}
+
+already_AddRefed<SpeechRecognitionAlternative>
+SpeechRecognitionResult::IndexedGetter(uint32_t aIndex, bool& aPresent) {
+ if (aIndex >= Length()) {
+ aPresent = false;
+ return nullptr;
+ }
+
+ aPresent = true;
+ return Item(aIndex);
+}
+
+uint32_t SpeechRecognitionResult::Length() const { return mItems.Length(); }
+
+already_AddRefed<SpeechRecognitionAlternative> SpeechRecognitionResult::Item(
+ uint32_t aIndex) {
+ RefPtr<SpeechRecognitionAlternative> alternative = mItems.ElementAt(aIndex);
+ return alternative.forget();
+}
+
+bool SpeechRecognitionResult::IsFinal() const {
+ return true; // TODO
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.h b/dom/media/webspeech/recognition/SpeechRecognitionResult.h
new file mode 100644
index 0000000000..fc9e8fd660
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.h
@@ -0,0 +1,54 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechRecognitionResult_h
+#define mozilla_dom_SpeechRecognitionResult_h
+
+#include "nsCOMPtr.h"
+#include "nsCycleCollectionParticipant.h"
+#include "nsWrapperCache.h"
+#include "nsTArray.h"
+#include "js/TypeDecls.h"
+
+#include "mozilla/Attributes.h"
+
+#include "SpeechRecognitionAlternative.h"
+
+namespace mozilla::dom {
+
+class SpeechRecognitionResult final : public nsISupports,
+ public nsWrapperCache {
+ public:
+ explicit SpeechRecognitionResult(SpeechRecognition* aParent);
+
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResult)
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ uint32_t Length() const;
+
+ already_AddRefed<SpeechRecognitionAlternative> Item(uint32_t aIndex);
+
+ bool IsFinal() const;
+
+ already_AddRefed<SpeechRecognitionAlternative> IndexedGetter(uint32_t aIndex,
+ bool& aPresent);
+
+ nsTArray<RefPtr<SpeechRecognitionAlternative>> mItems;
+
+ private:
+ ~SpeechRecognitionResult();
+
+ RefPtr<SpeechRecognition> mParent;
+};
+
+} // namespace mozilla::dom
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp
new file mode 100644
index 0000000000..2aa81a5982
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp
@@ -0,0 +1,58 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechRecognitionResultList.h"
+
+#include "mozilla/dom/SpeechRecognitionResultListBinding.h"
+
+#include "SpeechRecognition.h"
+
+namespace mozilla::dom {
+
+NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResultList, mParent,
+ mItems)
+NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResultList)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResultList)
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResultList)
+ NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY
+ NS_INTERFACE_MAP_ENTRY(nsISupports)
+NS_INTERFACE_MAP_END
+
+SpeechRecognitionResultList::SpeechRecognitionResultList(
+ SpeechRecognition* aParent)
+ : mParent(aParent) {}
+
+SpeechRecognitionResultList::~SpeechRecognitionResultList() = default;
+
+nsISupports* SpeechRecognitionResultList::GetParentObject() const {
+ return static_cast<EventTarget*>(mParent.get());
+}
+
+JSObject* SpeechRecognitionResultList::WrapObject(
+ JSContext* aCx, JS::Handle<JSObject*> aGivenProto) {
+ return SpeechRecognitionResultList_Binding::Wrap(aCx, this, aGivenProto);
+}
+
+already_AddRefed<SpeechRecognitionResult>
+SpeechRecognitionResultList::IndexedGetter(uint32_t aIndex, bool& aPresent) {
+ if (aIndex >= Length()) {
+ aPresent = false;
+ return nullptr;
+ }
+
+ aPresent = true;
+ return Item(aIndex);
+}
+
+uint32_t SpeechRecognitionResultList::Length() const { return mItems.Length(); }
+
+already_AddRefed<SpeechRecognitionResult> SpeechRecognitionResultList::Item(
+ uint32_t aIndex) {
+ RefPtr<SpeechRecognitionResult> result = mItems.ElementAt(aIndex);
+ return result.forget();
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.h b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h
new file mode 100644
index 0000000000..b45659564b
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h
@@ -0,0 +1,53 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechRecognitionResultList_h
+#define mozilla_dom_SpeechRecognitionResultList_h
+
+#include "nsCycleCollectionParticipant.h"
+#include "nsWrapperCache.h"
+#include "nsTArray.h"
+#include "js/TypeDecls.h"
+
+#include "mozilla/Attributes.h"
+
+#include "SpeechRecognitionResult.h"
+
+namespace mozilla::dom {
+
+class SpeechRecognition;
+
+class SpeechRecognitionResultList final : public nsISupports,
+ public nsWrapperCache {
+ public:
+ explicit SpeechRecognitionResultList(SpeechRecognition* aParent);
+
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResultList)
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx,
+ JS::Handle<JSObject*> aGivenProto) override;
+
+ uint32_t Length() const;
+
+ already_AddRefed<SpeechRecognitionResult> Item(uint32_t aIndex);
+
+ already_AddRefed<SpeechRecognitionResult> IndexedGetter(uint32_t aIndex,
+ bool& aPresent);
+
+ nsTArray<RefPtr<SpeechRecognitionResult>> mItems;
+
+ private:
+ ~SpeechRecognitionResultList();
+
+ RefPtr<SpeechRecognition> mParent;
+};
+
+} // namespace mozilla::dom
+
+#endif
diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.cpp b/dom/media/webspeech/recognition/SpeechTrackListener.cpp
new file mode 100644
index 0000000000..036ff753ba
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechTrackListener.cpp
@@ -0,0 +1,92 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "SpeechTrackListener.h"
+
+#include "SpeechRecognition.h"
+#include "nsProxyRelease.h"
+
+namespace mozilla::dom {
+
+SpeechTrackListener::SpeechTrackListener(SpeechRecognition* aRecognition)
+ : mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>(
+ "SpeechTrackListener::SpeechTrackListener", aRecognition, false)),
+ mRemovedPromise(
+ mRemovedHolder.Ensure("SpeechTrackListener::mRemovedPromise")) {
+ MOZ_ASSERT(NS_IsMainThread());
+ mRemovedPromise->Then(GetCurrentSerialEventTarget(), __func__,
+ [self = RefPtr<SpeechTrackListener>(this), this] {
+ mRecognition = nullptr;
+ });
+}
+
+void SpeechTrackListener::NotifyQueuedChanges(
+ MediaTrackGraph* aGraph, TrackTime aTrackOffset,
+ const MediaSegment& aQueuedMedia) {
+ AudioSegment* audio = const_cast<AudioSegment*>(
+ static_cast<const AudioSegment*>(&aQueuedMedia));
+
+ AudioSegment::ChunkIterator iterator(*audio);
+ while (!iterator.IsEnded()) {
+ // Skip over-large chunks so we don't crash!
+ if (iterator->GetDuration() > INT_MAX) {
+ continue;
+ }
+ int duration = int(iterator->GetDuration());
+
+ if (iterator->IsNull()) {
+ nsTArray<int16_t> nullData;
+ PodZero(nullData.AppendElements(duration), duration);
+ ConvertAndDispatchAudioChunk(duration, iterator->mVolume,
+ nullData.Elements(), aGraph->GraphRate());
+ } else {
+ AudioSampleFormat format = iterator->mBufferFormat;
+
+ MOZ_ASSERT(format == AUDIO_FORMAT_S16 || format == AUDIO_FORMAT_FLOAT32);
+
+ if (format == AUDIO_FORMAT_S16) {
+ ConvertAndDispatchAudioChunk(
+ duration, iterator->mVolume,
+ static_cast<const int16_t*>(iterator->mChannelData[0]),
+ aGraph->GraphRate());
+ } else if (format == AUDIO_FORMAT_FLOAT32) {
+ ConvertAndDispatchAudioChunk(
+ duration, iterator->mVolume,
+ static_cast<const float*>(iterator->mChannelData[0]),
+ aGraph->GraphRate());
+ }
+ }
+
+ iterator.Next();
+ }
+}
+
+template <typename SampleFormatType>
+void SpeechTrackListener::ConvertAndDispatchAudioChunk(int aDuration,
+ float aVolume,
+ SampleFormatType* aData,
+ TrackRate aTrackRate) {
+ CheckedInt<size_t> bufferSize(sizeof(int16_t));
+ bufferSize *= aDuration;
+ bufferSize *= 1; // channel
+ RefPtr<SharedBuffer> samples(SharedBuffer::Create(bufferSize));
+
+ int16_t* to = static_cast<int16_t*>(samples->Data());
+ ConvertAudioSamplesWithScale(aData, to, aDuration, aVolume);
+
+ mRecognition->FeedAudioData(mRecognition, samples.forget(), aDuration, this,
+ aTrackRate);
+}
+
+void SpeechTrackListener::NotifyEnded(MediaTrackGraph* aGraph) {
+ // TODO dispatch SpeechEnd event so services can be informed
+}
+
+void SpeechTrackListener::NotifyRemoved(MediaTrackGraph* aGraph) {
+ mRemovedHolder.ResolveIfExists(true, __func__);
+}
+
+} // namespace mozilla::dom
diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.h b/dom/media/webspeech/recognition/SpeechTrackListener.h
new file mode 100644
index 0000000000..423a5b0317
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechTrackListener.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechStreamListener_h
+#define mozilla_dom_SpeechStreamListener_h
+
+#include "MediaTrackGraph.h"
+#include "MediaTrackListener.h"
+#include "AudioSegment.h"
+#include "mozilla/MozPromise.h"
+
+namespace mozilla {
+
+class AudioSegment;
+
+namespace dom {
+
+class SpeechRecognition;
+
+class SpeechTrackListener : public MediaTrackListener {
+ public:
+ explicit SpeechTrackListener(SpeechRecognition* aRecognition);
+ ~SpeechTrackListener() = default;
+
+ void NotifyQueuedChanges(MediaTrackGraph* aGraph, TrackTime aTrackOffset,
+ const MediaSegment& aQueuedMedia) override;
+
+ void NotifyEnded(MediaTrackGraph* aGraph) override;
+
+ void NotifyRemoved(MediaTrackGraph* aGraph) override;
+
+ private:
+ template <typename SampleFormatType>
+ void ConvertAndDispatchAudioChunk(int aDuration, float aVolume,
+ SampleFormatType* aData,
+ TrackRate aTrackRate);
+ nsMainThreadPtrHandle<SpeechRecognition> mRecognition;
+ MozPromiseHolder<GenericNonExclusivePromise> mRemovedHolder;
+
+ public:
+ const RefPtr<GenericNonExclusivePromise> mRemovedPromise;
+};
+
+} // namespace dom
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/endpointer.cc b/dom/media/webspeech/recognition/endpointer.cc
new file mode 100644
index 0000000000..2347043d4b
--- /dev/null
+++ b/dom/media/webspeech/recognition/endpointer.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "endpointer.h"
+
+#include "AudioSegment.h"
+
+namespace {
+const int kFrameRate = 200; // 1 frame = 5ms of audio.
+}
+
+namespace mozilla {
+
+Endpointer::Endpointer(int sample_rate)
+ : speech_input_possibly_complete_silence_length_us_(-1),
+ speech_input_complete_silence_length_us_(-1),
+ audio_frame_time_us_(0),
+ sample_rate_(sample_rate),
+ frame_size_(0) {
+ Reset();
+
+ frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
+
+ speech_input_minimum_length_us_ =
+ static_cast<int64_t>(1.7 * 1000000);
+ speech_input_complete_silence_length_us_ =
+ static_cast<int64_t>(0.5 * 1000000);
+ long_speech_input_complete_silence_length_us_ = -1;
+ long_speech_length_us_ = -1;
+ speech_input_possibly_complete_silence_length_us_ =
+ 1 * 1000000;
+
+ // Set the default configuration for Push To Talk mode.
+ EnergyEndpointerParams ep_config;
+ ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
+ ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
+ ep_config.set_endpoint_margin(0.2f);
+ ep_config.set_onset_window(0.15f);
+ ep_config.set_speech_on_window(0.4f);
+ ep_config.set_offset_window(0.15f);
+ ep_config.set_onset_detect_dur(0.09f);
+ ep_config.set_onset_confirm_dur(0.075f);
+ ep_config.set_on_maintain_dur(0.10f);
+ ep_config.set_offset_confirm_dur(0.12f);
+ ep_config.set_decision_threshold(1000.0f);
+ ep_config.set_min_decision_threshold(50.0f);
+ ep_config.set_fast_update_dur(0.2f);
+ ep_config.set_sample_rate(static_cast<float>(sample_rate));
+ ep_config.set_min_fundamental_frequency(57.143f);
+ ep_config.set_max_fundamental_frequency(400.0f);
+ ep_config.set_contamination_rejection_period(0.25f);
+ energy_endpointer_.Init(ep_config);
+}
+
+void Endpointer::Reset() {
+ old_ep_status_ = EP_PRE_SPEECH;
+ waiting_for_speech_possibly_complete_timeout_ = false;
+ waiting_for_speech_complete_timeout_ = false;
+ speech_previously_detected_ = false;
+ speech_input_complete_ = false;
+ audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
+ speech_end_time_us_ = -1;
+ speech_start_time_us_ = -1;
+}
+
+void Endpointer::StartSession() {
+ Reset();
+ energy_endpointer_.StartSession();
+}
+
+void Endpointer::EndSession() {
+ energy_endpointer_.EndSession();
+}
+
+void Endpointer::SetEnvironmentEstimationMode() {
+ Reset();
+ energy_endpointer_.SetEnvironmentEstimationMode();
+}
+
+void Endpointer::SetUserInputMode() {
+ energy_endpointer_.SetUserInputMode();
+}
+
+EpStatus Endpointer::Status(int64_t *time) {
+ return energy_endpointer_.Status(time);
+}
+
+EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
+ MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format");
+ const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]);
+ const int num_samples = raw_audio.mDuration;
+ EpStatus ep_status = EP_PRE_SPEECH;
+
+ // Process the input data in blocks of frame_size_, dropping any incomplete
+ // frames at the end (which is ok since typically the caller will be recording
+ // audio in multiples of our frame size).
+ int sample_index = 0;
+ while (sample_index + frame_size_ <= num_samples) {
+ // Have the endpointer process the frame.
+ energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
+ audio_data + sample_index,
+ frame_size_,
+ rms_out);
+ sample_index += frame_size_;
+ audio_frame_time_us_ += (frame_size_ * 1000000) /
+ sample_rate_;
+
+ // Get the status of the endpointer.
+ int64_t ep_time;
+ ep_status = energy_endpointer_.Status(&ep_time);
+ if (old_ep_status_ != ep_status)
+ fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status);
+
+ // Handle state changes.
+ if ((EP_SPEECH_PRESENT == ep_status) &&
+ (EP_POSSIBLE_ONSET == old_ep_status_)) {
+ speech_end_time_us_ = -1;
+ waiting_for_speech_possibly_complete_timeout_ = false;
+ waiting_for_speech_complete_timeout_ = false;
+ // Trigger SpeechInputDidStart event on first detection.
+ if (false == speech_previously_detected_) {
+ speech_previously_detected_ = true;
+ speech_start_time_us_ = ep_time;
+ }
+ }
+ if ((EP_PRE_SPEECH == ep_status) &&
+ (EP_POSSIBLE_OFFSET == old_ep_status_)) {
+ speech_end_time_us_ = ep_time;
+ waiting_for_speech_possibly_complete_timeout_ = true;
+ waiting_for_speech_complete_timeout_ = true;
+ }
+ if (ep_time > speech_input_minimum_length_us_) {
+ // Speech possibly complete timeout.
+ if ((waiting_for_speech_possibly_complete_timeout_) &&
+ (ep_time - speech_end_time_us_ >
+ speech_input_possibly_complete_silence_length_us_)) {
+ waiting_for_speech_possibly_complete_timeout_ = false;
+ }
+ if (waiting_for_speech_complete_timeout_) {
+ // The length of the silence timeout period can be held constant, or it
+ // can be changed after a fixed amount of time from the beginning of
+ // speech.
+ bool has_stepped_silence =
+ (long_speech_length_us_ > 0) &&
+ (long_speech_input_complete_silence_length_us_ > 0);
+ int64_t requested_silence_length;
+ if (has_stepped_silence &&
+ (ep_time - speech_start_time_us_) > long_speech_length_us_) {
+ requested_silence_length =
+ long_speech_input_complete_silence_length_us_;
+ } else {
+ requested_silence_length =
+ speech_input_complete_silence_length_us_;
+ }
+
+ // Speech complete timeout.
+ if ((ep_time - speech_end_time_us_) > requested_silence_length) {
+ waiting_for_speech_complete_timeout_ = false;
+ speech_input_complete_ = true;
+ }
+ }
+ }
+ old_ep_status_ = ep_status;
+ }
+ return ep_status;
+}
+
+} // namespace mozilla
diff --git a/dom/media/webspeech/recognition/endpointer.h b/dom/media/webspeech/recognition/endpointer.h
new file mode 100644
index 0000000000..7879d6b9f3
--- /dev/null
+++ b/dom/media/webspeech/recognition/endpointer.h
@@ -0,0 +1,180 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
+
+#include "energy_endpointer.h"
+
+namespace mozilla {
+
+struct AudioChunk;
+
+// A simple interface to the underlying energy-endpointer implementation, this
+// class lets callers provide audio as being recorded and let them poll to find
+// when the user has stopped speaking.
+//
+// There are two events that may trigger the end of speech:
+//
+// speechInputPossiblyComplete event:
+//
+// Signals that silence/noise has been detected for a *short* amount of
+// time after some speech has been detected. It can be used for low latency
+// UI feedback. To disable it, set it to a large amount.
+//
+// speechInputComplete event:
+//
+// This event is intended to signal end of input and to stop recording.
+// The amount of time to wait after speech is set by
+// speech_input_complete_silence_length_ and optionally two other
+// parameters (see below).
+// This time can be held constant, or can change as more speech is detected.
+// In the latter case, the time changes after a set amount of time from the
+// *beginning* of speech. This is motivated by the expectation that there
+// will be two distinct types of inputs: short search queries and longer
+// dictation style input.
+//
+// Three parameters are used to define the piecewise constant timeout function.
+// The timeout length is speech_input_complete_silence_length until
+// long_speech_length, when it changes to
+// long_speech_input_complete_silence_length.
+class Endpointer {
+ public:
+ explicit Endpointer(int sample_rate);
+
+ // Start the endpointer. This should be called at the beginning of a session.
+ void StartSession();
+
+ // Stop the endpointer.
+ void EndSession();
+
+ // Start environment estimation. Audio will be used for environment estimation
+ // i.e. noise level estimation.
+ void SetEnvironmentEstimationMode();
+
+ // Start user input. This should be called when the user indicates start of
+ // input, e.g. by pressing a button.
+ void SetUserInputMode();
+
+ // Process a segment of audio, which may be more than one frame.
+ // The status of the last frame will be returned.
+ EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
+
+ // Get the status of the endpointer.
+ EpStatus Status(int64_t *time_us);
+
+ // Get the expected frame size for audio chunks. Audio chunks are expected
+ // to contain a number of samples that is a multiple of this number, and extra
+ // samples will be dropped.
+ int32_t FrameSize() const {
+ return frame_size_;
+ }
+
+ // Returns true if the endpointer detected reasonable audio levels above
+ // background noise which could be user speech, false if not.
+ bool DidStartReceivingSpeech() const {
+ return speech_previously_detected_;
+ }
+
+ bool IsEstimatingEnvironment() const {
+ return energy_endpointer_.estimating_environment();
+ }
+
+ void set_speech_input_complete_silence_length(int64_t time_us) {
+ speech_input_complete_silence_length_us_ = time_us;
+ }
+
+ void set_long_speech_input_complete_silence_length(int64_t time_us) {
+ long_speech_input_complete_silence_length_us_ = time_us;
+ }
+
+ void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
+ speech_input_possibly_complete_silence_length_us_ = time_us;
+ }
+
+ void set_long_speech_length(int64_t time_us) {
+ long_speech_length_us_ = time_us;
+ }
+
+ bool speech_input_complete() const {
+ return speech_input_complete_;
+ }
+
+ // RMS background noise level in dB.
+ float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
+
+ private:
+ // Reset internal states. Helper method common to initial input utterance
+ // and following input utternaces.
+ void Reset();
+
+ // Minimum allowable length of speech input.
+ int64_t speech_input_minimum_length_us_;
+
+ // The speechInputPossiblyComplete event signals that silence/noise has been
+ // detected for a *short* amount of time after some speech has been detected.
+ // This proporty specifies the time period.
+ int64_t speech_input_possibly_complete_silence_length_us_;
+
+ // The speechInputComplete event signals that silence/noise has been
+ // detected for a *long* amount of time after some speech has been detected.
+ // This property specifies the time period.
+ int64_t speech_input_complete_silence_length_us_;
+
+ // Same as above, this specifies the required silence period after speech
+ // detection. This period is used instead of
+ // speech_input_complete_silence_length_ when the utterance is longer than
+ // long_speech_length_. This parameter is optional.
+ int64_t long_speech_input_complete_silence_length_us_;
+
+ // The period of time after which the endpointer should consider
+ // long_speech_input_complete_silence_length_ as a valid silence period
+ // instead of speech_input_complete_silence_length_. This parameter is
+ // optional.
+ int64_t long_speech_length_us_;
+
+ // First speech onset time, used in determination of speech complete timeout.
+ int64_t speech_start_time_us_;
+
+ // Most recent end time, used in determination of speech complete timeout.
+ int64_t speech_end_time_us_;
+
+ int64_t audio_frame_time_us_;
+ EpStatus old_ep_status_;
+ bool waiting_for_speech_possibly_complete_timeout_;
+ bool waiting_for_speech_complete_timeout_;
+ bool speech_previously_detected_;
+ bool speech_input_complete_;
+ EnergyEndpointer energy_endpointer_;
+ int sample_rate_;
+ int32_t frame_size_;
+};
+
+} // namespace mozilla
+
+#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
diff --git a/dom/media/webspeech/recognition/energy_endpointer.cc b/dom/media/webspeech/recognition/energy_endpointer.cc
new file mode 100644
index 0000000000..b1c1ee0bcf
--- /dev/null
+++ b/dom/media/webspeech/recognition/energy_endpointer.cc
@@ -0,0 +1,393 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "energy_endpointer.h"
+
+#include <math.h>
+
+namespace {
+
+// Returns the RMS (quadratic mean) of the input signal.
+float RMS(const int16_t* samples, int num_samples) {
+ int64_t ssq_int64_t = 0;
+ int64_t sum_int64_t = 0;
+ for (int i = 0; i < num_samples; ++i) {
+ sum_int64_t += samples[i];
+ ssq_int64_t += samples[i] * samples[i];
+ }
+ // now convert to floats.
+ double sum = static_cast<double>(sum_int64_t);
+ sum /= num_samples;
+ double ssq = static_cast<double>(ssq_int64_t);
+ return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
+}
+
+int64_t Secs2Usecs(float seconds) {
+ return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
+}
+
+float GetDecibel(float value) {
+ if (value > 1.0e-100)
+ return 20 * log10(value);
+ return -2000.0;
+}
+
+} // namespace
+
+namespace mozilla {
+
+// Stores threshold-crossing histories for making decisions about the speech
+// state.
+class EnergyEndpointer::HistoryRing {
+ public:
+ HistoryRing() : insertion_index_(0) {}
+
+ // Resets the ring to |size| elements each with state |initial_state|
+ void SetRing(int size, bool initial_state);
+
+ // Inserts a new entry into the ring and drops the oldest entry.
+ void Insert(int64_t time_us, bool decision);
+
+ // Returns the time in microseconds of the most recently added entry.
+ int64_t EndTime() const;
+
+ // Returns the sum of all intervals during which 'decision' is true within
+ // the time in seconds specified by 'duration'. The returned interval is
+ // in seconds.
+ float RingSum(float duration_sec);
+
+ private:
+ struct DecisionPoint {
+ int64_t time_us;
+ bool decision;
+ };
+
+ std::vector<DecisionPoint> decision_points_;
+ int insertion_index_; // Index at which the next item gets added/inserted.
+
+ HistoryRing(const HistoryRing&);
+ void operator=(const HistoryRing&);
+};
+
+void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
+ insertion_index_ = 0;
+ decision_points_.clear();
+ DecisionPoint init = { -1, initial_state };
+ decision_points_.resize(size, init);
+}
+
+void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
+ decision_points_[insertion_index_].time_us = time_us;
+ decision_points_[insertion_index_].decision = decision;
+ insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
+}
+
+int64_t EnergyEndpointer::HistoryRing::EndTime() const {
+ int ind = insertion_index_ - 1;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ return decision_points_[ind].time_us;
+}
+
+float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
+ if (decision_points_.empty())
+ return 0.0;
+
+ int64_t sum_us = 0;
+ int ind = insertion_index_ - 1;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ int64_t end_us = decision_points_[ind].time_us;
+ bool is_on = decision_points_[ind].decision;
+ int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
+ if (start_us < 0)
+ start_us = 0;
+ size_t n_summed = 1; // n points ==> (n-1) intervals
+ while ((decision_points_[ind].time_us > start_us) &&
+ (n_summed < decision_points_.size())) {
+ --ind;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ if (is_on)
+ sum_us += end_us - decision_points_[ind].time_us;
+ is_on = decision_points_[ind].decision;
+ end_us = decision_points_[ind].time_us;
+ n_summed++;
+ }
+
+ return 1.0e-6f * sum_us; // Returns total time that was super threshold.
+}
+
+EnergyEndpointer::EnergyEndpointer()
+ : status_(EP_PRE_SPEECH),
+ offset_confirm_dur_sec_(0),
+ endpointer_time_us_(0),
+ fast_update_frames_(0),
+ frame_counter_(0),
+ max_window_dur_(4.0),
+ sample_rate_(0),
+ history_(new HistoryRing()),
+ decision_threshold_(0),
+ estimating_environment_(false),
+ noise_level_(0),
+ rms_adapt_(0),
+ start_lag_(0),
+ end_lag_(0),
+ user_input_start_time_us_(0) {
+}
+
+EnergyEndpointer::~EnergyEndpointer() {
+}
+
+int EnergyEndpointer::TimeToFrame(float time) const {
+ return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
+}
+
+void EnergyEndpointer::Restart(bool reset_threshold) {
+ status_ = EP_PRE_SPEECH;
+ user_input_start_time_us_ = 0;
+
+ if (reset_threshold) {
+ decision_threshold_ = params_.decision_threshold();
+ rms_adapt_ = decision_threshold_;
+ noise_level_ = params_.decision_threshold() / 2.0f;
+ frame_counter_ = 0; // Used for rapid initial update of levels.
+ }
+
+ // Set up the memories to hold the history windows.
+ history_->SetRing(TimeToFrame(max_window_dur_), false);
+
+ // Flag that indicates that current input should be used for
+ // estimating the environment. The user has not yet started input
+ // by e.g. pressed the push-to-talk button. By default, this is
+ // false for backward compatibility.
+ estimating_environment_ = false;
+}
+
+void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
+ params_ = params;
+
+ // Find the longest history interval to be used, and make the ring
+ // large enough to accommodate that number of frames. NOTE: This
+ // depends upon ep_frame_period being set correctly in the factory
+ // that did this instantiation.
+ max_window_dur_ = params_.onset_window();
+ if (params_.speech_on_window() > max_window_dur_)
+ max_window_dur_ = params_.speech_on_window();
+ if (params_.offset_window() > max_window_dur_)
+ max_window_dur_ = params_.offset_window();
+ Restart(true);
+
+ offset_confirm_dur_sec_ = params_.offset_window() -
+ params_.offset_confirm_dur();
+ if (offset_confirm_dur_sec_ < 0.0)
+ offset_confirm_dur_sec_ = 0.0;
+
+ user_input_start_time_us_ = 0;
+
+ // Flag that indicates that current input should be used for
+ // estimating the environment. The user has not yet started input
+ // by e.g. pressed the push-to-talk button. By default, this is
+ // false for backward compatibility.
+ estimating_environment_ = false;
+ // The initial value of the noise and speech levels is inconsequential.
+ // The level of the first frame will overwrite these values.
+ noise_level_ = params_.decision_threshold() / 2.0f;
+ fast_update_frames_ =
+ static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
+
+ frame_counter_ = 0; // Used for rapid initial update of levels.
+
+ sample_rate_ = params_.sample_rate();
+ start_lag_ = static_cast<int>(sample_rate_ /
+ params_.max_fundamental_frequency());
+ end_lag_ = static_cast<int>(sample_rate_ /
+ params_.min_fundamental_frequency());
+}
+
+void EnergyEndpointer::StartSession() {
+ Restart(true);
+}
+
+void EnergyEndpointer::EndSession() {
+ status_ = EP_POST_SPEECH;
+}
+
+void EnergyEndpointer::SetEnvironmentEstimationMode() {
+ Restart(true);
+ estimating_environment_ = true;
+}
+
+void EnergyEndpointer::SetUserInputMode() {
+ estimating_environment_ = false;
+ user_input_start_time_us_ = endpointer_time_us_;
+}
+
+void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
+ const int16_t* samples,
+ int num_samples,
+ float* rms_out) {
+ endpointer_time_us_ = time_us;
+ float rms = RMS(samples, num_samples);
+
+ // Check that this is user input audio vs. pre-input adaptation audio.
+ // Input audio starts when the user indicates start of input, by e.g.
+ // pressing push-to-talk. Audio recieved prior to that is used to update
+ // noise and speech level estimates.
+ if (!estimating_environment_) {
+ bool decision = false;
+ if ((endpointer_time_us_ - user_input_start_time_us_) <
+ Secs2Usecs(params_.contamination_rejection_period())) {
+ decision = false;
+ //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
+ } else {
+ decision = (rms > decision_threshold_);
+ }
+
+ history_->Insert(endpointer_time_us_, decision);
+
+ switch (status_) {
+ case EP_PRE_SPEECH:
+ if (history_->RingSum(params_.onset_window()) >
+ params_.onset_detect_dur()) {
+ status_ = EP_POSSIBLE_ONSET;
+ }
+ break;
+
+ case EP_POSSIBLE_ONSET: {
+ float tsum = history_->RingSum(params_.onset_window());
+ if (tsum > params_.onset_confirm_dur()) {
+ status_ = EP_SPEECH_PRESENT;
+ } else { // If signal is not maintained, drop back to pre-speech.
+ if (tsum <= params_.onset_detect_dur())
+ status_ = EP_PRE_SPEECH;
+ }
+ break;
+ }
+
+ case EP_SPEECH_PRESENT: {
+ // To induce hysteresis in the state residency, we allow a
+ // smaller residency time in the on_ring, than was required to
+ // enter the SPEECH_PERSENT state.
+ float on_time = history_->RingSum(params_.speech_on_window());
+ if (on_time < params_.on_maintain_dur())
+ status_ = EP_POSSIBLE_OFFSET;
+ break;
+ }
+
+ case EP_POSSIBLE_OFFSET:
+ if (history_->RingSum(params_.offset_window()) <=
+ offset_confirm_dur_sec_) {
+ // Note that this offset time may be beyond the end
+ // of the input buffer in a real-time system. It will be up
+ // to the RecognizerSession to decide what to do.
+ status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
+ } else { // If speech picks up again we allow return to SPEECH_PRESENT.
+ if (history_->RingSum(params_.speech_on_window()) >=
+ params_.on_maintain_dur())
+ status_ = EP_SPEECH_PRESENT;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // If this is a quiet, non-speech region, slowly adapt the detection
+ // threshold to be about 6dB above the average RMS.
+ if ((!decision) && (status_ == EP_PRE_SPEECH)) {
+ decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
+ rms_adapt_ = decision_threshold_;
+ } else {
+ // If this is in a speech region, adapt the decision threshold to
+ // be about 10dB below the average RMS. If the noise level is high,
+ // the threshold is pushed up.
+ // Adaptation up to a higher level is 5 times faster than decay to
+ // a lower level.
+ if ((status_ == EP_SPEECH_PRESENT) && decision) {
+ if (rms_adapt_ > rms) {
+ rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
+ } else {
+ rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
+ }
+ float target_threshold = 0.3f * rms_adapt_ + noise_level_;
+ decision_threshold_ = (.90f * decision_threshold_) +
+ (0.10f * target_threshold);
+ }
+ }
+
+ // Set a floor
+ if (decision_threshold_ < params_.min_decision_threshold())
+ decision_threshold_ = params_.min_decision_threshold();
+ }
+
+ // Update speech and noise levels.
+ UpdateLevels(rms);
+ ++frame_counter_;
+
+ if (rms_out)
+ *rms_out = GetDecibel(rms);
+}
+
+float EnergyEndpointer::GetNoiseLevelDb() const {
+ return GetDecibel(noise_level_);
+}
+
+void EnergyEndpointer::UpdateLevels(float rms) {
+ // Update quickly initially. We assume this is noise and that
+ // speech is 6dB above the noise.
+ if (frame_counter_ < fast_update_frames_) {
+ // Alpha increases from 0 to (k-1)/k where k is the number of time
+ // steps in the initial adaptation period.
+ float alpha = static_cast<float>(frame_counter_) /
+ static_cast<float>(fast_update_frames_);
+ noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
+ //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
+ } else {
+ // Update Noise level. The noise level adapts quickly downward, but
+ // slowly upward. The noise_level_ parameter is not currently used
+ // for threshold adaptation. It is used for UI feedback.
+ if (noise_level_ < rms)
+ noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
+ else
+ noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
+ }
+ if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
+ decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
+ // Set a floor
+ if (decision_threshold_ < params_.min_decision_threshold())
+ decision_threshold_ = params_.min_decision_threshold();
+ }
+}
+
+EpStatus EnergyEndpointer::Status(int64_t* status_time) const {
+ *status_time = history_->EndTime();
+ return status_;
+}
+
+} // namespace mozilla
diff --git a/dom/media/webspeech/recognition/energy_endpointer.h b/dom/media/webspeech/recognition/energy_endpointer.h
new file mode 100644
index 0000000000..12d3c736e3
--- /dev/null
+++ b/dom/media/webspeech/recognition/energy_endpointer.h
@@ -0,0 +1,180 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The EnergyEndpointer class finds likely speech onset and offset points.
+//
+// The implementation described here is about the simplest possible.
+// It is based on timings of threshold crossings for overall signal
+// RMS. It is suitable for light weight applications.
+//
+// As written, the basic idea is that one specifies intervals that
+// must be occupied by super- and sub-threshold energy levels, and
+// defers decisions re onset and offset times until these
+// specifications have been met. Three basic intervals are tested: an
+// onset window, a speech-on window, and an offset window. We require
+// super-threshold to exceed some mimimum total durations in the onset
+// and speech-on windows before declaring the speech onset time, and
+// we specify a required sub-threshold residency in the offset window
+// before declaring speech offset. As the various residency requirements are
+// met, the EnergyEndpointer instance assumes various states, and can return the
+// ID of these states to the client (see EpStatus below).
+//
+// The levels of the speech and background noise are continuously updated. It is
+// important that the background noise level be estimated initially for
+// robustness in noisy conditions. The first frames are assumed to be background
+// noise and a fast update rate is used for the noise level. The duration for
+// fast update is controlled by the fast_update_dur_ paramter.
+//
+// If used in noisy conditions, the endpointer should be started and run in the
+// EnvironmentEstimation mode, for at least 200ms, before switching to
+// UserInputMode.
+// Audio feedback contamination can appear in the input audio, if not cut
+// out or handled by echo cancellation. Audio feedback can trigger a false
+// accept. The false accepts can be ignored by setting
+// ep_contamination_rejection_period.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
+
+#include <vector>
+
+#include "mozilla/UniquePtr.h"
+
+#include "energy_endpointer_params.h"
+
+namespace mozilla {
+
+// Endpointer status codes
+enum EpStatus {
+ EP_PRE_SPEECH = 10,
+ EP_POSSIBLE_ONSET,
+ EP_SPEECH_PRESENT,
+ EP_POSSIBLE_OFFSET,
+ EP_POST_SPEECH,
+};
+
+class EnergyEndpointer {
+ public:
+ // The default construction MUST be followed by Init(), before any
+ // other use can be made of the instance.
+ EnergyEndpointer();
+ virtual ~EnergyEndpointer();
+
+ void Init(const EnergyEndpointerParams& params);
+
+ // Start the endpointer. This should be called at the beginning of a session.
+ void StartSession();
+
+ // Stop the endpointer.
+ void EndSession();
+
+ // Start environment estimation. Audio will be used for environment estimation
+ // i.e. noise level estimation.
+ void SetEnvironmentEstimationMode();
+
+ // Start user input. This should be called when the user indicates start of
+ // input, e.g. by pressing a button.
+ void SetUserInputMode();
+
+ // Computes the next input frame and modifies EnergyEndpointer status as
+ // appropriate based on the computation.
+ void ProcessAudioFrame(int64_t time_us,
+ const int16_t* samples, int num_samples,
+ float* rms_out);
+
+ // Returns the current state of the EnergyEndpointer and the time
+ // corresponding to the most recently computed frame.
+ EpStatus Status(int64_t* status_time_us) const;
+
+ bool estimating_environment() const {
+ return estimating_environment_;
+ }
+
+ // Returns estimated noise level in dB.
+ float GetNoiseLevelDb() const;
+
+ private:
+ class HistoryRing;
+
+ // Resets the endpointer internal state. If reset_threshold is true, the
+ // state will be reset completely, including adaptive thresholds and the
+ // removal of all history information.
+ void Restart(bool reset_threshold);
+
+ // Update internal speech and noise levels.
+ void UpdateLevels(float rms);
+
+ // Returns the number of frames (or frame number) corresponding to
+ // the 'time' (in seconds).
+ int TimeToFrame(float time) const;
+
+ EpStatus status_; // The current state of this instance.
+ float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
+ int64_t endpointer_time_us_; // Time of the most recently received audio frame.
+ int64_t fast_update_frames_; // Number of frames for initial level adaptation.
+ int64_t frame_counter_; // Number of frames seen. Used for initial adaptation.
+ float max_window_dur_; // Largest search window size (seconds)
+ float sample_rate_; // Sampling rate.
+
+ // Ring buffers to hold the speech activity history.
+ UniquePtr<HistoryRing> history_;
+
+ // Configuration parameters.
+ EnergyEndpointerParams params_;
+
+ // RMS which must be exceeded to conclude frame is speech.
+ float decision_threshold_;
+
+ // Flag to indicate that audio should be used to estimate environment, prior
+ // to receiving user input.
+ bool estimating_environment_;
+
+ // Estimate of the background noise level. Used externally for UI feedback.
+ float noise_level_;
+
+ // An adaptive threshold used to update decision_threshold_ when appropriate.
+ float rms_adapt_;
+
+ // Start lag corresponds to the highest fundamental frequency.
+ int start_lag_;
+
+ // End lag corresponds to the lowest fundamental frequency.
+ int end_lag_;
+
+ // Time when mode switched from environment estimation to user input. This
+ // is used to time forced rejection of audio feedback contamination.
+ int64_t user_input_start_time_us_;
+
+ // prevent copy constructor and assignment
+ EnergyEndpointer(const EnergyEndpointer&);
+ void operator=(const EnergyEndpointer&);
+};
+
+} // namespace mozilla
+
+#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.cc b/dom/media/webspeech/recognition/energy_endpointer_params.cc
new file mode 100644
index 0000000000..cac4f1b238
--- /dev/null
+++ b/dom/media/webspeech/recognition/energy_endpointer_params.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "energy_endpointer_params.h"
+
+namespace mozilla {
+
+EnergyEndpointerParams::EnergyEndpointerParams() {
+ SetDefaults();
+}
+
+void EnergyEndpointerParams::SetDefaults() {
+ frame_period_ = 0.01f;
+ frame_duration_ = 0.01f;
+ endpoint_margin_ = 0.2f;
+ onset_window_ = 0.15f;
+ speech_on_window_ = 0.4f;
+ offset_window_ = 0.15f;
+ onset_detect_dur_ = 0.09f;
+ onset_confirm_dur_ = 0.075f;
+ on_maintain_dur_ = 0.10f;
+ offset_confirm_dur_ = 0.12f;
+ decision_threshold_ = 150.0f;
+ min_decision_threshold_ = 50.0f;
+ fast_update_dur_ = 0.2f;
+ sample_rate_ = 8000.0f;
+ min_fundamental_frequency_ = 57.143f;
+ max_fundamental_frequency_ = 400.0f;
+ contamination_rejection_period_ = 0.25f;
+}
+
+void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) {
+ frame_period_ = source.frame_period();
+ frame_duration_ = source.frame_duration();
+ endpoint_margin_ = source.endpoint_margin();
+ onset_window_ = source.onset_window();
+ speech_on_window_ = source.speech_on_window();
+ offset_window_ = source.offset_window();
+ onset_detect_dur_ = source.onset_detect_dur();
+ onset_confirm_dur_ = source.onset_confirm_dur();
+ on_maintain_dur_ = source.on_maintain_dur();
+ offset_confirm_dur_ = source.offset_confirm_dur();
+ decision_threshold_ = source.decision_threshold();
+ min_decision_threshold_ = source.min_decision_threshold();
+ fast_update_dur_ = source.fast_update_dur();
+ sample_rate_ = source.sample_rate();
+ min_fundamental_frequency_ = source.min_fundamental_frequency();
+ max_fundamental_frequency_ = source.max_fundamental_frequency();
+ contamination_rejection_period_ = source.contamination_rejection_period();
+}
+
+} // namespace mozilla
diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.h b/dom/media/webspeech/recognition/energy_endpointer_params.h
new file mode 100644
index 0000000000..6437c6dc0f
--- /dev/null
+++ b/dom/media/webspeech/recognition/energy_endpointer_params.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
+
+namespace mozilla {
+
+// Input parameters for the EnergyEndpointer class.
+class EnergyEndpointerParams {
+ public:
+ EnergyEndpointerParams();
+
+ void SetDefaults();
+
+ void operator=(const EnergyEndpointerParams& source);
+
+ // Accessors and mutators
+ float frame_period() const { return frame_period_; }
+ void set_frame_period(float frame_period) {
+ frame_period_ = frame_period;
+ }
+
+ float frame_duration() const { return frame_duration_; }
+ void set_frame_duration(float frame_duration) {
+ frame_duration_ = frame_duration;
+ }
+
+ float endpoint_margin() const { return endpoint_margin_; }
+ void set_endpoint_margin(float endpoint_margin) {
+ endpoint_margin_ = endpoint_margin;
+ }
+
+ float onset_window() const { return onset_window_; }
+ void set_onset_window(float onset_window) { onset_window_ = onset_window; }
+
+ float speech_on_window() const { return speech_on_window_; }
+ void set_speech_on_window(float speech_on_window) {
+ speech_on_window_ = speech_on_window;
+ }
+
+ float offset_window() const { return offset_window_; }
+ void set_offset_window(float offset_window) {
+ offset_window_ = offset_window;
+ }
+
+ float onset_detect_dur() const { return onset_detect_dur_; }
+ void set_onset_detect_dur(float onset_detect_dur) {
+ onset_detect_dur_ = onset_detect_dur;
+ }
+
+ float onset_confirm_dur() const { return onset_confirm_dur_; }
+ void set_onset_confirm_dur(float onset_confirm_dur) {
+ onset_confirm_dur_ = onset_confirm_dur;
+ }
+
+ float on_maintain_dur() const { return on_maintain_dur_; }
+ void set_on_maintain_dur(float on_maintain_dur) {
+ on_maintain_dur_ = on_maintain_dur;
+ }
+
+ float offset_confirm_dur() const { return offset_confirm_dur_; }
+ void set_offset_confirm_dur(float offset_confirm_dur) {
+ offset_confirm_dur_ = offset_confirm_dur;
+ }
+
+ float decision_threshold() const { return decision_threshold_; }
+ void set_decision_threshold(float decision_threshold) {
+ decision_threshold_ = decision_threshold;
+ }
+
+ float min_decision_threshold() const { return min_decision_threshold_; }
+ void set_min_decision_threshold(float min_decision_threshold) {
+ min_decision_threshold_ = min_decision_threshold;
+ }
+
+ float fast_update_dur() const { return fast_update_dur_; }
+ void set_fast_update_dur(float fast_update_dur) {
+ fast_update_dur_ = fast_update_dur;
+ }
+
+ float sample_rate() const { return sample_rate_; }
+ void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; }
+
+ float min_fundamental_frequency() const { return min_fundamental_frequency_; }
+ void set_min_fundamental_frequency(float min_fundamental_frequency) {
+ min_fundamental_frequency_ = min_fundamental_frequency;
+ }
+
+ float max_fundamental_frequency() const { return max_fundamental_frequency_; }
+ void set_max_fundamental_frequency(float max_fundamental_frequency) {
+ max_fundamental_frequency_ = max_fundamental_frequency;
+ }
+
+ float contamination_rejection_period() const {
+ return contamination_rejection_period_;
+ }
+ void set_contamination_rejection_period(
+ float contamination_rejection_period) {
+ contamination_rejection_period_ = contamination_rejection_period;
+ }
+
+ private:
+ float frame_period_; // Frame period
+ float frame_duration_; // Window size
+ float onset_window_; // Interval scanned for onset activity
+ float speech_on_window_; // Inverval scanned for ongoing speech
+ float offset_window_; // Interval scanned for offset evidence
+ float offset_confirm_dur_; // Silence duration required to confirm offset
+ float decision_threshold_; // Initial rms detection threshold
+ float min_decision_threshold_; // Minimum rms detection threshold
+ float fast_update_dur_; // Period for initial estimation of levels.
+ float sample_rate_; // Expected sample rate.
+
+ // Time to add on either side of endpoint threshold crossings
+ float endpoint_margin_;
+ // Total dur within onset_window required to enter ONSET state
+ float onset_detect_dur_;
+ // Total on time within onset_window required to enter SPEECH_ON state
+ float onset_confirm_dur_;
+ // Minimum dur in SPEECH_ON state required to maintain ON state
+ float on_maintain_dur_;
+ // Minimum fundamental frequency for autocorrelation.
+ float min_fundamental_frequency_;
+ // Maximum fundamental frequency for autocorrelation.
+ float max_fundamental_frequency_;
+ // Period after start of user input that above threshold values are ignored.
+ // This is to reject audio feedback contamination.
+ float contamination_rejection_period_;
+};
+
+} // namespace mozilla
+
+#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
diff --git a/dom/media/webspeech/recognition/moz.build b/dom/media/webspeech/recognition/moz.build
new file mode 100644
index 0000000000..5fdf8fdd47
--- /dev/null
+++ b/dom/media/webspeech/recognition/moz.build
@@ -0,0 +1,64 @@
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+MOCHITEST_MANIFESTS += ["test/mochitest.ini"]
+
+XPIDL_MODULE = "dom_webspeechrecognition"
+
+XPIDL_SOURCES = ["nsISpeechRecognitionService.idl"]
+
+EXPORTS.mozilla.dom += [
+ "OnlineSpeechRecognitionService.h",
+ "SpeechGrammar.h",
+ "SpeechGrammarList.h",
+ "SpeechRecognition.h",
+ "SpeechRecognitionAlternative.h",
+ "SpeechRecognitionResult.h",
+ "SpeechRecognitionResultList.h",
+ "SpeechTrackListener.h",
+]
+
+EXPORTS += [
+ "endpointer.h",
+ "energy_endpointer.h",
+ "energy_endpointer_params.h",
+]
+
+if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]:
+ EXPORTS.mozilla.dom += [
+ "test/FakeSpeechRecognitionService.h",
+ ]
+
+UNIFIED_SOURCES += [
+ "endpointer.cc",
+ "energy_endpointer.cc",
+ "energy_endpointer_params.cc",
+ "OnlineSpeechRecognitionService.cpp",
+ "SpeechGrammar.cpp",
+ "SpeechGrammarList.cpp",
+ "SpeechRecognition.cpp",
+ "SpeechRecognitionAlternative.cpp",
+ "SpeechRecognitionResult.cpp",
+ "SpeechRecognitionResultList.cpp",
+ "SpeechTrackListener.cpp",
+]
+
+if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]:
+ UNIFIED_SOURCES += [
+ "test/FakeSpeechRecognitionService.cpp",
+ ]
+
+USE_LIBS += [
+ "jsoncpp",
+]
+
+LOCAL_INCLUDES += [
+ "/dom/base",
+ "/toolkit/components/jsoncpp/include",
+]
+
+include("/ipc/chromium/chromium-config.mozbuild")
+
+FINAL_LIBRARY = "xul"
diff --git a/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl
new file mode 100644
index 0000000000..a43d277da0
--- /dev/null
+++ b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl
@@ -0,0 +1,43 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsISupports.idl"
+
+%{C++
+#include "mozilla/WeakPtr.h"
+
+namespace mozilla {
+class AudioSegment;
+namespace dom {
+class SpeechRecognition;
+class SpeechRecognitionResultList;
+class SpeechGrammarList;
+class SpeechGrammar;
+}
+}
+%}
+
+native SpeechRecognitionWeakPtr(mozilla::WeakPtr<mozilla::dom::SpeechRecognition>);
+[ptr] native AudioSegmentPtr(mozilla::AudioSegment);
+[ptr] native SpeechGrammarPtr(mozilla::dom::SpeechGrammar);
+[ptr] native SpeechGrammarListPtr(mozilla::dom::SpeechGrammarList);
+
+[uuid(6fcb6ee8-a6db-49ba-9f06-355d7ee18ea7)]
+interface nsISpeechGrammarCompilationCallback : nsISupports {
+ void grammarCompilationEnd(in SpeechGrammarPtr grammarObject, in boolean success);
+};
+
+[uuid(8e97f287-f322-44e8-8888-8344fa408ef8)]
+interface nsISpeechRecognitionService : nsISupports {
+ void initialize(in SpeechRecognitionWeakPtr aSpeechRecognition);
+ void processAudioSegment(in AudioSegmentPtr aAudioSegment, in long aSampleRate);
+ void validateAndSetGrammarList(in SpeechGrammarPtr aSpeechGrammar, in nsISpeechGrammarCompilationCallback aCallback);
+ void soundEnd();
+ void abort();
+};
+
+%{C++
+#define NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "@mozilla.org/webspeech/service;1?name="
+%}
diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp
new file mode 100644
index 0000000000..cf14cb3750
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp
@@ -0,0 +1,118 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsThreadUtils.h"
+
+#include "FakeSpeechRecognitionService.h"
+
+#include "SpeechRecognition.h"
+#include "SpeechRecognitionAlternative.h"
+#include "SpeechRecognitionResult.h"
+#include "SpeechRecognitionResultList.h"
+#include "nsIObserverService.h"
+#include "mozilla/Services.h"
+#include "mozilla/StaticPrefs_media.h"
+
+namespace mozilla {
+
+using namespace dom;
+
+NS_IMPL_ISUPPORTS(FakeSpeechRecognitionService, nsISpeechRecognitionService,
+ nsIObserver)
+
+FakeSpeechRecognitionService::FakeSpeechRecognitionService() = default;
+
+FakeSpeechRecognitionService::~FakeSpeechRecognitionService() = default;
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::Initialize(
+ WeakPtr<SpeechRecognition> aSpeechRecognition) {
+ MOZ_ASSERT(NS_IsMainThread());
+ mRecognition = aSpeechRecognition;
+ nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
+ obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
+ obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
+ int32_t aSampleRate) {
+ MOZ_ASSERT(!NS_IsMainThread());
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::SoundEnd() {
+ MOZ_ASSERT(NS_IsMainThread());
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::ValidateAndSetGrammarList(
+ mozilla::dom::SpeechGrammar*, nsISpeechGrammarCompilationCallback*) {
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::Abort() {
+ MOZ_ASSERT(NS_IsMainThread());
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic,
+ const char16_t* aData) {
+ MOZ_ASSERT(StaticPrefs::media_webspeech_test_fake_recognition_service(),
+ "Got request to fake recognition service event, but "
+ "media.webspeech.test.fake_recognition_service is not set");
+
+ if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
+ nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
+ obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
+ obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
+
+ return NS_OK;
+ }
+
+ const nsDependentString eventName = nsDependentString(aData);
+
+ if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_ERROR")) {
+ mRecognition->DispatchError(
+ SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+ SpeechRecognitionErrorCode::Network, // TODO different codes?
+ "RECOGNITIONSERVICE_ERROR test event");
+
+ } else if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_FINAL_RESULT")) {
+ RefPtr<SpeechEvent> event = new SpeechEvent(
+ mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
+
+ event->mRecognitionResultList = BuildMockResultList();
+ NS_DispatchToMainThread(event);
+ }
+ return NS_OK;
+}
+
+SpeechRecognitionResultList*
+FakeSpeechRecognitionService::BuildMockResultList() {
+ SpeechRecognitionResultList* resultList =
+ new SpeechRecognitionResultList(mRecognition);
+ SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
+ if (0 < mRecognition->MaxAlternatives()) {
+ SpeechRecognitionAlternative* alternative =
+ new SpeechRecognitionAlternative(mRecognition);
+
+ alternative->mTranscript = u"Mock final result"_ns;
+ alternative->mConfidence = 0.0f;
+
+ result->mItems.AppendElement(alternative);
+ }
+ resultList->mItems.AppendElement(result);
+
+ return resultList;
+}
+
+} // namespace mozilla
diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h
new file mode 100644
index 0000000000..69e2786b76
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h
@@ -0,0 +1,40 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_FakeSpeechRecognitionService_h
+#define mozilla_dom_FakeSpeechRecognitionService_h
+
+#include "nsCOMPtr.h"
+#include "nsIObserver.h"
+#include "nsISpeechRecognitionService.h"
+
+#define NS_FAKE_SPEECH_RECOGNITION_SERVICE_CID \
+ {0x48c345e7, \
+ 0x9929, \
+ 0x4f9a, \
+ {0xa5, 0x63, 0xf4, 0x78, 0x22, 0x2d, 0xab, 0xcd}};
+
+namespace mozilla {
+
+class FakeSpeechRecognitionService : public nsISpeechRecognitionService,
+ public nsIObserver {
+ public:
+ NS_DECL_THREADSAFE_ISUPPORTS
+ NS_DECL_NSISPEECHRECOGNITIONSERVICE
+ NS_DECL_NSIOBSERVER
+
+ FakeSpeechRecognitionService();
+
+ private:
+ virtual ~FakeSpeechRecognitionService();
+
+ WeakPtr<dom::SpeechRecognition> mRecognition;
+ dom::SpeechRecognitionResultList* BuildMockResultList();
+};
+
+} // namespace mozilla
+
+#endif
diff --git a/dom/media/webspeech/recognition/test/head.js b/dom/media/webspeech/recognition/test/head.js
new file mode 100644
index 0000000000..c77a7ee926
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/head.js
@@ -0,0 +1,200 @@
+"use strict";
+
+const DEFAULT_AUDIO_SAMPLE_FILE = "hello.ogg";
+const SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC =
+ "SpeechRecognitionTest:RequestEvent";
+const SPEECH_RECOGNITION_TEST_END_TOPIC = "SpeechRecognitionTest:End";
+
+var errorCodes = {
+ NO_SPEECH: "no-speech",
+ ABORTED: "aborted",
+ AUDIO_CAPTURE: "audio-capture",
+ NETWORK: "network",
+ NOT_ALLOWED: "not-allowed",
+ SERVICE_NOT_ALLOWED: "service-not-allowed",
+ BAD_GRAMMAR: "bad-grammar",
+ LANGUAGE_NOT_SUPPORTED: "language-not-supported",
+};
+
+var Services = SpecialPowers.Services;
+
+function EventManager(sr) {
+ var self = this;
+ var nEventsExpected = 0;
+ self.eventsReceived = [];
+
+ var allEvents = [
+ "audiostart",
+ "soundstart",
+ "speechstart",
+ "speechend",
+ "soundend",
+ "audioend",
+ "result",
+ "nomatch",
+ "error",
+ "start",
+ "end",
+ ];
+
+ var eventDependencies = {
+ speechend: "speechstart",
+ soundend: "soundstart",
+ audioend: "audiostart",
+ };
+
+ var isDone = false;
+
+ // set up grammar
+ var sgl = new SpeechGrammarList();
+ sgl.addFromString("#JSGF V1.0; grammar test; public <simple> = hello ;", 1);
+ sr.grammars = sgl;
+
+ // AUDIO_DATA events are asynchronous,
+ // so we queue events requested while they are being
+ // issued to make them seem synchronous
+ var isSendingAudioData = false;
+ var queuedEventRequests = [];
+
+ // register default handlers
+ for (var i = 0; i < allEvents.length; i++) {
+ (function (eventName) {
+ sr["on" + eventName] = function (evt) {
+ var message = "unexpected event: " + eventName;
+ if (eventName == "error") {
+ message += " -- " + evt.message;
+ }
+
+ ok(false, message);
+ if (self.doneFunc && !isDone) {
+ isDone = true;
+ self.doneFunc();
+ }
+ };
+ })(allEvents[i]);
+ }
+
+ self.expect = function EventManager_expect(eventName, cb) {
+ nEventsExpected++;
+
+ sr["on" + eventName] = function (evt) {
+ self.eventsReceived.push(eventName);
+ ok(true, "received event " + eventName);
+
+ var dep = eventDependencies[eventName];
+ if (dep) {
+ ok(
+ self.eventsReceived.includes(dep),
+ eventName + " must come after " + dep
+ );
+ }
+
+ cb && cb(evt, sr);
+ if (
+ self.doneFunc &&
+ !isDone &&
+ nEventsExpected === self.eventsReceived.length
+ ) {
+ isDone = true;
+ self.doneFunc();
+ }
+ };
+ };
+
+ self.start = function EventManager_start() {
+ isSendingAudioData = true;
+ var audioTag = document.createElement("audio");
+ audioTag.src = self.audioSampleFile;
+
+ var stream = audioTag.mozCaptureStreamUntilEnded();
+ audioTag.addEventListener("ended", function () {
+ info("Sample stream ended, requesting queued events");
+ isSendingAudioData = false;
+ while (queuedEventRequests.length) {
+ self.requestFSMEvent(queuedEventRequests.shift());
+ }
+ });
+
+ audioTag.play();
+ sr.start(stream);
+ };
+
+ self.requestFSMEvent = function EventManager_requestFSMEvent(eventName) {
+ if (isSendingAudioData) {
+ info(
+ "Queuing event " + eventName + " until we're done sending audio data"
+ );
+ queuedEventRequests.push(eventName);
+ return;
+ }
+
+ info("requesting " + eventName);
+ Services.obs.notifyObservers(
+ null,
+ SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC,
+ eventName
+ );
+ };
+
+ self.requestTestEnd = function EventManager_requestTestEnd() {
+ Services.obs.notifyObservers(null, SPEECH_RECOGNITION_TEST_END_TOPIC);
+ };
+}
+
+function buildResultCallback(transcript) {
+ return function (evt) {
+ is(evt.results[0][0].transcript, transcript, "expect correct transcript");
+ };
+}
+
+function buildErrorCallback(errcode) {
+ return function (err) {
+ is(err.error, errcode, "expect correct error code");
+ };
+}
+
+function performTest(options) {
+ var prefs = options.prefs;
+
+ prefs.unshift(
+ ["media.webspeech.recognition.enable", true],
+ ["media.webspeech.test.enable", true]
+ );
+
+ SpecialPowers.pushPrefEnv({ set: prefs }, function () {
+ var sr;
+ if (!options.webkit) {
+ sr = new SpeechRecognition();
+ } else {
+ sr = new webkitSpeechRecognition();
+ var grammar = new webkitSpeechGrammar();
+ var speechrecognitionlist = new webkitSpeechGrammarList();
+ speechrecognitionlist.addFromString("", 1);
+ sr.grammars = speechrecognitionlist;
+ }
+ var em = new EventManager(sr);
+
+ for (var eventName in options.expectedEvents) {
+ var cb = options.expectedEvents[eventName];
+ em.expect(eventName, cb);
+ }
+
+ em.doneFunc = function () {
+ em.requestTestEnd();
+ if (options.doneFunc) {
+ options.doneFunc();
+ }
+ };
+
+ em.audioSampleFile = DEFAULT_AUDIO_SAMPLE_FILE;
+ if (options.audioSampleFile) {
+ em.audioSampleFile = options.audioSampleFile;
+ }
+
+ em.start();
+
+ for (var i = 0; i < options.eventsToRequest.length; i++) {
+ em.requestFSMEvent(options.eventsToRequest[i]);
+ }
+ });
+}
diff --git a/dom/media/webspeech/recognition/test/hello.ogg b/dom/media/webspeech/recognition/test/hello.ogg
new file mode 100644
index 0000000000..7a80926065
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/hello.ogg
Binary files differ
diff --git a/dom/media/webspeech/recognition/test/hello.ogg^headers^ b/dom/media/webspeech/recognition/test/hello.ogg^headers^
new file mode 100644
index 0000000000..4030ea1d3d
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/hello.ogg^headers^
@@ -0,0 +1 @@
+Cache-Control: no-store
diff --git a/dom/media/webspeech/recognition/test/http_requesthandler.sjs b/dom/media/webspeech/recognition/test/http_requesthandler.sjs
new file mode 100644
index 0000000000..3400df50ec
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/http_requesthandler.sjs
@@ -0,0 +1,85 @@
+const CC = Components.Constructor;
+
+// Context structure - we need to set this up properly to pass to setObjectState
+const ctx = {
+ QueryInterface(iid) {
+ if (iid.equals(Components.interfaces.nsISupports)) {
+ return this;
+ }
+ throw Components.Exception("", Components.results.NS_ERROR_NO_INTERFACE);
+ },
+};
+
+function setRequest(request) {
+ setObjectState(key, request);
+}
+function getRequest() {
+ let request;
+ getObjectState(v => {
+ request = v;
+ });
+ return request;
+}
+
+function handleRequest(request, response) {
+ response.processAsync();
+ if (request.queryString == "save") {
+ // Get the context structure and finish the old request
+ getObjectState("context", function (obj) {
+ savedCtx = obj.wrappedJSObject;
+ request = savedCtx.request;
+
+ response.setHeader("Content-Type", "application/octet-stream", false);
+ response.setHeader("Access-Control-Allow-Origin", "*", false);
+ response.setHeader("Cache-Control", "no-cache", false);
+ response.setStatusLine(request.httpVersion, 200, "OK");
+
+ const input = request.bodyInputStream;
+ const output = response.bodyOutputStream;
+ let bodyAvail;
+ while ((bodyAvail = input.available()) > 0) {
+ output.writeFrom(input, bodyAvail);
+ }
+ response.finish();
+ });
+ return;
+ } else if (
+ request.queryString == "malformedresult=1" ||
+ request.queryString == "emptyresult=1"
+ ) {
+ jsonOK =
+ request.queryString == "malformedresult=1"
+ ? '{"status":"ok","dat'
+ : '{"status":"ok","data":[]}';
+ response.setHeader("Content-Length", String(jsonOK.length), false);
+ response.setHeader("Content-Type", "application/json", false);
+ response.setHeader("Access-Control-Allow-Origin", "*", false);
+ response.setHeader("Cache-Control", "no-cache", false);
+ response.setStatusLine(request.httpVersion, 200, "OK");
+ response.write(jsonOK, jsonOK.length);
+ response.finish();
+ } else if (request.queryString == "hangup=1") {
+ response.finish();
+ } else if (request.queryString == "return400=1") {
+ jsonOK = "{'message':'Bad header:accept-language-stt'}";
+ response.setHeader("Content-Length", String(jsonOK.length), false);
+ response.setHeader("Content-Type", "application/json", false);
+ response.setHeader("Access-Control-Allow-Origin", "*", false);
+ response.setHeader("Cache-Control", "no-cache", false);
+ response.setStatusLine(request.httpVersion, 400, "Bad Request");
+ response.write(jsonOK, jsonOK.length);
+ response.finish();
+ } else {
+ ctx.wrappedJSObject = ctx;
+ ctx.request = request;
+ setObjectState("context", ctx);
+ jsonOK = '{"status":"ok","data":[{"confidence":0.9085610,"text":"hello"}]}';
+ response.setHeader("Content-Length", String(jsonOK.length), false);
+ response.setHeader("Content-Type", "application/json", false);
+ response.setHeader("Access-Control-Allow-Origin", "*", false);
+ response.setHeader("Cache-Control", "no-cache", false);
+ response.setStatusLine(request.httpVersion, 200, "OK");
+ response.write(jsonOK, jsonOK.length);
+ response.finish();
+ }
+}
diff --git a/dom/media/webspeech/recognition/test/mochitest.ini b/dom/media/webspeech/recognition/test/mochitest.ini
new file mode 100644
index 0000000000..6af13b906c
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/mochitest.ini
@@ -0,0 +1,35 @@
+[DEFAULT]
+tags=mtg
+subsuite = media
+support-files =
+ head.js
+ hello.ogg
+ hello.ogg^headers^
+ http_requesthandler.sjs
+ sinoid+hello.ogg
+ sinoid+hello.ogg^headers^
+ silence.ogg
+ silence.ogg^headers^
+[test_abort.html]
+skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363
+[test_audio_capture_error.html]
+skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360
+[test_call_start_from_end_handler.html]
+tags=capturestream
+skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363
+[test_nested_eventloop.html]
+skip-if = toolkit == 'android'
+[test_online_400_response.html]
+[test_online_hangup.html]
+[test_online_http.html]
+[test_online_http_webkit.html]
+[test_online_malformed_result_handling.html]
+[test_online_empty_result_handling.html]
+[test_preference_enable.html]
+[test_recognition_service_error.html]
+skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360
+[test_success_without_recognition_service.html]
+skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360
+[test_timeout.html]
+skip-if =
+ os == "linux" # Bug 1307991 - low frequency on try pushes
diff --git a/dom/media/webspeech/recognition/test/silence.ogg b/dom/media/webspeech/recognition/test/silence.ogg
new file mode 100644
index 0000000000..e6da3a5022
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/silence.ogg
Binary files differ
diff --git a/dom/media/webspeech/recognition/test/silence.ogg^headers^ b/dom/media/webspeech/recognition/test/silence.ogg^headers^
new file mode 100644
index 0000000000..4030ea1d3d
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/silence.ogg^headers^
@@ -0,0 +1 @@
+Cache-Control: no-store
diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg b/dom/media/webspeech/recognition/test/sinoid+hello.ogg
new file mode 100644
index 0000000000..7092e82f30
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg
Binary files differ
diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^
new file mode 100644
index 0000000000..4030ea1d3d
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^
@@ -0,0 +1 @@
+Cache-Control: no-store
diff --git a/dom/media/webspeech/recognition/test/test_abort.html b/dom/media/webspeech/recognition/test/test_abort.html
new file mode 100644
index 0000000000..0f22770cc7
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_abort.html
@@ -0,0 +1,73 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Call abort from inside handlers</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ // Abort inside event handlers, should't get a
+ // result after that
+
+ var nextEventIdx = 0;
+ var eventsToAbortOn = [
+ "start",
+ "audiostart",
+ "speechstart",
+ "speechend",
+ "audioend"
+ ];
+
+ function doNextTest() {
+ var nextEvent = eventsToAbortOn[nextEventIdx];
+ var expectedEvents = {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null
+ };
+
+ if (nextEventIdx >= eventsToAbortOn.indexOf("speechstart")) {
+ expectedEvents.speechstart = null;
+ }
+
+ if (nextEventIdx >= eventsToAbortOn.indexOf("speechend")) {
+ expectedEvents.speechend = null;
+ }
+
+ info("Aborting on " + nextEvent);
+ expectedEvents[nextEvent] = function(evt, sr) {
+ sr.abort();
+ };
+
+ nextEventIdx++;
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents,
+ doneFunc: (nextEventIdx < eventsToAbortOn.length) ? doNextTest : SimpleTest.finish,
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+ }
+
+ doNextTest();
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_audio_capture_error.html b/dom/media/webspeech/recognition/test/test_audio_capture_error.html
new file mode 100644
index 0000000000..0c054dbf0b
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_audio_capture_error.html
@@ -0,0 +1,42 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Behavior on audio error</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: ['EVENT_AUDIO_ERROR'],
+ expectedEvents: {
+ 'start': null,
+ 'audiostart': null,
+ 'speechstart': null,
+ 'speechend': null,
+ 'audioend': null,
+ 'error': buildErrorCallback(errorCodes.AUDIO_CAPTURE),
+ 'end': null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html
new file mode 100644
index 0000000000..895648ad9e
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html
@@ -0,0 +1,102 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Restart recognition from end handler</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ function createAudioStream() {
+ var audioTag = document.createElement("audio");
+ audioTag.src = DEFAULT_AUDIO_SAMPLE_FILE;
+
+ var stream = audioTag.mozCaptureStreamUntilEnded();
+ audioTag.play();
+
+ return stream;
+ }
+
+ var done = false;
+ function endHandler(evt, sr) {
+ if (done) {
+ SimpleTest.finish();
+ return;
+ }
+
+ try {
+ var stream = createAudioStream();
+ sr.start(stream); // shouldn't fail
+ } catch (err) {
+ ok(false, "Failed to start() from end() callback");
+ }
+
+ // calling start() may cause some callbacks to fire, but we're
+ // no longer interested in them, except for onend, which is where
+ // we'll conclude the test.
+ sr.onstart = null;
+ sr.onaudiostart = null;
+ sr.onspeechstart = null;
+ sr.onspeechend = null;
+ sr.onaudioend = null;
+ sr.onresult = null;
+
+ // FIXME(ggp) the state transition caused by start() is async,
+ // but abort() is sync (see bug 1055093). until we normalize
+ // state transitions, we need to setTimeout here to make sure
+ // abort() finds the speech recognition object in the correct
+ // state (namely, STATE_STARTING).
+ setTimeout(function() {
+ sr.abort();
+ done = true;
+ });
+
+ info("Successfully start() from end() callback");
+ }
+
+ function expectExceptionHandler(evt, sr) {
+ try {
+ sr.start(createAudioStream());
+ } catch (err) {
+ is(err.name, "InvalidStateError");
+ return;
+ }
+
+ ok(false, "Calling start() didn't raise InvalidStateError");
+ }
+
+ performTest({
+ eventsToRequest: [
+ 'EVENT_RECOGNITIONSERVICE_FINAL_RESULT'
+ ],
+ expectedEvents: {
+ 'start': expectExceptionHandler,
+ 'audiostart': expectExceptionHandler,
+ 'speechstart': expectExceptionHandler,
+ 'speechend': expectExceptionHandler,
+ 'audioend': expectExceptionHandler,
+ 'result': buildResultCallback("Mock final result"),
+ 'end': endHandler,
+ },
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_nested_eventloop.html b/dom/media/webspeech/recognition/test/test_nested_eventloop.html
new file mode 100644
index 0000000000..4924766b44
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_nested_eventloop.html
@@ -0,0 +1,82 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Spin the event loop from inside a callback</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ /*
+ * SpecialPowers.spinEventLoop can be used to spin the event loop, causing
+ * queued SpeechEvents (such as those created by calls to start(), stop()
+ * or abort()) to be processed immediately.
+ * When this is done from inside DOM event handlers, it is possible to
+ * cause reentrancy in our C++ code, which we should be able to withstand.
+ */
+ function abortAndSpinEventLoop(evt, sr) {
+ sr.abort();
+ SpecialPowers.spinEventLoop(window);
+ }
+ function doneFunc() {
+ // Trigger gc now and wait some time to make sure this test gets the blame
+ // for any assertions caused by spinning the event loop.
+ //
+ // NB - The assertions should be gone, but this looks too scary to touch
+ // during batch cleanup.
+ var count = 0, GC_COUNT = 4;
+
+ function triggerGCOrFinish() {
+ SpecialPowers.gc();
+ count++;
+
+ if (count == GC_COUNT) {
+ SimpleTest.finish();
+ }
+ }
+
+ for (var i = 0; i < GC_COUNT; i++) {
+ setTimeout(triggerGCOrFinish, 0);
+ }
+ }
+
+ /*
+ * We start by performing a normal start, then abort from the audiostart
+ * callback and force the EVENT_ABORT to be processed while still inside
+ * the event handler. This causes the recording to stop, which raises
+ * the audioend and (later on) end events.
+ * Then, we abort (once again spinning the event loop) from the audioend
+ * handler, attempting to cause a re-entry into the abort code. This second
+ * call should be ignored, and we get the end callback and finish.
+ */
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "audiostart": abortAndSpinEventLoop,
+ "audioend": abortAndSpinEventLoop,
+ "end": null
+ },
+ doneFunc,
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_400_response.html b/dom/media/webspeech/recognition/test/test_online_400_response.html
new file mode 100644
index 0000000000..1a7d0ed452
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_400_response.html
@@ -0,0 +1,47 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns a 400 error
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ 'error': buildErrorCallback(errorCodes.NETWORK),
+ "speechstart": null,
+ "speechend": null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?return400=1"],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html
new file mode 100644
index 0000000000..46f1e7e0cb
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html
@@ -0,0 +1,48 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns a valid json object, but without any transcription
+results on it, for example: `{"status":"ok","data":[]}`
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ 'error': buildErrorCallback(errorCodes.NETWORK),
+ "speechstart": null,
+ "speechend": null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?emptyresult=1"],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_hangup.html b/dom/media/webspeech/recognition/test/test_online_hangup.html
new file mode 100644
index 0000000000..4a46f80f8f
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_hangup.html
@@ -0,0 +1,47 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server hangups the connection without sending any response
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ 'error': buildErrorCallback(errorCodes.NETWORK),
+ "speechstart": null,
+ "speechend": null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?hangup=1"],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_http.html b/dom/media/webspeech/recognition/test/test_online_http.html
new file mode 100644
index 0000000000..43be7a656a
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_http.html
@@ -0,0 +1,89 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test a successfull speech recognition request and
+that audio is being properly encoded
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ async function validateRawAudio(buffer) {
+ const ac = new AudioContext();
+ const decodedData = await ac.decodeAudioData(buffer);
+ const source = ac.createBufferSource();
+ source.buffer = decodedData;
+ source.loop = true;
+ const analyser = ac.createAnalyser();
+ analyser.smoothingTimeConstant = 0.2;
+ analyser.fftSize = 1024;
+ source.connect(analyser);
+ const binIndexForFrequency = frequency =>
+ 1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
+ source.start();
+ const data = new Uint8Array(analyser.frequencyBinCount);
+ const start = performance.now();
+ while (true) {
+ if (performance.now() - start > 10000) {
+ return false;
+ break;
+ }
+ analyser.getByteFrequencyData(data);
+ if (data[binIndexForFrequency(200)] < 50 &&
+ data[binIndexForFrequency(440)] > 180 &&
+ data[binIndexForFrequency(1000)] < 50) {
+ return true;
+ break;
+ }
+ await new Promise(r => requestAnimationFrame(r));
+ }
+ }
+
+ async function verifyEncodedAudio(requestUrl) {
+ try {
+ const response = await fetch(requestUrl);
+ const buffer = await response.arrayBuffer();
+ ok(await validateRawAudio(buffer), "Audio encoding is valid");
+ } catch(e) {
+ ok(false, e);
+ } finally {
+ SimpleTest.finish();
+ }
+ }
+
+ performTest({
+ eventsToRequest: {},
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
+ "speechstart": null,
+ "speechend": null
+ },
+ audioSampleFile: "sinoid+hello.ogg",
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_http_webkit.html b/dom/media/webspeech/recognition/test/test_online_http_webkit.html
new file mode 100644
index 0000000000..7f6c7e6d7d
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_http_webkit.html
@@ -0,0 +1,90 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test a successfull speech recognition request and
+that audio is being properly encoded
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ async function validateRawAudio(buffer) {
+ const ac = new AudioContext();
+ const decodedData = await ac.decodeAudioData(buffer);
+ const source = ac.createBufferSource();
+ source.buffer = decodedData;
+ source.loop = true;
+ const analyser = ac.createAnalyser();
+ analyser.smoothingTimeConstant = 0.2;
+ analyser.fftSize = 1024;
+ source.connect(analyser);
+ const binIndexForFrequency = frequency =>
+ 1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
+ source.start();
+ const data = new Uint8Array(analyser.frequencyBinCount);
+ const start = performance.now();
+ while (true) {
+ if (performance.now() - start > 10000) {
+ return false;
+ break;
+ }
+ analyser.getByteFrequencyData(data);
+ if (data[binIndexForFrequency(200)] < 50 &&
+ data[binIndexForFrequency(440)] > 180 &&
+ data[binIndexForFrequency(1000)] < 50) {
+ return true;
+ break;
+ }
+ await new Promise(r => requestAnimationFrame(r));
+ }
+ }
+
+ async function verifyEncodedAudio(requestUrl) {
+ try {
+ const response = await fetch(requestUrl);
+ const buffer = await response.arrayBuffer();
+ ok(await validateRawAudio(buffer), "Audio encoding is valid");
+ } catch(e) {
+ ok(false, e);
+ } finally {
+ SimpleTest.finish();
+ }
+ }
+
+ performTest({
+ eventsToRequest: {},
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
+ "speechstart": null,
+ "speechend": null
+ },
+ audioSampleFile: "sinoid+hello.ogg",
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
+ ["media.webspeech.recognition.timeout", 100000]],
+ webkit: true
+ });
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html
new file mode 100644
index 0000000000..b071a46ea3
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html
@@ -0,0 +1,48 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns an invalid/corrupted json object, for example:
+`{"status":"ok","dat`
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 1248897 -- Online speech service</title>
+ <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "end": null,
+ 'error': buildErrorCallback(errorCodes.NETWORK),
+ "speechstart": null,
+ "speechend": null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.recognition.enable", true],
+ ["media.webspeech.recognition.force_enable", true],
+ ["media.webspeech.service.endpoint",
+ "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?malformedresult=1"],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_preference_enable.html b/dom/media/webspeech/recognition/test/test_preference_enable.html
new file mode 100644
index 0000000000..2b56f82e2c
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_preference_enable.html
@@ -0,0 +1,43 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- No objects should be visible with preference disabled</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ SpecialPowers.pushPrefEnv({
+ set: [["media.webspeech.recognition.enable", false]]
+ }, function() {
+ var objects = [
+ "SpeechRecognition",
+ "SpeechGrammar",
+ "SpeechRecognitionResult",
+ "SpeechRecognitionResultList",
+ "SpeechRecognitionAlternative"
+ ];
+
+ for (var i = 0; i < objects.length; i++) {
+ is(window[objects[i]], undefined,
+ objects[i] + " should be undefined with pref off");
+ }
+
+ SimpleTest.finish();
+ });
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_recognition_service_error.html b/dom/media/webspeech/recognition/test/test_recognition_service_error.html
new file mode 100644
index 0000000000..e8e59e2afc
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_recognition_service_error.html
@@ -0,0 +1,45 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Behavior on recognition service error</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [
+ 'EVENT_RECOGNITIONSERVICE_ERROR'
+ ],
+ expectedEvents: {
+ 'start': null,
+ 'audiostart': null,
+ 'speechstart': null,
+ 'speechend': null,
+ 'audioend': null,
+ 'error': buildErrorCallback(errorCodes.NETWORK),
+ 'end': null
+ },
+ doneFunc: SimpleTest.finish,
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html
new file mode 100644
index 0000000000..38748ed5cb
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html
@@ -0,0 +1,45 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Success with fake recognition service</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [
+ 'EVENT_RECOGNITIONSERVICE_FINAL_RESULT'
+ ],
+ expectedEvents: {
+ 'start': null,
+ 'audiostart': null,
+ 'speechstart': null,
+ 'speechend': null,
+ 'audioend': null,
+ 'result': buildResultCallback("Mock final result"),
+ 'end': null
+ },
+ doneFunc:SimpleTest.finish,
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 100000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>
diff --git a/dom/media/webspeech/recognition/test/test_timeout.html b/dom/media/webspeech/recognition/test/test_timeout.html
new file mode 100644
index 0000000000..8334c9e779
--- /dev/null
+++ b/dom/media/webspeech/recognition/test/test_timeout.html
@@ -0,0 +1,42 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+ <meta charset="utf-8">
+ <title>Test for Bug 650295 -- Timeout for user speech</title>
+ <script src="/tests/SimpleTest/SimpleTest.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+ <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+ SimpleTest.waitForExplicitFinish();
+
+ performTest({
+ eventsToRequest: [],
+ expectedEvents: {
+ "start": null,
+ "audiostart": null,
+ "audioend": null,
+ "error": buildErrorCallback(errorCodes.NO_SPEECH),
+ "end": null
+ },
+ doneFunc: SimpleTest.finish,
+ audioSampleFile: "silence.ogg",
+ prefs: [["media.webspeech.test.fake_fsm_events", true],
+ ["media.webspeech.test.fake_recognition_service", true],
+ ["media.webspeech.recognition.timeout", 1000]]
+ });
+
+</script>
+</pre>
+</body>
+</html>