From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- dom/media/webspeech/moz.build | 12 + .../recognition/OnlineSpeechRecognitionService.cpp | 462 ++++++++ .../recognition/OnlineSpeechRecognitionService.h | 132 +++ dom/media/webspeech/recognition/SpeechGrammar.cpp | 57 + dom/media/webspeech/recognition/SpeechGrammar.h | 64 ++ .../webspeech/recognition/SpeechGrammarList.cpp | 76 ++ .../webspeech/recognition/SpeechGrammarList.h | 73 ++ .../webspeech/recognition/SpeechRecognition.cpp | 1170 ++++++++++++++++++++ .../webspeech/recognition/SpeechRecognition.h | 314 ++++++ .../recognition/SpeechRecognitionAlternative.cpp | 44 + .../recognition/SpeechRecognitionAlternative.h | 49 + .../recognition/SpeechRecognitionResult.cpp | 59 + .../recognition/SpeechRecognitionResult.h | 54 + .../recognition/SpeechRecognitionResultList.cpp | 58 + .../recognition/SpeechRecognitionResultList.h | 53 + .../webspeech/recognition/SpeechTrackListener.cpp | 92 ++ .../webspeech/recognition/SpeechTrackListener.h | 50 + dom/media/webspeech/recognition/endpointer.cc | 193 ++++ dom/media/webspeech/recognition/endpointer.h | 180 +++ .../webspeech/recognition/energy_endpointer.cc | 393 +++++++ .../webspeech/recognition/energy_endpointer.h | 180 +++ .../recognition/energy_endpointer_params.cc | 77 ++ .../recognition/energy_endpointer_params.h | 159 +++ dom/media/webspeech/recognition/moz.build | 64 ++ .../recognition/nsISpeechRecognitionService.idl | 43 + .../test/FakeSpeechRecognitionService.cpp | 118 ++ .../test/FakeSpeechRecognitionService.h | 40 + dom/media/webspeech/recognition/test/head.js | 200 ++++ dom/media/webspeech/recognition/test/hello.ogg | Bin 0 -> 11328 bytes .../webspeech/recognition/test/hello.ogg^headers^ | 1 + .../recognition/test/http_requesthandler.sjs | 85 ++ dom/media/webspeech/recognition/test/mochitest.ini | 35 + dom/media/webspeech/recognition/test/silence.ogg | Bin 0 -> 106941 bytes .../recognition/test/silence.ogg^headers^ | 1 + .../webspeech/recognition/test/sinoid+hello.ogg | Bin 0 -> 29514 bytes .../recognition/test/sinoid+hello.ogg^headers^ | 1 + .../webspeech/recognition/test/test_abort.html | 73 ++ .../recognition/test/test_audio_capture_error.html | 42 + .../test/test_call_start_from_end_handler.html | 102 ++ .../recognition/test/test_nested_eventloop.html | 82 ++ .../recognition/test/test_online_400_response.html | 47 + .../test/test_online_empty_result_handling.html | 48 + .../recognition/test/test_online_hangup.html | 47 + .../recognition/test/test_online_http.html | 89 ++ .../recognition/test/test_online_http_webkit.html | 90 ++ .../test_online_malformed_result_handling.html | 48 + .../recognition/test/test_preference_enable.html | 43 + .../test/test_recognition_service_error.html | 45 + .../test_success_without_recognition_service.html | 45 + .../webspeech/recognition/test/test_timeout.html | 42 + dom/media/webspeech/synth/SpeechSynthesis.cpp | 315 ++++++ dom/media/webspeech/synth/SpeechSynthesis.h | 88 ++ .../webspeech/synth/SpeechSynthesisUtterance.cpp | 137 +++ .../webspeech/synth/SpeechSynthesisUtterance.h | 115 ++ dom/media/webspeech/synth/SpeechSynthesisVoice.cpp | 72 ++ dom/media/webspeech/synth/SpeechSynthesisVoice.h | 55 + .../synth/android/SpeechSynthesisService.cpp | 215 ++++ .../synth/android/SpeechSynthesisService.h | 68 ++ dom/media/webspeech/synth/android/components.conf | 17 + dom/media/webspeech/synth/android/moz.build | 19 + .../synth/cocoa/OSXSpeechSynthesizerService.h | 42 + .../synth/cocoa/OSXSpeechSynthesizerService.mm | 431 +++++++ dom/media/webspeech/synth/cocoa/components.conf | 17 + dom/media/webspeech/synth/cocoa/moz.build | 15 + dom/media/webspeech/synth/crashtests/1230428.html | 32 + .../webspeech/synth/crashtests/crashtests.list | 1 + .../webspeech/synth/ipc/PSpeechSynthesis.ipdl | 50 + .../synth/ipc/PSpeechSynthesisRequest.ipdl | 48 + .../webspeech/synth/ipc/SpeechSynthesisChild.cpp | 169 +++ .../webspeech/synth/ipc/SpeechSynthesisChild.h | 107 ++ .../webspeech/synth/ipc/SpeechSynthesisParent.cpp | 221 ++++ .../webspeech/synth/ipc/SpeechSynthesisParent.h | 102 ++ dom/media/webspeech/synth/moz.build | 65 ++ dom/media/webspeech/synth/nsISpeechService.idl | 143 +++ .../webspeech/synth/nsISynthVoiceRegistry.idl | 77 ++ dom/media/webspeech/synth/nsSpeechTask.cpp | 389 +++++++ dom/media/webspeech/synth/nsSpeechTask.h | 128 +++ dom/media/webspeech/synth/nsSynthVoiceRegistry.cpp | 762 +++++++++++++ dom/media/webspeech/synth/nsSynthVoiceRegistry.h | 99 ++ .../synth/speechd/SpeechDispatcherService.cpp | 538 +++++++++ .../synth/speechd/SpeechDispatcherService.h | 65 ++ dom/media/webspeech/synth/speechd/components.conf | 17 + dom/media/webspeech/synth/speechd/moz.build | 15 + dom/media/webspeech/synth/test/common.js | 104 ++ dom/media/webspeech/synth/test/components.conf | 17 + .../webspeech/synth/test/file_bfcache_page1.html | 18 + .../webspeech/synth/test/file_bfcache_page2.html | 14 + .../webspeech/synth/test/file_global_queue.html | 69 ++ .../synth/test/file_global_queue_cancel.html | 88 ++ .../synth/test/file_global_queue_pause.html | 130 +++ .../synth/test/file_indirect_service_events.html | 102 ++ dom/media/webspeech/synth/test/file_setup.html | 96 ++ .../webspeech/synth/test/file_speech_cancel.html | 100 ++ .../webspeech/synth/test/file_speech_error.html | 46 + .../webspeech/synth/test/file_speech_queue.html | 86 ++ .../test/file_speech_repeating_utterance.html | 26 + .../webspeech/synth/test/file_speech_simple.html | 53 + dom/media/webspeech/synth/test/mochitest.ini | 29 + .../webspeech/synth/test/nsFakeSynthServices.cpp | 288 +++++ .../webspeech/synth/test/nsFakeSynthServices.h | 42 + .../synth/test/startup/file_voiceschanged.html | 32 + .../webspeech/synth/test/startup/mochitest.ini | 8 + .../synth/test/startup/test_voiceschanged.html | 32 + dom/media/webspeech/synth/test/test_bfcache.html | 46 + .../webspeech/synth/test/test_global_queue.html | 35 + .../synth/test/test_global_queue_cancel.html | 35 + .../synth/test/test_global_queue_pause.html | 35 + .../synth/test/test_indirect_service_events.html | 36 + dom/media/webspeech/synth/test/test_setup.html | 32 + .../webspeech/synth/test/test_speech_cancel.html | 35 + .../webspeech/synth/test/test_speech_error.html | 35 + .../webspeech/synth/test/test_speech_queue.html | 37 + .../test/test_speech_repeating_utterance.html | 18 + .../webspeech/synth/test/test_speech_simple.html | 34 + dom/media/webspeech/synth/windows/SapiService.cpp | 445 ++++++++ dom/media/webspeech/synth/windows/SapiService.h | 57 + dom/media/webspeech/synth/windows/components.conf | 17 + dom/media/webspeech/synth/windows/moz.build | 17 + 118 files changed, 12360 insertions(+) create mode 100644 dom/media/webspeech/moz.build create mode 100644 dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp create mode 100644 dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h create mode 100644 dom/media/webspeech/recognition/SpeechGrammar.cpp create mode 100644 dom/media/webspeech/recognition/SpeechGrammar.h create mode 100644 dom/media/webspeech/recognition/SpeechGrammarList.cpp create mode 100644 dom/media/webspeech/recognition/SpeechGrammarList.h create mode 100644 dom/media/webspeech/recognition/SpeechRecognition.cpp create mode 100644 dom/media/webspeech/recognition/SpeechRecognition.h create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionAlternative.h create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionResult.cpp create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionResult.h create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp create mode 100644 dom/media/webspeech/recognition/SpeechRecognitionResultList.h create mode 100644 dom/media/webspeech/recognition/SpeechTrackListener.cpp create mode 100644 dom/media/webspeech/recognition/SpeechTrackListener.h create mode 100644 dom/media/webspeech/recognition/endpointer.cc create mode 100644 dom/media/webspeech/recognition/endpointer.h create mode 100644 dom/media/webspeech/recognition/energy_endpointer.cc create mode 100644 dom/media/webspeech/recognition/energy_endpointer.h create mode 100644 dom/media/webspeech/recognition/energy_endpointer_params.cc create mode 100644 dom/media/webspeech/recognition/energy_endpointer_params.h create mode 100644 dom/media/webspeech/recognition/moz.build create mode 100644 dom/media/webspeech/recognition/nsISpeechRecognitionService.idl create mode 100644 dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp create mode 100644 dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h create mode 100644 dom/media/webspeech/recognition/test/head.js create mode 100644 dom/media/webspeech/recognition/test/hello.ogg create mode 100644 dom/media/webspeech/recognition/test/hello.ogg^headers^ create mode 100644 dom/media/webspeech/recognition/test/http_requesthandler.sjs create mode 100644 dom/media/webspeech/recognition/test/mochitest.ini create mode 100644 dom/media/webspeech/recognition/test/silence.ogg create mode 100644 dom/media/webspeech/recognition/test/silence.ogg^headers^ create mode 100644 dom/media/webspeech/recognition/test/sinoid+hello.ogg create mode 100644 dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ create mode 100644 dom/media/webspeech/recognition/test/test_abort.html create mode 100644 dom/media/webspeech/recognition/test/test_audio_capture_error.html create mode 100644 dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html create mode 100644 dom/media/webspeech/recognition/test/test_nested_eventloop.html create mode 100644 dom/media/webspeech/recognition/test/test_online_400_response.html create mode 100644 dom/media/webspeech/recognition/test/test_online_empty_result_handling.html create mode 100644 dom/media/webspeech/recognition/test/test_online_hangup.html create mode 100644 dom/media/webspeech/recognition/test/test_online_http.html create mode 100644 dom/media/webspeech/recognition/test/test_online_http_webkit.html create mode 100644 dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html create mode 100644 dom/media/webspeech/recognition/test/test_preference_enable.html create mode 100644 dom/media/webspeech/recognition/test/test_recognition_service_error.html create mode 100644 dom/media/webspeech/recognition/test/test_success_without_recognition_service.html create mode 100644 dom/media/webspeech/recognition/test/test_timeout.html create mode 100644 dom/media/webspeech/synth/SpeechSynthesis.cpp create mode 100644 dom/media/webspeech/synth/SpeechSynthesis.h create mode 100644 dom/media/webspeech/synth/SpeechSynthesisUtterance.cpp create mode 100644 dom/media/webspeech/synth/SpeechSynthesisUtterance.h create mode 100644 dom/media/webspeech/synth/SpeechSynthesisVoice.cpp create mode 100644 dom/media/webspeech/synth/SpeechSynthesisVoice.h create mode 100644 dom/media/webspeech/synth/android/SpeechSynthesisService.cpp create mode 100644 dom/media/webspeech/synth/android/SpeechSynthesisService.h create mode 100644 dom/media/webspeech/synth/android/components.conf create mode 100644 dom/media/webspeech/synth/android/moz.build create mode 100644 dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.h create mode 100644 dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.mm create mode 100644 dom/media/webspeech/synth/cocoa/components.conf create mode 100644 dom/media/webspeech/synth/cocoa/moz.build create mode 100644 dom/media/webspeech/synth/crashtests/1230428.html create mode 100644 dom/media/webspeech/synth/crashtests/crashtests.list create mode 100644 dom/media/webspeech/synth/ipc/PSpeechSynthesis.ipdl create mode 100644 dom/media/webspeech/synth/ipc/PSpeechSynthesisRequest.ipdl create mode 100644 dom/media/webspeech/synth/ipc/SpeechSynthesisChild.cpp create mode 100644 dom/media/webspeech/synth/ipc/SpeechSynthesisChild.h create mode 100644 dom/media/webspeech/synth/ipc/SpeechSynthesisParent.cpp create mode 100644 dom/media/webspeech/synth/ipc/SpeechSynthesisParent.h create mode 100644 dom/media/webspeech/synth/moz.build create mode 100644 dom/media/webspeech/synth/nsISpeechService.idl create mode 100644 dom/media/webspeech/synth/nsISynthVoiceRegistry.idl create mode 100644 dom/media/webspeech/synth/nsSpeechTask.cpp create mode 100644 dom/media/webspeech/synth/nsSpeechTask.h create mode 100644 dom/media/webspeech/synth/nsSynthVoiceRegistry.cpp create mode 100644 dom/media/webspeech/synth/nsSynthVoiceRegistry.h create mode 100644 dom/media/webspeech/synth/speechd/SpeechDispatcherService.cpp create mode 100644 dom/media/webspeech/synth/speechd/SpeechDispatcherService.h create mode 100644 dom/media/webspeech/synth/speechd/components.conf create mode 100644 dom/media/webspeech/synth/speechd/moz.build create mode 100644 dom/media/webspeech/synth/test/common.js create mode 100644 dom/media/webspeech/synth/test/components.conf create mode 100644 dom/media/webspeech/synth/test/file_bfcache_page1.html create mode 100644 dom/media/webspeech/synth/test/file_bfcache_page2.html create mode 100644 dom/media/webspeech/synth/test/file_global_queue.html create mode 100644 dom/media/webspeech/synth/test/file_global_queue_cancel.html create mode 100644 dom/media/webspeech/synth/test/file_global_queue_pause.html create mode 100644 dom/media/webspeech/synth/test/file_indirect_service_events.html create mode 100644 dom/media/webspeech/synth/test/file_setup.html create mode 100644 dom/media/webspeech/synth/test/file_speech_cancel.html create mode 100644 dom/media/webspeech/synth/test/file_speech_error.html create mode 100644 dom/media/webspeech/synth/test/file_speech_queue.html create mode 100644 dom/media/webspeech/synth/test/file_speech_repeating_utterance.html create mode 100644 dom/media/webspeech/synth/test/file_speech_simple.html create mode 100644 dom/media/webspeech/synth/test/mochitest.ini create mode 100644 dom/media/webspeech/synth/test/nsFakeSynthServices.cpp create mode 100644 dom/media/webspeech/synth/test/nsFakeSynthServices.h create mode 100644 dom/media/webspeech/synth/test/startup/file_voiceschanged.html create mode 100644 dom/media/webspeech/synth/test/startup/mochitest.ini create mode 100644 dom/media/webspeech/synth/test/startup/test_voiceschanged.html create mode 100644 dom/media/webspeech/synth/test/test_bfcache.html create mode 100644 dom/media/webspeech/synth/test/test_global_queue.html create mode 100644 dom/media/webspeech/synth/test/test_global_queue_cancel.html create mode 100644 dom/media/webspeech/synth/test/test_global_queue_pause.html create mode 100644 dom/media/webspeech/synth/test/test_indirect_service_events.html create mode 100644 dom/media/webspeech/synth/test/test_setup.html create mode 100644 dom/media/webspeech/synth/test/test_speech_cancel.html create mode 100644 dom/media/webspeech/synth/test/test_speech_error.html create mode 100644 dom/media/webspeech/synth/test/test_speech_queue.html create mode 100644 dom/media/webspeech/synth/test/test_speech_repeating_utterance.html create mode 100644 dom/media/webspeech/synth/test/test_speech_simple.html create mode 100644 dom/media/webspeech/synth/windows/SapiService.cpp create mode 100644 dom/media/webspeech/synth/windows/SapiService.h create mode 100644 dom/media/webspeech/synth/windows/components.conf create mode 100644 dom/media/webspeech/synth/windows/moz.build (limited to 'dom/media/webspeech') diff --git a/dom/media/webspeech/moz.build b/dom/media/webspeech/moz.build new file mode 100644 index 0000000000..26856a0598 --- /dev/null +++ b/dom/media/webspeech/moz.build @@ -0,0 +1,12 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +DIRS = ["synth"] + +if CONFIG["MOZ_WEBSPEECH"]: + DIRS += ["recognition"] + +with Files("**"): + BUG_COMPONENT = ("Core", "Web Speech") diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp new file mode 100644 index 0000000000..e68ccc417e --- /dev/null +++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp @@ -0,0 +1,462 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsThreadUtils.h" +#include "nsXPCOMCIDInternal.h" +#include "OnlineSpeechRecognitionService.h" +#include "nsIFile.h" +#include "SpeechGrammar.h" +#include "SpeechRecognition.h" +#include "SpeechRecognitionAlternative.h" +#include "SpeechRecognitionResult.h" +#include "SpeechRecognitionResultList.h" +#include "nsIObserverService.h" +#include "mozilla/dom/Document.h" +#include "mozilla/Preferences.h" +#include "mozilla/ScopeExit.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/Services.h" +#include "nsDirectoryServiceDefs.h" +#include "nsDirectoryServiceUtils.h" +#include "nsNetUtil.h" +#include "nsContentUtils.h" +#include "nsIChannel.h" +#include "nsIHttpChannel.h" +#include "nsIPrincipal.h" +#include "nsIStreamListener.h" +#include "nsIUploadChannel2.h" +#include "mozilla/dom/ClientIPCTypes.h" +#include "nsStringStream.h" +#include "nsIOutputStream.h" +#include "nsStreamUtils.h" +#include "OpusTrackEncoder.h" +#include "OggWriter.h" +#include "nsIClassOfService.h" +#include +#include +#include +#include + +namespace mozilla { + +using namespace dom; + +#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \ + "media.webspeech.service.endpoint" +#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/" +#define MAX_LISTENING_TIME_MS 10000 + +NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService, + nsIStreamListener) + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure, + const char* aFromRawSegment, + uint32_t aToOffset, uint32_t aCount, + uint32_t* aWriteCount) { + nsCString* buf = static_cast(aClosure); + buf->Append(aFromRawSegment, aCount); + *aWriteCount = aCount; + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest, + nsIInputStream* aInputStream, + uint64_t aOffset, + uint32_t aCount) { + MOZ_ASSERT(NS_IsMainThread()); + nsresult rv; + uint32_t readCount; + rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount, + &readCount); + NS_ENSURE_SUCCESS(rv, rv); + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest, + nsresult aStatusCode) { + MOZ_ASSERT(NS_IsMainThread()); + + auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); }); + + if (mAborted) { + return NS_OK; + } + + bool success; + float confidence = 0; + Json::Value root; + Json::CharReaderBuilder builder; + bool parsingSuccessful; + nsAutoCString result; + nsAutoCString hypoValue; + nsAutoCString errorMsg; + SpeechRecognitionErrorCode errorCode; + + SR_LOG("STT Result: %s", mBuf.get()); + + if (NS_FAILED(aStatusCode)) { + success = false; + errorMsg.AssignLiteral("Error connecting to the service."); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + success = true; + UniquePtr const reader(builder.newCharReader()); + parsingSuccessful = + reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr); + if (!parsingSuccessful) { + // there's an internal server error + success = false; + errorMsg.AssignLiteral("Internal server error"); + errorCode = SpeechRecognitionErrorCode::Network; + } else { + result.Assign(root.get("status", "error").asString().c_str()); + if (result.EqualsLiteral("ok")) { + // ok, we have a result + if (!root["data"].empty()) { + hypoValue.Assign(root["data"][0].get("text", "").asString().c_str()); + confidence = root["data"][0].get("confidence", "0").asFloat(); + } else { + success = false; + errorMsg.AssignLiteral("Error reading result data."); + errorCode = SpeechRecognitionErrorCode::Network; + } + } else { + success = false; + errorMsg.Assign(root.get("message", "").asString().c_str()); + errorCode = SpeechRecognitionErrorCode::No_speech; + } + } + } + + if (!success) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg); + } else { + // Declare javascript result events + RefPtr event = new SpeechEvent( + mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); + SpeechRecognitionResultList* resultList = + new SpeechRecognitionResultList(mRecognition); + SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); + + if (mRecognition->MaxAlternatives() > 0) { + SpeechRecognitionAlternative* alternative = + new SpeechRecognitionAlternative(mRecognition); + + alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue); + alternative->mConfidence = confidence; + + result->mItems.AppendElement(alternative); + } + resultList->mItems.AppendElement(result); + + event->mRecognitionResultList = resultList; + NS_DispatchToMainThread(event); + } + + return NS_OK; +} + +OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default; +OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default; + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Initialize( + WeakPtr aSpeechRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + mWriter = MakeUnique(); + mRecognition = new nsMainThreadPtrHolder( + "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition); + mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding(); + MOZ_ASSERT(mEncodeTaskQueue); + return NS_OK; +} + +void OnlineSpeechRecognitionService::EncoderFinished() { + MOZ_ASSERT(!NS_IsMainThread()); + MOZ_ASSERT(mEncodedAudioQueue.IsFinished()); + + while (RefPtr frame = mEncodedAudioQueue.PopFront()) { + AutoTArray, 1> frames({frame}); + DebugOnly rv = + mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream() + ? ContainerWriter::END_OF_STREAM + : 0); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + } + + mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED); + MOZ_ASSERT(mWriter->IsWritingComplete()); + + NS_DispatchToMainThread( + NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this, + &OnlineSpeechRecognitionService::DoSTT)); +} + +void OnlineSpeechRecognitionService::EncoderInitialized() { + MOZ_ASSERT(!NS_IsMainThread()); + AutoTArray, 1> metadata; + metadata.AppendElement(mAudioEncoder->GetMetadata()); + if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) { + SR_LOG("wrong meta data type!"); + MOZ_ASSERT_UNREACHABLE(); + } + + nsresult rv = mWriter->SetMetadata(metadata); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + + Unused << rv; +} + +void OnlineSpeechRecognitionService::EncoderError() { + MOZ_ASSERT(!NS_IsMainThread()); + SR_LOG("Error encoding frames."); + mEncodedData.Clear(); + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::DispatchError", + [this, self = RefPtr(this)]() { + if (!mRecognition) { + return; + } + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Encoder error"); + })); +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, + int32_t aSampleRate) { + MOZ_ASSERT(!NS_IsMainThread()); + int64_t duration = aAudioSegment->GetDuration(); + if (duration <= 0) { + return NS_OK; + } + + if (!mAudioEncoder) { + mSpeechEncoderListener = new SpeechEncoderListener(this); + mAudioEncoder = + MakeUnique(aSampleRate, mEncodedAudioQueue); + RefPtr mEncoderThread = AbstractThread::GetCurrent(); + mAudioEncoder->SetWorkerThread(mEncoderThread); + mAudioEncoder->RegisterListener(mSpeechEncoderListener); + } + + mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment)); + + TimeStamp now = TimeStamp::Now(); + if (mFirstIteration.IsNull()) { + mFirstIteration = now; + } + + if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) { + NS_DispatchToMainThread(NS_NewRunnableFunction( + "SpeechRecognition::Stop", + [this, self = RefPtr(this)]() { + if (!mRecognition) { + return; + } + mRecognition->Stop(); + })); + + return NS_OK; + } + + return NS_OK; +} + +void OnlineSpeechRecognitionService::DoSTT() { + MOZ_ASSERT(NS_IsMainThread()); + + if (mAborted) { + return; + } + + nsresult rv; + nsCOMPtr chan; + nsCOMPtr uri; + nsAutoCString speechRecognitionEndpoint; + nsAutoCString prefEndpoint; + nsAutoString language; + + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT, + prefEndpoint); + + if (!prefEndpoint.IsEmpty()) { + speechRecognitionEndpoint = prefEndpoint; + } else { + speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT; + } + + rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr, + nullptr); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Unknown URI"); + return; + } + + nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT; + nsLoadFlags loadFlags = + nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER; + nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER; + + nsPIDOMWindowInner* window = mRecognition->GetOwner(); + if (NS_WARN_IF(!window)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No window"); + return; + } + + Document* doc = window->GetExtantDoc(); + if (NS_WARN_IF(!doc)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Aborted, "No document"); + } + rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags, + contentPolicy, nullptr, nullptr, nullptr, nullptr, + loadFlags); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open channel"); + return; + } + + nsCOMPtr httpChan = do_QueryInterface(chan); + if (httpChan) { + rv = httpChan->SetRequestMethod("POST"_ns); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + if (httpChan) { + mRecognition->GetLang(language); + // Accept-Language-STT is a custom header of our backend server used to set + // the language of the speech sample being submitted by the client + rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns, + NS_ConvertUTF16toUTF8(language), false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the transcription by default + rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Tell the server to not store the sample by default + rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + // Set the product tag as teh web speech api + rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + + nsCOMPtr cos(do_QueryInterface(chan)); + if (cos) { + cos->AddClassFlags(nsIClassOfService::UrgentStart); + } + + nsCOMPtr uploadChan = do_QueryInterface(chan); + if (uploadChan) { + nsCOMPtr bodyStream; + uint32_t length = 0; + for (const nsTArray& chunk : mEncodedData) { + length += chunk.Length(); + } + + nsTArray audio; + if (!audio.SetCapacity(length, fallible)) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Audio_capture, "Allocation error"); + return; + } + + for (const nsTArray& chunk : mEncodedData) { + audio.AppendElements(chunk); + } + + mEncodedData.Clear(); + + rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio)); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Failed to open stream"); + return; + } + if (bodyStream) { + rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns, + length, "POST"_ns, false); + MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); + } + } + + rv = chan->AsyncOpen(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, "Internal server error"); + } +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::SoundEnd() { + MOZ_ASSERT(NS_IsMainThread()); + + if (!mEncodeTaskQueue) { + // Not initialized + return NS_OK; + } + + nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( + "OnlineSpeechRecognitionService::SoundEnd", + [this, self = RefPtr(this)]() { + if (mAudioEncoder) { + mAudioEncoder->NotifyEndOfStream(); + mAudioEncoder->UnregisterListener(mSpeechEncoderListener); + mSpeechEncoderListener = nullptr; + mAudioEncoder = nullptr; + EncoderFinished(); + } + })); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + + mEncodeTaskQueue = nullptr; + + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::ValidateAndSetGrammarList( + SpeechGrammar* aSpeechGrammar, + nsISpeechGrammarCompilationCallback* aCallback) { + // This is an online LVCSR (STT) service, + // so we don't need to set a grammar + return NS_OK; +} + +NS_IMETHODIMP +OnlineSpeechRecognitionService::Abort() { + MOZ_ASSERT(NS_IsMainThread()); + if (mAborted) { + return NS_OK; + } + mAborted = true; + return SoundEnd(); +} +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h new file mode 100644 index 0000000000..c049e5046a --- /dev/null +++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h @@ -0,0 +1,132 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_OnlineRecognitionService_h +#define mozilla_dom_OnlineRecognitionService_h + +#include "nsCOMPtr.h" +#include "nsTArray.h" +#include "nsISpeechRecognitionService.h" +#include "speex/speex_resampler.h" +#include "nsIStreamListener.h" +#include "OpusTrackEncoder.h" +#include "ContainerWriter.h" + +#define NS_ONLINE_SPEECH_RECOGNITION_SERVICE_CID \ + {0x0ff5ce56, \ + 0x5b09, \ + 0x4db8, \ + {0xad, 0xc6, 0x82, 0x66, 0xaf, 0x95, 0xf8, 0x64}}; + +namespace mozilla { + +namespace ipc { +class PrincipalInfo; +} // namespace ipc + +/** + * Online implementation of the nsISpeechRecognitionService interface + */ +class OnlineSpeechRecognitionService : public nsISpeechRecognitionService, + public nsIStreamListener { + public: + // Add XPCOM glue code + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSISPEECHRECOGNITIONSERVICE + NS_DECL_NSIREQUESTOBSERVER + NS_DECL_NSISTREAMLISTENER + + /** + * Listener responsible for handling the events raised by the TrackEncoder + */ + class SpeechEncoderListener : public TrackEncoderListener { + public: + explicit SpeechEncoderListener(OnlineSpeechRecognitionService* aService) + : mService(aService), mOwningThread(AbstractThread::GetCurrent()) {} + + void Started(TrackEncoder* aEncoder) override {} + + void Initialized(TrackEncoder* aEncoder) override { + MOZ_ASSERT(mOwningThread->IsCurrentThreadIn()); + mService->EncoderInitialized(); + } + + void Error(TrackEncoder* aEncoder) override { + MOZ_ASSERT(mOwningThread->IsCurrentThreadIn()); + mService->EncoderError(); + } + + private: + const RefPtr mService; + const RefPtr mOwningThread; + }; + + /** + * Default constructs a OnlineSpeechRecognitionService + */ + OnlineSpeechRecognitionService(); + + /** + * Called by SpeechEncoderListener when the AudioTrackEncoder has been + * initialized. + */ + void EncoderInitialized(); + + /** + * Called after the AudioTrackEncoder has encoded all data for us to wrap in a + * container and pass along. + */ + void EncoderFinished(); + + /** + * Called by SpeechEncoderListener when the AudioTrackEncoder has + * encountered an error. + */ + void EncoderError(); + + private: + /** + * Private destructor to prevent bypassing of reference counting + */ + virtual ~OnlineSpeechRecognitionService(); + + /** The associated SpeechRecognition */ + nsMainThreadPtrHandle mRecognition; + + /** + * Builds a mock SpeechRecognitionResultList + */ + dom::SpeechRecognitionResultList* BuildMockResultList(); + + /** + * Method responsible for uploading the audio to the remote endpoint + */ + void DoSTT(); + + // Encoded and packaged ogg audio data + nsTArray> mEncodedData; + // Member responsible for holding a reference to the TrackEncoderListener + RefPtr mSpeechEncoderListener; + // MediaQueue fed encoded data by mAudioEncoder + MediaQueue mEncodedAudioQueue; + // Encoder responsible for encoding the frames from pcm to opus which is the + // format supported by our backend + UniquePtr mAudioEncoder; + // Object responsible for wrapping the opus frames into an ogg container + UniquePtr mWriter; + // Member responsible for storing the json string returned by the endpoint + nsCString mBuf; + // Used to calculate a ceiling on the time spent listening. + TimeStamp mFirstIteration; + // flag responsible to control if the user choose to abort + bool mAborted = false; + // reference to the audio encoder queue + RefPtr mEncodeTaskQueue; +}; + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechGrammar.cpp b/dom/media/webspeech/recognition/SpeechGrammar.cpp new file mode 100644 index 0000000000..de6e9fa30f --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammar.cpp @@ -0,0 +1,57 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechGrammar.h" + +#include "mozilla/ErrorResult.h" +#include "mozilla/dom/SpeechGrammarBinding.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammar, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammar) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammar) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammar) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechGrammar::SpeechGrammar(nsISupports* aParent) : mParent(aParent) {} + +SpeechGrammar::~SpeechGrammar() = default; + +already_AddRefed SpeechGrammar::Constructor( + const GlobalObject& aGlobal) { + RefPtr speechGrammar = + new SpeechGrammar(aGlobal.GetAsSupports()); + return speechGrammar.forget(); +} + +nsISupports* SpeechGrammar::GetParentObject() const { return mParent; } + +JSObject* SpeechGrammar::WrapObject(JSContext* aCx, + JS::Handle aGivenProto) { + return SpeechGrammar_Binding::Wrap(aCx, this, aGivenProto); +} + +void SpeechGrammar::GetSrc(nsString& aRetVal, ErrorResult& aRv) const { + aRetVal = mSrc; +} + +void SpeechGrammar::SetSrc(const nsAString& aArg, ErrorResult& aRv) { + mSrc = aArg; +} + +float SpeechGrammar::GetWeight(ErrorResult& aRv) const { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return 0; +} + +void SpeechGrammar::SetWeight(float aArg, ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechGrammar.h b/dom/media/webspeech/recognition/SpeechGrammar.h new file mode 100644 index 0000000000..0dee1e9792 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammar.h @@ -0,0 +1,64 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechGrammar_h +#define mozilla_dom_SpeechGrammar_h + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +namespace mozilla { +class ErrorResult; + +namespace dom { + +class GlobalObject; + +class SpeechGrammar final : public nsISupports, public nsWrapperCache { + public: + explicit SpeechGrammar(nsISupports* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammar) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + static already_AddRefed Constructor( + const GlobalObject& aGlobal); + + static already_AddRefed WebkitSpeechGrammar( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal); + } + + void GetSrc(nsString& aRetVal, ErrorResult& aRv) const; + + void SetSrc(const nsAString& aArg, ErrorResult& aRv); + + float GetWeight(ErrorResult& aRv) const; + + void SetWeight(float aArg, ErrorResult& aRv); + + private: + ~SpeechGrammar(); + + nsCOMPtr mParent; + + nsString mSrc; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.cpp b/dom/media/webspeech/recognition/SpeechGrammarList.cpp new file mode 100644 index 0000000000..4317452057 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammarList.cpp @@ -0,0 +1,76 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechGrammarList.h" + +#include "mozilla/dom/SpeechGrammar.h" +#include "mozilla/dom/SpeechGrammarListBinding.h" +#include "mozilla/ErrorResult.h" +#include "nsCOMPtr.h" +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechGrammarList, mParent, mItems) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechGrammarList) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechGrammarList) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechGrammarList) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechGrammarList::SpeechGrammarList(nsISupports* aParent) : mParent(aParent) {} + +SpeechGrammarList::~SpeechGrammarList() = default; + +already_AddRefed SpeechGrammarList::Constructor( + const GlobalObject& aGlobal) { + RefPtr speechGrammarList = + new SpeechGrammarList(aGlobal.GetAsSupports()); + return speechGrammarList.forget(); +} + +JSObject* SpeechGrammarList::WrapObject(JSContext* aCx, + JS::Handle aGivenProto) { + return SpeechGrammarList_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechGrammarList::GetParentObject() const { return mParent; } + +uint32_t SpeechGrammarList::Length() const { return mItems.Length(); } + +already_AddRefed SpeechGrammarList::Item(uint32_t aIndex, + ErrorResult& aRv) { + RefPtr result = mItems.ElementAt(aIndex); + return result.forget(); +} + +void SpeechGrammarList::AddFromURI(const nsAString& aSrc, + const Optional& aWeight, + ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechGrammarList::AddFromString(const nsAString& aString, + const Optional& aWeight, + ErrorResult& aRv) { + SpeechGrammar* speechGrammar = new SpeechGrammar(mParent); + speechGrammar->SetSrc(aString, aRv); + mItems.AppendElement(speechGrammar); +} + +already_AddRefed SpeechGrammarList::IndexedGetter( + uint32_t aIndex, bool& aPresent, ErrorResult& aRv) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + ErrorResult rv; + aPresent = true; + return Item(aIndex, rv); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechGrammarList.h b/dom/media/webspeech/recognition/SpeechGrammarList.h new file mode 100644 index 0000000000..7f1e09cd9e --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechGrammarList.h @@ -0,0 +1,73 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechGrammarList_h +#define mozilla_dom_SpeechGrammarList_h + +#include "mozilla/Attributes.h" +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsTArray.h" +#include "nsWrapperCache.h" + +struct JSContext; + +namespace mozilla { + +class ErrorResult; + +namespace dom { + +class GlobalObject; +class SpeechGrammar; +template +class Optional; + +class SpeechGrammarList final : public nsISupports, public nsWrapperCache { + public: + explicit SpeechGrammarList(nsISupports* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechGrammarList) + + static already_AddRefed Constructor( + const GlobalObject& aGlobal); + + static already_AddRefed WebkitSpeechGrammarList( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal); + } + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed Item(uint32_t aIndex, ErrorResult& aRv); + + void AddFromURI(const nsAString& aSrc, const Optional& aWeight, + ErrorResult& aRv); + + void AddFromString(const nsAString& aString, const Optional& aWeight, + ErrorResult& aRv); + + already_AddRefed IndexedGetter(uint32_t aIndex, bool& aPresent, + ErrorResult& aRv); + + private: + ~SpeechGrammarList(); + + nsCOMPtr mParent; + + nsTArray> mItems; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp new file mode 100644 index 0000000000..e3bf531218 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp @@ -0,0 +1,1170 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognition.h" + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" + +#include "mozilla/dom/AudioStreamTrack.h" +#include "mozilla/dom/BindingUtils.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/SpeechRecognitionBinding.h" +#include "mozilla/dom/MediaStreamTrackBinding.h" +#include "mozilla/dom/MediaStreamError.h" +#include "mozilla/dom/RootedDictionary.h" +#include "mozilla/dom/SpeechGrammar.h" +#include "mozilla/MediaManager.h" +#include "mozilla/Preferences.h" +#include "mozilla/ResultVariant.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/AbstractThread.h" +#include "VideoUtils.h" +#include "AudioSegment.h" +#include "MediaEnginePrefs.h" +#include "endpointer.h" + +#include "mozilla/dom/SpeechRecognitionEvent.h" +#include "nsComponentManagerUtils.h" +#include "nsContentUtils.h" +#include "mozilla/dom/Document.h" +#include "nsIObserverService.h" +#include "nsIPermissionManager.h" +#include "nsIPrincipal.h" +#include "nsPIDOMWindow.h" +#include "nsServiceManagerUtils.h" +#include "nsQueryObject.h" +#include "SpeechTrackListener.h" + +#include + +// Undo the windows.h damage +#if defined(XP_WIN) && defined(GetMessage) +# undef GetMessage +#endif + +namespace mozilla::dom { + +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" +#define DEFAULT_RECOGNITION_SERVICE "online" + +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \ + "media.webspeech.long_silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \ + "media.webspeech.long_speech_length" +#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \ + "media.webspeech.recognition.timeout" + +static const uint32_t kSAMPLE_RATE = 16000; + +// number of frames corresponding to 300ms of audio to send to endpointer while +// it's in environment estimation mode +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; + +LogModule* GetSpeechRecognitionLog() { + static LazyLogModule sLog("SpeechRecognition"); + return sLog; +} +#define SR_LOG(...) \ + MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +namespace { +class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker { + public: + SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition, + const nsString& aName) + : media::ShutdownBlocker(aName), mRecognition(aRecognition) {} + + NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override { + MOZ_ASSERT(NS_IsMainThread()); + // AbortSilently will eventually clear the blocker. + mRecognition->Abort(); + return NS_OK; + } + + private: + const RefPtr mRecognition; +}; + +enum class ServiceCreationError { + ServiceNotFound, +}; + +Result, ServiceCreationError> +CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow, + SpeechRecognition* aRecognition, + const nsAString& aLang) { + nsAutoCString speechRecognitionServiceCID; + + nsAutoCString prefValue; + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue); + nsAutoCString speechRecognitionService; + + if (!prefValue.IsEmpty()) { + speechRecognitionService = prefValue; + } else { + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; + } + + if (StaticPrefs::media_webspeech_test_fake_recognition_service()) { + speechRecognitionServiceCID = + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; + } else { + speechRecognitionServiceCID = + nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + + speechRecognitionService; + } + + nsresult rv; + nsCOMPtr recognitionService; + recognitionService = + do_CreateInstance(speechRecognitionServiceCID.get(), &rv); + if (!recognitionService) { + return Err(ServiceCreationError::ServiceNotFound); + } + + return recognitionService; +} +} // namespace + +NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition, + DOMEventTargetHelper, mStream, + mTrack, mRecognitionService, + mSpeechGrammarList) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition) + NS_INTERFACE_MAP_ENTRY(nsIObserver) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) + +SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) + : DOMEventTargetHelper(aOwnerWindow), + mEndpointer(kSAMPLE_RATE), + mAudioSamplesPerChunk(mEndpointer.FrameSize()), + mSpeechDetectionTimer(NS_NewTimer()), + mSpeechGrammarList(new SpeechGrammarList(GetOwner())), + mContinuous(false), + mInterimResults(false), + mMaxAlternatives(1) { + SR_LOG("created SpeechRecognition"); + + if (StaticPrefs::media_webspeech_test_enable()) { + nsCOMPtr obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + } + + mEndpointer.set_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); + mEndpointer.set_long_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); + mEndpointer.set_long_speech_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); + + mSpeechDetectionTimeoutMs = + Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000); + + Reset(); +} + +SpeechRecognition::~SpeechRecognition() = default; + +bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) { + return mCurrentState >= begin && mCurrentState <= end; +} + +void SpeechRecognition::SetState(FSMState state) { + mCurrentState = state; + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); +} + +JSObject* SpeechRecognition::WrapObject(JSContext* aCx, + JS::Handle aGivenProto) { + return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto); +} + +bool SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) { + nsCOMPtr principal = nsContentUtils::ObjectPrincipal(aGlobal); + + nsresult rv; + nsCOMPtr mgr = + do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION; + rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition"_ns, + &speechRecognition); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + bool hasPermission = + (speechRecognition == nsIPermissionManager::ALLOW_ACTION); + + return (hasPermission || + StaticPrefs::media_webspeech_recognition_force_enable() || + StaticPrefs::media_webspeech_test_enable()) && + StaticPrefs::media_webspeech_recognition_enable(); +} + +already_AddRefed SpeechRecognition::Constructor( + const GlobalObject& aGlobal, ErrorResult& aRv) { + nsCOMPtr win = do_QueryInterface(aGlobal.GetAsSupports()); + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + return nullptr; + } + + RefPtr object = new SpeechRecognition(win); + return object.forget(); +} + +void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) { + SR_LOG("Processing %s, current state is %s", GetName(aEvent), + GetName(mCurrentState)); + + if (mAborted && aEvent->mType != EVENT_ABORT) { + // ignore all events while aborting + return; + } + + Transition(aEvent); +} + +void SpeechRecognition::Transition(SpeechEvent* aEvent) { + switch (mCurrentState) { + case STATE_IDLE: + switch (aEvent->mType) { + case EVENT_START: + // TODO: may want to time out if we wait too long + // for user to approve + WaitForAudioData(aEvent); + break; + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_STARTING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + StartedAudioCapture(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_STOP: + ResetAndEnd(); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ESTIMATING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForEstimation(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + DetectSpeech(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_RECOGNIZING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForSpeechEnd(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_WAITING_FOR_RESULT: + switch (aEvent->mType) { + case EVENT_STOP: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + NotifyFinalResult(aEvent); + break; + case EVENT_AUDIO_DATA: + DoNothing(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_START: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", + GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + case STATE_ABORTING: + switch (aEvent->mType) { + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + default: + MOZ_CRASH("Invalid event"); + } + break; + default: + MOZ_CRASH("Invalid state"); + } +} + +/* + * Handle a segment of recorded audio data. + * Returns the number of samples that were processed. + */ +uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, + TrackRate aTrackRate) { + AudioSegment::ChunkIterator iterator(*aSegment); + uint32_t samples = 0; + while (!iterator.IsEnded()) { + float out; + mEndpointer.ProcessAudio(*iterator, &out); + samples += iterator->GetDuration(); + iterator.Next(); + } + + // we need to call the nsISpeechRecognitionService::ProcessAudioSegment + // in a separate thread so that any eventual encoding or pre-processing + // of the audio does not block the main thread + nsresult rv = mEncodeTaskQueue->Dispatch( + NewRunnableMethod, TrackRate>( + "nsISpeechRecognitionService::ProcessAudioSegment", + mRecognitionService, + &nsISpeechRecognitionService::ProcessAudioSegment, + std::move(*aSegment), aTrackRate)); + MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); + Unused << rv; + return samples; +} + +/**************************************************************************** + * FSM Transition functions + * + * If a transition function may cause a DOM event to be fired, + * it may also be re-entered, since the event handler may cause the + * event loop to spin and new SpeechEvents to be processed. + * + * Rules: + * 1) These methods should call SetState as soon as possible. + * 2) If these methods dispatch DOM events, or call methods that dispatch + * DOM events, that should be done as late as possible. + * 3) If anything must happen after dispatching a DOM event, make sure + * the state is still what the method expected it to be. + ****************************************************************************/ + +void SpeechRecognition::Reset() { + SetState(STATE_IDLE); + + // This breaks potential ref-cycles. + mRecognitionService = nullptr; + + ++mStreamGeneration; + if (mStream) { + mStream->UnregisterTrackListener(this); + mStream = nullptr; + } + mTrack = nullptr; + mTrackIsOwned = false; + mStopRecordingPromise = nullptr; + mEncodeTaskQueue = nullptr; + mEstimationSamples = 0; + mBufferedSamples = 0; + mSpeechDetectionTimer->Cancel(); + mAborted = false; +} + +void SpeechRecognition::ResetAndEnd() { + Reset(); + DispatchTrustedEvent(u"end"_ns); +} + +void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) { + SetState(STATE_STARTING); +} + +void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEndpointer.SetEnvironmentEstimationMode(); + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + + DispatchTrustedEvent(u"audiostart"_ns); + if (mCurrentState == STATE_ESTIMATING) { + DispatchTrustedEvent(u"start"_ns); + } +} + +void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_RESULT); + + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); + + // This will run SoundEnd on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->SoundEnd(); }); + + StopRecording(); +} + +void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) { + SetState(STATE_ESTIMATING); + + mEstimationSamples += + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEstimationSamples > kESTIMATION_SAMPLES) { + mEndpointer.SetUserInputMode(); + SetState(STATE_WAITING_FOR_SPEECH); + } +} + +void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) { + SetState(STATE_WAITING_FOR_SPEECH); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.DidStartReceivingSpeech()) { + mSpeechDetectionTimer->Cancel(); + SetState(STATE_RECOGNIZING); + DispatchTrustedEvent(u"speechstart"_ns); + } +} + +void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) { + SetState(STATE_RECOGNIZING); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.speech_input_complete()) { + DispatchTrustedEvent(u"speechend"_ns); + + if (mCurrentState == STATE_RECOGNIZING) { + // FIXME: StopRecordingAndRecognize should only be called for single + // shot services for continuous we should just inform the service + StopRecordingAndRecognize(aEvent); + } + } +} + +void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) { + ResetAndEnd(); + + RootedDictionary init(RootingCx()); + init.mBubbles = true; + init.mCancelable = false; + // init.mResultIndex = 0; + init.mResults = aEvent->mRecognitionResultList; + init.mInterpretation = JS::NullValue(); + // init.mEmma = nullptr; + + RefPtr event = + SpeechRecognitionEvent::Constructor(this, u"result"_ns, init); + event->SetTrusted(true); + + DispatchEvent(*event); +} + +void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {} + +void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) { + if (mRecognitionService) { + if (mTrack) { + // This will run Abort on the service just before StopRecording begins + // shutting the encode thread down. + mSpeechListener->mRemovedPromise->Then( + GetCurrentSerialEventTarget(), __func__, + [service = mRecognitionService] { service->Abort(); }); + } else { + // Recording hasn't started yet. We can just call Abort(). + mRecognitionService->Abort(); + } + } + + StopRecording()->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr(this), this] { ResetAndEnd(); }); + + SetState(STATE_ABORTING); +} + +void SpeechRecognition::AbortError(SpeechEvent* aEvent) { + AbortSilently(aEvent); + NotifyError(aEvent); +} + +void SpeechRecognition::NotifyError(SpeechEvent* aEvent) { + aEvent->mError->SetTrusted(true); + + DispatchEvent(*aEvent->mError); +} + +/************************************** + * Event triggers and other functions * + **************************************/ +NS_IMETHODIMP +SpeechRecognition::StartRecording(RefPtr& aTrack) { + // hold a reference so that the underlying track doesn't get collected. + mTrack = aTrack; + MOZ_ASSERT(!mTrack->Ended()); + + mSpeechListener = new SpeechTrackListener(this); + mTrack->AddListener(mSpeechListener); + + nsString blockerName; + blockerName.AppendPrintf("SpeechRecognition %p shutdown", this); + mShutdownBlocker = + MakeAndAddRef(this, blockerName); + media::MustGetShutdownBarrier()->AddBlocker( + mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__, + u"SpeechRecognition shutdown"_ns); + + mEndpointer.StartSession(); + + return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs, + nsITimer::TYPE_ONE_SHOT); +} + +RefPtr SpeechRecognition::StopRecording() { + if (!mTrack) { + // Recording wasn't started, or has already been stopped. + if (mStream) { + // Ensure we don't start recording because a track became available + // before we get reset. + mStream->UnregisterTrackListener(this); + } + return GenericNonExclusivePromise::CreateAndResolve(true, __func__); + } + + if (mStopRecordingPromise) { + return mStopRecordingPromise; + } + + mTrack->RemoveListener(mSpeechListener); + if (mTrackIsOwned) { + mTrack->Stop(); + } + + mEndpointer.EndSession(); + DispatchTrustedEvent(u"audioend"_ns); + + // Block shutdown until the speech track listener has been removed from the + // MSG, as it holds a reference to us, and we reference the world, which we + // don't want to leak. + mStopRecordingPromise = + mSpeechListener->mRemovedPromise + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr(this), this] { + SR_LOG("Shutting down encoding thread"); + return mEncodeTaskQueue->BeginShutdown(); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return ShutdownPromise::CreateAndResolve(false, __func__); + }) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [self = RefPtr(this), this] { + media::MustGetShutdownBarrier()->RemoveBlocker( + mShutdownBlocker); + mShutdownBlocker = nullptr; + + MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE); + return GenericNonExclusivePromise::CreateAndResolve(true, + __func__); + }, + [] { + MOZ_CRASH("Unexpected rejection"); + return GenericNonExclusivePromise::CreateAndResolve(false, + __func__); + }); + return mStopRecordingPromise; +} + +NS_IMETHODIMP +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); + + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::No_speech, + "No speech detected (timeout)"); + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() && + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { + ProcessTestEventRequest(aSubject, nsDependentString(aData)); + } + + return NS_OK; +} + +void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, + const nsAString& aEventName) { + if (aEventName.EqualsLiteral("EVENT_ABORT")) { + Abort(); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { + DispatchError( + SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? + "AUDIO_ERROR test event"); + } else { + NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(), + "Got request for fake recognition service event, but " + "media.webspeech.test.fake_recognition_service is unset"); + + // let the fake recognition service handle the request + } +} + +already_AddRefed SpeechRecognition::Grammars() const { + RefPtr speechGrammarList = mSpeechGrammarList; + return speechGrammarList.forget(); +} + +void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) { + mSpeechGrammarList = &aArg; +} + +void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; } + +void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; } + +bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const { + return mContinuous; +} + +void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) { + mContinuous = aArg; +} + +bool SpeechRecognition::InterimResults() const { return mInterimResults; } + +void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; } + +uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; } + +void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) { + mMaxAlternatives = aArg; +} + +void SpeechRecognition::GetServiceURI(nsString& aRetVal, + ErrorResult& aRv) const { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) { + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); +} + +void SpeechRecognition::Start(const Optional>& aStream, + CallerType aCallerType, ErrorResult& aRv) { + if (mCurrentState != STATE_IDLE) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return; + } + + if (!SetRecognitionService(aRv)) { + return; + } + + if (!ValidateAndSetGrammarList(aRv)) { + return; + } + + mEncodeTaskQueue = + TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER), + "WebSpeechEncoderThread"); + + nsresult rv; + rv = mRecognitionService->Initialize(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + MediaStreamConstraints constraints; + constraints.mAudio.SetAsBoolean() = true; + + if (aStream.WasPassed()) { + mStream = &aStream.Value(); + mTrackIsOwned = false; + mStream->RegisterTrackListener(this); + nsTArray> tracks; + mStream->GetAudioTracks(tracks); + for (const RefPtr& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + break; + } + } + } else { + mTrackIsOwned = true; + nsPIDOMWindowInner* win = GetOwner(); + if (!win || !win->IsFullyActive()) { + aRv.ThrowInvalidStateError("The document is not fully active."); + return; + } + AutoNoJSAPI nojsapi; + RefPtr self(this); + MediaManager::Get() + ->GetUserMedia(win, constraints, aCallerType) + ->Then( + GetCurrentSerialEventTarget(), __func__, + [this, self, + generation = mStreamGeneration](RefPtr&& aStream) { + nsTArray> tracks; + aStream->GetAudioTracks(tracks); + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + for (const RefPtr& track : tracks) { + track->Stop(); + } + return; + } + mStream = std::move(aStream); + mStream->RegisterTrackListener(this); + for (const RefPtr& track : tracks) { + if (!track->Ended()) { + NotifyTrackAdded(track); + } + } + }, + [this, self, + generation = mStreamGeneration](RefPtr&& error) { + if (mAborted || mCurrentState != STATE_STARTING || + mStreamGeneration != generation) { + // We were probably aborted. Exit early. + return; + } + SpeechRecognitionErrorCode errorCode; + + if (error->mName == MediaMgrError::Name::NotAllowedError) { + errorCode = SpeechRecognitionErrorCode::Not_allowed; + } else { + errorCode = SpeechRecognitionErrorCode::Audio_capture; + } + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, + error->mMessage); + }); + } + + RefPtr event = new SpeechEvent(this, EVENT_START); + NS_DispatchToMainThread(event); +} + +bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) { + if (!GetOwner()) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + // See: + // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang + nsAutoString lang; + if (!mLang.IsEmpty()) { + lang = mLang; + } else { + nsCOMPtr document = GetOwner()->GetExtantDoc(); + if (!document) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + nsCOMPtr element = document->GetRootElement(); + if (!element) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + nsAutoString lang; + element->GetLang(lang); + } + + auto result = CreateSpeechRecognitionService(GetOwner(), this, lang); + + if (result.isErr()) { + switch (result.unwrapErr()) { + case ServiceCreationError::ServiceNotFound: + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + break; + default: + MOZ_CRASH("Unknown error"); + } + return false; + } + + mRecognitionService = result.unwrap(); + MOZ_DIAGNOSTIC_ASSERT(mRecognitionService); + return true; +} + +bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) { + if (!mSpeechGrammarList) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + uint32_t grammarListLength = mSpeechGrammarList->Length(); + for (uint32_t count = 0; count < grammarListLength; ++count) { + RefPtr speechGrammar = mSpeechGrammarList->Item(count, aRv); + if (aRv.Failed()) { + return false; + } + if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList( + speechGrammar.get(), nullptr))) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + } + + return true; +} + +void SpeechRecognition::Stop() { + RefPtr event = new SpeechEvent(this, EVENT_STOP); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::Abort() { + if (mAborted) { + return; + } + + mAborted = true; + + RefPtr event = new SpeechEvent(this, EVENT_ABORT); + NS_DispatchToMainThread(event); +} + +void SpeechRecognition::NotifyTrackAdded( + const RefPtr& aTrack) { + if (mTrack) { + return; + } + + RefPtr audioTrack = aTrack->AsAudioStreamTrack(); + if (!audioTrack) { + return; + } + + if (audioTrack->Ended()) { + return; + } + + StartRecording(audioTrack); +} + +void SpeechRecognition::DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsACString& aMessage) { + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || + aErrorType == EVENT_AUDIO_ERROR, + "Invalid error type!"); + + RefPtr srError = + new SpeechRecognitionError(nullptr, nullptr, nullptr); + + srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode, + aMessage); + + RefPtr event = new SpeechEvent(this, aErrorType); + event->mError = srError; + NS_DispatchToMainThread(event); +} + +/* + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. + * Updates mBufferedSamples and returns the number of samples that were + * buffered. + */ +uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, + uint32_t aSampleCount) { + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); + MOZ_ASSERT(mAudioSamplesBuffer); + + int16_t* samplesBuffer = static_cast(mAudioSamplesBuffer->Data()); + size_t samplesToCopy = + std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples); + + PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy); + + mBufferedSamples += samplesToCopy; + return samplesToCopy; +} + +/* + * Split a samples buffer starting of a given size into + * chunks of equal size. The chunks are stored in the array + * received as argument. + * Returns the offset of the end of the last chunk that was + * created. + */ +uint32_t SpeechRecognition::SplitSamplesBuffer( + const int16_t* aSamplesBuffer, uint32_t aSampleCount, + nsTArray>& aResult) { + uint32_t chunkStart = 0; + + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { + CheckedInt bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + RefPtr chunk = SharedBuffer::Create(bufferSize); + + PodCopy(static_cast(chunk->Data()), aSamplesBuffer + chunkStart, + mAudioSamplesPerChunk); + + aResult.AppendElement(chunk.forget()); + chunkStart += mAudioSamplesPerChunk; + } + + return chunkStart; +} + +AudioSegment* SpeechRecognition::CreateAudioSegment( + nsTArray>& aChunks) { + AudioSegment* segment = new AudioSegment(); + for (uint32_t i = 0; i < aChunks.Length(); ++i) { + RefPtr buffer = aChunks[i]; + const int16_t* chunkData = static_cast(buffer->Data()); + + AutoTArray channels; + channels.AppendElement(chunkData); + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, + PRINCIPAL_HANDLE_NONE); + } + + return segment; +} + +void SpeechRecognition::FeedAudioData( + nsMainThreadPtrHandle& aRecognition, + already_AddRefed aSamples, uint32_t aDuration, + MediaTrackListener* aProvider, TrackRate aTrackRate) { + NS_ASSERTION(!NS_IsMainThread(), + "FeedAudioData should not be called in the main thread"); + + // Endpointer expects to receive samples in chunks whose size is a + // multiple of its frame size. + // Since we can't assume we will receive the frames in appropriate-sized + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk + // (a multiple of Endpointer's frame size) before feeding to Endpointer. + + // ensure aSamples is deleted + RefPtr refSamples = aSamples; + + uint32_t samplesIndex = 0; + const int16_t* samples = static_cast(refSamples->Data()); + AutoTArray, 5> chunksToSend; + + // fill up our buffer and make a chunk out of it, if possible + if (mBufferedSamples > 0) { + samplesIndex += FillSamplesBuffer(samples, aDuration); + + if (mBufferedSamples == mAudioSamplesPerChunk) { + chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); + mBufferedSamples = 0; + } + } + + // create sample chunks of correct size + if (samplesIndex < aDuration) { + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, + aDuration - samplesIndex, chunksToSend); + } + + // buffer remaining samples + if (samplesIndex < aDuration) { + mBufferedSamples = 0; + CheckedInt bufferSize(sizeof(int16_t)); + bufferSize *= mAudioSamplesPerChunk; + mAudioSamplesBuffer = SharedBuffer::Create(bufferSize); + + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); + } + + AudioSegment* segment = CreateAudioSegment(chunksToSend); + RefPtr event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA); + event->mAudioSegment = segment; + event->mProvider = aProvider; + event->mTrackRate = aTrackRate; + NS_DispatchToMainThread(event); +} + +const char* SpeechRecognition::GetName(FSMState aId) { + static const char* names[] = { + "STATE_IDLE", "STATE_STARTING", + "STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH", + "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT", + "STATE_ABORTING", + }; + + MOZ_ASSERT(aId < STATE_COUNT); + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); + return names[aId]; +} + +const char* SpeechRecognition::GetName(SpeechEvent* aEvent) { + static const char* names[] = {"EVENT_START", + "EVENT_STOP", + "EVENT_ABORT", + "EVENT_AUDIO_DATA", + "EVENT_AUDIO_ERROR", + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", + "EVENT_RECOGNITIONSERVICE_ERROR"}; + + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); + return names[aEvent->mType]; +} + +TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const { + MOZ_ASSERT(NS_IsMainThread()); + return mEncodeTaskQueue; +} + +SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(new nsMainThreadPtrHolder( + "SpeechEvent::SpeechEvent", aRecognition)), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::SpeechEvent(nsMainThreadPtrHandle& aRecognition, + SpeechRecognition::EventType aType) + : Runnable("dom::SpeechEvent"), + mAudioSegment(nullptr), + mRecognitionResultList(nullptr), + mError(nullptr), + mRecognition(aRecognition), + mType(aType), + mTrackRate(0) {} + +SpeechEvent::~SpeechEvent() { delete mAudioSegment; } + +NS_IMETHODIMP +SpeechEvent::Run() { + mRecognition->ProcessEvent(this); + return NS_OK; +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognition.h b/dom/media/webspeech/recognition/SpeechRecognition.h new file mode 100644 index 0000000000..687f38041e --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.h @@ -0,0 +1,314 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognition_h +#define mozilla_dom_SpeechRecognition_h + +#include "mozilla/Attributes.h" +#include "mozilla/DOMEventTargetHelper.h" +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" +#include "nsProxyRelease.h" +#include "DOMMediaStream.h" +#include "nsITimer.h" +#include "MediaTrackGraph.h" +#include "AudioSegment.h" +#include "mozilla/WeakPtr.h" + +#include "SpeechGrammarList.h" +#include "SpeechRecognitionResultList.h" +#include "nsISpeechRecognitionService.h" +#include "endpointer.h" + +#include "mozilla/dom/BindingDeclarations.h" +#include "mozilla/dom/SpeechRecognitionError.h" + +namespace mozilla { + +namespace media { +class ShutdownBlocker; +} + +namespace dom { + +#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \ + "SpeechRecognitionTest:RequestEvent" +#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" + +class GlobalObject; +class AudioStreamTrack; +class SpeechEvent; +class SpeechTrackListener; + +LogModule* GetSpeechRecognitionLog(); +#define SR_LOG(...) \ + MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +class SpeechRecognition final : public DOMEventTargetHelper, + public nsIObserver, + public DOMMediaStream::TrackListener, + public SupportsWeakPtr { + public: + explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); + + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, + DOMEventTargetHelper) + + NS_DECL_NSIOBSERVER + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); + + static already_AddRefed Constructor( + const GlobalObject& aGlobal, ErrorResult& aRv); + + static already_AddRefed WebkitSpeechRecognition( + const GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal, aRv); + } + + already_AddRefed Grammars() const; + + void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); + + void GetLang(nsString& aRetVal) const; + + void SetLang(const nsAString& aArg); + + bool GetContinuous(ErrorResult& aRv) const; + + void SetContinuous(bool aArg, ErrorResult& aRv); + + bool InterimResults() const; + + void SetInterimResults(bool aArg); + + uint32_t MaxAlternatives() const; + + TaskQueue* GetTaskQueueForEncoding() const; + + void SetMaxAlternatives(uint32_t aArg); + + void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; + + void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); + + void Start(const Optional>& aStream, + CallerType aCallerType, ErrorResult& aRv); + + void Stop(); + + void Abort(); + + IMPL_EVENT_HANDLER(audiostart) + IMPL_EVENT_HANDLER(soundstart) + IMPL_EVENT_HANDLER(speechstart) + IMPL_EVENT_HANDLER(speechend) + IMPL_EVENT_HANDLER(soundend) + IMPL_EVENT_HANDLER(audioend) + IMPL_EVENT_HANDLER(result) + IMPL_EVENT_HANDLER(nomatch) + IMPL_EVENT_HANDLER(error) + IMPL_EVENT_HANDLER(start) + IMPL_EVENT_HANDLER(end) + + enum EventType { + EVENT_START, + EVENT_STOP, + EVENT_ABORT, + EVENT_AUDIO_DATA, + EVENT_AUDIO_ERROR, + EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, + EVENT_RECOGNITIONSERVICE_FINAL_RESULT, + EVENT_RECOGNITIONSERVICE_ERROR, + EVENT_COUNT + }; + + void NotifyTrackAdded(const RefPtr& aTrack) override; + // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are + // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16. + void DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsACString& aMessage); + template + void DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const char (&aMessage)[N]) { + DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage)); + } + uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); + uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, + uint32_t aSampleCount, + nsTArray>& aResult); + AudioSegment* CreateAudioSegment(nsTArray>& aChunks); + void FeedAudioData(nsMainThreadPtrHandle& aRecognition, + already_AddRefed aSamples, + uint32_t aDuration, MediaTrackListener* aProvider, + TrackRate aTrackRate); + + friend class SpeechEvent; + + private: + virtual ~SpeechRecognition(); + + enum FSMState { + STATE_IDLE, + STATE_STARTING, + STATE_ESTIMATING, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FOR_RESULT, + STATE_ABORTING, + STATE_COUNT + }; + + void SetState(FSMState state); + bool StateBetween(FSMState begin, FSMState end); + + bool SetRecognitionService(ErrorResult& aRv); + bool ValidateAndSetGrammarList(ErrorResult& aRv); + + NS_IMETHOD StartRecording(RefPtr& aDOMStream); + RefPtr StopRecording(); + + uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); + void NotifyError(SpeechEvent* aEvent); + + void ProcessEvent(SpeechEvent* aEvent); + void Transition(SpeechEvent* aEvent); + + void Reset(); + void ResetAndEnd(); + void WaitForAudioData(SpeechEvent* aEvent); + void StartedAudioCapture(SpeechEvent* aEvent); + void StopRecordingAndRecognize(SpeechEvent* aEvent); + void WaitForEstimation(SpeechEvent* aEvent); + void DetectSpeech(SpeechEvent* aEvent); + void WaitForSpeechEnd(SpeechEvent* aEvent); + void NotifyFinalResult(SpeechEvent* aEvent); + void DoNothing(SpeechEvent* aEvent); + void AbortSilently(SpeechEvent* aEvent); + void AbortError(SpeechEvent* aEvent); + + RefPtr mStream; + RefPtr mTrack; + bool mTrackIsOwned = false; + RefPtr mStopRecordingPromise; + RefPtr mSpeechListener; + nsCOMPtr mRecognitionService; + RefPtr mShutdownBlocker; + // TaskQueue responsible for pre-processing the samples by the service + // it runs in a separate thread from the main thread + RefPtr mEncodeTaskQueue; + + // A generation ID of the MediaStream a started session is for, so that + // a gUM request that resolves after the session has stopped, and a new + // one has started, can exit early. Main thread only. Can wrap. + uint8_t mStreamGeneration = 0; + + FSMState mCurrentState; + + Endpointer mEndpointer; + uint32_t mEstimationSamples; + + uint32_t mAudioSamplesPerChunk; + + // maximum amount of seconds the engine will wait for voice + // until returning a 'no speech detected' error + uint32_t mSpeechDetectionTimeoutMs; + + // buffer holds one chunk of mAudioSamplesPerChunk + // samples before feeding it to mEndpointer + RefPtr mAudioSamplesBuffer; + uint32_t mBufferedSamples; + + nsCOMPtr mSpeechDetectionTimer; + bool mAborted; + + nsString mLang; + + RefPtr mSpeechGrammarList; + + // private flag used to hold if the user called the setContinuous() method + // of the API + bool mContinuous; + + // WebSpeechAPI (http://bit.ly/1gIl7DC) states: + // + // 1. Default value MUST be false + // 2. If true, interim results SHOULD be returned + // 3. If false, interim results MUST NOT be returned + // + // Pocketsphinx does not return interm results; so, defaulting + // mInterimResults to false, then ignoring its subsequent value + // is a conforming implementation. + bool mInterimResults; + + // WebSpeechAPI (http://bit.ly/1JAiqeo) states: + // + // 1. Default value is 1 + // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives + // per result" + // + // Pocketsphinx can only return at maximum a single + // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting + // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives + // while for a 0 value returning no SpeechRecognitionAlternative per result is + // a conforming implementation. + uint32_t mMaxAlternatives; + + void ProcessTestEventRequest(nsISupports* aSubject, + const nsAString& aEventName); + + const char* GetName(FSMState aId); + const char* GetName(SpeechEvent* aEvent); +}; + +class SpeechEvent : public Runnable { + public: + SpeechEvent(SpeechRecognition* aRecognition, + SpeechRecognition::EventType aType); + SpeechEvent(nsMainThreadPtrHandle& aRecognition, + SpeechRecognition::EventType aType); + + ~SpeechEvent(); + + NS_IMETHOD Run() override; + AudioSegment* mAudioSegment; + RefPtr + mRecognitionResultList; // TODO: make this a session being passed which + // also has index and stuff + RefPtr mError; + + friend class SpeechRecognition; + + private: + nsMainThreadPtrHandle mRecognition; + + // for AUDIO_DATA events, keep a reference to the provider + // of the data (i.e., the SpeechTrackListener) to ensure it + // is kept alive (and keeps SpeechRecognition alive) until this + // event gets processed. + RefPtr mProvider; + SpeechRecognition::EventType mType; + TrackRate mTrackRate; +}; + +} // namespace dom + +inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) { + return ToSupports(static_cast(aRec)); +} + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp new file mode 100644 index 0000000000..4dee9090a7 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.cpp @@ -0,0 +1,44 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionAlternative.h" + +#include "mozilla/dom/SpeechRecognitionAlternativeBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionAlternative, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionAlternative) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionAlternative) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionAlternative) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionAlternative::SpeechRecognitionAlternative( + SpeechRecognition* aParent) + : mConfidence(0), mParent(aParent) {} + +SpeechRecognitionAlternative::~SpeechRecognitionAlternative() = default; + +JSObject* SpeechRecognitionAlternative::WrapObject( + JSContext* aCx, JS::Handle aGivenProto) { + return SpeechRecognitionAlternative_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechRecognitionAlternative::GetParentObject() const { + return static_cast(mParent.get()); +} + +void SpeechRecognitionAlternative::GetTranscript(nsString& aRetVal) const { + aRetVal = mTranscript; +} + +float SpeechRecognitionAlternative::Confidence() const { return mConfidence; } + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h new file mode 100644 index 0000000000..017d869943 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionAlternative.h @@ -0,0 +1,49 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionAlternative_h +#define mozilla_dom_SpeechRecognitionAlternative_h + +#include "nsCycleCollectionParticipant.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +namespace mozilla::dom { + +class SpeechRecognition; + +class SpeechRecognitionAlternative final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionAlternative(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionAlternative) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + void GetTranscript(nsString& aRetVal) const; + + float Confidence() const; + + nsString mTranscript; + float mConfidence; + + private: + ~SpeechRecognitionAlternative(); + + RefPtr mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp new file mode 100644 index 0000000000..009281b234 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.cpp @@ -0,0 +1,59 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionResult.h" +#include "mozilla/dom/SpeechRecognitionResultBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResult, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResult) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResult) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResult) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionResult::SpeechRecognitionResult(SpeechRecognition* aParent) + : mParent(aParent) {} + +SpeechRecognitionResult::~SpeechRecognitionResult() = default; + +JSObject* SpeechRecognitionResult::WrapObject( + JSContext* aCx, JS::Handle aGivenProto) { + return SpeechRecognitionResult_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechRecognitionResult::GetParentObject() const { + return static_cast(mParent.get()); +} + +already_AddRefed +SpeechRecognitionResult::IndexedGetter(uint32_t aIndex, bool& aPresent) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + + aPresent = true; + return Item(aIndex); +} + +uint32_t SpeechRecognitionResult::Length() const { return mItems.Length(); } + +already_AddRefed SpeechRecognitionResult::Item( + uint32_t aIndex) { + RefPtr alternative = mItems.ElementAt(aIndex); + return alternative.forget(); +} + +bool SpeechRecognitionResult::IsFinal() const { + return true; // TODO +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResult.h b/dom/media/webspeech/recognition/SpeechRecognitionResult.h new file mode 100644 index 0000000000..fc9e8fd660 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResult.h @@ -0,0 +1,54 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionResult_h +#define mozilla_dom_SpeechRecognitionResult_h + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +#include "SpeechRecognitionAlternative.h" + +namespace mozilla::dom { + +class SpeechRecognitionResult final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionResult(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResult) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed Item(uint32_t aIndex); + + bool IsFinal() const; + + already_AddRefed IndexedGetter(uint32_t aIndex, + bool& aPresent); + + nsTArray> mItems; + + private: + ~SpeechRecognitionResult(); + + RefPtr mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp new file mode 100644 index 0000000000..2aa81a5982 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.cpp @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognitionResultList.h" + +#include "mozilla/dom/SpeechRecognitionResultListBinding.h" + +#include "SpeechRecognition.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechRecognitionResultList, mParent, + mItems) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechRecognitionResultList) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechRecognitionResultList) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognitionResultList) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechRecognitionResultList::SpeechRecognitionResultList( + SpeechRecognition* aParent) + : mParent(aParent) {} + +SpeechRecognitionResultList::~SpeechRecognitionResultList() = default; + +nsISupports* SpeechRecognitionResultList::GetParentObject() const { + return static_cast(mParent.get()); +} + +JSObject* SpeechRecognitionResultList::WrapObject( + JSContext* aCx, JS::Handle aGivenProto) { + return SpeechRecognitionResultList_Binding::Wrap(aCx, this, aGivenProto); +} + +already_AddRefed +SpeechRecognitionResultList::IndexedGetter(uint32_t aIndex, bool& aPresent) { + if (aIndex >= Length()) { + aPresent = false; + return nullptr; + } + + aPresent = true; + return Item(aIndex); +} + +uint32_t SpeechRecognitionResultList::Length() const { return mItems.Length(); } + +already_AddRefed SpeechRecognitionResultList::Item( + uint32_t aIndex) { + RefPtr result = mItems.ElementAt(aIndex); + return result.forget(); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechRecognitionResultList.h b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h new file mode 100644 index 0000000000..b45659564b --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognitionResultList.h @@ -0,0 +1,53 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognitionResultList_h +#define mozilla_dom_SpeechRecognitionResultList_h + +#include "nsCycleCollectionParticipant.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" + +#include "mozilla/Attributes.h" + +#include "SpeechRecognitionResult.h" + +namespace mozilla::dom { + +class SpeechRecognition; + +class SpeechRecognitionResultList final : public nsISupports, + public nsWrapperCache { + public: + explicit SpeechRecognitionResultList(SpeechRecognition* aParent); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechRecognitionResultList) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + uint32_t Length() const; + + already_AddRefed Item(uint32_t aIndex); + + already_AddRefed IndexedGetter(uint32_t aIndex, + bool& aPresent); + + nsTArray> mItems; + + private: + ~SpeechRecognitionResultList(); + + RefPtr mParent; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.cpp b/dom/media/webspeech/recognition/SpeechTrackListener.cpp new file mode 100644 index 0000000000..036ff753ba --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechTrackListener.cpp @@ -0,0 +1,92 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechTrackListener.h" + +#include "SpeechRecognition.h" +#include "nsProxyRelease.h" + +namespace mozilla::dom { + +SpeechTrackListener::SpeechTrackListener(SpeechRecognition* aRecognition) + : mRecognition(new nsMainThreadPtrHolder( + "SpeechTrackListener::SpeechTrackListener", aRecognition, false)), + mRemovedPromise( + mRemovedHolder.Ensure("SpeechTrackListener::mRemovedPromise")) { + MOZ_ASSERT(NS_IsMainThread()); + mRemovedPromise->Then(GetCurrentSerialEventTarget(), __func__, + [self = RefPtr(this), this] { + mRecognition = nullptr; + }); +} + +void SpeechTrackListener::NotifyQueuedChanges( + MediaTrackGraph* aGraph, TrackTime aTrackOffset, + const MediaSegment& aQueuedMedia) { + AudioSegment* audio = const_cast( + static_cast(&aQueuedMedia)); + + AudioSegment::ChunkIterator iterator(*audio); + while (!iterator.IsEnded()) { + // Skip over-large chunks so we don't crash! + if (iterator->GetDuration() > INT_MAX) { + continue; + } + int duration = int(iterator->GetDuration()); + + if (iterator->IsNull()) { + nsTArray nullData; + PodZero(nullData.AppendElements(duration), duration); + ConvertAndDispatchAudioChunk(duration, iterator->mVolume, + nullData.Elements(), aGraph->GraphRate()); + } else { + AudioSampleFormat format = iterator->mBufferFormat; + + MOZ_ASSERT(format == AUDIO_FORMAT_S16 || format == AUDIO_FORMAT_FLOAT32); + + if (format == AUDIO_FORMAT_S16) { + ConvertAndDispatchAudioChunk( + duration, iterator->mVolume, + static_cast(iterator->mChannelData[0]), + aGraph->GraphRate()); + } else if (format == AUDIO_FORMAT_FLOAT32) { + ConvertAndDispatchAudioChunk( + duration, iterator->mVolume, + static_cast(iterator->mChannelData[0]), + aGraph->GraphRate()); + } + } + + iterator.Next(); + } +} + +template +void SpeechTrackListener::ConvertAndDispatchAudioChunk(int aDuration, + float aVolume, + SampleFormatType* aData, + TrackRate aTrackRate) { + CheckedInt bufferSize(sizeof(int16_t)); + bufferSize *= aDuration; + bufferSize *= 1; // channel + RefPtr samples(SharedBuffer::Create(bufferSize)); + + int16_t* to = static_cast(samples->Data()); + ConvertAudioSamplesWithScale(aData, to, aDuration, aVolume); + + mRecognition->FeedAudioData(mRecognition, samples.forget(), aDuration, this, + aTrackRate); +} + +void SpeechTrackListener::NotifyEnded(MediaTrackGraph* aGraph) { + // TODO dispatch SpeechEnd event so services can be informed +} + +void SpeechTrackListener::NotifyRemoved(MediaTrackGraph* aGraph) { + mRemovedHolder.ResolveIfExists(true, __func__); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/recognition/SpeechTrackListener.h b/dom/media/webspeech/recognition/SpeechTrackListener.h new file mode 100644 index 0000000000..423a5b0317 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechTrackListener.h @@ -0,0 +1,50 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechStreamListener_h +#define mozilla_dom_SpeechStreamListener_h + +#include "MediaTrackGraph.h" +#include "MediaTrackListener.h" +#include "AudioSegment.h" +#include "mozilla/MozPromise.h" + +namespace mozilla { + +class AudioSegment; + +namespace dom { + +class SpeechRecognition; + +class SpeechTrackListener : public MediaTrackListener { + public: + explicit SpeechTrackListener(SpeechRecognition* aRecognition); + ~SpeechTrackListener() = default; + + void NotifyQueuedChanges(MediaTrackGraph* aGraph, TrackTime aTrackOffset, + const MediaSegment& aQueuedMedia) override; + + void NotifyEnded(MediaTrackGraph* aGraph) override; + + void NotifyRemoved(MediaTrackGraph* aGraph) override; + + private: + template + void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, + SampleFormatType* aData, + TrackRate aTrackRate); + nsMainThreadPtrHandle mRecognition; + MozPromiseHolder mRemovedHolder; + + public: + const RefPtr mRemovedPromise; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/endpointer.cc b/dom/media/webspeech/recognition/endpointer.cc new file mode 100644 index 0000000000..2347043d4b --- /dev/null +++ b/dom/media/webspeech/recognition/endpointer.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "endpointer.h" + +#include "AudioSegment.h" + +namespace { +const int kFrameRate = 200; // 1 frame = 5ms of audio. +} + +namespace mozilla { + +Endpointer::Endpointer(int sample_rate) + : speech_input_possibly_complete_silence_length_us_(-1), + speech_input_complete_silence_length_us_(-1), + audio_frame_time_us_(0), + sample_rate_(sample_rate), + frame_size_(0) { + Reset(); + + frame_size_ = static_cast(sample_rate / static_cast(kFrameRate)); + + speech_input_minimum_length_us_ = + static_cast(1.7 * 1000000); + speech_input_complete_silence_length_us_ = + static_cast(0.5 * 1000000); + long_speech_input_complete_silence_length_us_ = -1; + long_speech_length_us_ = -1; + speech_input_possibly_complete_silence_length_us_ = + 1 * 1000000; + + // Set the default configuration for Push To Talk mode. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(1000.0f); + ep_config.set_min_decision_threshold(50.0f); + ep_config.set_fast_update_dur(0.2f); + ep_config.set_sample_rate(static_cast(sample_rate)); + ep_config.set_min_fundamental_frequency(57.143f); + ep_config.set_max_fundamental_frequency(400.0f); + ep_config.set_contamination_rejection_period(0.25f); + energy_endpointer_.Init(ep_config); +} + +void Endpointer::Reset() { + old_ep_status_ = EP_PRE_SPEECH; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + speech_previously_detected_ = false; + speech_input_complete_ = false; + audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. + speech_end_time_us_ = -1; + speech_start_time_us_ = -1; +} + +void Endpointer::StartSession() { + Reset(); + energy_endpointer_.StartSession(); +} + +void Endpointer::EndSession() { + energy_endpointer_.EndSession(); +} + +void Endpointer::SetEnvironmentEstimationMode() { + Reset(); + energy_endpointer_.SetEnvironmentEstimationMode(); +} + +void Endpointer::SetUserInputMode() { + energy_endpointer_.SetUserInputMode(); +} + +EpStatus Endpointer::Status(int64_t *time) { + return energy_endpointer_.Status(time); +} + +EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { + MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format"); + const int16_t* audio_data = static_cast(raw_audio.mChannelData[0]); + const int num_samples = raw_audio.mDuration; + EpStatus ep_status = EP_PRE_SPEECH; + + // Process the input data in blocks of frame_size_, dropping any incomplete + // frames at the end (which is ok since typically the caller will be recording + // audio in multiples of our frame size). + int sample_index = 0; + while (sample_index + frame_size_ <= num_samples) { + // Have the endpointer process the frame. + energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, + audio_data + sample_index, + frame_size_, + rms_out); + sample_index += frame_size_; + audio_frame_time_us_ += (frame_size_ * 1000000) / + sample_rate_; + + // Get the status of the endpointer. + int64_t ep_time; + ep_status = energy_endpointer_.Status(&ep_time); + if (old_ep_status_ != ep_status) + fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status); + + // Handle state changes. + if ((EP_SPEECH_PRESENT == ep_status) && + (EP_POSSIBLE_ONSET == old_ep_status_)) { + speech_end_time_us_ = -1; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + // Trigger SpeechInputDidStart event on first detection. + if (false == speech_previously_detected_) { + speech_previously_detected_ = true; + speech_start_time_us_ = ep_time; + } + } + if ((EP_PRE_SPEECH == ep_status) && + (EP_POSSIBLE_OFFSET == old_ep_status_)) { + speech_end_time_us_ = ep_time; + waiting_for_speech_possibly_complete_timeout_ = true; + waiting_for_speech_complete_timeout_ = true; + } + if (ep_time > speech_input_minimum_length_us_) { + // Speech possibly complete timeout. + if ((waiting_for_speech_possibly_complete_timeout_) && + (ep_time - speech_end_time_us_ > + speech_input_possibly_complete_silence_length_us_)) { + waiting_for_speech_possibly_complete_timeout_ = false; + } + if (waiting_for_speech_complete_timeout_) { + // The length of the silence timeout period can be held constant, or it + // can be changed after a fixed amount of time from the beginning of + // speech. + bool has_stepped_silence = + (long_speech_length_us_ > 0) && + (long_speech_input_complete_silence_length_us_ > 0); + int64_t requested_silence_length; + if (has_stepped_silence && + (ep_time - speech_start_time_us_) > long_speech_length_us_) { + requested_silence_length = + long_speech_input_complete_silence_length_us_; + } else { + requested_silence_length = + speech_input_complete_silence_length_us_; + } + + // Speech complete timeout. + if ((ep_time - speech_end_time_us_) > requested_silence_length) { + waiting_for_speech_complete_timeout_ = false; + speech_input_complete_ = true; + } + } + } + old_ep_status_ = ep_status; + } + return ep_status; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/endpointer.h b/dom/media/webspeech/recognition/endpointer.h new file mode 100644 index 0000000000..7879d6b9f3 --- /dev/null +++ b/dom/media/webspeech/recognition/endpointer.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ + +#include "energy_endpointer.h" + +namespace mozilla { + +struct AudioChunk; + +// A simple interface to the underlying energy-endpointer implementation, this +// class lets callers provide audio as being recorded and let them poll to find +// when the user has stopped speaking. +// +// There are two events that may trigger the end of speech: +// +// speechInputPossiblyComplete event: +// +// Signals that silence/noise has been detected for a *short* amount of +// time after some speech has been detected. It can be used for low latency +// UI feedback. To disable it, set it to a large amount. +// +// speechInputComplete event: +// +// This event is intended to signal end of input and to stop recording. +// The amount of time to wait after speech is set by +// speech_input_complete_silence_length_ and optionally two other +// parameters (see below). +// This time can be held constant, or can change as more speech is detected. +// In the latter case, the time changes after a set amount of time from the +// *beginning* of speech. This is motivated by the expectation that there +// will be two distinct types of inputs: short search queries and longer +// dictation style input. +// +// Three parameters are used to define the piecewise constant timeout function. +// The timeout length is speech_input_complete_silence_length until +// long_speech_length, when it changes to +// long_speech_input_complete_silence_length. +class Endpointer { + public: + explicit Endpointer(int sample_rate); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Process a segment of audio, which may be more than one frame. + // The status of the last frame will be returned. + EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); + + // Get the status of the endpointer. + EpStatus Status(int64_t *time_us); + + // Get the expected frame size for audio chunks. Audio chunks are expected + // to contain a number of samples that is a multiple of this number, and extra + // samples will be dropped. + int32_t FrameSize() const { + return frame_size_; + } + + // Returns true if the endpointer detected reasonable audio levels above + // background noise which could be user speech, false if not. + bool DidStartReceivingSpeech() const { + return speech_previously_detected_; + } + + bool IsEstimatingEnvironment() const { + return energy_endpointer_.estimating_environment(); + } + + void set_speech_input_complete_silence_length(int64_t time_us) { + speech_input_complete_silence_length_us_ = time_us; + } + + void set_long_speech_input_complete_silence_length(int64_t time_us) { + long_speech_input_complete_silence_length_us_ = time_us; + } + + void set_speech_input_possibly_complete_silence_length(int64_t time_us) { + speech_input_possibly_complete_silence_length_us_ = time_us; + } + + void set_long_speech_length(int64_t time_us) { + long_speech_length_us_ = time_us; + } + + bool speech_input_complete() const { + return speech_input_complete_; + } + + // RMS background noise level in dB. + float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } + + private: + // Reset internal states. Helper method common to initial input utterance + // and following input utternaces. + void Reset(); + + // Minimum allowable length of speech input. + int64_t speech_input_minimum_length_us_; + + // The speechInputPossiblyComplete event signals that silence/noise has been + // detected for a *short* amount of time after some speech has been detected. + // This proporty specifies the time period. + int64_t speech_input_possibly_complete_silence_length_us_; + + // The speechInputComplete event signals that silence/noise has been + // detected for a *long* amount of time after some speech has been detected. + // This property specifies the time period. + int64_t speech_input_complete_silence_length_us_; + + // Same as above, this specifies the required silence period after speech + // detection. This period is used instead of + // speech_input_complete_silence_length_ when the utterance is longer than + // long_speech_length_. This parameter is optional. + int64_t long_speech_input_complete_silence_length_us_; + + // The period of time after which the endpointer should consider + // long_speech_input_complete_silence_length_ as a valid silence period + // instead of speech_input_complete_silence_length_. This parameter is + // optional. + int64_t long_speech_length_us_; + + // First speech onset time, used in determination of speech complete timeout. + int64_t speech_start_time_us_; + + // Most recent end time, used in determination of speech complete timeout. + int64_t speech_end_time_us_; + + int64_t audio_frame_time_us_; + EpStatus old_ep_status_; + bool waiting_for_speech_possibly_complete_timeout_; + bool waiting_for_speech_complete_timeout_; + bool speech_previously_detected_; + bool speech_input_complete_; + EnergyEndpointer energy_endpointer_; + int sample_rate_; + int32_t frame_size_; +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ diff --git a/dom/media/webspeech/recognition/energy_endpointer.cc b/dom/media/webspeech/recognition/energy_endpointer.cc new file mode 100644 index 0000000000..b1c1ee0bcf --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "energy_endpointer.h" + +#include + +namespace { + +// Returns the RMS (quadratic mean) of the input signal. +float RMS(const int16_t* samples, int num_samples) { + int64_t ssq_int64_t = 0; + int64_t sum_int64_t = 0; + for (int i = 0; i < num_samples; ++i) { + sum_int64_t += samples[i]; + ssq_int64_t += samples[i] * samples[i]; + } + // now convert to floats. + double sum = static_cast(sum_int64_t); + sum /= num_samples; + double ssq = static_cast(ssq_int64_t); + return static_cast(sqrt((ssq / num_samples) - (sum * sum))); +} + +int64_t Secs2Usecs(float seconds) { + return static_cast(0.5 + (1.0e6 * seconds)); +} + +float GetDecibel(float value) { + if (value > 1.0e-100) + return 20 * log10(value); + return -2000.0; +} + +} // namespace + +namespace mozilla { + +// Stores threshold-crossing histories for making decisions about the speech +// state. +class EnergyEndpointer::HistoryRing { + public: + HistoryRing() : insertion_index_(0) {} + + // Resets the ring to |size| elements each with state |initial_state| + void SetRing(int size, bool initial_state); + + // Inserts a new entry into the ring and drops the oldest entry. + void Insert(int64_t time_us, bool decision); + + // Returns the time in microseconds of the most recently added entry. + int64_t EndTime() const; + + // Returns the sum of all intervals during which 'decision' is true within + // the time in seconds specified by 'duration'. The returned interval is + // in seconds. + float RingSum(float duration_sec); + + private: + struct DecisionPoint { + int64_t time_us; + bool decision; + }; + + std::vector decision_points_; + int insertion_index_; // Index at which the next item gets added/inserted. + + HistoryRing(const HistoryRing&); + void operator=(const HistoryRing&); +}; + +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { + insertion_index_ = 0; + decision_points_.clear(); + DecisionPoint init = { -1, initial_state }; + decision_points_.resize(size, init); +} + +void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { + decision_points_[insertion_index_].time_us = time_us; + decision_points_[insertion_index_].decision = decision; + insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); +} + +int64_t EnergyEndpointer::HistoryRing::EndTime() const { + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + return decision_points_[ind].time_us; +} + +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { + if (decision_points_.empty()) + return 0.0; + + int64_t sum_us = 0; + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + int64_t end_us = decision_points_[ind].time_us; + bool is_on = decision_points_[ind].decision; + int64_t start_us = end_us - static_cast(0.5 + (1.0e6 * duration_sec)); + if (start_us < 0) + start_us = 0; + size_t n_summed = 1; // n points ==> (n-1) intervals + while ((decision_points_[ind].time_us > start_us) && + (n_summed < decision_points_.size())) { + --ind; + if (ind < 0) + ind = decision_points_.size() - 1; + if (is_on) + sum_us += end_us - decision_points_[ind].time_us; + is_on = decision_points_[ind].decision; + end_us = decision_points_[ind].time_us; + n_summed++; + } + + return 1.0e-6f * sum_us; // Returns total time that was super threshold. +} + +EnergyEndpointer::EnergyEndpointer() + : status_(EP_PRE_SPEECH), + offset_confirm_dur_sec_(0), + endpointer_time_us_(0), + fast_update_frames_(0), + frame_counter_(0), + max_window_dur_(4.0), + sample_rate_(0), + history_(new HistoryRing()), + decision_threshold_(0), + estimating_environment_(false), + noise_level_(0), + rms_adapt_(0), + start_lag_(0), + end_lag_(0), + user_input_start_time_us_(0) { +} + +EnergyEndpointer::~EnergyEndpointer() { +} + +int EnergyEndpointer::TimeToFrame(float time) const { + return static_cast(0.5 + (time / params_.frame_period())); +} + +void EnergyEndpointer::Restart(bool reset_threshold) { + status_ = EP_PRE_SPEECH; + user_input_start_time_us_ = 0; + + if (reset_threshold) { + decision_threshold_ = params_.decision_threshold(); + rms_adapt_ = decision_threshold_; + noise_level_ = params_.decision_threshold() / 2.0f; + frame_counter_ = 0; // Used for rapid initial update of levels. + } + + // Set up the memories to hold the history windows. + history_->SetRing(TimeToFrame(max_window_dur_), false); + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; +} + +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { + params_ = params; + + // Find the longest history interval to be used, and make the ring + // large enough to accommodate that number of frames. NOTE: This + // depends upon ep_frame_period being set correctly in the factory + // that did this instantiation. + max_window_dur_ = params_.onset_window(); + if (params_.speech_on_window() > max_window_dur_) + max_window_dur_ = params_.speech_on_window(); + if (params_.offset_window() > max_window_dur_) + max_window_dur_ = params_.offset_window(); + Restart(true); + + offset_confirm_dur_sec_ = params_.offset_window() - + params_.offset_confirm_dur(); + if (offset_confirm_dur_sec_ < 0.0) + offset_confirm_dur_sec_ = 0.0; + + user_input_start_time_us_ = 0; + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; + // The initial value of the noise and speech levels is inconsequential. + // The level of the first frame will overwrite these values. + noise_level_ = params_.decision_threshold() / 2.0f; + fast_update_frames_ = + static_cast(params_.fast_update_dur() / params_.frame_period()); + + frame_counter_ = 0; // Used for rapid initial update of levels. + + sample_rate_ = params_.sample_rate(); + start_lag_ = static_cast(sample_rate_ / + params_.max_fundamental_frequency()); + end_lag_ = static_cast(sample_rate_ / + params_.min_fundamental_frequency()); +} + +void EnergyEndpointer::StartSession() { + Restart(true); +} + +void EnergyEndpointer::EndSession() { + status_ = EP_POST_SPEECH; +} + +void EnergyEndpointer::SetEnvironmentEstimationMode() { + Restart(true); + estimating_environment_ = true; +} + +void EnergyEndpointer::SetUserInputMode() { + estimating_environment_ = false; + user_input_start_time_us_ = endpointer_time_us_; +} + +void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, + const int16_t* samples, + int num_samples, + float* rms_out) { + endpointer_time_us_ = time_us; + float rms = RMS(samples, num_samples); + + // Check that this is user input audio vs. pre-input adaptation audio. + // Input audio starts when the user indicates start of input, by e.g. + // pressing push-to-talk. Audio recieved prior to that is used to update + // noise and speech level estimates. + if (!estimating_environment_) { + bool decision = false; + if ((endpointer_time_us_ - user_input_start_time_us_) < + Secs2Usecs(params_.contamination_rejection_period())) { + decision = false; + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); + } else { + decision = (rms > decision_threshold_); + } + + history_->Insert(endpointer_time_us_, decision); + + switch (status_) { + case EP_PRE_SPEECH: + if (history_->RingSum(params_.onset_window()) > + params_.onset_detect_dur()) { + status_ = EP_POSSIBLE_ONSET; + } + break; + + case EP_POSSIBLE_ONSET: { + float tsum = history_->RingSum(params_.onset_window()); + if (tsum > params_.onset_confirm_dur()) { + status_ = EP_SPEECH_PRESENT; + } else { // If signal is not maintained, drop back to pre-speech. + if (tsum <= params_.onset_detect_dur()) + status_ = EP_PRE_SPEECH; + } + break; + } + + case EP_SPEECH_PRESENT: { + // To induce hysteresis in the state residency, we allow a + // smaller residency time in the on_ring, than was required to + // enter the SPEECH_PERSENT state. + float on_time = history_->RingSum(params_.speech_on_window()); + if (on_time < params_.on_maintain_dur()) + status_ = EP_POSSIBLE_OFFSET; + break; + } + + case EP_POSSIBLE_OFFSET: + if (history_->RingSum(params_.offset_window()) <= + offset_confirm_dur_sec_) { + // Note that this offset time may be beyond the end + // of the input buffer in a real-time system. It will be up + // to the RecognizerSession to decide what to do. + status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. + } else { // If speech picks up again we allow return to SPEECH_PRESENT. + if (history_->RingSum(params_.speech_on_window()) >= + params_.on_maintain_dur()) + status_ = EP_SPEECH_PRESENT; + } + break; + + default: + break; + } + + // If this is a quiet, non-speech region, slowly adapt the detection + // threshold to be about 6dB above the average RMS. + if ((!decision) && (status_ == EP_PRE_SPEECH)) { + decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); + rms_adapt_ = decision_threshold_; + } else { + // If this is in a speech region, adapt the decision threshold to + // be about 10dB below the average RMS. If the noise level is high, + // the threshold is pushed up. + // Adaptation up to a higher level is 5 times faster than decay to + // a lower level. + if ((status_ == EP_SPEECH_PRESENT) && decision) { + if (rms_adapt_ > rms) { + rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); + } else { + rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); + } + float target_threshold = 0.3f * rms_adapt_ + noise_level_; + decision_threshold_ = (.90f * decision_threshold_) + + (0.10f * target_threshold); + } + } + + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } + + // Update speech and noise levels. + UpdateLevels(rms); + ++frame_counter_; + + if (rms_out) + *rms_out = GetDecibel(rms); +} + +float EnergyEndpointer::GetNoiseLevelDb() const { + return GetDecibel(noise_level_); +} + +void EnergyEndpointer::UpdateLevels(float rms) { + // Update quickly initially. We assume this is noise and that + // speech is 6dB above the noise. + if (frame_counter_ < fast_update_frames_) { + // Alpha increases from 0 to (k-1)/k where k is the number of time + // steps in the initial adaptation period. + float alpha = static_cast(frame_counter_) / + static_cast(fast_update_frames_); + noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); + } else { + // Update Noise level. The noise level adapts quickly downward, but + // slowly upward. The noise_level_ parameter is not currently used + // for threshold adaptation. It is used for UI feedback. + if (noise_level_ < rms) + noise_level_ = (0.999f * noise_level_) + (0.001f * rms); + else + noise_level_ = (0.95f * noise_level_) + (0.05f * rms); + } + if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { + decision_threshold_ = noise_level_ * 2; // 6dB above noise level. + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } +} + +EpStatus EnergyEndpointer::Status(int64_t* status_time) const { + *status_time = history_->EndTime(); + return status_; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/energy_endpointer.h b/dom/media/webspeech/recognition/energy_endpointer.h new file mode 100644 index 0000000000..12d3c736e3 --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The EnergyEndpointer class finds likely speech onset and offset points. +// +// The implementation described here is about the simplest possible. +// It is based on timings of threshold crossings for overall signal +// RMS. It is suitable for light weight applications. +// +// As written, the basic idea is that one specifies intervals that +// must be occupied by super- and sub-threshold energy levels, and +// defers decisions re onset and offset times until these +// specifications have been met. Three basic intervals are tested: an +// onset window, a speech-on window, and an offset window. We require +// super-threshold to exceed some mimimum total durations in the onset +// and speech-on windows before declaring the speech onset time, and +// we specify a required sub-threshold residency in the offset window +// before declaring speech offset. As the various residency requirements are +// met, the EnergyEndpointer instance assumes various states, and can return the +// ID of these states to the client (see EpStatus below). +// +// The levels of the speech and background noise are continuously updated. It is +// important that the background noise level be estimated initially for +// robustness in noisy conditions. The first frames are assumed to be background +// noise and a fast update rate is used for the noise level. The duration for +// fast update is controlled by the fast_update_dur_ paramter. +// +// If used in noisy conditions, the endpointer should be started and run in the +// EnvironmentEstimation mode, for at least 200ms, before switching to +// UserInputMode. +// Audio feedback contamination can appear in the input audio, if not cut +// out or handled by echo cancellation. Audio feedback can trigger a false +// accept. The false accepts can be ignored by setting +// ep_contamination_rejection_period. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ + +#include + +#include "mozilla/UniquePtr.h" + +#include "energy_endpointer_params.h" + +namespace mozilla { + +// Endpointer status codes +enum EpStatus { + EP_PRE_SPEECH = 10, + EP_POSSIBLE_ONSET, + EP_SPEECH_PRESENT, + EP_POSSIBLE_OFFSET, + EP_POST_SPEECH, +}; + +class EnergyEndpointer { + public: + // The default construction MUST be followed by Init(), before any + // other use can be made of the instance. + EnergyEndpointer(); + virtual ~EnergyEndpointer(); + + void Init(const EnergyEndpointerParams& params); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Computes the next input frame and modifies EnergyEndpointer status as + // appropriate based on the computation. + void ProcessAudioFrame(int64_t time_us, + const int16_t* samples, int num_samples, + float* rms_out); + + // Returns the current state of the EnergyEndpointer and the time + // corresponding to the most recently computed frame. + EpStatus Status(int64_t* status_time_us) const; + + bool estimating_environment() const { + return estimating_environment_; + } + + // Returns estimated noise level in dB. + float GetNoiseLevelDb() const; + + private: + class HistoryRing; + + // Resets the endpointer internal state. If reset_threshold is true, the + // state will be reset completely, including adaptive thresholds and the + // removal of all history information. + void Restart(bool reset_threshold); + + // Update internal speech and noise levels. + void UpdateLevels(float rms); + + // Returns the number of frames (or frame number) corresponding to + // the 'time' (in seconds). + int TimeToFrame(float time) const; + + EpStatus status_; // The current state of this instance. + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH + int64_t endpointer_time_us_; // Time of the most recently received audio frame. + int64_t fast_update_frames_; // Number of frames for initial level adaptation. + int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. + float max_window_dur_; // Largest search window size (seconds) + float sample_rate_; // Sampling rate. + + // Ring buffers to hold the speech activity history. + UniquePtr history_; + + // Configuration parameters. + EnergyEndpointerParams params_; + + // RMS which must be exceeded to conclude frame is speech. + float decision_threshold_; + + // Flag to indicate that audio should be used to estimate environment, prior + // to receiving user input. + bool estimating_environment_; + + // Estimate of the background noise level. Used externally for UI feedback. + float noise_level_; + + // An adaptive threshold used to update decision_threshold_ when appropriate. + float rms_adapt_; + + // Start lag corresponds to the highest fundamental frequency. + int start_lag_; + + // End lag corresponds to the lowest fundamental frequency. + int end_lag_; + + // Time when mode switched from environment estimation to user input. This + // is used to time forced rejection of audio feedback contamination. + int64_t user_input_start_time_us_; + + // prevent copy constructor and assignment + EnergyEndpointer(const EnergyEndpointer&); + void operator=(const EnergyEndpointer&); +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.cc b/dom/media/webspeech/recognition/energy_endpointer_params.cc new file mode 100644 index 0000000000..cac4f1b238 --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer_params.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "energy_endpointer_params.h" + +namespace mozilla { + +EnergyEndpointerParams::EnergyEndpointerParams() { + SetDefaults(); +} + +void EnergyEndpointerParams::SetDefaults() { + frame_period_ = 0.01f; + frame_duration_ = 0.01f; + endpoint_margin_ = 0.2f; + onset_window_ = 0.15f; + speech_on_window_ = 0.4f; + offset_window_ = 0.15f; + onset_detect_dur_ = 0.09f; + onset_confirm_dur_ = 0.075f; + on_maintain_dur_ = 0.10f; + offset_confirm_dur_ = 0.12f; + decision_threshold_ = 150.0f; + min_decision_threshold_ = 50.0f; + fast_update_dur_ = 0.2f; + sample_rate_ = 8000.0f; + min_fundamental_frequency_ = 57.143f; + max_fundamental_frequency_ = 400.0f; + contamination_rejection_period_ = 0.25f; +} + +void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) { + frame_period_ = source.frame_period(); + frame_duration_ = source.frame_duration(); + endpoint_margin_ = source.endpoint_margin(); + onset_window_ = source.onset_window(); + speech_on_window_ = source.speech_on_window(); + offset_window_ = source.offset_window(); + onset_detect_dur_ = source.onset_detect_dur(); + onset_confirm_dur_ = source.onset_confirm_dur(); + on_maintain_dur_ = source.on_maintain_dur(); + offset_confirm_dur_ = source.offset_confirm_dur(); + decision_threshold_ = source.decision_threshold(); + min_decision_threshold_ = source.min_decision_threshold(); + fast_update_dur_ = source.fast_update_dur(); + sample_rate_ = source.sample_rate(); + min_fundamental_frequency_ = source.min_fundamental_frequency(); + max_fundamental_frequency_ = source.max_fundamental_frequency(); + contamination_rejection_period_ = source.contamination_rejection_period(); +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/energy_endpointer_params.h b/dom/media/webspeech/recognition/energy_endpointer_params.h new file mode 100644 index 0000000000..6437c6dc0f --- /dev/null +++ b/dom/media/webspeech/recognition/energy_endpointer_params.h @@ -0,0 +1,159 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ + +namespace mozilla { + +// Input parameters for the EnergyEndpointer class. +class EnergyEndpointerParams { + public: + EnergyEndpointerParams(); + + void SetDefaults(); + + void operator=(const EnergyEndpointerParams& source); + + // Accessors and mutators + float frame_period() const { return frame_period_; } + void set_frame_period(float frame_period) { + frame_period_ = frame_period; + } + + float frame_duration() const { return frame_duration_; } + void set_frame_duration(float frame_duration) { + frame_duration_ = frame_duration; + } + + float endpoint_margin() const { return endpoint_margin_; } + void set_endpoint_margin(float endpoint_margin) { + endpoint_margin_ = endpoint_margin; + } + + float onset_window() const { return onset_window_; } + void set_onset_window(float onset_window) { onset_window_ = onset_window; } + + float speech_on_window() const { return speech_on_window_; } + void set_speech_on_window(float speech_on_window) { + speech_on_window_ = speech_on_window; + } + + float offset_window() const { return offset_window_; } + void set_offset_window(float offset_window) { + offset_window_ = offset_window; + } + + float onset_detect_dur() const { return onset_detect_dur_; } + void set_onset_detect_dur(float onset_detect_dur) { + onset_detect_dur_ = onset_detect_dur; + } + + float onset_confirm_dur() const { return onset_confirm_dur_; } + void set_onset_confirm_dur(float onset_confirm_dur) { + onset_confirm_dur_ = onset_confirm_dur; + } + + float on_maintain_dur() const { return on_maintain_dur_; } + void set_on_maintain_dur(float on_maintain_dur) { + on_maintain_dur_ = on_maintain_dur; + } + + float offset_confirm_dur() const { return offset_confirm_dur_; } + void set_offset_confirm_dur(float offset_confirm_dur) { + offset_confirm_dur_ = offset_confirm_dur; + } + + float decision_threshold() const { return decision_threshold_; } + void set_decision_threshold(float decision_threshold) { + decision_threshold_ = decision_threshold; + } + + float min_decision_threshold() const { return min_decision_threshold_; } + void set_min_decision_threshold(float min_decision_threshold) { + min_decision_threshold_ = min_decision_threshold; + } + + float fast_update_dur() const { return fast_update_dur_; } + void set_fast_update_dur(float fast_update_dur) { + fast_update_dur_ = fast_update_dur; + } + + float sample_rate() const { return sample_rate_; } + void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; } + + float min_fundamental_frequency() const { return min_fundamental_frequency_; } + void set_min_fundamental_frequency(float min_fundamental_frequency) { + min_fundamental_frequency_ = min_fundamental_frequency; + } + + float max_fundamental_frequency() const { return max_fundamental_frequency_; } + void set_max_fundamental_frequency(float max_fundamental_frequency) { + max_fundamental_frequency_ = max_fundamental_frequency; + } + + float contamination_rejection_period() const { + return contamination_rejection_period_; + } + void set_contamination_rejection_period( + float contamination_rejection_period) { + contamination_rejection_period_ = contamination_rejection_period; + } + + private: + float frame_period_; // Frame period + float frame_duration_; // Window size + float onset_window_; // Interval scanned for onset activity + float speech_on_window_; // Inverval scanned for ongoing speech + float offset_window_; // Interval scanned for offset evidence + float offset_confirm_dur_; // Silence duration required to confirm offset + float decision_threshold_; // Initial rms detection threshold + float min_decision_threshold_; // Minimum rms detection threshold + float fast_update_dur_; // Period for initial estimation of levels. + float sample_rate_; // Expected sample rate. + + // Time to add on either side of endpoint threshold crossings + float endpoint_margin_; + // Total dur within onset_window required to enter ONSET state + float onset_detect_dur_; + // Total on time within onset_window required to enter SPEECH_ON state + float onset_confirm_dur_; + // Minimum dur in SPEECH_ON state required to maintain ON state + float on_maintain_dur_; + // Minimum fundamental frequency for autocorrelation. + float min_fundamental_frequency_; + // Maximum fundamental frequency for autocorrelation. + float max_fundamental_frequency_; + // Period after start of user input that above threshold values are ignored. + // This is to reject audio feedback contamination. + float contamination_rejection_period_; +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ diff --git a/dom/media/webspeech/recognition/moz.build b/dom/media/webspeech/recognition/moz.build new file mode 100644 index 0000000000..5fdf8fdd47 --- /dev/null +++ b/dom/media/webspeech/recognition/moz.build @@ -0,0 +1,64 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +MOCHITEST_MANIFESTS += ["test/mochitest.ini"] + +XPIDL_MODULE = "dom_webspeechrecognition" + +XPIDL_SOURCES = ["nsISpeechRecognitionService.idl"] + +EXPORTS.mozilla.dom += [ + "OnlineSpeechRecognitionService.h", + "SpeechGrammar.h", + "SpeechGrammarList.h", + "SpeechRecognition.h", + "SpeechRecognitionAlternative.h", + "SpeechRecognitionResult.h", + "SpeechRecognitionResultList.h", + "SpeechTrackListener.h", +] + +EXPORTS += [ + "endpointer.h", + "energy_endpointer.h", + "energy_endpointer_params.h", +] + +if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]: + EXPORTS.mozilla.dom += [ + "test/FakeSpeechRecognitionService.h", + ] + +UNIFIED_SOURCES += [ + "endpointer.cc", + "energy_endpointer.cc", + "energy_endpointer_params.cc", + "OnlineSpeechRecognitionService.cpp", + "SpeechGrammar.cpp", + "SpeechGrammarList.cpp", + "SpeechRecognition.cpp", + "SpeechRecognitionAlternative.cpp", + "SpeechRecognitionResult.cpp", + "SpeechRecognitionResultList.cpp", + "SpeechTrackListener.cpp", +] + +if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]: + UNIFIED_SOURCES += [ + "test/FakeSpeechRecognitionService.cpp", + ] + +USE_LIBS += [ + "jsoncpp", +] + +LOCAL_INCLUDES += [ + "/dom/base", + "/toolkit/components/jsoncpp/include", +] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" diff --git a/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl new file mode 100644 index 0000000000..a43d277da0 --- /dev/null +++ b/dom/media/webspeech/recognition/nsISpeechRecognitionService.idl @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +%{C++ +#include "mozilla/WeakPtr.h" + +namespace mozilla { +class AudioSegment; +namespace dom { +class SpeechRecognition; +class SpeechRecognitionResultList; +class SpeechGrammarList; +class SpeechGrammar; +} +} +%} + +native SpeechRecognitionWeakPtr(mozilla::WeakPtr); +[ptr] native AudioSegmentPtr(mozilla::AudioSegment); +[ptr] native SpeechGrammarPtr(mozilla::dom::SpeechGrammar); +[ptr] native SpeechGrammarListPtr(mozilla::dom::SpeechGrammarList); + +[uuid(6fcb6ee8-a6db-49ba-9f06-355d7ee18ea7)] +interface nsISpeechGrammarCompilationCallback : nsISupports { + void grammarCompilationEnd(in SpeechGrammarPtr grammarObject, in boolean success); +}; + +[uuid(8e97f287-f322-44e8-8888-8344fa408ef8)] +interface nsISpeechRecognitionService : nsISupports { + void initialize(in SpeechRecognitionWeakPtr aSpeechRecognition); + void processAudioSegment(in AudioSegmentPtr aAudioSegment, in long aSampleRate); + void validateAndSetGrammarList(in SpeechGrammarPtr aSpeechGrammar, in nsISpeechGrammarCompilationCallback aCallback); + void soundEnd(); + void abort(); +}; + +%{C++ +#define NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "@mozilla.org/webspeech/service;1?name=" +%} diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp new file mode 100644 index 0000000000..cf14cb3750 --- /dev/null +++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp @@ -0,0 +1,118 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsThreadUtils.h" + +#include "FakeSpeechRecognitionService.h" + +#include "SpeechRecognition.h" +#include "SpeechRecognitionAlternative.h" +#include "SpeechRecognitionResult.h" +#include "SpeechRecognitionResultList.h" +#include "nsIObserverService.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPrefs_media.h" + +namespace mozilla { + +using namespace dom; + +NS_IMPL_ISUPPORTS(FakeSpeechRecognitionService, nsISpeechRecognitionService, + nsIObserver) + +FakeSpeechRecognitionService::FakeSpeechRecognitionService() = default; + +FakeSpeechRecognitionService::~FakeSpeechRecognitionService() = default; + +NS_IMETHODIMP +FakeSpeechRecognitionService::Initialize( + WeakPtr aSpeechRecognition) { + MOZ_ASSERT(NS_IsMainThread()); + mRecognition = aSpeechRecognition; + nsCOMPtr obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, + int32_t aSampleRate) { + MOZ_ASSERT(!NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::SoundEnd() { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::ValidateAndSetGrammarList( + mozilla::dom::SpeechGrammar*, nsISpeechGrammarCompilationCallback*) { + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::Abort() { + MOZ_ASSERT(NS_IsMainThread()); + return NS_OK; +} + +NS_IMETHODIMP +FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(StaticPrefs::media_webspeech_test_fake_recognition_service(), + "Got request to fake recognition service event, but " + "media.webspeech.test.fake_recognition_service is not set"); + + if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + + return NS_OK; + } + + const nsDependentString eventName = nsDependentString(aData); + + if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_ERROR")) { + mRecognition->DispatchError( + SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, + SpeechRecognitionErrorCode::Network, // TODO different codes? + "RECOGNITIONSERVICE_ERROR test event"); + + } else if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_FINAL_RESULT")) { + RefPtr event = new SpeechEvent( + mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); + + event->mRecognitionResultList = BuildMockResultList(); + NS_DispatchToMainThread(event); + } + return NS_OK; +} + +SpeechRecognitionResultList* +FakeSpeechRecognitionService::BuildMockResultList() { + SpeechRecognitionResultList* resultList = + new SpeechRecognitionResultList(mRecognition); + SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); + if (0 < mRecognition->MaxAlternatives()) { + SpeechRecognitionAlternative* alternative = + new SpeechRecognitionAlternative(mRecognition); + + alternative->mTranscript = u"Mock final result"_ns; + alternative->mConfidence = 0.0f; + + result->mItems.AppendElement(alternative); + } + resultList->mItems.AppendElement(result); + + return resultList; +} + +} // namespace mozilla diff --git a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h new file mode 100644 index 0000000000..69e2786b76 --- /dev/null +++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h @@ -0,0 +1,40 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_FakeSpeechRecognitionService_h +#define mozilla_dom_FakeSpeechRecognitionService_h + +#include "nsCOMPtr.h" +#include "nsIObserver.h" +#include "nsISpeechRecognitionService.h" + +#define NS_FAKE_SPEECH_RECOGNITION_SERVICE_CID \ + {0x48c345e7, \ + 0x9929, \ + 0x4f9a, \ + {0xa5, 0x63, 0xf4, 0x78, 0x22, 0x2d, 0xab, 0xcd}}; + +namespace mozilla { + +class FakeSpeechRecognitionService : public nsISpeechRecognitionService, + public nsIObserver { + public: + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSISPEECHRECOGNITIONSERVICE + NS_DECL_NSIOBSERVER + + FakeSpeechRecognitionService(); + + private: + virtual ~FakeSpeechRecognitionService(); + + WeakPtr mRecognition; + dom::SpeechRecognitionResultList* BuildMockResultList(); +}; + +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/recognition/test/head.js b/dom/media/webspeech/recognition/test/head.js new file mode 100644 index 0000000000..c77a7ee926 --- /dev/null +++ b/dom/media/webspeech/recognition/test/head.js @@ -0,0 +1,200 @@ +"use strict"; + +const DEFAULT_AUDIO_SAMPLE_FILE = "hello.ogg"; +const SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC = + "SpeechRecognitionTest:RequestEvent"; +const SPEECH_RECOGNITION_TEST_END_TOPIC = "SpeechRecognitionTest:End"; + +var errorCodes = { + NO_SPEECH: "no-speech", + ABORTED: "aborted", + AUDIO_CAPTURE: "audio-capture", + NETWORK: "network", + NOT_ALLOWED: "not-allowed", + SERVICE_NOT_ALLOWED: "service-not-allowed", + BAD_GRAMMAR: "bad-grammar", + LANGUAGE_NOT_SUPPORTED: "language-not-supported", +}; + +var Services = SpecialPowers.Services; + +function EventManager(sr) { + var self = this; + var nEventsExpected = 0; + self.eventsReceived = []; + + var allEvents = [ + "audiostart", + "soundstart", + "speechstart", + "speechend", + "soundend", + "audioend", + "result", + "nomatch", + "error", + "start", + "end", + ]; + + var eventDependencies = { + speechend: "speechstart", + soundend: "soundstart", + audioend: "audiostart", + }; + + var isDone = false; + + // set up grammar + var sgl = new SpeechGrammarList(); + sgl.addFromString("#JSGF V1.0; grammar test; public = hello ;", 1); + sr.grammars = sgl; + + // AUDIO_DATA events are asynchronous, + // so we queue events requested while they are being + // issued to make them seem synchronous + var isSendingAudioData = false; + var queuedEventRequests = []; + + // register default handlers + for (var i = 0; i < allEvents.length; i++) { + (function (eventName) { + sr["on" + eventName] = function (evt) { + var message = "unexpected event: " + eventName; + if (eventName == "error") { + message += " -- " + evt.message; + } + + ok(false, message); + if (self.doneFunc && !isDone) { + isDone = true; + self.doneFunc(); + } + }; + })(allEvents[i]); + } + + self.expect = function EventManager_expect(eventName, cb) { + nEventsExpected++; + + sr["on" + eventName] = function (evt) { + self.eventsReceived.push(eventName); + ok(true, "received event " + eventName); + + var dep = eventDependencies[eventName]; + if (dep) { + ok( + self.eventsReceived.includes(dep), + eventName + " must come after " + dep + ); + } + + cb && cb(evt, sr); + if ( + self.doneFunc && + !isDone && + nEventsExpected === self.eventsReceived.length + ) { + isDone = true; + self.doneFunc(); + } + }; + }; + + self.start = function EventManager_start() { + isSendingAudioData = true; + var audioTag = document.createElement("audio"); + audioTag.src = self.audioSampleFile; + + var stream = audioTag.mozCaptureStreamUntilEnded(); + audioTag.addEventListener("ended", function () { + info("Sample stream ended, requesting queued events"); + isSendingAudioData = false; + while (queuedEventRequests.length) { + self.requestFSMEvent(queuedEventRequests.shift()); + } + }); + + audioTag.play(); + sr.start(stream); + }; + + self.requestFSMEvent = function EventManager_requestFSMEvent(eventName) { + if (isSendingAudioData) { + info( + "Queuing event " + eventName + " until we're done sending audio data" + ); + queuedEventRequests.push(eventName); + return; + } + + info("requesting " + eventName); + Services.obs.notifyObservers( + null, + SPEECH_RECOGNITION_TEST_REQUEST_EVENT_TOPIC, + eventName + ); + }; + + self.requestTestEnd = function EventManager_requestTestEnd() { + Services.obs.notifyObservers(null, SPEECH_RECOGNITION_TEST_END_TOPIC); + }; +} + +function buildResultCallback(transcript) { + return function (evt) { + is(evt.results[0][0].transcript, transcript, "expect correct transcript"); + }; +} + +function buildErrorCallback(errcode) { + return function (err) { + is(err.error, errcode, "expect correct error code"); + }; +} + +function performTest(options) { + var prefs = options.prefs; + + prefs.unshift( + ["media.webspeech.recognition.enable", true], + ["media.webspeech.test.enable", true] + ); + + SpecialPowers.pushPrefEnv({ set: prefs }, function () { + var sr; + if (!options.webkit) { + sr = new SpeechRecognition(); + } else { + sr = new webkitSpeechRecognition(); + var grammar = new webkitSpeechGrammar(); + var speechrecognitionlist = new webkitSpeechGrammarList(); + speechrecognitionlist.addFromString("", 1); + sr.grammars = speechrecognitionlist; + } + var em = new EventManager(sr); + + for (var eventName in options.expectedEvents) { + var cb = options.expectedEvents[eventName]; + em.expect(eventName, cb); + } + + em.doneFunc = function () { + em.requestTestEnd(); + if (options.doneFunc) { + options.doneFunc(); + } + }; + + em.audioSampleFile = DEFAULT_AUDIO_SAMPLE_FILE; + if (options.audioSampleFile) { + em.audioSampleFile = options.audioSampleFile; + } + + em.start(); + + for (var i = 0; i < options.eventsToRequest.length; i++) { + em.requestFSMEvent(options.eventsToRequest[i]); + } + }); +} diff --git a/dom/media/webspeech/recognition/test/hello.ogg b/dom/media/webspeech/recognition/test/hello.ogg new file mode 100644 index 0000000000..7a80926065 Binary files /dev/null and b/dom/media/webspeech/recognition/test/hello.ogg differ diff --git a/dom/media/webspeech/recognition/test/hello.ogg^headers^ b/dom/media/webspeech/recognition/test/hello.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/hello.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/http_requesthandler.sjs b/dom/media/webspeech/recognition/test/http_requesthandler.sjs new file mode 100644 index 0000000000..3400df50ec --- /dev/null +++ b/dom/media/webspeech/recognition/test/http_requesthandler.sjs @@ -0,0 +1,85 @@ +const CC = Components.Constructor; + +// Context structure - we need to set this up properly to pass to setObjectState +const ctx = { + QueryInterface(iid) { + if (iid.equals(Components.interfaces.nsISupports)) { + return this; + } + throw Components.Exception("", Components.results.NS_ERROR_NO_INTERFACE); + }, +}; + +function setRequest(request) { + setObjectState(key, request); +} +function getRequest() { + let request; + getObjectState(v => { + request = v; + }); + return request; +} + +function handleRequest(request, response) { + response.processAsync(); + if (request.queryString == "save") { + // Get the context structure and finish the old request + getObjectState("context", function (obj) { + savedCtx = obj.wrappedJSObject; + request = savedCtx.request; + + response.setHeader("Content-Type", "application/octet-stream", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + + const input = request.bodyInputStream; + const output = response.bodyOutputStream; + let bodyAvail; + while ((bodyAvail = input.available()) > 0) { + output.writeFrom(input, bodyAvail); + } + response.finish(); + }); + return; + } else if ( + request.queryString == "malformedresult=1" || + request.queryString == "emptyresult=1" + ) { + jsonOK = + request.queryString == "malformedresult=1" + ? '{"status":"ok","dat' + : '{"status":"ok","data":[]}'; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } else if (request.queryString == "hangup=1") { + response.finish(); + } else if (request.queryString == "return400=1") { + jsonOK = "{'message':'Bad header:accept-language-stt'}"; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 400, "Bad Request"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } else { + ctx.wrappedJSObject = ctx; + ctx.request = request; + setObjectState("context", ctx); + jsonOK = '{"status":"ok","data":[{"confidence":0.9085610,"text":"hello"}]}'; + response.setHeader("Content-Length", String(jsonOK.length), false); + response.setHeader("Content-Type", "application/json", false); + response.setHeader("Access-Control-Allow-Origin", "*", false); + response.setHeader("Cache-Control", "no-cache", false); + response.setStatusLine(request.httpVersion, 200, "OK"); + response.write(jsonOK, jsonOK.length); + response.finish(); + } +} diff --git a/dom/media/webspeech/recognition/test/mochitest.ini b/dom/media/webspeech/recognition/test/mochitest.ini new file mode 100644 index 0000000000..6af13b906c --- /dev/null +++ b/dom/media/webspeech/recognition/test/mochitest.ini @@ -0,0 +1,35 @@ +[DEFAULT] +tags=mtg +subsuite = media +support-files = + head.js + hello.ogg + hello.ogg^headers^ + http_requesthandler.sjs + sinoid+hello.ogg + sinoid+hello.ogg^headers^ + silence.ogg + silence.ogg^headers^ +[test_abort.html] +skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363 +[test_audio_capture_error.html] +skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360 +[test_call_start_from_end_handler.html] +tags=capturestream +skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363 +[test_nested_eventloop.html] +skip-if = toolkit == 'android' +[test_online_400_response.html] +[test_online_hangup.html] +[test_online_http.html] +[test_online_http_webkit.html] +[test_online_malformed_result_handling.html] +[test_online_empty_result_handling.html] +[test_preference_enable.html] +[test_recognition_service_error.html] +skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360 +[test_success_without_recognition_service.html] +skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360 +[test_timeout.html] +skip-if = + os == "linux" # Bug 1307991 - low frequency on try pushes diff --git a/dom/media/webspeech/recognition/test/silence.ogg b/dom/media/webspeech/recognition/test/silence.ogg new file mode 100644 index 0000000000..e6da3a5022 Binary files /dev/null and b/dom/media/webspeech/recognition/test/silence.ogg differ diff --git a/dom/media/webspeech/recognition/test/silence.ogg^headers^ b/dom/media/webspeech/recognition/test/silence.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/silence.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg b/dom/media/webspeech/recognition/test/sinoid+hello.ogg new file mode 100644 index 0000000000..7092e82f30 Binary files /dev/null and b/dom/media/webspeech/recognition/test/sinoid+hello.ogg differ diff --git a/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ new file mode 100644 index 0000000000..4030ea1d3d --- /dev/null +++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^ @@ -0,0 +1 @@ +Cache-Control: no-store diff --git a/dom/media/webspeech/recognition/test/test_abort.html b/dom/media/webspeech/recognition/test/test_abort.html new file mode 100644 index 0000000000..0f22770cc7 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_abort.html @@ -0,0 +1,73 @@ + + + + + + Test for Bug 650295 -- Call abort from inside handlers + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_audio_capture_error.html b/dom/media/webspeech/recognition/test/test_audio_capture_error.html new file mode 100644 index 0000000000..0c054dbf0b --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_audio_capture_error.html @@ -0,0 +1,42 @@ + + + + + + Test for Bug 650295 -- Behavior on audio error + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html new file mode 100644 index 0000000000..895648ad9e --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html @@ -0,0 +1,102 @@ + + + + + + Test for Bug 650295 -- Restart recognition from end handler + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_nested_eventloop.html b/dom/media/webspeech/recognition/test/test_nested_eventloop.html new file mode 100644 index 0000000000..4924766b44 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_nested_eventloop.html @@ -0,0 +1,82 @@ + + + + + + Test for Bug 650295 -- Spin the event loop from inside a callback + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_400_response.html b/dom/media/webspeech/recognition/test/test_online_400_response.html new file mode 100644 index 0000000000..1a7d0ed452 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_400_response.html @@ -0,0 +1,47 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html new file mode 100644 index 0000000000..46f1e7e0cb --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html @@ -0,0 +1,48 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_hangup.html b/dom/media/webspeech/recognition/test/test_online_hangup.html new file mode 100644 index 0000000000..4a46f80f8f --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_hangup.html @@ -0,0 +1,47 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_http.html b/dom/media/webspeech/recognition/test/test_online_http.html new file mode 100644 index 0000000000..43be7a656a --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_http.html @@ -0,0 +1,89 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_http_webkit.html b/dom/media/webspeech/recognition/test/test_online_http_webkit.html new file mode 100644 index 0000000000..7f6c7e6d7d --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_http_webkit.html @@ -0,0 +1,90 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html new file mode 100644 index 0000000000..b071a46ea3 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html @@ -0,0 +1,48 @@ + + + + + + Test for Bug 1248897 -- Online speech service + + + + + +Mozilla Bug 1248897 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_preference_enable.html b/dom/media/webspeech/recognition/test/test_preference_enable.html new file mode 100644 index 0000000000..2b56f82e2c --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_preference_enable.html @@ -0,0 +1,43 @@ + + + + + + Test for Bug 650295 -- No objects should be visible with preference disabled + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_recognition_service_error.html b/dom/media/webspeech/recognition/test/test_recognition_service_error.html new file mode 100644 index 0000000000..e8e59e2afc --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_recognition_service_error.html @@ -0,0 +1,45 @@ + + + + + + Test for Bug 650295 -- Behavior on recognition service error + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html new file mode 100644 index 0000000000..38748ed5cb --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html @@ -0,0 +1,45 @@ + + + + + + Test for Bug 650295 -- Success with fake recognition service + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/recognition/test/test_timeout.html b/dom/media/webspeech/recognition/test/test_timeout.html new file mode 100644 index 0000000000..8334c9e779 --- /dev/null +++ b/dom/media/webspeech/recognition/test/test_timeout.html @@ -0,0 +1,42 @@ + + + + + + Test for Bug 650295 -- Timeout for user speech + + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/SpeechSynthesis.cpp b/dom/media/webspeech/synth/SpeechSynthesis.cpp new file mode 100644 index 0000000000..20e3ef754b --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesis.cpp @@ -0,0 +1,315 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupportsPrimitives.h" +#include "nsSpeechTask.h" +#include "mozilla/Logging.h" + +#include "mozilla/dom/Element.h" + +#include "mozilla/dom/SpeechSynthesisBinding.h" +#include "mozilla/dom/WindowGlobalChild.h" +#include "SpeechSynthesis.h" +#include "nsContentUtils.h" +#include "nsSynthVoiceRegistry.h" +#include "mozilla/dom/Document.h" +#include "nsIDocShell.h" + +#undef LOG +mozilla::LogModule* GetSpeechSynthLog() { + static mozilla::LazyLogModule sLog("SpeechSynthesis"); + + return sLog; +} +#define LOG(type, msg) MOZ_LOG(GetSpeechSynthLog(), type, msg) + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_CLASS(SpeechSynthesis) + +NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN_INHERITED(SpeechSynthesis, + DOMEventTargetHelper) + NS_IMPL_CYCLE_COLLECTION_UNLINK(mCurrentTask) + NS_IMPL_CYCLE_COLLECTION_UNLINK(mSpeechQueue) + tmp->mVoiceCache.Clear(); + NS_IMPL_CYCLE_COLLECTION_UNLINK_WEAK_REFERENCE +NS_IMPL_CYCLE_COLLECTION_UNLINK_END + +NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN_INHERITED(SpeechSynthesis, + DOMEventTargetHelper) + NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mCurrentTask) + NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mSpeechQueue) + for (SpeechSynthesisVoice* voice : tmp->mVoiceCache.Values()) { + cb.NoteXPCOMChild(voice); + } +NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechSynthesis) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY(nsISupportsWeakReference) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechSynthesis, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechSynthesis, DOMEventTargetHelper) + +SpeechSynthesis::SpeechSynthesis(nsPIDOMWindowInner* aParent) + : DOMEventTargetHelper(aParent), + mHoldQueue(false), + mInnerID(aParent->WindowID()) { + MOZ_ASSERT(NS_IsMainThread()); + + nsCOMPtr obs = mozilla::services::GetObserverService(); + if (obs) { + obs->AddObserver(this, "inner-window-destroyed", true); + obs->AddObserver(this, "synth-voices-changed", true); + } +} + +SpeechSynthesis::~SpeechSynthesis() = default; + +JSObject* SpeechSynthesis::WrapObject(JSContext* aCx, + JS::Handle aGivenProto) { + return SpeechSynthesis_Binding::Wrap(aCx, this, aGivenProto); +} + +bool SpeechSynthesis::Pending() const { + // If we don't have any task, nothing is pending. If we have only one task, + // check if that task is currently pending. If we have more than one task, + // then the tasks after the first one are definitely pending. + return mSpeechQueue.Length() > 1 || + (mSpeechQueue.Length() == 1 && + (!mCurrentTask || mCurrentTask->IsPending())); +} + +bool SpeechSynthesis::Speaking() const { + // Check global speaking state if there is no active speaking task. + return (!mSpeechQueue.IsEmpty() && HasSpeakingTask()) || + nsSynthVoiceRegistry::GetInstance()->IsSpeaking(); +} + +bool SpeechSynthesis::Paused() const { + return mHoldQueue || (mCurrentTask && mCurrentTask->IsPrePaused()) || + (!mSpeechQueue.IsEmpty() && mSpeechQueue.ElementAt(0)->IsPaused()); +} + +bool SpeechSynthesis::HasEmptyQueue() const { + return mSpeechQueue.Length() == 0; +} + +bool SpeechSynthesis::HasVoices() const { + uint32_t voiceCount = mVoiceCache.Count(); + if (voiceCount == 0) { + nsresult rv = + nsSynthVoiceRegistry::GetInstance()->GetVoiceCount(&voiceCount); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + } + + return voiceCount != 0; +} + +void SpeechSynthesis::Speak(SpeechSynthesisUtterance& aUtterance) { + if (!mInnerID) { + return; + } + + mSpeechQueue.AppendElement(&aUtterance); + + if (mSpeechQueue.Length() == 1) { + RefPtr wgc = + WindowGlobalChild::GetByInnerWindowId(mInnerID); + if (wgc) { + wgc->BlockBFCacheFor(BFCacheStatus::HAS_ACTIVE_SPEECH_SYNTHESIS); + } + + // If we only have one item in the queue, we aren't pre-paused, and + // we have voices available, speak it. + if (!mCurrentTask && !mHoldQueue && HasVoices()) { + AdvanceQueue(); + } + } +} + +void SpeechSynthesis::AdvanceQueue() { + LOG(LogLevel::Debug, + ("SpeechSynthesis::AdvanceQueue length=%zu", mSpeechQueue.Length())); + + if (mSpeechQueue.IsEmpty()) { + return; + } + + RefPtr utterance = mSpeechQueue.ElementAt(0); + + nsAutoString docLang; + nsCOMPtr window = GetOwner(); + if (Document* doc = window ? window->GetExtantDoc() : nullptr) { + if (Element* elm = doc->GetHtmlElement()) { + elm->GetLang(docLang); + } + } + + mCurrentTask = + nsSynthVoiceRegistry::GetInstance()->SpeakUtterance(*utterance, docLang); + + if (mCurrentTask) { + mCurrentTask->SetSpeechSynthesis(this); + } +} + +void SpeechSynthesis::Cancel() { + if (!mSpeechQueue.IsEmpty() && HasSpeakingTask()) { + // Remove all queued utterances except for current one, we will remove it + // in OnEnd + mSpeechQueue.RemoveLastElements(mSpeechQueue.Length() - 1); + } else { + mSpeechQueue.Clear(); + } + + if (mCurrentTask) { + mCurrentTask->Cancel(); + } +} + +void SpeechSynthesis::Pause() { + if (Paused()) { + return; + } + + if (!mSpeechQueue.IsEmpty() && HasSpeakingTask()) { + mCurrentTask->Pause(); + } else { + mHoldQueue = true; + } +} + +void SpeechSynthesis::Resume() { + if (!Paused()) { + return; + } + + mHoldQueue = false; + + if (mCurrentTask) { + mCurrentTask->Resume(); + } else { + AdvanceQueue(); + } +} + +void SpeechSynthesis::OnEnd(const nsSpeechTask* aTask) { + MOZ_ASSERT(mCurrentTask == aTask); + + if (!mSpeechQueue.IsEmpty()) { + mSpeechQueue.RemoveElementAt(0); + if (mSpeechQueue.IsEmpty()) { + RefPtr wgc = + WindowGlobalChild::GetByInnerWindowId(mInnerID); + if (wgc) { + wgc->UnblockBFCacheFor(BFCacheStatus::HAS_ACTIVE_SPEECH_SYNTHESIS); + } + } + } + + mCurrentTask = nullptr; + AdvanceQueue(); +} + +void SpeechSynthesis::GetVoices( + nsTArray >& aResult) { + aResult.Clear(); + uint32_t voiceCount = 0; + nsCOMPtr window = GetOwner(); + nsCOMPtr docShell = window ? window->GetDocShell() : nullptr; + + if (nsContentUtils::ShouldResistFingerprinting(docShell, + RFPTarget::SpeechSynthesis)) { + return; + } + + nsresult rv = nsSynthVoiceRegistry::GetInstance()->GetVoiceCount(&voiceCount); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + nsISupports* voiceParent = NS_ISUPPORTS_CAST(nsIObserver*, this); + + for (uint32_t i = 0; i < voiceCount; i++) { + nsAutoString uri; + rv = nsSynthVoiceRegistry::GetInstance()->GetVoice(i, uri); + + if (NS_FAILED(rv)) { + NS_WARNING("Failed to retrieve voice from registry"); + continue; + } + + SpeechSynthesisVoice* voice = mVoiceCache.GetWeak(uri); + + if (!voice) { + voice = new SpeechSynthesisVoice(voiceParent, uri); + } + + aResult.AppendElement(voice); + } + + mVoiceCache.Clear(); + + for (uint32_t i = 0; i < aResult.Length(); i++) { + SpeechSynthesisVoice* voice = aResult[i]; + mVoiceCache.InsertOrUpdate(voice->mUri, RefPtr{voice}); + } +} + +// For testing purposes, allows us to cancel the current task that is +// misbehaving, and flush the queue. +void SpeechSynthesis::ForceEnd() { + if (mCurrentTask) { + mCurrentTask->ForceEnd(); + } +} + +NS_IMETHODIMP +SpeechSynthesis::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread()); + + if (strcmp(aTopic, "inner-window-destroyed") == 0) { + nsCOMPtr wrapper = do_QueryInterface(aSubject); + NS_ENSURE_TRUE(wrapper, NS_ERROR_FAILURE); + + uint64_t innerID; + nsresult rv = wrapper->GetData(&innerID); + NS_ENSURE_SUCCESS(rv, rv); + + if (innerID == mInnerID) { + mInnerID = 0; + Cancel(); + + nsCOMPtr obs = + mozilla::services::GetObserverService(); + if (obs) { + obs->RemoveObserver(this, "inner-window-destroyed"); + } + } + } else if (strcmp(aTopic, "synth-voices-changed") == 0) { + LOG(LogLevel::Debug, ("SpeechSynthesis::onvoiceschanged")); + nsCOMPtr window = GetOwner(); + nsCOMPtr docShell = window ? window->GetDocShell() : nullptr; + + if (!nsContentUtils::ShouldResistFingerprinting( + docShell, RFPTarget::SpeechSynthesis)) { + DispatchTrustedEvent(u"voiceschanged"_ns); + // If we have a pending item, and voices become available, speak it. + if (!mCurrentTask && !mHoldQueue && HasVoices()) { + AdvanceQueue(); + } + } + } + + return NS_OK; +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/SpeechSynthesis.h b/dom/media/webspeech/synth/SpeechSynthesis.h new file mode 100644 index 0000000000..1227261b59 --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesis.h @@ -0,0 +1,88 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesis_h +#define mozilla_dom_SpeechSynthesis_h + +#include "nsCOMPtr.h" +#include "nsIObserver.h" +#include "nsRefPtrHashtable.h" +#include "nsString.h" +#include "nsWeakReference.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +#include "SpeechSynthesisUtterance.h" +#include "SpeechSynthesisVoice.h" + +class nsIDOMWindow; + +namespace mozilla::dom { + +class nsSpeechTask; + +class SpeechSynthesis final : public DOMEventTargetHelper, + public nsIObserver, + public nsSupportsWeakReference { + public: + explicit SpeechSynthesis(nsPIDOMWindowInner* aParent); + + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechSynthesis, + DOMEventTargetHelper) + NS_DECL_NSIOBSERVER + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + bool Pending() const; + + bool Speaking() const; + + bool Paused() const; + + bool HasEmptyQueue() const; + + void Speak(SpeechSynthesisUtterance& aUtterance); + + void Cancel(); + + void Pause(); + + void Resume(); + + void OnEnd(const nsSpeechTask* aTask); + + void GetVoices(nsTArray >& aResult); + + void ForceEnd(); + + IMPL_EVENT_HANDLER(voiceschanged) + + private: + virtual ~SpeechSynthesis(); + + void AdvanceQueue(); + + bool HasVoices() const; + + bool HasSpeakingTask() const { + return mCurrentTask && mCurrentTask->IsSpeaking(); + } + + nsTArray > mSpeechQueue; + + RefPtr mCurrentTask; + + nsRefPtrHashtable mVoiceCache; + + bool mHoldQueue; + + uint64_t mInnerID; +}; + +} // namespace mozilla::dom +#endif diff --git a/dom/media/webspeech/synth/SpeechSynthesisUtterance.cpp b/dom/media/webspeech/synth/SpeechSynthesisUtterance.cpp new file mode 100644 index 0000000000..4d8dcd5c12 --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesisUtterance.cpp @@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsGkAtoms.h" + +#include "mozilla/dom/SpeechSynthesisEvent.h" +#include "mozilla/dom/SpeechSynthesisUtteranceBinding.h" +#include "SpeechSynthesisUtterance.h" +#include "SpeechSynthesisVoice.h" + +#include + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechSynthesisUtterance, + DOMEventTargetHelper, mVoice); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechSynthesisUtterance) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechSynthesisUtterance, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechSynthesisUtterance, DOMEventTargetHelper) + +SpeechSynthesisUtterance::SpeechSynthesisUtterance( + nsPIDOMWindowInner* aOwnerWindow, const nsAString& text) + : DOMEventTargetHelper(aOwnerWindow), + mText(text), + mVolume(1), + mRate(1), + mPitch(1), + mPaused(false), + mShouldResistFingerprinting( + aOwnerWindow->AsGlobal()->ShouldResistFingerprinting( + RFPTarget::SpeechSynthesis)) {} + +SpeechSynthesisUtterance::~SpeechSynthesisUtterance() = default; + +JSObject* SpeechSynthesisUtterance::WrapObject( + JSContext* aCx, JS::Handle aGivenProto) { + return SpeechSynthesisUtterance_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechSynthesisUtterance::GetParentObject() const { + return GetOwner(); +} + +already_AddRefed +SpeechSynthesisUtterance::Constructor(GlobalObject& aGlobal, ErrorResult& aRv) { + return Constructor(aGlobal, u""_ns, aRv); +} + +already_AddRefed +SpeechSynthesisUtterance::Constructor(GlobalObject& aGlobal, + const nsAString& aText, + ErrorResult& aRv) { + nsCOMPtr win = do_QueryInterface(aGlobal.GetAsSupports()); + + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + return nullptr; + } + + RefPtr object = + new SpeechSynthesisUtterance(win, aText); + return object.forget(); +} + +void SpeechSynthesisUtterance::GetText(nsString& aResult) const { + aResult = mText; +} + +void SpeechSynthesisUtterance::SetText(const nsAString& aText) { + mText = aText; +} + +void SpeechSynthesisUtterance::GetLang(nsString& aResult) const { + aResult = mLang; +} + +void SpeechSynthesisUtterance::SetLang(const nsAString& aLang) { + mLang = aLang; +} + +SpeechSynthesisVoice* SpeechSynthesisUtterance::GetVoice() const { + return mVoice; +} + +void SpeechSynthesisUtterance::SetVoice(SpeechSynthesisVoice* aVoice) { + mVoice = aVoice; +} + +float SpeechSynthesisUtterance::Volume() const { return mVolume; } + +void SpeechSynthesisUtterance::SetVolume(float aVolume) { + mVolume = std::max(std::min(aVolume, 1), 0); +} + +float SpeechSynthesisUtterance::Rate() const { return mRate; } + +void SpeechSynthesisUtterance::SetRate(float aRate) { + mRate = std::max(std::min(aRate, 10), 0.1f); +} + +float SpeechSynthesisUtterance::Pitch() const { return mPitch; } + +void SpeechSynthesisUtterance::SetPitch(float aPitch) { + mPitch = std::max(std::min(aPitch, 2), 0); +} + +void SpeechSynthesisUtterance::GetChosenVoiceURI(nsString& aResult) const { + aResult = mChosenVoiceURI; +} + +void SpeechSynthesisUtterance::DispatchSpeechSynthesisEvent( + const nsAString& aEventType, uint32_t aCharIndex, + const Nullable& aCharLength, float aElapsedTime, + const nsAString& aName) { + SpeechSynthesisEventInit init; + init.mBubbles = false; + init.mCancelable = false; + init.mUtterance = this; + init.mCharIndex = aCharIndex; + init.mCharLength = aCharLength; + init.mElapsedTime = aElapsedTime; + init.mName = aName; + + RefPtr event = + SpeechSynthesisEvent::Constructor(this, aEventType, init); + DispatchTrustedEvent(event); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/SpeechSynthesisUtterance.h b/dom/media/webspeech/synth/SpeechSynthesisUtterance.h new file mode 100644 index 0000000000..17958a3b32 --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesisUtterance.h @@ -0,0 +1,115 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesisUtterance_h +#define mozilla_dom_SpeechSynthesisUtterance_h + +#include "mozilla/DOMEventTargetHelper.h" +#include "nsCOMPtr.h" +#include "nsString.h" +#include "js/TypeDecls.h" + +#include "nsSpeechTask.h" + +namespace mozilla::dom { + +class SpeechSynthesisVoice; +class SpeechSynthesis; +class nsSynthVoiceRegistry; + +class SpeechSynthesisUtterance final : public DOMEventTargetHelper { + friend class SpeechSynthesis; + friend class nsSpeechTask; + friend class nsSynthVoiceRegistry; + + public: + SpeechSynthesisUtterance(nsPIDOMWindowInner* aOwnerWindow, + const nsAString& aText); + + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechSynthesisUtterance, + DOMEventTargetHelper) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + static already_AddRefed Constructor( + GlobalObject& aGlobal, ErrorResult& aRv); + static already_AddRefed Constructor( + GlobalObject& aGlobal, const nsAString& aText, ErrorResult& aRv); + + void GetText(nsString& aResult) const; + + void SetText(const nsAString& aText); + + void GetLang(nsString& aResult) const; + + void SetLang(const nsAString& aLang); + + SpeechSynthesisVoice* GetVoice() const; + + void SetVoice(SpeechSynthesisVoice* aVoice); + + float Volume() const; + + void SetVolume(float aVolume); + + float Rate() const; + + void SetRate(float aRate); + + float Pitch() const; + + void SetPitch(float aPitch); + + void GetChosenVoiceURI(nsString& aResult) const; + + bool IsPaused() { return mPaused; } + + bool ShouldResistFingerprinting() const { + return mShouldResistFingerprinting; + } + + IMPL_EVENT_HANDLER(start) + IMPL_EVENT_HANDLER(end) + IMPL_EVENT_HANDLER(error) + IMPL_EVENT_HANDLER(pause) + IMPL_EVENT_HANDLER(resume) + IMPL_EVENT_HANDLER(mark) + IMPL_EVENT_HANDLER(boundary) + + private: + virtual ~SpeechSynthesisUtterance(); + + void DispatchSpeechSynthesisEvent(const nsAString& aEventType, + uint32_t aCharIndex, + const Nullable& aCharLength, + float aElapsedTime, const nsAString& aName); + + nsString mText; + + nsString mLang; + + float mVolume; + + float mRate; + + float mPitch; + + nsString mChosenVoiceURI; + + bool mPaused; + + RefPtr mVoice; + + bool mShouldResistFingerprinting; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/SpeechSynthesisVoice.cpp b/dom/media/webspeech/synth/SpeechSynthesisVoice.cpp new file mode 100644 index 0000000000..a309daca26 --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesisVoice.cpp @@ -0,0 +1,72 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechSynthesis.h" +#include "nsSynthVoiceRegistry.h" +#include "mozilla/dom/SpeechSynthesisVoiceBinding.h" + +namespace mozilla::dom { + +NS_IMPL_CYCLE_COLLECTION_WRAPPERCACHE(SpeechSynthesisVoice, mParent) +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechSynthesisVoice) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechSynthesisVoice) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechSynthesisVoice) + NS_WRAPPERCACHE_INTERFACE_MAP_ENTRY + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +SpeechSynthesisVoice::SpeechSynthesisVoice(nsISupports* aParent, + const nsAString& aUri) + : mParent(aParent), mUri(aUri) {} + +SpeechSynthesisVoice::~SpeechSynthesisVoice() = default; + +JSObject* SpeechSynthesisVoice::WrapObject(JSContext* aCx, + JS::Handle aGivenProto) { + return SpeechSynthesisVoice_Binding::Wrap(aCx, this, aGivenProto); +} + +nsISupports* SpeechSynthesisVoice::GetParentObject() const { return mParent; } + +void SpeechSynthesisVoice::GetVoiceURI(nsString& aRetval) const { + aRetval = mUri; +} + +void SpeechSynthesisVoice::GetName(nsString& aRetval) const { + DebugOnly rv = + nsSynthVoiceRegistry::GetInstance()->GetVoiceName(mUri, aRetval); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Failed to get SpeechSynthesisVoice.name"); +} + +void SpeechSynthesisVoice::GetLang(nsString& aRetval) const { + DebugOnly rv = + nsSynthVoiceRegistry::GetInstance()->GetVoiceLang(mUri, aRetval); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Failed to get SpeechSynthesisVoice.lang"); +} + +bool SpeechSynthesisVoice::LocalService() const { + bool isLocal; + DebugOnly rv = + nsSynthVoiceRegistry::GetInstance()->IsLocalVoice(mUri, &isLocal); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Failed to get SpeechSynthesisVoice.localService"); + + return isLocal; +} + +bool SpeechSynthesisVoice::Default() const { + bool isDefault; + DebugOnly rv = + nsSynthVoiceRegistry::GetInstance()->IsDefaultVoice(mUri, &isDefault); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Failed to get SpeechSynthesisVoice.default"); + + return isDefault; +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/SpeechSynthesisVoice.h b/dom/media/webspeech/synth/SpeechSynthesisVoice.h new file mode 100644 index 0000000000..079e5f49ea --- /dev/null +++ b/dom/media/webspeech/synth/SpeechSynthesisVoice.h @@ -0,0 +1,55 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesisVoice_h +#define mozilla_dom_SpeechSynthesisVoice_h + +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "js/TypeDecls.h" + +namespace mozilla::dom { + +class nsSynthVoiceRegistry; +class SpeechSynthesis; + +class SpeechSynthesisVoice final : public nsISupports, public nsWrapperCache { + friend class nsSynthVoiceRegistry; + friend class SpeechSynthesis; + + public: + SpeechSynthesisVoice(nsISupports* aParent, const nsAString& aUri); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(SpeechSynthesisVoice) + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, + JS::Handle aGivenProto) override; + + void GetVoiceURI(nsString& aRetval) const; + + void GetName(nsString& aRetval) const; + + void GetLang(nsString& aRetval) const; + + bool LocalService() const; + + bool Default() const; + + private: + virtual ~SpeechSynthesisVoice(); + + nsCOMPtr mParent; + + nsString mUri; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/android/SpeechSynthesisService.cpp b/dom/media/webspeech/synth/android/SpeechSynthesisService.cpp new file mode 100644 index 0000000000..1b6e4b6125 --- /dev/null +++ b/dom/media/webspeech/synth/android/SpeechSynthesisService.cpp @@ -0,0 +1,215 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechSynthesisService.h" + +#include + +#include "nsXULAppAPI.h" +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/jni/Utils.h" +#include "mozilla/Preferences.h" +#include "mozilla/StaticPrefs_media.h" + +#define ALOG(args...) \ + __android_log_print(ANDROID_LOG_INFO, "GeckoSpeechSynthesis", ##args) + +namespace mozilla { +namespace dom { + +StaticRefPtr SpeechSynthesisService::sSingleton; + +class AndroidSpeechCallback final : public nsISpeechTaskCallback { + public: + AndroidSpeechCallback() {} + + NS_DECL_ISUPPORTS + + NS_IMETHOD OnResume() override { return NS_OK; } + + NS_IMETHOD OnPause() override { return NS_OK; } + + NS_IMETHOD OnCancel() override { + java::SpeechSynthesisService::Stop(); + return NS_OK; + } + + NS_IMETHOD OnVolumeChanged(float aVolume) override { return NS_OK; } + + private: + ~AndroidSpeechCallback() {} +}; + +NS_IMPL_ISUPPORTS(AndroidSpeechCallback, nsISpeechTaskCallback) + +NS_IMPL_ISUPPORTS(SpeechSynthesisService, nsISpeechService) + +void SpeechSynthesisService::Setup() { + ALOG("SpeechSynthesisService::Setup"); + + if (!StaticPrefs::media_webspeech_synth_enabled() || + Preferences::GetBool("media.webspeech.synth.test")) { + return; + } + + if (!jni::IsAvailable()) { + NS_WARNING("Failed to initialize speech synthesis"); + return; + } + + Init(); + java::SpeechSynthesisService::InitSynth(); +} + +// nsISpeechService + +NS_IMETHODIMP +SpeechSynthesisService::Speak(const nsAString& aText, const nsAString& aUri, + float aVolume, float aRate, float aPitch, + nsISpeechTask* aTask) { + if (mTask) { + NS_WARNING("Service only supports one speech task at a time."); + return NS_ERROR_NOT_AVAILABLE; + } + + RefPtr callback = new AndroidSpeechCallback(); + nsresult rv = aTask->Setup(callback); + + if (NS_FAILED(rv)) { + return rv; + } + + jni::String::LocalRef utteranceId = + java::SpeechSynthesisService::Speak(aUri, aText, aRate, aPitch, aVolume); + if (!utteranceId) { + return NS_ERROR_NOT_AVAILABLE; + } + + mTaskUtteranceId = utteranceId->ToCString(); + mTask = aTask; + mTaskTextLength = aText.Length(); + mTaskTextOffset = 0; + + return NS_OK; +} + +SpeechSynthesisService* SpeechSynthesisService::GetInstance(bool aCreate) { + if (XRE_GetProcessType() != GeckoProcessType_Default) { + MOZ_ASSERT( + false, + "SpeechSynthesisService can only be started on main gecko process"); + return nullptr; + } + + if (!sSingleton && aCreate) { + sSingleton = new SpeechSynthesisService(); + sSingleton->Setup(); + ClearOnShutdown(&sSingleton); + } + + return sSingleton; +} + +already_AddRefed +SpeechSynthesisService::GetInstanceForService() { + MOZ_ASSERT(NS_IsMainThread()); + RefPtr sapiService = GetInstance(); + return sapiService.forget(); +} + +// JNI + +void SpeechSynthesisService::RegisterVoice(jni::String::Param aUri, + jni::String::Param aName, + jni::String::Param aLocale, + bool aIsNetwork, bool aIsDefault) { + nsSynthVoiceRegistry* registry = nsSynthVoiceRegistry::GetInstance(); + SpeechSynthesisService* service = SpeechSynthesisService::GetInstance(false); + // This service can only speak one utterance at a time, so we set + // aQueuesUtterances to true in order to track global state and schedule + // access to this service. + DebugOnly rv = + registry->AddVoice(service, aUri->ToString(), aName->ToString(), + aLocale->ToString(), !aIsNetwork, true); + + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Failed to add voice"); + + if (aIsDefault) { + DebugOnly rv = registry->SetDefaultVoice(aUri->ToString(), true); + + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Failed to set voice as default"); + } +} + +void SpeechSynthesisService::DoneRegisteringVoices() { + nsSynthVoiceRegistry* registry = nsSynthVoiceRegistry::GetInstance(); + registry->NotifyVoicesChanged(); +} + +void SpeechSynthesisService::DispatchStart(jni::String::Param aUtteranceId) { + if (sSingleton) { + MOZ_ASSERT(sSingleton->mTaskUtteranceId.Equals(aUtteranceId->ToCString())); + nsCOMPtr task = sSingleton->mTask; + if (task) { + sSingleton->mTaskStartTime = TimeStamp::Now(); + DebugOnly rv = task->DispatchStart(); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Unable to dispatch start"); + } + } +} + +void SpeechSynthesisService::DispatchEnd(jni::String::Param aUtteranceId) { + if (sSingleton) { + // In API older than 23, we will sometimes call this function + // without providing an utterance ID. + MOZ_ASSERT(!aUtteranceId || + sSingleton->mTaskUtteranceId.Equals(aUtteranceId->ToCString())); + nsCOMPtr task = sSingleton->mTask; + sSingleton->mTask = nullptr; + if (task) { + TimeStamp startTime = sSingleton->mTaskStartTime; + DebugOnly rv = + task->DispatchEnd((TimeStamp::Now() - startTime).ToSeconds(), + sSingleton->mTaskTextLength); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Unable to dispatch start"); + } + } +} + +void SpeechSynthesisService::DispatchError(jni::String::Param aUtteranceId) { + if (sSingleton) { + MOZ_ASSERT(sSingleton->mTaskUtteranceId.Equals(aUtteranceId->ToCString())); + nsCOMPtr task = sSingleton->mTask; + sSingleton->mTask = nullptr; + if (task) { + TimeStamp startTime = sSingleton->mTaskStartTime; + DebugOnly rv = + task->DispatchError((TimeStamp::Now() - startTime).ToSeconds(), + sSingleton->mTaskTextOffset); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Unable to dispatch start"); + } + } +} + +void SpeechSynthesisService::DispatchBoundary(jni::String::Param aUtteranceId, + int32_t aStart, int32_t aEnd) { + if (sSingleton) { + MOZ_ASSERT(sSingleton->mTaskUtteranceId.Equals(aUtteranceId->ToCString())); + nsCOMPtr task = sSingleton->mTask; + if (task) { + TimeStamp startTime = sSingleton->mTaskStartTime; + sSingleton->mTaskTextOffset = aStart; + DebugOnly rv = task->DispatchBoundary( + u"word"_ns, (TimeStamp::Now() - startTime).ToSeconds(), aStart, + aEnd - aStart, 1); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Unable to dispatch boundary"); + } + } +} + +} // namespace dom +} // namespace mozilla diff --git a/dom/media/webspeech/synth/android/SpeechSynthesisService.h b/dom/media/webspeech/synth/android/SpeechSynthesisService.h new file mode 100644 index 0000000000..98c5143cf6 --- /dev/null +++ b/dom/media/webspeech/synth/android/SpeechSynthesisService.h @@ -0,0 +1,68 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesisService_h +#define mozilla_dom_SpeechSynthesisService_h + +#include "nsISpeechService.h" +#include "mozilla/java/SpeechSynthesisServiceNatives.h" +#include "mozilla/StaticPtr.h" + +namespace mozilla { +namespace dom { + +class SpeechSynthesisService final + : public nsISpeechService, + public java::SpeechSynthesisService::Natives { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSISPEECHSERVICE + + SpeechSynthesisService(){}; + + void Setup(); + + static void DoneRegisteringVoices(); + + static void RegisterVoice(jni::String::Param aUri, jni::String::Param aName, + jni::String::Param aLocale, bool aIsNetwork, + bool aIsDefault); + + static void DispatchStart(jni::String::Param aUtteranceId); + + static void DispatchEnd(jni::String::Param aUtteranceId); + + static void DispatchError(jni::String::Param aUtteranceId); + + static void DispatchBoundary(jni::String::Param aUtteranceId, int32_t aStart, + int32_t aEnd); + + static SpeechSynthesisService* GetInstance(bool aCreate = true); + static already_AddRefed GetInstanceForService(); + + static StaticRefPtr sSingleton; + + private: + virtual ~SpeechSynthesisService(){}; + + nsCOMPtr mTask; + + // Unique ID assigned to utterance when it is sent to system service. + nsCString mTaskUtteranceId; + + // Time stamp from the moment the utterance is started. + TimeStamp mTaskStartTime; + + // Length of text of the utterance. + uint32_t mTaskTextLength; + + // Current offset in characters of what has been spoken. + uint32_t mTaskTextOffset; +}; + +} // namespace dom +} // namespace mozilla +#endif diff --git a/dom/media/webspeech/synth/android/components.conf b/dom/media/webspeech/synth/android/components.conf new file mode 100644 index 0000000000..4c35954fcc --- /dev/null +++ b/dom/media/webspeech/synth/android/components.conf @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{311b2dab-f4d3-4be4-8123-6732313d95c2}', + 'contract_ids': ['@mozilla.org/androidspeechsynth;1'], + 'singleton': True, + 'type': 'mozilla::dom::SpeechSynthesisService', + 'headers': ['/dom/media/webspeech/synth/android/SpeechSynthesisService.h'], + 'constructor': 'mozilla::dom::SpeechSynthesisService::GetInstanceForService', + 'categories': {"speech-synth-started": 'Android Speech Synth'}, + }, +] diff --git a/dom/media/webspeech/synth/android/moz.build b/dom/media/webspeech/synth/android/moz.build new file mode 100644 index 0000000000..348c157f3c --- /dev/null +++ b/dom/media/webspeech/synth/android/moz.build @@ -0,0 +1,19 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS.mozilla.dom += ["SpeechSynthesisService.h"] + +UNIFIED_SOURCES += [ + "SpeechSynthesisService.cpp", +] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" diff --git a/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.h b/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.h new file mode 100644 index 0000000000..6148d59c92 --- /dev/null +++ b/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.h @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_OsxSpeechSynthesizerService_h +#define mozilla_dom_OsxSpeechSynthesizerService_h + +#include "nsISpeechService.h" +#include "nsIObserver.h" +#include "mozilla/StaticPtr.h" + +namespace mozilla { +namespace dom { + +class OSXSpeechSynthesizerService final : public nsISpeechService, + public nsIObserver { + public: + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSISPEECHSERVICE + NS_DECL_NSIOBSERVER + + bool Init(); + + static OSXSpeechSynthesizerService* GetInstance(); + static already_AddRefed GetInstanceForService(); + + private: + OSXSpeechSynthesizerService(); + virtual ~OSXSpeechSynthesizerService() = default; + + bool RegisterVoices(); + + bool mInitialized; + static mozilla::StaticRefPtr sSingleton; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.mm b/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.mm new file mode 100644 index 0000000000..a815c68644 --- /dev/null +++ b/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.mm @@ -0,0 +1,431 @@ +/* -*- Mode: Objective-C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=2 sw=2 et tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.h" +#include "nsServiceManagerUtils.h" +#include "nsObjCExceptions.h" +#include "nsCocoaUtils.h" +#include "nsIThread.h" +#include "nsThreadUtils.h" +#include "nsXULAppAPI.h" +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/dom/nsSpeechTask.h" +#include "mozilla/Preferences.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/Assertions.h" +#include "OSXSpeechSynthesizerService.h" + +#import + +@class SpeechDelegate; + +// We can escape the default delimiters ("[[" and "]]") by temporarily +// changing the delimiters just before they appear, and changing them back +// just after. +#define DLIM_ESCAPE_START "[[dlim (( ))]]" +#define DLIM_ESCAPE_END "((dlim [[ ]]))" + +using namespace mozilla; + +class SpeechTaskCallback final : public nsISpeechTaskCallback { + public: + SpeechTaskCallback(nsISpeechTask* aTask, NSSpeechSynthesizer* aSynth, + const nsTArray& aOffsets); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SpeechTaskCallback, nsISpeechTaskCallback) + + NS_DECL_NSISPEECHTASKCALLBACK + + void OnWillSpeakWord(uint32_t aIndex, uint32_t aLength); + void OnError(uint32_t aIndex); + void OnDidFinishSpeaking(); + + private: + virtual ~SpeechTaskCallback(); + + float GetTimeDurationFromStart(); + + nsCOMPtr mTask; + NSSpeechSynthesizer* mSpeechSynthesizer; + SpeechDelegate* mDelegate; + TimeStamp mStartingTime; + uint32_t mCurrentIndex; + nsTArray mOffsets; +}; + +@interface SpeechDelegate : NSObject { + @private + SpeechTaskCallback* mCallback; +} + +- (id)initWithCallback:(SpeechTaskCallback*)aCallback; +@end + +@implementation SpeechDelegate +- (id)initWithCallback:(SpeechTaskCallback*)aCallback { + [super init]; + mCallback = aCallback; + return self; +} + +- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender + willSpeakWord:(NSRange)aRange + ofString:(NSString*)aString { + mCallback->OnWillSpeakWord(aRange.location, aRange.length); +} + +- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender didFinishSpeaking:(BOOL)aFinishedSpeaking { + mCallback->OnDidFinishSpeaking(); +} + +- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender + didEncounterErrorAtIndex:(NSUInteger)aCharacterIndex + ofString:(NSString*)aString + message:(NSString*)aMessage { + mCallback->OnError(aCharacterIndex); +} +@end + +NS_IMPL_CYCLE_COLLECTION(SpeechTaskCallback, mTask); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechTaskCallback) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechTaskCallback) + +SpeechTaskCallback::SpeechTaskCallback(nsISpeechTask* aTask, NSSpeechSynthesizer* aSynth, + const nsTArray& aOffsets) + : mTask(aTask), mSpeechSynthesizer(aSynth), mCurrentIndex(0), mOffsets(aOffsets.Clone()) { + mDelegate = [[SpeechDelegate alloc] initWithCallback:this]; + [mSpeechSynthesizer setDelegate:mDelegate]; + mStartingTime = TimeStamp::Now(); +} + +SpeechTaskCallback::~SpeechTaskCallback() { + [mSpeechSynthesizer setDelegate:nil]; + [mDelegate release]; + [mSpeechSynthesizer release]; +} + +NS_IMETHODIMP +SpeechTaskCallback::OnCancel() { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + [mSpeechSynthesizer stopSpeaking]; + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +NS_IMETHODIMP +SpeechTaskCallback::OnPause() { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + [mSpeechSynthesizer pauseSpeakingAtBoundary:NSSpeechImmediateBoundary]; + if (!mTask) { + // When calling pause() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchPause(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +NS_IMETHODIMP +SpeechTaskCallback::OnResume() { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + [mSpeechSynthesizer continueSpeaking]; + if (!mTask) { + // When calling resume() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchResume(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +NS_IMETHODIMP +SpeechTaskCallback::OnVolumeChanged(float aVolume) { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + [mSpeechSynthesizer setObject:[NSNumber numberWithFloat:aVolume] + forProperty:NSSpeechVolumeProperty + error:nil]; + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +float SpeechTaskCallback::GetTimeDurationFromStart() { + TimeDuration duration = TimeStamp::Now() - mStartingTime; + return duration.ToSeconds(); +} + +void SpeechTaskCallback::OnWillSpeakWord(uint32_t aIndex, uint32_t aLength) { + mCurrentIndex = aIndex < mOffsets.Length() ? mOffsets[aIndex] : mCurrentIndex; + if (!mTask) { + return; + } + mTask->DispatchBoundary(u"word"_ns, GetTimeDurationFromStart(), mCurrentIndex, aLength, 1); +} + +void SpeechTaskCallback::OnError(uint32_t aIndex) { + if (!mTask) { + return; + } + mTask->DispatchError(GetTimeDurationFromStart(), aIndex); +} + +void SpeechTaskCallback::OnDidFinishSpeaking() { + mTask->DispatchEnd(GetTimeDurationFromStart(), mCurrentIndex); + // no longer needed + [mSpeechSynthesizer setDelegate:nil]; + mTask = nullptr; +} + +namespace mozilla { +namespace dom { + +struct OSXVoice { + OSXVoice() : mIsDefault(false) {} + + nsString mUri; + nsString mName; + nsString mLocale; + bool mIsDefault; +}; + +class RegisterVoicesRunnable final : public Runnable { + public: + RegisterVoicesRunnable(OSXSpeechSynthesizerService* aSpeechService, nsTArray& aList) + : Runnable("RegisterVoicesRunnable"), mSpeechService(aSpeechService), mVoices(aList) {} + + NS_IMETHOD Run() override; + + private: + ~RegisterVoicesRunnable() override = default; + + // This runnable always use sync mode. It is unnecesarry to reference object + OSXSpeechSynthesizerService* mSpeechService; + nsTArray& mVoices; +}; + +NS_IMETHODIMP +RegisterVoicesRunnable::Run() { + nsresult rv; + nsCOMPtr registry = do_GetService(NS_SYNTHVOICEREGISTRY_CONTRACTID, &rv); + if (!registry) { + return rv; + } + + for (OSXVoice voice : mVoices) { + rv = registry->AddVoice(mSpeechService, voice.mUri, voice.mName, voice.mLocale, true, false); + if (NS_WARN_IF(NS_FAILED(rv))) { + continue; + } + + if (voice.mIsDefault) { + registry->SetDefaultVoice(voice.mUri, true); + } + } + + registry->NotifyVoicesChanged(); + + return NS_OK; +} + +class EnumVoicesRunnable final : public Runnable { + public: + explicit EnumVoicesRunnable(OSXSpeechSynthesizerService* aSpeechService) + : Runnable("EnumVoicesRunnable"), mSpeechService(aSpeechService) {} + + NS_IMETHOD Run() override; + + private: + ~EnumVoicesRunnable() override = default; + + RefPtr mSpeechService; +}; + +NS_IMETHODIMP +EnumVoicesRunnable::Run() { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + AutoTArray list; + + NSArray* voices = [NSSpeechSynthesizer availableVoices]; + NSString* defaultVoice = [NSSpeechSynthesizer defaultVoice]; + + for (NSString* voice in voices) { + OSXVoice item; + + NSDictionary* attr = [NSSpeechSynthesizer attributesForVoice:voice]; + + nsAutoString identifier; + nsCocoaUtils::GetStringForNSString([attr objectForKey:NSVoiceIdentifier], identifier); + + nsCocoaUtils::GetStringForNSString([attr objectForKey:NSVoiceName], item.mName); + + nsCocoaUtils::GetStringForNSString([attr objectForKey:NSVoiceLocaleIdentifier], item.mLocale); + item.mLocale.ReplaceChar('_', '-'); + + item.mUri.AssignLiteral("urn:moz-tts:osx:"); + item.mUri.Append(identifier); + + if ([voice isEqualToString:defaultVoice]) { + item.mIsDefault = true; + } + + list.AppendElement(item); + } + + RefPtr runnable = new RegisterVoicesRunnable(mSpeechService, list); + NS_DispatchAndSpinEventLoopUntilComplete("EnumVoicesRunnable"_ns, + GetMainThreadSerialEventTarget(), runnable.forget()); + + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +StaticRefPtr OSXSpeechSynthesizerService::sSingleton; + +NS_INTERFACE_MAP_BEGIN(OSXSpeechSynthesizerService) + NS_INTERFACE_MAP_ENTRY(nsISpeechService) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechService) +NS_INTERFACE_MAP_END + +NS_IMPL_ADDREF(OSXSpeechSynthesizerService) +NS_IMPL_RELEASE(OSXSpeechSynthesizerService) + +OSXSpeechSynthesizerService::OSXSpeechSynthesizerService() : mInitialized(false) {} + +bool OSXSpeechSynthesizerService::Init() { + if (Preferences::GetBool("media.webspeech.synth.test") || + !StaticPrefs::media_webspeech_synth_enabled()) { + // When test is enabled, we shouldn't add OS backend (Bug 1160844) + return false; + } + + nsCOMPtr thread; + if (NS_FAILED(NS_NewNamedThread("SpeechWorker", getter_AddRefs(thread)))) { + return false; + } + + // Get all the voices and register in the SynthVoiceRegistry + nsCOMPtr runnable = new EnumVoicesRunnable(this); + thread->Dispatch(runnable, NS_DISPATCH_NORMAL); + + mInitialized = true; + return true; +} + +NS_IMETHODIMP +OSXSpeechSynthesizerService::Speak(const nsAString& aText, const nsAString& aUri, float aVolume, + float aRate, float aPitch, nsISpeechTask* aTask) { + NS_OBJC_BEGIN_TRY_BLOCK_RETURN; + + MOZ_ASSERT(StringBeginsWith(aUri, u"urn:moz-tts:osx:"_ns), + "OSXSpeechSynthesizerService doesn't allow this voice URI"); + + NSSpeechSynthesizer* synth = [[NSSpeechSynthesizer alloc] init]; + // strlen("urn:moz-tts:osx:") == 16 + NSString* identifier = nsCocoaUtils::ToNSString(Substring(aUri, 16)); + [synth setVoice:identifier]; + + // default rate is 180-220 + [synth setObject:[NSNumber numberWithInt:aRate * 200] forProperty:NSSpeechRateProperty error:nil]; + // volume allows 0.0-1.0 + [synth setObject:[NSNumber numberWithFloat:aVolume] forProperty:NSSpeechVolumeProperty error:nil]; + // Use default pitch value to calculate this + NSNumber* defaultPitch = [synth objectForProperty:NSSpeechPitchBaseProperty error:nil]; + if (defaultPitch) { + int newPitch = [defaultPitch intValue] * (aPitch / 2 + 0.5); + [synth setObject:[NSNumber numberWithInt:newPitch] + forProperty:NSSpeechPitchBaseProperty + error:nil]; + } + + nsAutoString escapedText; + // We need to map the the offsets from the given text to the escaped text. + // The index of the offsets array is the position in the escaped text, + // the element value is the position in the user-supplied text. + nsTArray offsets; + offsets.SetCapacity(aText.Length()); + + // This loop looks for occurances of "[[" or "]]", escapes them, and + // populates the offsets array to supply a map to the original offsets. + for (size_t i = 0; i < aText.Length(); i++) { + if (aText.Length() > i + 1 && + ((aText[i] == ']' && aText[i + 1] == ']') || (aText[i] == '[' && aText[i + 1] == '['))) { + escapedText.AppendLiteral(DLIM_ESCAPE_START); + offsets.AppendElements(strlen(DLIM_ESCAPE_START)); + escapedText.Append(aText[i]); + offsets.AppendElement(i); + escapedText.Append(aText[++i]); + offsets.AppendElement(i); + escapedText.AppendLiteral(DLIM_ESCAPE_END); + offsets.AppendElements(strlen(DLIM_ESCAPE_END)); + } else { + escapedText.Append(aText[i]); + offsets.AppendElement(i); + } + } + + RefPtr callback = new SpeechTaskCallback(aTask, synth, offsets); + nsresult rv = aTask->Setup(callback); + NS_ENSURE_SUCCESS(rv, rv); + + NSString* text = nsCocoaUtils::ToNSString(escapedText); + BOOL success = [synth startSpeakingString:text]; + NS_ENSURE_TRUE(success, NS_ERROR_FAILURE); + + aTask->DispatchStart(); + return NS_OK; + + NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE); +} + +NS_IMETHODIMP +OSXSpeechSynthesizerService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + return NS_OK; +} + +OSXSpeechSynthesizerService* OSXSpeechSynthesizerService::GetInstance() { + MOZ_ASSERT(NS_IsMainThread()); + if (XRE_GetProcessType() != GeckoProcessType_Default) { + return nullptr; + } + + if (!sSingleton) { + RefPtr speechService = new OSXSpeechSynthesizerService(); + if (speechService->Init()) { + sSingleton = speechService; + ClearOnShutdown(&sSingleton); + } + } + return sSingleton; +} + +already_AddRefed OSXSpeechSynthesizerService::GetInstanceForService() { + RefPtr speechService = GetInstance(); + return speechService.forget(); +} + +} // namespace dom +} // namespace mozilla diff --git a/dom/media/webspeech/synth/cocoa/components.conf b/dom/media/webspeech/synth/cocoa/components.conf new file mode 100644 index 0000000000..c9b0fa5ef0 --- /dev/null +++ b/dom/media/webspeech/synth/cocoa/components.conf @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{914e73b4-6337-4bef-97f3-4d069e053a12}', + 'contract_ids': ['@mozilla.org/synthsystem;1'], + 'singleton': True, + 'type': 'mozilla::dom::OSXSpeechSynthesizerService', + 'headers': ['/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.h'], + 'constructor': 'mozilla::dom::OSXSpeechSynthesizerService::GetInstanceForService', + 'categories': {"speech-synth-started": 'OSX Speech Synth'}, + }, +] diff --git a/dom/media/webspeech/synth/cocoa/moz.build b/dom/media/webspeech/synth/cocoa/moz.build new file mode 100644 index 0000000000..4d59f7a389 --- /dev/null +++ b/dom/media/webspeech/synth/cocoa/moz.build @@ -0,0 +1,15 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + "OSXSpeechSynthesizerService.mm", +] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +FINAL_LIBRARY = "xul" diff --git a/dom/media/webspeech/synth/crashtests/1230428.html b/dom/media/webspeech/synth/crashtests/1230428.html new file mode 100644 index 0000000000..40fa000710 --- /dev/null +++ b/dom/media/webspeech/synth/crashtests/1230428.html @@ -0,0 +1,32 @@ + + + + + + + + + diff --git a/dom/media/webspeech/synth/crashtests/crashtests.list b/dom/media/webspeech/synth/crashtests/crashtests.list new file mode 100644 index 0000000000..07e931c929 --- /dev/null +++ b/dom/media/webspeech/synth/crashtests/crashtests.list @@ -0,0 +1 @@ +skip-if(!cocoaWidget) pref(media.webspeech.synth.enabled,true) load 1230428.html # bug 1230428 diff --git a/dom/media/webspeech/synth/ipc/PSpeechSynthesis.ipdl b/dom/media/webspeech/synth/ipc/PSpeechSynthesis.ipdl new file mode 100644 index 0000000000..38e360bf4c --- /dev/null +++ b/dom/media/webspeech/synth/ipc/PSpeechSynthesis.ipdl @@ -0,0 +1,50 @@ +/* -*- Mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; tab-width: 40 -*- */ +/* vim: set ts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +include protocol PContent; +include protocol PSpeechSynthesisRequest; + +namespace mozilla { +namespace dom { + +struct RemoteVoice { + nsString voiceURI; + nsString name; + nsString lang; + bool localService; + bool queued; +}; + +[ManualDealloc] +sync protocol PSpeechSynthesis +{ + manager PContent; + manages PSpeechSynthesisRequest; + +child: + + async VoiceAdded(RemoteVoice aVoice); + + async VoiceRemoved(nsString aUri); + + async SetDefaultVoice(nsString aUri, bool aIsDefault); + + async IsSpeakingChanged(bool aIsSpeaking); + + async NotifyVoicesChanged(); + + async InitialVoicesAndState(RemoteVoice[] aVoices, nsString[] aDefaults, + bool aIsSpeaking); + +parent: + async __delete__(); + + async PSpeechSynthesisRequest(nsString aText, nsString aUri, nsString aLang, + float aVolume, float aRate, float aPitch, bool aShouldResistFingerprinting); +}; + +} // namespace dom +} // namespace mozilla diff --git a/dom/media/webspeech/synth/ipc/PSpeechSynthesisRequest.ipdl b/dom/media/webspeech/synth/ipc/PSpeechSynthesisRequest.ipdl new file mode 100644 index 0000000000..8543eebc5b --- /dev/null +++ b/dom/media/webspeech/synth/ipc/PSpeechSynthesisRequest.ipdl @@ -0,0 +1,48 @@ +/* -*- Mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; tab-width: 40 -*- */ +/* vim: set ts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +include protocol PSpeechSynthesis; + +namespace mozilla { +namespace dom { + +[ManualDealloc, ChildImpl=virtual, ParentImpl=virtual] +async protocol PSpeechSynthesisRequest +{ + manager PSpeechSynthesis; + + parent: + + async __delete__(); + + async Pause(); + + async Resume(); + + async Cancel(); + + async ForceEnd(); + + async SetAudioOutputVolume(float aVolume); + + child: + + async OnEnd(bool aIsError, float aElapsedTime, uint32_t aCharIndex); + + async OnStart(nsString aUri); + + async OnPause(float aElapsedTime, uint32_t aCharIndex); + + async OnResume(float aElapsedTime, uint32_t aCharIndex); + + async OnBoundary(nsString aName, float aElapsedTime, uint32_t aCharIndex, + uint32_t aCharLength, uint8_t argc); + + async OnMark(nsString aName, float aElapsedTime, uint32_t aCharIndex); +}; + +} // namespace dom +} // namespace mozilla diff --git a/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.cpp b/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.cpp new file mode 100644 index 0000000000..9a9e9b6fe2 --- /dev/null +++ b/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.cpp @@ -0,0 +1,169 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechSynthesisChild.h" +#include "nsSynthVoiceRegistry.h" + +namespace mozilla::dom { + +SpeechSynthesisChild::SpeechSynthesisChild() { + MOZ_COUNT_CTOR(SpeechSynthesisChild); +} + +SpeechSynthesisChild::~SpeechSynthesisChild() { + MOZ_COUNT_DTOR(SpeechSynthesisChild); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvInitialVoicesAndState( + nsTArray&& aVoices, nsTArray&& aDefaults, + const bool& aIsSpeaking) { + nsSynthVoiceRegistry::RecvInitialVoicesAndState(aVoices, aDefaults, + aIsSpeaking); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvVoiceAdded( + const RemoteVoice& aVoice) { + nsSynthVoiceRegistry::RecvAddVoice(aVoice); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvVoiceRemoved( + const nsAString& aUri) { + nsSynthVoiceRegistry::RecvRemoveVoice(aUri); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvSetDefaultVoice( + const nsAString& aUri, const bool& aIsDefault) { + nsSynthVoiceRegistry::RecvSetDefaultVoice(aUri, aIsDefault); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvIsSpeakingChanged( + const bool& aIsSpeaking) { + nsSynthVoiceRegistry::RecvIsSpeakingChanged(aIsSpeaking); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisChild::RecvNotifyVoicesChanged() { + nsSynthVoiceRegistry::RecvNotifyVoicesChanged(); + return IPC_OK(); +} + +PSpeechSynthesisRequestChild* +SpeechSynthesisChild::AllocPSpeechSynthesisRequestChild( + const nsAString& aText, const nsAString& aLang, const nsAString& aUri, + const float& aVolume, const float& aRate, const float& aPitch, + const bool& aShouldResistFingerprinting) { + MOZ_CRASH("Caller is supposed to manually construct a request!"); +} + +bool SpeechSynthesisChild::DeallocPSpeechSynthesisRequestChild( + PSpeechSynthesisRequestChild* aActor) { + delete aActor; + return true; +} + +// SpeechSynthesisRequestChild + +SpeechSynthesisRequestChild::SpeechSynthesisRequestChild(SpeechTaskChild* aTask) + : mTask(aTask) { + mTask->mActor = this; + MOZ_COUNT_CTOR(SpeechSynthesisRequestChild); +} + +SpeechSynthesisRequestChild::~SpeechSynthesisRequestChild() { + MOZ_COUNT_DTOR(SpeechSynthesisRequestChild); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnStart( + const nsAString& aUri) { + mTask->DispatchStartImpl(aUri); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnEnd( + const bool& aIsError, const float& aElapsedTime, + const uint32_t& aCharIndex) { + SpeechSynthesisRequestChild* actor = mTask->mActor; + mTask->mActor = nullptr; + + if (aIsError) { + mTask->DispatchErrorImpl(aElapsedTime, aCharIndex); + } else { + mTask->DispatchEndImpl(aElapsedTime, aCharIndex); + } + + SpeechSynthesisRequestChild::Send__delete__(actor); + + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnPause( + const float& aElapsedTime, const uint32_t& aCharIndex) { + mTask->DispatchPauseImpl(aElapsedTime, aCharIndex); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnResume( + const float& aElapsedTime, const uint32_t& aCharIndex) { + mTask->DispatchResumeImpl(aElapsedTime, aCharIndex); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnBoundary( + const nsAString& aName, const float& aElapsedTime, + const uint32_t& aCharIndex, const uint32_t& aCharLength, + const uint8_t& argc) { + mTask->DispatchBoundaryImpl(aName, aElapsedTime, aCharIndex, aCharLength, + argc); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestChild::RecvOnMark( + const nsAString& aName, const float& aElapsedTime, + const uint32_t& aCharIndex) { + mTask->DispatchMarkImpl(aName, aElapsedTime, aCharIndex); + return IPC_OK(); +} + +// SpeechTaskChild + +SpeechTaskChild::SpeechTaskChild(SpeechSynthesisUtterance* aUtterance, + bool aShouldResistFingerprinting) + : nsSpeechTask(aUtterance, aShouldResistFingerprinting), mActor(nullptr) {} + +NS_IMETHODIMP +SpeechTaskChild::Setup(nsISpeechTaskCallback* aCallback) { + MOZ_CRASH("Should never be called from child"); +} + +void SpeechTaskChild::Pause() { + MOZ_ASSERT(mActor); + mActor->SendPause(); +} + +void SpeechTaskChild::Resume() { + MOZ_ASSERT(mActor); + mActor->SendResume(); +} + +void SpeechTaskChild::Cancel() { + MOZ_ASSERT(mActor); + mActor->SendCancel(); +} + +void SpeechTaskChild::ForceEnd() { + MOZ_ASSERT(mActor); + mActor->SendForceEnd(); +} + +void SpeechTaskChild::SetAudioOutputVolume(float aVolume) { + if (mActor) { + mActor->SendSetAudioOutputVolume(aVolume); + } +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.h b/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.h new file mode 100644 index 0000000000..f57582932a --- /dev/null +++ b/dom/media/webspeech/synth/ipc/SpeechSynthesisChild.h @@ -0,0 +1,107 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesisChild_h +#define mozilla_dom_SpeechSynthesisChild_h + +#include "mozilla/Attributes.h" +#include "mozilla/dom/PSpeechSynthesisChild.h" +#include "mozilla/dom/PSpeechSynthesisRequestChild.h" +#include "nsSpeechTask.h" + +namespace mozilla::dom { + +class nsSynthVoiceRegistry; +class SpeechSynthesisRequestChild; +class SpeechTaskChild; + +class SpeechSynthesisChild : public PSpeechSynthesisChild { + friend class nsSynthVoiceRegistry; + friend class PSpeechSynthesisChild; + + public: + mozilla::ipc::IPCResult RecvInitialVoicesAndState( + nsTArray&& aVoices, nsTArray&& aDefaults, + const bool& aIsSpeaking); + + mozilla::ipc::IPCResult RecvVoiceAdded(const RemoteVoice& aVoice); + + mozilla::ipc::IPCResult RecvVoiceRemoved(const nsAString& aUri); + + mozilla::ipc::IPCResult RecvSetDefaultVoice(const nsAString& aUri, + const bool& aIsDefault); + + mozilla::ipc::IPCResult RecvIsSpeakingChanged(const bool& aIsSpeaking); + + mozilla::ipc::IPCResult RecvNotifyVoicesChanged(); + + protected: + SpeechSynthesisChild(); + virtual ~SpeechSynthesisChild(); + + PSpeechSynthesisRequestChild* AllocPSpeechSynthesisRequestChild( + const nsAString& aLang, const nsAString& aUri, const nsAString& aText, + const float& aVolume, const float& aPitch, const float& aRate, + const bool& aShouldResistFingerprinting); + bool DeallocPSpeechSynthesisRequestChild( + PSpeechSynthesisRequestChild* aActor); +}; + +class SpeechSynthesisRequestChild : public PSpeechSynthesisRequestChild { + public: + explicit SpeechSynthesisRequestChild(SpeechTaskChild* aTask); + virtual ~SpeechSynthesisRequestChild(); + + protected: + mozilla::ipc::IPCResult RecvOnStart(const nsAString& aUri) override; + + mozilla::ipc::IPCResult RecvOnEnd(const bool& aIsError, + const float& aElapsedTime, + const uint32_t& aCharIndex) override; + + mozilla::ipc::IPCResult RecvOnPause(const float& aElapsedTime, + const uint32_t& aCharIndex) override; + + mozilla::ipc::IPCResult RecvOnResume(const float& aElapsedTime, + const uint32_t& aCharIndex) override; + + mozilla::ipc::IPCResult RecvOnBoundary(const nsAString& aName, + const float& aElapsedTime, + const uint32_t& aCharIndex, + const uint32_t& aCharLength, + const uint8_t& argc) override; + + mozilla::ipc::IPCResult RecvOnMark(const nsAString& aName, + const float& aElapsedTime, + const uint32_t& aCharIndex) override; + + RefPtr mTask; +}; + +class SpeechTaskChild : public nsSpeechTask { + friend class SpeechSynthesisRequestChild; + + public: + explicit SpeechTaskChild(SpeechSynthesisUtterance* aUtterance, + bool aShouldResistFingerprinting); + + NS_IMETHOD Setup(nsISpeechTaskCallback* aCallback) override; + + void Pause() override; + + void Resume() override; + + void Cancel() override; + + void ForceEnd() override; + + void SetAudioOutputVolume(float aVolume) override; + + private: + SpeechSynthesisRequestChild* mActor; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.cpp b/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.cpp new file mode 100644 index 0000000000..a9eb53c5b7 --- /dev/null +++ b/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.cpp @@ -0,0 +1,221 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechSynthesisParent.h" +#include "nsSynthVoiceRegistry.h" + +namespace mozilla::dom { + +SpeechSynthesisParent::SpeechSynthesisParent() { + MOZ_COUNT_CTOR(SpeechSynthesisParent); +} + +SpeechSynthesisParent::~SpeechSynthesisParent() { + MOZ_COUNT_DTOR(SpeechSynthesisParent); +} + +void SpeechSynthesisParent::ActorDestroy(ActorDestroyReason aWhy) { + // Implement me! Bug 1005141 +} + +bool SpeechSynthesisParent::SendInit() { + return nsSynthVoiceRegistry::GetInstance()->SendInitialVoicesAndState(this); +} + +PSpeechSynthesisRequestParent* +SpeechSynthesisParent::AllocPSpeechSynthesisRequestParent( + const nsAString& aText, const nsAString& aLang, const nsAString& aUri, + const float& aVolume, const float& aRate, const float& aPitch, + const bool& aShouldResistFingerprinting) { + RefPtr task = + new SpeechTaskParent(aVolume, aText, aShouldResistFingerprinting); + SpeechSynthesisRequestParent* actor = new SpeechSynthesisRequestParent(task); + return actor; +} + +bool SpeechSynthesisParent::DeallocPSpeechSynthesisRequestParent( + PSpeechSynthesisRequestParent* aActor) { + delete aActor; + return true; +} + +mozilla::ipc::IPCResult +SpeechSynthesisParent::RecvPSpeechSynthesisRequestConstructor( + PSpeechSynthesisRequestParent* aActor, const nsAString& aText, + const nsAString& aLang, const nsAString& aUri, const float& aVolume, + const float& aRate, const float& aPitch, + const bool& aShouldResistFingerprinting) { + MOZ_ASSERT(aActor); + SpeechSynthesisRequestParent* actor = + static_cast(aActor); + nsSynthVoiceRegistry::GetInstance()->Speak(aText, aLang, aUri, aVolume, aRate, + aPitch, actor->mTask); + return IPC_OK(); +} + +// SpeechSynthesisRequestParent + +SpeechSynthesisRequestParent::SpeechSynthesisRequestParent( + SpeechTaskParent* aTask) + : mTask(aTask) { + mTask->mActor = this; + MOZ_COUNT_CTOR(SpeechSynthesisRequestParent); +} + +SpeechSynthesisRequestParent::~SpeechSynthesisRequestParent() { + if (mTask) { + mTask->mActor = nullptr; + // If we still have a task, cancel it. + mTask->Cancel(); + } + MOZ_COUNT_DTOR(SpeechSynthesisRequestParent); +} + +void SpeechSynthesisRequestParent::ActorDestroy(ActorDestroyReason aWhy) { + // Implement me! Bug 1005141 +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::RecvPause() { + MOZ_ASSERT(mTask); + mTask->Pause(); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::Recv__delete__() { + MOZ_ASSERT(mTask); + mTask->mActor = nullptr; + mTask = nullptr; + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::RecvResume() { + MOZ_ASSERT(mTask); + mTask->Resume(); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::RecvCancel() { + MOZ_ASSERT(mTask); + mTask->Cancel(); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::RecvForceEnd() { + MOZ_ASSERT(mTask); + mTask->ForceEnd(); + return IPC_OK(); +} + +mozilla::ipc::IPCResult SpeechSynthesisRequestParent::RecvSetAudioOutputVolume( + const float& aVolume) { + MOZ_ASSERT(mTask); + mTask->SetAudioOutputVolume(aVolume); + return IPC_OK(); +} + +// SpeechTaskParent + +nsresult SpeechTaskParent::DispatchStartImpl(const nsAString& aUri) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnStart(aUri)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchEndImpl(float aElapsedTime, + uint32_t aCharIndex) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnEnd(false, aElapsedTime, aCharIndex)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchPauseImpl(float aElapsedTime, + uint32_t aCharIndex) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnPause(aElapsedTime, aCharIndex)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchResumeImpl(float aElapsedTime, + uint32_t aCharIndex) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnResume(aElapsedTime, aCharIndex)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchErrorImpl(float aElapsedTime, + uint32_t aCharIndex) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnEnd(true, aElapsedTime, aCharIndex)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchBoundaryImpl(const nsAString& aName, + float aElapsedTime, + uint32_t aCharIndex, + uint32_t aCharLength, + uint8_t argc) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnBoundary(aName, aElapsedTime, aCharIndex, + aCharLength, argc)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +nsresult SpeechTaskParent::DispatchMarkImpl(const nsAString& aName, + float aElapsedTime, + uint32_t aCharIndex) { + if (!mActor) { + // Child is already gone. + return NS_OK; + } + + if (NS_WARN_IF(!(mActor->SendOnMark(aName, aElapsedTime, aCharIndex)))) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.h b/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.h new file mode 100644 index 0000000000..6ae4d38bbc --- /dev/null +++ b/dom/media/webspeech/synth/ipc/SpeechSynthesisParent.h @@ -0,0 +1,102 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechSynthesisParent_h +#define mozilla_dom_SpeechSynthesisParent_h + +#include "mozilla/dom/PSpeechSynthesisParent.h" +#include "mozilla/dom/PSpeechSynthesisRequestParent.h" +#include "nsSpeechTask.h" + +namespace mozilla::dom { + +class ContentParent; +class SpeechTaskParent; +class SpeechSynthesisRequestParent; + +class SpeechSynthesisParent : public PSpeechSynthesisParent { + friend class ContentParent; + friend class SpeechSynthesisRequestParent; + friend class PSpeechSynthesisParent; + + public: + void ActorDestroy(ActorDestroyReason aWhy) override; + + bool SendInit(); + + protected: + SpeechSynthesisParent(); + virtual ~SpeechSynthesisParent(); + PSpeechSynthesisRequestParent* AllocPSpeechSynthesisRequestParent( + const nsAString& aText, const nsAString& aLang, const nsAString& aUri, + const float& aVolume, const float& aRate, const float& aPitch, + const bool& aShouldResistFingerprinting); + + bool DeallocPSpeechSynthesisRequestParent( + PSpeechSynthesisRequestParent* aActor); + + mozilla::ipc::IPCResult RecvPSpeechSynthesisRequestConstructor( + PSpeechSynthesisRequestParent* aActor, const nsAString& aText, + const nsAString& aLang, const nsAString& aUri, const float& aVolume, + const float& aRate, const float& aPitch, + const bool& aShouldResistFingerprinting) override; +}; + +class SpeechSynthesisRequestParent : public PSpeechSynthesisRequestParent { + public: + explicit SpeechSynthesisRequestParent(SpeechTaskParent* aTask); + virtual ~SpeechSynthesisRequestParent(); + + RefPtr mTask; + + protected: + void ActorDestroy(ActorDestroyReason aWhy) override; + + mozilla::ipc::IPCResult RecvPause() override; + + mozilla::ipc::IPCResult RecvResume() override; + + mozilla::ipc::IPCResult RecvCancel() override; + + mozilla::ipc::IPCResult RecvForceEnd() override; + + mozilla::ipc::IPCResult RecvSetAudioOutputVolume( + const float& aVolume) override; + + mozilla::ipc::IPCResult Recv__delete__() override; +}; + +class SpeechTaskParent : public nsSpeechTask { + friend class SpeechSynthesisRequestParent; + + public: + SpeechTaskParent(float aVolume, const nsAString& aUtterance, + bool aShouldResistFingerprinting) + : nsSpeechTask(aVolume, aUtterance, aShouldResistFingerprinting), + mActor(nullptr) {} + + nsresult DispatchStartImpl(const nsAString& aUri) override; + + nsresult DispatchEndImpl(float aElapsedTime, uint32_t aCharIndex) override; + + nsresult DispatchPauseImpl(float aElapsedTime, uint32_t aCharIndex) override; + + nsresult DispatchResumeImpl(float aElapsedTime, uint32_t aCharIndex) override; + + nsresult DispatchErrorImpl(float aElapsedTime, uint32_t aCharIndex) override; + + nsresult DispatchBoundaryImpl(const nsAString& aName, float aElapsedTime, + uint32_t aCharIndex, uint32_t aCharLength, + uint8_t argc) override; + + nsresult DispatchMarkImpl(const nsAString& aName, float aElapsedTime, + uint32_t aCharIndex) override; + + private: + SpeechSynthesisRequestParent* mActor; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/moz.build b/dom/media/webspeech/synth/moz.build new file mode 100644 index 0000000000..dde668668a --- /dev/null +++ b/dom/media/webspeech/synth/moz.build @@ -0,0 +1,65 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +if CONFIG["MOZ_WEBSPEECH"]: + MOCHITEST_MANIFESTS += [ + "test/mochitest.ini", + "test/startup/mochitest.ini", + ] + + XPIDL_MODULE = "dom_webspeechsynth" + + XPIDL_SOURCES += ["nsISpeechService.idl", "nsISynthVoiceRegistry.idl"] + + EXPORTS.mozilla.dom += [ + "ipc/SpeechSynthesisChild.h", + "ipc/SpeechSynthesisParent.h", + "nsSpeechTask.h", + "nsSynthVoiceRegistry.h", + "SpeechSynthesis.h", + "SpeechSynthesisUtterance.h", + "SpeechSynthesisVoice.h", + ] + + UNIFIED_SOURCES += [ + "ipc/SpeechSynthesisChild.cpp", + "ipc/SpeechSynthesisParent.cpp", + "nsSpeechTask.cpp", + "nsSynthVoiceRegistry.cpp", + "SpeechSynthesis.cpp", + "SpeechSynthesisUtterance.cpp", + "SpeechSynthesisVoice.cpp", + ] + + if CONFIG["MOZ_WEBSPEECH_TEST_BACKEND"]: + UNIFIED_SOURCES += ["test/nsFakeSynthServices.cpp"] + + XPCOM_MANIFESTS += [ + "test/components.conf", + ] + + if CONFIG["MOZ_WIDGET_TOOLKIT"] == "windows": + DIRS += ["windows"] + + if CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa": + DIRS += ["cocoa"] + + if CONFIG["MOZ_WIDGET_TOOLKIT"] == "android": + DIRS += ["android"] + + if CONFIG["MOZ_SYNTH_SPEECHD"]: + DIRS += ["speechd"] + + IPDL_SOURCES += [ + "ipc/PSpeechSynthesis.ipdl", + "ipc/PSpeechSynthesisRequest.ipdl", + ] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" +LOCAL_INCLUDES += [ + "ipc", +] diff --git a/dom/media/webspeech/synth/nsISpeechService.idl b/dom/media/webspeech/synth/nsISpeechService.idl new file mode 100644 index 0000000000..b69973b6d2 --- /dev/null +++ b/dom/media/webspeech/synth/nsISpeechService.idl @@ -0,0 +1,143 @@ +/* -*- Mode: IDL; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +/** + * A callback is implemented by the service. + */ +[scriptable, uuid(c576de0c-8a3d-4570-be7e-9876d3e5bed2)] +interface nsISpeechTaskCallback : nsISupports +{ + /** + * The user or application has paused the speech. + */ + void onPause(); + + /** + * The user or application has resumed the speech. + */ + void onResume(); + + /** + * The user or application has canceled the speech. + */ + void onCancel(); + + /** + * The user or application has changed the volume of this speech. + */ + void onVolumeChanged(in float aVolume); +}; + + +/** + * A task is associated with a single utterance. It is provided by the browser + * to the service in the speak() method. + */ +[scriptable, builtinclass, uuid(ad59949c-2437-4b35-8eeb-d760caab75c5)] +interface nsISpeechTask : nsISupports +{ + /** + * Prepare browser for speech. + * + * @param aCallback callback object for mid-speech operations. + */ + void setup(in nsISpeechTaskCallback aCallback); + + /** + * Dispatch start event. + */ + void dispatchStart(); + + /** + * Dispatch end event. + * + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + */ + void dispatchEnd(in float aElapsedTime, in unsigned long aCharIndex); + + /** + * Dispatch pause event. + * + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + */ + void dispatchPause(in float aElapsedTime, in unsigned long aCharIndex); + + /** + * Dispatch resume event. + * + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + */ + void dispatchResume(in float aElapsedTime, in unsigned long aCharIndex); + + /** + * Dispatch error event. + * + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + */ + void dispatchError(in float aElapsedTime, in unsigned long aCharIndex); + + /** + * Dispatch boundary event. + * + * @param aName name of boundary, 'word' or 'sentence' + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + * @param aCharLength length of text in boundary event to be spoken. + */ + [optional_argc] void dispatchBoundary(in AString aName, in float aElapsedTime, + in unsigned long aCharIndex, + [optional] in unsigned long aCharLength); + + /** + * Dispatch mark event. + * + * @param aName mark identifier. + * @param aElapsedTime time in seconds since speech has started. + * @param aCharIndex offset of spoken characters. + */ + void dispatchMark(in AString aName, in float aElapsedTime, in unsigned long aCharIndex); +}; + +/** + * The main interface of a speech synthesis service. + * + * A service is responsible for outputting audio. + * The service dispatches events, starting with dispatchStart() and ending with + * dispatchEnd or dispatchError(). + * A service must also respond with the currect actions and events in response + * to implemented callback methods. + */ +[scriptable, uuid(9b7d59db-88ff-43d0-b6ee-9f63d042d08f)] +interface nsISpeechService : nsISupports +{ + /** + * Speak the given text using the voice identified byu the given uri. See + * W3C Speech API spec for information about pitch and rate. + * https://dvcs.w3.org/hg/speech-api/raw-file/tip/speechapi.html#utterance-attributes + * + * @param aText text to utter. + * @param aUri unique voice identifier. + * @param aVolume volume to speak voice in. Only relevant for indirect audio. + * @param aRate rate to speak voice in. + * @param aPitch pitch to speak voice in. + * @param aTask task instance for utterance, used for sending events or audio + * data back to browser. + */ + void speak(in AString aText, in AString aUri, + in float aVolume, in float aRate, in float aPitch, + in nsISpeechTask aTask); +}; + +%{C++ +// This is the service category speech services could use to start up as +// a component. +#define NS_SPEECH_SYNTH_STARTED "speech-synth-started" +%} diff --git a/dom/media/webspeech/synth/nsISynthVoiceRegistry.idl b/dom/media/webspeech/synth/nsISynthVoiceRegistry.idl new file mode 100644 index 0000000000..8dd3a0426c --- /dev/null +++ b/dom/media/webspeech/synth/nsISynthVoiceRegistry.idl @@ -0,0 +1,77 @@ +/* -*- Mode: IDL; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +interface nsISpeechService; + +[scriptable, builtinclass, uuid(5d7a0b38-77e5-4ee5-897c-ce5db9b85d44)] +interface nsISynthVoiceRegistry : nsISupports +{ + /** + * Register a speech synthesis voice. + * + * @param aService the service that provides this voice. + * @param aUri a unique identifier for this voice. + * @param aName human-readable name for this voice. + * @param aLang a BCP 47 language tag. + * @param aLocalService true if service does not require network. + * @param aQueuesUtterances true if voice only speaks one utterance at a time + */ + void addVoice(in nsISpeechService aService, in AString aUri, + in AString aName, in AString aLang, + in boolean aLocalService, in boolean aQueuesUtterances); + + /** + * Remove a speech synthesis voice. + * + * @param aService the service that was used to add the voice. + * @param aUri a unique identifier of an existing voice. + */ + void removeVoice(in nsISpeechService aService, in AString aUri); + + /** + * Notify content of voice availability changes. This allows content + * to be notified of voice catalog changes in real time. + */ + void notifyVoicesChanged(); + + /** + * Set a voice as default. + * + * @param aUri a unique identifier of an existing voice. + * @param aIsDefault true if this voice should be toggled as default. + */ + void setDefaultVoice(in AString aUri, in boolean aIsDefault); + + readonly attribute uint32_t voiceCount; + + AString getVoice(in uint32_t aIndex); + + bool isDefaultVoice(in AString aUri); + + bool isLocalVoice(in AString aUri); + + AString getVoiceLang(in AString aUri); + + AString getVoiceName(in AString aUri); +}; + +%{C++ +#define NS_SYNTHVOICEREGISTRY_CID \ + { /* {7090524d-5574-4492-a77f-d8d558ced59d} */ \ + 0x7090524d, \ + 0x5574, \ + 0x4492, \ + { 0xa7, 0x7f, 0xd8, 0xd5, 0x58, 0xce, 0xd5, 0x9d } \ + } + +#define NS_SYNTHVOICEREGISTRY_CONTRACTID \ + "@mozilla.org/synth-voice-registry;1" + +#define NS_SYNTHVOICEREGISTRY_CLASSNAME \ + "Speech Synthesis Voice Registry" + +%} diff --git a/dom/media/webspeech/synth/nsSpeechTask.cpp b/dom/media/webspeech/synth/nsSpeechTask.cpp new file mode 100644 index 0000000000..b102172466 --- /dev/null +++ b/dom/media/webspeech/synth/nsSpeechTask.cpp @@ -0,0 +1,389 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "AudioChannelService.h" +#include "AudioSegment.h" +#include "nsSpeechTask.h" +#include "nsSynthVoiceRegistry.h" +#include "nsXULAppAPI.h" +#include "SharedBuffer.h" +#include "SpeechSynthesis.h" + +#undef LOG +extern mozilla::LogModule* GetSpeechSynthLog(); +#define LOG(type, msg) MOZ_LOG(GetSpeechSynthLog(), type, msg) + +#define AUDIO_TRACK 1 + +namespace mozilla::dom { + +// nsSpeechTask + +NS_IMPL_CYCLE_COLLECTION_WEAK(nsSpeechTask, mSpeechSynthesis, mUtterance, + mCallback) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsSpeechTask) + NS_INTERFACE_MAP_ENTRY(nsISpeechTask) + NS_INTERFACE_MAP_ENTRY(nsIAudioChannelAgentCallback) + NS_INTERFACE_MAP_ENTRY(nsISupportsWeakReference) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTask) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsSpeechTask) +NS_IMPL_CYCLE_COLLECTING_RELEASE(nsSpeechTask) + +nsSpeechTask::nsSpeechTask(SpeechSynthesisUtterance* aUtterance, + bool aShouldResistFingerprinting) + : mUtterance(aUtterance), + mInited(false), + mPrePaused(false), + mPreCanceled(false), + mCallback(nullptr), + mShouldResistFingerprinting(aShouldResistFingerprinting), + mState(STATE_PENDING) { + mText = aUtterance->mText; + mVolume = aUtterance->Volume(); +} + +nsSpeechTask::nsSpeechTask(float aVolume, const nsAString& aText, + bool aShouldResistFingerprinting) + : mUtterance(nullptr), + mVolume(aVolume), + mText(aText), + mInited(false), + mPrePaused(false), + mPreCanceled(false), + mCallback(nullptr), + mShouldResistFingerprinting(aShouldResistFingerprinting), + mState(STATE_PENDING) {} + +nsSpeechTask::~nsSpeechTask() { LOG(LogLevel::Debug, ("~nsSpeechTask")); } + +void nsSpeechTask::Init() { mInited = true; } + +void nsSpeechTask::SetChosenVoiceURI(const nsAString& aUri) { + mChosenVoiceURI = aUri; +} + +NS_IMETHODIMP +nsSpeechTask::Setup(nsISpeechTaskCallback* aCallback) { + MOZ_ASSERT(XRE_IsParentProcess()); + + LOG(LogLevel::Debug, ("nsSpeechTask::Setup")); + + mCallback = aCallback; + + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchStart() { + nsSynthVoiceRegistry::GetInstance()->SetIsSpeaking(true); + return DispatchStartImpl(); +} + +nsresult nsSpeechTask::DispatchStartImpl() { + return DispatchStartImpl(mChosenVoiceURI); +} + +nsresult nsSpeechTask::DispatchStartImpl(const nsAString& aUri) { + LOG(LogLevel::Debug, ("nsSpeechTask::DispatchStartImpl")); + + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mState != STATE_PENDING)) { + return NS_ERROR_NOT_AVAILABLE; + } + + CreateAudioChannelAgent(); + + mState = STATE_SPEAKING; + mUtterance->mChosenVoiceURI = aUri; + mUtterance->DispatchSpeechSynthesisEvent(u"start"_ns, 0, nullptr, 0, u""_ns); + + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchEnd(float aElapsedTime, uint32_t aCharIndex) { + // After we end, no callback functions should go through. + mCallback = nullptr; + + if (!mPreCanceled) { + nsSynthVoiceRegistry::GetInstance()->SpeakNext(); + } + + return DispatchEndImpl(aElapsedTime, aCharIndex); +} + +nsresult nsSpeechTask::DispatchEndImpl(float aElapsedTime, + uint32_t aCharIndex) { + LOG(LogLevel::Debug, ("nsSpeechTask::DispatchEndImpl")); + + DestroyAudioChannelAgent(); + + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mState == STATE_ENDED)) { + return NS_ERROR_NOT_AVAILABLE; + } + + RefPtr utterance = mUtterance; + + if (mSpeechSynthesis) { + mSpeechSynthesis->OnEnd(this); + } + + mState = STATE_ENDED; + utterance->DispatchSpeechSynthesisEvent(u"end"_ns, aCharIndex, nullptr, + aElapsedTime, u""_ns); + + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchPause(float aElapsedTime, uint32_t aCharIndex) { + return DispatchPauseImpl(aElapsedTime, aCharIndex); +} + +nsresult nsSpeechTask::DispatchPauseImpl(float aElapsedTime, + uint32_t aCharIndex) { + LOG(LogLevel::Debug, ("nsSpeechTask::DispatchPauseImpl")); + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mUtterance->mPaused)) { + return NS_ERROR_NOT_AVAILABLE; + } + if (NS_WARN_IF(mState == STATE_ENDED)) { + return NS_ERROR_NOT_AVAILABLE; + } + + mUtterance->mPaused = true; + if (mState == STATE_SPEAKING) { + mUtterance->DispatchSpeechSynthesisEvent(u"pause"_ns, aCharIndex, nullptr, + aElapsedTime, u""_ns); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchResume(float aElapsedTime, uint32_t aCharIndex) { + return DispatchResumeImpl(aElapsedTime, aCharIndex); +} + +nsresult nsSpeechTask::DispatchResumeImpl(float aElapsedTime, + uint32_t aCharIndex) { + LOG(LogLevel::Debug, ("nsSpeechTask::DispatchResumeImpl")); + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(!(mUtterance->mPaused))) { + return NS_ERROR_NOT_AVAILABLE; + } + if (NS_WARN_IF(mState == STATE_ENDED)) { + return NS_ERROR_NOT_AVAILABLE; + } + + mUtterance->mPaused = false; + if (mState == STATE_SPEAKING) { + mUtterance->DispatchSpeechSynthesisEvent(u"resume"_ns, aCharIndex, nullptr, + aElapsedTime, u""_ns); + } + + return NS_OK; +} + +void nsSpeechTask::ForceError(float aElapsedTime, uint32_t aCharIndex) { + DispatchError(aElapsedTime, aCharIndex); +} + +NS_IMETHODIMP +nsSpeechTask::DispatchError(float aElapsedTime, uint32_t aCharIndex) { + if (!mPreCanceled) { + nsSynthVoiceRegistry::GetInstance()->SpeakNext(); + } + + return DispatchErrorImpl(aElapsedTime, aCharIndex); +} + +nsresult nsSpeechTask::DispatchErrorImpl(float aElapsedTime, + uint32_t aCharIndex) { + LOG(LogLevel::Debug, ("nsSpeechTask::DispatchErrorImpl")); + + DestroyAudioChannelAgent(); + + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mState == STATE_ENDED)) { + return NS_ERROR_NOT_AVAILABLE; + } + + if (mSpeechSynthesis) { + mSpeechSynthesis->OnEnd(this); + } + + mState = STATE_ENDED; + mUtterance->DispatchSpeechSynthesisEvent(u"error"_ns, aCharIndex, nullptr, + aElapsedTime, u""_ns); + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchBoundary(const nsAString& aName, float aElapsedTime, + uint32_t aCharIndex, uint32_t aCharLength, + uint8_t argc) { + return DispatchBoundaryImpl(aName, aElapsedTime, aCharIndex, aCharLength, + argc); +} + +nsresult nsSpeechTask::DispatchBoundaryImpl(const nsAString& aName, + float aElapsedTime, + uint32_t aCharIndex, + uint32_t aCharLength, + uint8_t argc) { + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mState != STATE_SPEAKING)) { + return NS_ERROR_NOT_AVAILABLE; + } + mUtterance->DispatchSpeechSynthesisEvent( + u"boundary"_ns, aCharIndex, + argc ? static_cast >(aCharLength) : nullptr, + aElapsedTime, aName); + + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::DispatchMark(const nsAString& aName, float aElapsedTime, + uint32_t aCharIndex) { + return DispatchMarkImpl(aName, aElapsedTime, aCharIndex); +} + +nsresult nsSpeechTask::DispatchMarkImpl(const nsAString& aName, + float aElapsedTime, + uint32_t aCharIndex) { + MOZ_ASSERT(mUtterance); + if (NS_WARN_IF(mState != STATE_SPEAKING)) { + return NS_ERROR_NOT_AVAILABLE; + } + mUtterance->DispatchSpeechSynthesisEvent(u"mark"_ns, aCharIndex, nullptr, + aElapsedTime, aName); + return NS_OK; +} + +void nsSpeechTask::Pause() { + MOZ_ASSERT(XRE_IsParentProcess()); + + if (mCallback) { + DebugOnly rv = mCallback->OnPause(); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Unable to call onPause() callback"); + } + + if (!mInited) { + mPrePaused = true; + } +} + +void nsSpeechTask::Resume() { + MOZ_ASSERT(XRE_IsParentProcess()); + + if (mCallback) { + DebugOnly rv = mCallback->OnResume(); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Unable to call onResume() callback"); + } + + if (mPrePaused) { + mPrePaused = false; + nsSynthVoiceRegistry::GetInstance()->ResumeQueue(); + } +} + +void nsSpeechTask::Cancel() { + MOZ_ASSERT(XRE_IsParentProcess()); + + LOG(LogLevel::Debug, ("nsSpeechTask::Cancel")); + + if (mCallback) { + DebugOnly rv = mCallback->OnCancel(); + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), + "Unable to call onCancel() callback"); + } + + if (!mInited) { + mPreCanceled = true; + } +} + +void nsSpeechTask::ForceEnd() { + if (!mInited) { + mPreCanceled = true; + } + + DispatchEnd(0, 0); +} + +void nsSpeechTask::SetSpeechSynthesis(SpeechSynthesis* aSpeechSynthesis) { + mSpeechSynthesis = aSpeechSynthesis; +} + +void nsSpeechTask::CreateAudioChannelAgent() { + if (!mUtterance) { + return; + } + + if (mAudioChannelAgent) { + mAudioChannelAgent->NotifyStoppedPlaying(); + } + + mAudioChannelAgent = new AudioChannelAgent(); + mAudioChannelAgent->InitWithWeakCallback(mUtterance->GetOwner(), this); + + nsresult rv = mAudioChannelAgent->NotifyStartedPlaying( + AudioChannelService::AudibleState::eAudible); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + mAudioChannelAgent->PullInitialUpdate(); +} + +void nsSpeechTask::DestroyAudioChannelAgent() { + if (mAudioChannelAgent) { + mAudioChannelAgent->NotifyStoppedPlaying(); + mAudioChannelAgent = nullptr; + } +} + +NS_IMETHODIMP +nsSpeechTask::WindowVolumeChanged(float aVolume, bool aMuted) { + SetAudioOutputVolume(aMuted ? 0.0 : mVolume * aVolume); + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::WindowSuspendChanged(nsSuspendedTypes aSuspend) { + if (!mUtterance) { + return NS_OK; + } + + if (aSuspend == nsISuspendedTypes::NONE_SUSPENDED && mUtterance->mPaused) { + Resume(); + } else if (aSuspend != nsISuspendedTypes::NONE_SUSPENDED && + !mUtterance->mPaused) { + Pause(); + } + return NS_OK; +} + +NS_IMETHODIMP +nsSpeechTask::WindowAudioCaptureChanged(bool aCapture) { + // This is not supported yet. + return NS_OK; +} + +void nsSpeechTask::SetAudioOutputVolume(float aVolume) { + if (mCallback) { + mCallback->OnVolumeChanged(aVolume); + } +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/nsSpeechTask.h b/dom/media/webspeech/synth/nsSpeechTask.h new file mode 100644 index 0000000000..fc121cf8f1 --- /dev/null +++ b/dom/media/webspeech/synth/nsSpeechTask.h @@ -0,0 +1,128 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_nsSpeechTask_h +#define mozilla_dom_nsSpeechTask_h + +#include "SpeechSynthesisUtterance.h" +#include "AudioChannelAgent.h" +#include "nsISpeechService.h" +#include "nsWeakReference.h" + +namespace mozilla { + +class SharedBuffer; + +namespace dom { + +class SpeechSynthesisUtterance; +class SpeechSynthesis; + +class nsSpeechTask : public nsISpeechTask, + public nsIAudioChannelAgentCallback, + public nsSupportsWeakReference { + public: + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsSpeechTask, nsISpeechTask) + + NS_DECL_NSISPEECHTASK + NS_DECL_NSIAUDIOCHANNELAGENTCALLBACK + + explicit nsSpeechTask(SpeechSynthesisUtterance* aUtterance, + bool aShouldResistFingerprinting); + nsSpeechTask(float aVolume, const nsAString& aText, + bool aShouldResistFingerprinting); + + virtual void Pause(); + + virtual void Resume(); + + virtual void Cancel(); + + virtual void ForceEnd(); + + void SetSpeechSynthesis(SpeechSynthesis* aSpeechSynthesis); + + void Init(); + + void SetChosenVoiceURI(const nsAString& aUri); + + virtual void SetAudioOutputVolume(float aVolume); + + void ForceError(float aElapsedTime, uint32_t aCharIndex); + + bool IsPreCanceled() { return mPreCanceled; }; + + bool IsPrePaused() { return mPrePaused; } + + bool ShouldResistFingerprinting() { return mShouldResistFingerprinting; } + + enum { STATE_PENDING, STATE_SPEAKING, STATE_ENDED }; + + uint32_t GetState() const { return mState; } + + bool IsSpeaking() const { return mState == STATE_SPEAKING; } + + bool IsPending() const { return mState == STATE_PENDING; } + + protected: + virtual ~nsSpeechTask(); + + nsresult DispatchStartImpl(); + + virtual nsresult DispatchStartImpl(const nsAString& aUri); + + virtual nsresult DispatchEndImpl(float aElapsedTime, uint32_t aCharIndex); + + virtual nsresult DispatchPauseImpl(float aElapsedTime, uint32_t aCharIndex); + + virtual nsresult DispatchResumeImpl(float aElapsedTime, uint32_t aCharIndex); + + virtual nsresult DispatchErrorImpl(float aElapsedTime, uint32_t aCharIndex); + + virtual nsresult DispatchBoundaryImpl(const nsAString& aName, + float aElapsedTime, uint32_t aCharIndex, + uint32_t aCharLength, uint8_t argc); + + virtual nsresult DispatchMarkImpl(const nsAString& aName, float aElapsedTime, + uint32_t aCharIndex); + + RefPtr mUtterance; + + float mVolume; + + nsString mText; + + bool mInited; + + bool mPrePaused; + + bool mPreCanceled; + + private: + void End(); + + void CreateAudioChannelAgent(); + + void DestroyAudioChannelAgent(); + + nsCOMPtr mCallback; + + RefPtr mAudioChannelAgent; + + RefPtr mSpeechSynthesis; + + nsString mChosenVoiceURI; + + bool mShouldResistFingerprinting; + + uint32_t mState; +}; + +} // namespace dom +} // namespace mozilla + +#endif diff --git a/dom/media/webspeech/synth/nsSynthVoiceRegistry.cpp b/dom/media/webspeech/synth/nsSynthVoiceRegistry.cpp new file mode 100644 index 0000000000..d289c81655 --- /dev/null +++ b/dom/media/webspeech/synth/nsSynthVoiceRegistry.cpp @@ -0,0 +1,762 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISpeechService.h" +#include "nsServiceManagerUtils.h" +#include "nsCategoryManagerUtils.h" + +#include "SpeechSynthesisUtterance.h" +#include "SpeechSynthesisVoice.h" +#include "nsContentUtils.h" +#include "nsSynthVoiceRegistry.h" +#include "nsSpeechTask.h" +#include "AudioChannelService.h" + +#include "nsString.h" +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/ContentChild.h" +#include "mozilla/dom/ContentParent.h" +#include "mozilla/dom/Document.h" +#include "mozilla/intl/LocaleService.h" +#include "mozilla/StaticPrefs_media.h" +#include "mozilla/StaticPtr.h" +#include "mozilla/Unused.h" + +#include "SpeechSynthesisChild.h" +#include "SpeechSynthesisParent.h" + +using mozilla::intl::LocaleService; + +#undef LOG +extern mozilla::LogModule* GetSpeechSynthLog(); +#define LOG(type, msg) MOZ_LOG(GetSpeechSynthLog(), type, msg) + +namespace { + +void GetAllSpeechSynthActors( + nsTArray& aActors) { + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aActors.IsEmpty()); + + AutoTArray contentActors; + mozilla::dom::ContentParent::GetAll(contentActors); + + for (uint32_t contentIndex = 0; contentIndex < contentActors.Length(); + ++contentIndex) { + MOZ_ASSERT(contentActors[contentIndex]); + + AutoTArray speechsynthActors; + contentActors[contentIndex]->ManagedPSpeechSynthesisParent( + speechsynthActors); + + for (uint32_t speechsynthIndex = 0; + speechsynthIndex < speechsynthActors.Length(); ++speechsynthIndex) { + MOZ_ASSERT(speechsynthActors[speechsynthIndex]); + + mozilla::dom::SpeechSynthesisParent* actor = + static_cast( + speechsynthActors[speechsynthIndex]); + aActors.AppendElement(actor); + } + } +} + +} // namespace + +namespace mozilla::dom { + +// VoiceData + +class VoiceData final { + private: + // Private destructor, to discourage deletion outside of Release(): + ~VoiceData() = default; + + public: + VoiceData(nsISpeechService* aService, const nsAString& aUri, + const nsAString& aName, const nsAString& aLang, bool aIsLocal, + bool aQueuesUtterances) + : mService(aService), + mUri(aUri), + mName(aName), + mLang(aLang), + mIsLocal(aIsLocal), + mIsQueued(aQueuesUtterances) {} + + NS_INLINE_DECL_REFCOUNTING(VoiceData) + + nsCOMPtr mService; + + nsString mUri; + + nsString mName; + + nsString mLang; + + bool mIsLocal; + + bool mIsQueued; +}; + +// GlobalQueueItem + +class GlobalQueueItem final { + private: + // Private destructor, to discourage deletion outside of Release(): + ~GlobalQueueItem() = default; + + public: + GlobalQueueItem(VoiceData* aVoice, nsSpeechTask* aTask, + const nsAString& aText, const float& aVolume, + const float& aRate, const float& aPitch) + : mVoice(aVoice), + mTask(aTask), + mText(aText), + mVolume(aVolume), + mRate(aRate), + mPitch(aPitch), + mIsLocal(false) {} + + NS_INLINE_DECL_REFCOUNTING(GlobalQueueItem) + + RefPtr mVoice; + + RefPtr mTask; + + nsString mText; + + float mVolume; + + float mRate; + + float mPitch; + + bool mIsLocal; +}; + +// nsSynthVoiceRegistry + +static StaticRefPtr gSynthVoiceRegistry; + +NS_IMPL_ISUPPORTS(nsSynthVoiceRegistry, nsISynthVoiceRegistry) + +nsSynthVoiceRegistry::nsSynthVoiceRegistry() + : mSpeechSynthChild(nullptr), mUseGlobalQueue(false), mIsSpeaking(false) { + if (XRE_IsContentProcess()) { + mSpeechSynthChild = new SpeechSynthesisChild(); + ContentChild::GetSingleton()->SendPSpeechSynthesisConstructor( + mSpeechSynthChild); + } +} + +nsSynthVoiceRegistry::~nsSynthVoiceRegistry() { + LOG(LogLevel::Debug, ("~nsSynthVoiceRegistry")); + + // mSpeechSynthChild's lifecycle is managed by the Content protocol. + mSpeechSynthChild = nullptr; + + mUriVoiceMap.Clear(); +} + +nsSynthVoiceRegistry* nsSynthVoiceRegistry::GetInstance() { + MOZ_ASSERT(NS_IsMainThread()); + + if (!gSynthVoiceRegistry) { + gSynthVoiceRegistry = new nsSynthVoiceRegistry(); + ClearOnShutdown(&gSynthVoiceRegistry); + if (XRE_IsParentProcess()) { + // Start up all speech synth services. + NS_CreateServicesFromCategory(NS_SPEECH_SYNTH_STARTED, nullptr, + NS_SPEECH_SYNTH_STARTED); + } + } + + return gSynthVoiceRegistry; +} + +already_AddRefed +nsSynthVoiceRegistry::GetInstanceForService() { + RefPtr registry = GetInstance(); + + return registry.forget(); +} + +bool nsSynthVoiceRegistry::SendInitialVoicesAndState( + SpeechSynthesisParent* aParent) { + MOZ_ASSERT(XRE_IsParentProcess()); + + nsTArray voices; + nsTArray defaults; + + for (uint32_t i = 0; i < mVoices.Length(); ++i) { + RefPtr voice = mVoices[i]; + + voices.AppendElement(RemoteVoice(voice->mUri, voice->mName, voice->mLang, + voice->mIsLocal, voice->mIsQueued)); + } + + for (uint32_t i = 0; i < mDefaultVoices.Length(); ++i) { + defaults.AppendElement(mDefaultVoices[i]->mUri); + } + + return aParent->SendInitialVoicesAndState(voices, defaults, IsSpeaking()); +} + +void nsSynthVoiceRegistry::RecvInitialVoicesAndState( + const nsTArray& aVoices, const nsTArray& aDefaults, + const bool& aIsSpeaking) { + // We really should have a local instance since this is a directed response to + // an Init() call. + MOZ_ASSERT(gSynthVoiceRegistry); + + for (uint32_t i = 0; i < aVoices.Length(); ++i) { + RemoteVoice voice = aVoices[i]; + gSynthVoiceRegistry->AddVoiceImpl(nullptr, voice.voiceURI(), voice.name(), + voice.lang(), voice.localService(), + voice.queued()); + } + + for (uint32_t i = 0; i < aDefaults.Length(); ++i) { + gSynthVoiceRegistry->SetDefaultVoice(aDefaults[i], true); + } + + gSynthVoiceRegistry->mIsSpeaking = aIsSpeaking; + + if (aVoices.Length()) { + gSynthVoiceRegistry->NotifyVoicesChanged(); + } +} + +void nsSynthVoiceRegistry::RecvRemoveVoice(const nsAString& aUri) { + // If we dont have a local instance of the registry yet, we will recieve + // current voices at contruction time. + if (!gSynthVoiceRegistry) { + return; + } + + gSynthVoiceRegistry->RemoveVoice(nullptr, aUri); +} + +void nsSynthVoiceRegistry::RecvAddVoice(const RemoteVoice& aVoice) { + // If we dont have a local instance of the registry yet, we will recieve + // current voices at contruction time. + if (!gSynthVoiceRegistry) { + return; + } + + gSynthVoiceRegistry->AddVoiceImpl(nullptr, aVoice.voiceURI(), aVoice.name(), + aVoice.lang(), aVoice.localService(), + aVoice.queued()); +} + +void nsSynthVoiceRegistry::RecvSetDefaultVoice(const nsAString& aUri, + bool aIsDefault) { + // If we dont have a local instance of the registry yet, we will recieve + // current voices at contruction time. + if (!gSynthVoiceRegistry) { + return; + } + + gSynthVoiceRegistry->SetDefaultVoice(aUri, aIsDefault); +} + +void nsSynthVoiceRegistry::RecvIsSpeakingChanged(bool aIsSpeaking) { + // If we dont have a local instance of the registry yet, we will get the + // speaking state on construction. + if (!gSynthVoiceRegistry) { + return; + } + + gSynthVoiceRegistry->mIsSpeaking = aIsSpeaking; +} + +void nsSynthVoiceRegistry::RecvNotifyVoicesChanged() { + // If we dont have a local instance of the registry yet, we don't care. + if (!gSynthVoiceRegistry) { + return; + } + + gSynthVoiceRegistry->NotifyVoicesChanged(); +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::AddVoice(nsISpeechService* aService, + const nsAString& aUri, const nsAString& aName, + const nsAString& aLang, bool aLocalService, + bool aQueuesUtterances) { + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::AddVoice uri='%s' name='%s' lang='%s' local=%s " + "queued=%s", + NS_ConvertUTF16toUTF8(aUri).get(), NS_ConvertUTF16toUTF8(aName).get(), + NS_ConvertUTF16toUTF8(aLang).get(), aLocalService ? "true" : "false", + aQueuesUtterances ? "true" : "false")); + + if (NS_WARN_IF(XRE_IsContentProcess())) { + return NS_ERROR_NOT_AVAILABLE; + } + + return AddVoiceImpl(aService, aUri, aName, aLang, aLocalService, + aQueuesUtterances); +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::RemoveVoice(nsISpeechService* aService, + const nsAString& aUri) { + LOG(LogLevel::Debug, ("nsSynthVoiceRegistry::RemoveVoice uri='%s' (%s)", + NS_ConvertUTF16toUTF8(aUri).get(), + (XRE_IsContentProcess()) ? "child" : "parent")); + + bool found = false; + VoiceData* retval = mUriVoiceMap.GetWeak(aUri, &found); + + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + if (NS_WARN_IF(!(aService == retval->mService))) { + return NS_ERROR_INVALID_ARG; + } + + mVoices.RemoveElement(retval); + mDefaultVoices.RemoveElement(retval); + mUriVoiceMap.Remove(aUri); + + if (retval->mIsQueued && + !StaticPrefs::media_webspeech_synth_force_global_queue()) { + // Check if this is the last queued voice, and disable the global queue if + // it is. + bool queued = false; + for (uint32_t i = 0; i < mVoices.Length(); i++) { + VoiceData* voice = mVoices[i]; + if (voice->mIsQueued) { + queued = true; + break; + } + } + if (!queued) { + mUseGlobalQueue = false; + } + } + + nsTArray ssplist; + GetAllSpeechSynthActors(ssplist); + + for (uint32_t i = 0; i < ssplist.Length(); ++i) + Unused << ssplist[i]->SendVoiceRemoved(aUri); + + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::NotifyVoicesChanged() { + if (XRE_IsParentProcess()) { + nsTArray ssplist; + GetAllSpeechSynthActors(ssplist); + + for (uint32_t i = 0; i < ssplist.Length(); ++i) + Unused << ssplist[i]->SendNotifyVoicesChanged(); + } + + nsCOMPtr obs = mozilla::services::GetObserverService(); + if (NS_WARN_IF(!(obs))) { + return NS_ERROR_NOT_AVAILABLE; + } + + obs->NotifyObservers(nullptr, "synth-voices-changed", nullptr); + + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::SetDefaultVoice(const nsAString& aUri, bool aIsDefault) { + bool found = false; + VoiceData* retval = mUriVoiceMap.GetWeak(aUri, &found); + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + mDefaultVoices.RemoveElement(retval); + + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::SetDefaultVoice %s %s", + NS_ConvertUTF16toUTF8(aUri).get(), aIsDefault ? "true" : "false")); + + if (aIsDefault) { + mDefaultVoices.AppendElement(retval); + } + + if (XRE_IsParentProcess()) { + nsTArray ssplist; + GetAllSpeechSynthActors(ssplist); + + for (uint32_t i = 0; i < ssplist.Length(); ++i) { + Unused << ssplist[i]->SendSetDefaultVoice(aUri, aIsDefault); + } + } + + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::GetVoiceCount(uint32_t* aRetval) { + *aRetval = mVoices.Length(); + + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::GetVoice(uint32_t aIndex, nsAString& aRetval) { + if (NS_WARN_IF(!(aIndex < mVoices.Length()))) { + return NS_ERROR_INVALID_ARG; + } + + aRetval = mVoices[aIndex]->mUri; + + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::IsDefaultVoice(const nsAString& aUri, bool* aRetval) { + bool found; + VoiceData* voice = mUriVoiceMap.GetWeak(aUri, &found); + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + for (int32_t i = mDefaultVoices.Length(); i > 0;) { + VoiceData* defaultVoice = mDefaultVoices[--i]; + + if (voice->mLang.Equals(defaultVoice->mLang)) { + *aRetval = voice == defaultVoice; + return NS_OK; + } + } + + *aRetval = false; + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::IsLocalVoice(const nsAString& aUri, bool* aRetval) { + bool found; + VoiceData* voice = mUriVoiceMap.GetWeak(aUri, &found); + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + *aRetval = voice->mIsLocal; + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::GetVoiceLang(const nsAString& aUri, nsAString& aRetval) { + bool found; + VoiceData* voice = mUriVoiceMap.GetWeak(aUri, &found); + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + aRetval = voice->mLang; + return NS_OK; +} + +NS_IMETHODIMP +nsSynthVoiceRegistry::GetVoiceName(const nsAString& aUri, nsAString& aRetval) { + bool found; + VoiceData* voice = mUriVoiceMap.GetWeak(aUri, &found); + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + aRetval = voice->mName; + return NS_OK; +} + +nsresult nsSynthVoiceRegistry::AddVoiceImpl( + nsISpeechService* aService, const nsAString& aUri, const nsAString& aName, + const nsAString& aLang, bool aLocalService, bool aQueuesUtterances) { + const bool found = mUriVoiceMap.Contains(aUri); + if (NS_WARN_IF(found)) { + return NS_ERROR_INVALID_ARG; + } + + RefPtr voice = new VoiceData(aService, aUri, aName, aLang, + aLocalService, aQueuesUtterances); + + mVoices.AppendElement(voice); + mUriVoiceMap.InsertOrUpdate(aUri, std::move(voice)); + mUseGlobalQueue |= aQueuesUtterances; + + nsTArray ssplist; + GetAllSpeechSynthActors(ssplist); + + if (!ssplist.IsEmpty()) { + mozilla::dom::RemoteVoice ssvoice(nsString(aUri), nsString(aName), + nsString(aLang), aLocalService, + aQueuesUtterances); + + for (uint32_t i = 0; i < ssplist.Length(); ++i) { + Unused << ssplist[i]->SendVoiceAdded(ssvoice); + } + } + + return NS_OK; +} + +bool nsSynthVoiceRegistry::FindVoiceByLang(const nsAString& aLang, + VoiceData** aRetval) { + nsAString::const_iterator dashPos, start, end; + aLang.BeginReading(start); + aLang.EndReading(end); + + while (true) { + nsAutoString langPrefix(Substring(start, end)); + + for (int32_t i = mDefaultVoices.Length(); i > 0;) { + VoiceData* voice = mDefaultVoices[--i]; + + if (StringBeginsWith(voice->mLang, langPrefix)) { + *aRetval = voice; + return true; + } + } + + for (int32_t i = mVoices.Length(); i > 0;) { + VoiceData* voice = mVoices[--i]; + + if (StringBeginsWith(voice->mLang, langPrefix)) { + *aRetval = voice; + return true; + } + } + + dashPos = end; + end = start; + + if (!RFindInReadable(u"-"_ns, end, dashPos)) { + break; + } + } + + return false; +} + +VoiceData* nsSynthVoiceRegistry::FindBestMatch(const nsAString& aUri, + const nsAString& aLang) { + if (mVoices.IsEmpty()) { + return nullptr; + } + + bool found = false; + VoiceData* retval = mUriVoiceMap.GetWeak(aUri, &found); + + if (found) { + LOG(LogLevel::Debug, ("nsSynthVoiceRegistry::FindBestMatch - Matched URI")); + return retval; + } + + // Try finding a match for given voice. + if (!aLang.IsVoid() && !aLang.IsEmpty()) { + if (FindVoiceByLang(aLang, &retval)) { + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::FindBestMatch - Matched language (%s ~= %s)", + NS_ConvertUTF16toUTF8(aLang).get(), + NS_ConvertUTF16toUTF8(retval->mLang).get())); + + return retval; + } + } + + // Try UI language. + nsAutoCString uiLang; + LocaleService::GetInstance()->GetAppLocaleAsBCP47(uiLang); + + if (FindVoiceByLang(NS_ConvertASCIItoUTF16(uiLang), &retval)) { + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::FindBestMatch - Matched UI language (%s ~= %s)", + uiLang.get(), NS_ConvertUTF16toUTF8(retval->mLang).get())); + + return retval; + } + + // Try en-US, the language of locale "C" + if (FindVoiceByLang(u"en-US"_ns, &retval)) { + LOG(LogLevel::Debug, ("nsSynthVoiceRegistry::FindBestMatch - Matched C " + "locale language (en-US ~= %s)", + NS_ConvertUTF16toUTF8(retval->mLang).get())); + + return retval; + } + + // The top default voice is better than nothing... + if (!mDefaultVoices.IsEmpty()) { + return mDefaultVoices.LastElement(); + } + + return nullptr; +} + +already_AddRefed nsSynthVoiceRegistry::SpeakUtterance( + SpeechSynthesisUtterance& aUtterance, const nsAString& aDocLang) { + nsString lang = + nsString(aUtterance.mLang.IsEmpty() ? aDocLang : aUtterance.mLang); + nsAutoString uri; + + if (aUtterance.mVoice) { + aUtterance.mVoice->GetVoiceURI(uri); + } + + // Get current audio volume to apply speech call + float volume = aUtterance.Volume(); + RefPtr service = AudioChannelService::GetOrCreate(); + if (service) { + if (nsCOMPtr topWindow = aUtterance.GetOwner()) { + // TODO : use audio channel agent, open new bug to fix it. + AudioPlaybackConfig config = + service->GetMediaConfig(topWindow->GetOuterWindow()); + volume = config.mMuted ? 0.0f : config.mVolume * volume; + } + } + + RefPtr task; + if (XRE_IsContentProcess()) { + task = new SpeechTaskChild(&aUtterance, + aUtterance.ShouldResistFingerprinting()); + SpeechSynthesisRequestChild* actor = new SpeechSynthesisRequestChild( + static_cast(task.get())); + mSpeechSynthChild->SendPSpeechSynthesisRequestConstructor( + actor, aUtterance.mText, lang, uri, volume, aUtterance.Rate(), + aUtterance.Pitch(), aUtterance.ShouldResistFingerprinting()); + } else { + task = + new nsSpeechTask(&aUtterance, aUtterance.ShouldResistFingerprinting()); + Speak(aUtterance.mText, lang, uri, volume, aUtterance.Rate(), + aUtterance.Pitch(), task); + } + + return task.forget(); +} + +void nsSynthVoiceRegistry::Speak(const nsAString& aText, const nsAString& aLang, + const nsAString& aUri, const float& aVolume, + const float& aRate, const float& aPitch, + nsSpeechTask* aTask) { + MOZ_ASSERT(XRE_IsParentProcess()); + + if (aTask->ShouldResistFingerprinting()) { + aTask->ForceError(0, 0); + return; + } + + VoiceData* voice = FindBestMatch(aUri, aLang); + + if (!voice) { + NS_WARNING("No voices found."); + aTask->ForceError(0, 0); + return; + } + + aTask->SetChosenVoiceURI(voice->mUri); + + if (mUseGlobalQueue || + StaticPrefs::media_webspeech_synth_force_global_queue()) { + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::Speak queueing text='%s' lang='%s' uri='%s' " + "rate=%f pitch=%f", + NS_ConvertUTF16toUTF8(aText).get(), NS_ConvertUTF16toUTF8(aLang).get(), + NS_ConvertUTF16toUTF8(aUri).get(), aRate, aPitch)); + RefPtr item = + new GlobalQueueItem(voice, aTask, aText, aVolume, aRate, aPitch); + mGlobalQueue.AppendElement(item); + + if (mGlobalQueue.Length() == 1) { + SpeakImpl(item->mVoice, item->mTask, item->mText, item->mVolume, + item->mRate, item->mPitch); + } + } else { + SpeakImpl(voice, aTask, aText, aVolume, aRate, aPitch); + } +} + +void nsSynthVoiceRegistry::SpeakNext() { + MOZ_ASSERT(XRE_IsParentProcess()); + + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::SpeakNext %d", mGlobalQueue.IsEmpty())); + + SetIsSpeaking(false); + + if (mGlobalQueue.IsEmpty()) { + return; + } + + mGlobalQueue.RemoveElementAt(0); + + while (!mGlobalQueue.IsEmpty()) { + RefPtr item = mGlobalQueue.ElementAt(0); + if (item->mTask->IsPreCanceled()) { + mGlobalQueue.RemoveElementAt(0); + continue; + } + if (!item->mTask->IsPrePaused()) { + SpeakImpl(item->mVoice, item->mTask, item->mText, item->mVolume, + item->mRate, item->mPitch); + } + break; + } +} + +void nsSynthVoiceRegistry::ResumeQueue() { + MOZ_ASSERT(XRE_IsParentProcess()); + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::ResumeQueue %d", mGlobalQueue.IsEmpty())); + + if (mGlobalQueue.IsEmpty()) { + return; + } + + RefPtr item = mGlobalQueue.ElementAt(0); + if (!item->mTask->IsPrePaused()) { + SpeakImpl(item->mVoice, item->mTask, item->mText, item->mVolume, + item->mRate, item->mPitch); + } +} + +bool nsSynthVoiceRegistry::IsSpeaking() { return mIsSpeaking; } + +void nsSynthVoiceRegistry::SetIsSpeaking(bool aIsSpeaking) { + MOZ_ASSERT(XRE_IsParentProcess()); + + // Only set to 'true' if global queue is enabled. + mIsSpeaking = + aIsSpeaking && (mUseGlobalQueue || + StaticPrefs::media_webspeech_synth_force_global_queue()); + + nsTArray ssplist; + GetAllSpeechSynthActors(ssplist); + for (uint32_t i = 0; i < ssplist.Length(); ++i) { + Unused << ssplist[i]->SendIsSpeakingChanged(aIsSpeaking); + } +} + +void nsSynthVoiceRegistry::SpeakImpl(VoiceData* aVoice, nsSpeechTask* aTask, + const nsAString& aText, + const float& aVolume, const float& aRate, + const float& aPitch) { + LOG(LogLevel::Debug, + ("nsSynthVoiceRegistry::SpeakImpl queueing text='%s' uri='%s' rate=%f " + "pitch=%f", + NS_ConvertUTF16toUTF8(aText).get(), + NS_ConvertUTF16toUTF8(aVoice->mUri).get(), aRate, aPitch)); + + aTask->Init(); + + if (NS_FAILED(aVoice->mService->Speak(aText, aVoice->mUri, aVolume, aRate, + aPitch, aTask))) { + aTask->DispatchError(0, 0); + } +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/nsSynthVoiceRegistry.h b/dom/media/webspeech/synth/nsSynthVoiceRegistry.h new file mode 100644 index 0000000000..85c67c087f --- /dev/null +++ b/dom/media/webspeech/synth/nsSynthVoiceRegistry.h @@ -0,0 +1,99 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_nsSynthVoiceRegistry_h +#define mozilla_dom_nsSynthVoiceRegistry_h + +#include "nsISynthVoiceRegistry.h" +#include "nsRefPtrHashtable.h" +#include "nsTArray.h" + +class nsISpeechService; + +namespace mozilla::dom { + +class RemoteVoice; +class SpeechSynthesisUtterance; +class SpeechSynthesisChild; +class SpeechSynthesisParent; +class nsSpeechTask; +class VoiceData; +class GlobalQueueItem; + +class nsSynthVoiceRegistry final : public nsISynthVoiceRegistry { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSISYNTHVOICEREGISTRY + + nsSynthVoiceRegistry(); + + already_AddRefed SpeakUtterance( + SpeechSynthesisUtterance& aUtterance, const nsAString& aDocLang); + + void Speak(const nsAString& aText, const nsAString& aLang, + const nsAString& aUri, const float& aVolume, const float& aRate, + const float& aPitch, nsSpeechTask* aTask); + + bool SendInitialVoicesAndState(SpeechSynthesisParent* aParent); + + void SpeakNext(); + + void ResumeQueue(); + + bool IsSpeaking(); + + void SetIsSpeaking(bool aIsSpeaking); + + static nsSynthVoiceRegistry* GetInstance(); + + static already_AddRefed GetInstanceForService(); + + static void RecvInitialVoicesAndState(const nsTArray& aVoices, + const nsTArray& aDefaults, + const bool& aIsSpeaking); + + static void RecvRemoveVoice(const nsAString& aUri); + + static void RecvAddVoice(const RemoteVoice& aVoice); + + static void RecvSetDefaultVoice(const nsAString& aUri, bool aIsDefault); + + static void RecvIsSpeakingChanged(bool aIsSpeaking); + + static void RecvNotifyVoicesChanged(); + + private: + virtual ~nsSynthVoiceRegistry(); + + VoiceData* FindBestMatch(const nsAString& aUri, const nsAString& lang); + + bool FindVoiceByLang(const nsAString& aLang, VoiceData** aRetval); + + nsresult AddVoiceImpl(nsISpeechService* aService, const nsAString& aUri, + const nsAString& aName, const nsAString& aLang, + bool aLocalService, bool aQueuesUtterances); + + void SpeakImpl(VoiceData* aVoice, nsSpeechTask* aTask, const nsAString& aText, + const float& aVolume, const float& aRate, const float& aPitch); + + nsTArray> mVoices; + + nsTArray> mDefaultVoices; + + nsRefPtrHashtable mUriVoiceMap; + + SpeechSynthesisChild* mSpeechSynthChild; + + bool mUseGlobalQueue; + + nsTArray> mGlobalQueue; + + bool mIsSpeaking; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/speechd/SpeechDispatcherService.cpp b/dom/media/webspeech/synth/speechd/SpeechDispatcherService.cpp new file mode 100644 index 0000000000..e0d5488748 --- /dev/null +++ b/dom/media/webspeech/synth/speechd/SpeechDispatcherService.cpp @@ -0,0 +1,538 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechDispatcherService.h" + +#include "mozilla/dom/nsSpeechTask.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/Preferences.h" +#include "mozilla/StaticPrefs_media.h" +#include "nsEscape.h" +#include "nsISupports.h" +#include "nsPrintfCString.h" +#include "nsReadableUtils.h" +#include "nsServiceManagerUtils.h" +#include "nsThreadUtils.h" +#include "nsXULAppAPI.h" +#include "prlink.h" + +#include +#include + +#define URI_PREFIX "urn:moz-tts:speechd:" + +#define MAX_RATE static_cast(2.5) +#define MIN_RATE static_cast(0.5) + +// Some structures for libspeechd +typedef enum { + SPD_EVENT_BEGIN, + SPD_EVENT_END, + SPD_EVENT_INDEX_MARK, + SPD_EVENT_CANCEL, + SPD_EVENT_PAUSE, + SPD_EVENT_RESUME +} SPDNotificationType; + +typedef enum { + SPD_BEGIN = 1, + SPD_END = 2, + SPD_INDEX_MARKS = 4, + SPD_CANCEL = 8, + SPD_PAUSE = 16, + SPD_RESUME = 32, + + SPD_ALL = 0x3f +} SPDNotification; + +typedef enum { SPD_MODE_SINGLE = 0, SPD_MODE_THREADED = 1 } SPDConnectionMode; + +typedef void (*SPDCallback)(size_t msg_id, size_t client_id, + SPDNotificationType state); + +typedef void (*SPDCallbackIM)(size_t msg_id, size_t client_id, + SPDNotificationType state, char* index_mark); + +struct SPDConnection { + SPDCallback callback_begin; + SPDCallback callback_end; + SPDCallback callback_cancel; + SPDCallback callback_pause; + SPDCallback callback_resume; + SPDCallbackIM callback_im; + + /* partial, more private fields in structure */ +}; + +struct SPDVoice { + char* name; + char* language; + char* variant; +}; + +typedef enum { + SPD_IMPORTANT = 1, + SPD_MESSAGE = 2, + SPD_TEXT = 3, + SPD_NOTIFICATION = 4, + SPD_PROGRESS = 5 +} SPDPriority; + +#define SPEECHD_FUNCTIONS \ + FUNC(spd_open, SPDConnection*, \ + (const char*, const char*, const char*, SPDConnectionMode)) \ + FUNC(spd_close, void, (SPDConnection*)) \ + FUNC(spd_list_synthesis_voices, SPDVoice**, (SPDConnection*)) \ + FUNC(spd_say, int, (SPDConnection*, SPDPriority, const char*)) \ + FUNC(spd_cancel, int, (SPDConnection*)) \ + FUNC(spd_set_volume, int, (SPDConnection*, int)) \ + FUNC(spd_set_voice_rate, int, (SPDConnection*, int)) \ + FUNC(spd_set_voice_pitch, int, (SPDConnection*, int)) \ + FUNC(spd_set_synthesis_voice, int, (SPDConnection*, const char*)) \ + FUNC(spd_set_notification_on, int, (SPDConnection*, SPDNotification)) + +#define FUNC(name, type, params) \ + typedef type(*_##name##_fn) params; \ + static _##name##_fn _##name; + +SPEECHD_FUNCTIONS + +#undef FUNC + +#define spd_open _spd_open +#define spd_close _spd_close +#define spd_list_synthesis_voices _spd_list_synthesis_voices +#define spd_say _spd_say +#define spd_cancel _spd_cancel +#define spd_set_volume _spd_set_volume +#define spd_set_voice_rate _spd_set_voice_rate +#define spd_set_voice_pitch _spd_set_voice_pitch +#define spd_set_synthesis_voice _spd_set_synthesis_voice +#define spd_set_notification_on _spd_set_notification_on + +static PRLibrary* speechdLib = nullptr; + +typedef void (*nsSpeechDispatcherFunc)(); +struct nsSpeechDispatcherDynamicFunction { + const char* functionName; + nsSpeechDispatcherFunc* function; +}; + +namespace mozilla::dom { + +StaticRefPtr SpeechDispatcherService::sSingleton; + +class SpeechDispatcherVoice { + public: + SpeechDispatcherVoice(const nsAString& aName, const nsAString& aLanguage) + : mName(aName), mLanguage(aLanguage) {} + + NS_INLINE_DECL_THREADSAFE_REFCOUNTING(SpeechDispatcherVoice) + + // Voice name + nsString mName; + + // Voice language, in BCP-47 syntax + nsString mLanguage; + + private: + ~SpeechDispatcherVoice() = default; +}; + +class SpeechDispatcherCallback final : public nsISpeechTaskCallback { + public: + SpeechDispatcherCallback(nsISpeechTask* aTask, + SpeechDispatcherService* aService) + : mTask(aTask), mService(aService) {} + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SpeechDispatcherCallback, + nsISpeechTaskCallback) + + NS_DECL_NSISPEECHTASKCALLBACK + + bool OnSpeechEvent(SPDNotificationType state); + + private: + ~SpeechDispatcherCallback() = default; + + // This pointer is used to dispatch events + nsCOMPtr mTask; + + // By holding a strong reference to the service we guarantee that it won't be + // destroyed before this runnable. + RefPtr mService; + + TimeStamp mStartTime; +}; + +NS_IMPL_CYCLE_COLLECTION(SpeechDispatcherCallback, mTask); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechDispatcherCallback) + NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechDispatcherCallback) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechDispatcherCallback) + +NS_IMETHODIMP +SpeechDispatcherCallback::OnPause() { + // XXX: Speech dispatcher does not pause immediately, but waits for the speech + // to reach an index mark so that it could resume from that offset. + // There is no support for word or sentence boundaries, so index marks would + // only occur in explicit SSML marks, and we don't support that yet. + // What in actuality happens, is that if you call spd_pause(), it will speak + // the utterance in its entirety, dispatch an end event, and then put speechd + // in a 'paused' state. Since it is after the utterance ended, we don't get + // that state change, and our speech api is in an unrecoverable state. + // So, since it is useless anyway, I am not implementing pause. + return NS_OK; +} + +NS_IMETHODIMP +SpeechDispatcherCallback::OnResume() { + // XXX: Unsupported, see OnPause(). + return NS_OK; +} + +NS_IMETHODIMP +SpeechDispatcherCallback::OnCancel() { + if (spd_cancel(mService->mSpeechdClient) < 0) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +NS_IMETHODIMP +SpeechDispatcherCallback::OnVolumeChanged(float aVolume) { + // XXX: This currently does not change the volume mid-utterance, but it + // doesn't do anything bad either. So we could put this here with the hopes + // that speechd supports this in the future. + if (spd_set_volume(mService->mSpeechdClient, + static_cast(aVolume * 100)) < 0) { + return NS_ERROR_FAILURE; + } + + return NS_OK; +} + +bool SpeechDispatcherCallback::OnSpeechEvent(SPDNotificationType state) { + bool remove = false; + + switch (state) { + case SPD_EVENT_BEGIN: + mStartTime = TimeStamp::Now(); + mTask->DispatchStart(); + break; + + case SPD_EVENT_PAUSE: + mTask->DispatchPause((TimeStamp::Now() - mStartTime).ToSeconds(), 0); + break; + + case SPD_EVENT_RESUME: + mTask->DispatchResume((TimeStamp::Now() - mStartTime).ToSeconds(), 0); + break; + + case SPD_EVENT_CANCEL: + case SPD_EVENT_END: + mTask->DispatchEnd((TimeStamp::Now() - mStartTime).ToSeconds(), 0); + remove = true; + break; + + case SPD_EVENT_INDEX_MARK: + // Not yet supported + break; + + default: + break; + } + + return remove; +} + +static void speechd_cb(size_t msg_id, size_t client_id, + SPDNotificationType state) { + SpeechDispatcherService* service = + SpeechDispatcherService::GetInstance(false); + + if (service) { + NS_DispatchToMainThread(NewRunnableMethod( + "dom::SpeechDispatcherService::EventNotify", service, + &SpeechDispatcherService::EventNotify, static_cast(msg_id), + state)); + } +} + +NS_INTERFACE_MAP_BEGIN(SpeechDispatcherService) + NS_INTERFACE_MAP_ENTRY(nsISpeechService) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIObserver) +NS_INTERFACE_MAP_END + +NS_IMPL_ADDREF(SpeechDispatcherService) +NS_IMPL_RELEASE(SpeechDispatcherService) + +SpeechDispatcherService::SpeechDispatcherService() + : mInitialized(false), mSpeechdClient(nullptr) {} + +void SpeechDispatcherService::Init() { + if (!StaticPrefs::media_webspeech_synth_enabled() || + Preferences::GetBool("media.webspeech.synth.test")) { + return; + } + + // While speech dispatcher has a "threaded" mode, only spd_say() is async. + // Since synchronous socket i/o could impact startup time, we do + // initialization in a separate thread. + DebugOnly rv = + NS_NewNamedThread("speechd init", getter_AddRefs(mInitThread)); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + rv = mInitThread->Dispatch( + NewRunnableMethod("dom::SpeechDispatcherService::Setup", this, + &SpeechDispatcherService::Setup), + NS_DISPATCH_NORMAL); + MOZ_ASSERT(NS_SUCCEEDED(rv)); +} + +SpeechDispatcherService::~SpeechDispatcherService() { + if (mInitThread) { + mInitThread->Shutdown(); + } + + if (mSpeechdClient) { + spd_close(mSpeechdClient); + } +} + +void SpeechDispatcherService::Setup() { +#define FUNC(name, type, params) {#name, (nsSpeechDispatcherFunc*)&_##name}, + static const nsSpeechDispatcherDynamicFunction kSpeechDispatcherSymbols[] = { + SPEECHD_FUNCTIONS}; +#undef FUNC + + MOZ_ASSERT(!mInitialized); + + speechdLib = PR_LoadLibrary("libspeechd.so.2"); + + if (!speechdLib) { + NS_WARNING("Failed to load speechd library"); + return; + } + + if (!PR_FindFunctionSymbol(speechdLib, "spd_get_volume")) { + // There is no version getter function, so we rely on a symbol that was + // introduced in release 0.8.2 in order to check for ABI compatibility. + NS_WARNING("Unsupported version of speechd detected"); + return; + } + + for (uint32_t i = 0; i < ArrayLength(kSpeechDispatcherSymbols); i++) { + *kSpeechDispatcherSymbols[i].function = PR_FindFunctionSymbol( + speechdLib, kSpeechDispatcherSymbols[i].functionName); + + if (!*kSpeechDispatcherSymbols[i].function) { + NS_WARNING(nsPrintfCString("Failed to find speechd symbol for'%s'", + kSpeechDispatcherSymbols[i].functionName) + .get()); + return; + } + } + + mSpeechdClient = + spd_open("firefox", "web speech api", "who", SPD_MODE_THREADED); + if (!mSpeechdClient) { + NS_WARNING("Failed to call spd_open"); + return; + } + + // Get all the voices from sapi and register in the SynthVoiceRegistry + SPDVoice** list = spd_list_synthesis_voices(mSpeechdClient); + + mSpeechdClient->callback_begin = speechd_cb; + mSpeechdClient->callback_end = speechd_cb; + mSpeechdClient->callback_cancel = speechd_cb; + mSpeechdClient->callback_pause = speechd_cb; + mSpeechdClient->callback_resume = speechd_cb; + + spd_set_notification_on(mSpeechdClient, SPD_BEGIN); + spd_set_notification_on(mSpeechdClient, SPD_END); + spd_set_notification_on(mSpeechdClient, SPD_CANCEL); + + if (list != NULL) { + for (int i = 0; list[i]; i++) { + nsAutoString uri; + + uri.AssignLiteral(URI_PREFIX); + nsAutoCString name; + NS_EscapeURL(list[i]->name, -1, + esc_OnlyNonASCII | esc_Spaces | esc_AlwaysCopy, name); + uri.Append(NS_ConvertUTF8toUTF16(name)); + + uri.AppendLiteral("?"); + + nsAutoCString lang(list[i]->language); + + uri.Append(NS_ConvertUTF8toUTF16(lang)); + + mVoices.InsertOrUpdate(uri, MakeRefPtr( + NS_ConvertUTF8toUTF16(list[i]->name), + NS_ConvertUTF8toUTF16(lang))); + } + } + + NS_DispatchToMainThread( + NewRunnableMethod("dom::SpeechDispatcherService::RegisterVoices", this, + &SpeechDispatcherService::RegisterVoices)); + + // mInitialized = true; +} + +// private methods + +void SpeechDispatcherService::RegisterVoices() { + RefPtr registry = nsSynthVoiceRegistry::GetInstance(); + for (const auto& entry : mVoices) { + const RefPtr& voice = entry.GetData(); + + // This service can only speak one utterance at a time, so we set + // aQueuesUtterances to true in order to track global state and schedule + // access to this service. + DebugOnly rv = + registry->AddVoice(this, entry.GetKey(), voice->mName, voice->mLanguage, + voice->mName.EqualsLiteral("default"), true); + + NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Failed to add voice"); + } + + mInitThread->Shutdown(); + mInitThread = nullptr; + + mInitialized = true; + + registry->NotifyVoicesChanged(); +} + +// nsIObserver + +NS_IMETHODIMP +SpeechDispatcherService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + return NS_OK; +} + +// nsISpeechService + +// TODO: Support SSML +NS_IMETHODIMP +SpeechDispatcherService::Speak(const nsAString& aText, const nsAString& aUri, + float aVolume, float aRate, float aPitch, + nsISpeechTask* aTask) { + if (NS_WARN_IF(!mInitialized)) { + return NS_ERROR_NOT_AVAILABLE; + } + + RefPtr callback = + new SpeechDispatcherCallback(aTask, this); + + bool found = false; + SpeechDispatcherVoice* voice = mVoices.GetWeak(aUri, &found); + + if (NS_WARN_IF(!(found))) { + return NS_ERROR_NOT_AVAILABLE; + } + + spd_set_synthesis_voice(mSpeechdClient, + NS_ConvertUTF16toUTF8(voice->mName).get()); + + // We provide a volume of 0.0 to 1.0, speech-dispatcher expects 0 - 100. + spd_set_volume(mSpeechdClient, static_cast(aVolume * 100)); + + // aRate is a value of 0.1 (0.1x) to 10 (10x) with 1 (1x) being normal rate. + // speechd expects -100 to 100 with 0 being normal rate. + float rate = 0; + if (aRate > 1) { + // Each step to 100 is logarithmically distributed up to 2.5x. + rate = log10(std::min(aRate, MAX_RATE)) / log10(MAX_RATE) * 100; + } else if (aRate < 1) { + // Each step to -100 is logarithmically distributed down to 0.5x. + rate = log10(std::max(aRate, MIN_RATE)) / log10(MIN_RATE) * -100; + } + + spd_set_voice_rate(mSpeechdClient, static_cast(rate)); + + // We provide a pitch of 0 to 2 with 1 being the default. + // speech-dispatcher expects -100 to 100 with 0 being default. + spd_set_voice_pitch(mSpeechdClient, static_cast((aPitch - 1) * 100)); + + nsresult rv = aTask->Setup(callback); + + if (NS_FAILED(rv)) { + return rv; + } + + if (aText.Length()) { + int msg_id = spd_say(mSpeechdClient, SPD_MESSAGE, + NS_ConvertUTF16toUTF8(aText).get()); + + if (msg_id < 0) { + return NS_ERROR_FAILURE; + } + + mCallbacks.InsertOrUpdate(msg_id, std::move(callback)); + } else { + // Speech dispatcher does not work well with empty strings. + // In that case, don't send empty string to speechd, + // and just emulate a speechd start and end event. + NS_DispatchToMainThread(NewRunnableMethod( + "dom::SpeechDispatcherCallback::OnSpeechEvent", callback, + &SpeechDispatcherCallback::OnSpeechEvent, SPD_EVENT_BEGIN)); + + NS_DispatchToMainThread(NewRunnableMethod( + "dom::SpeechDispatcherCallback::OnSpeechEvent", callback, + &SpeechDispatcherCallback::OnSpeechEvent, SPD_EVENT_END)); + } + + return NS_OK; +} + +SpeechDispatcherService* SpeechDispatcherService::GetInstance(bool create) { + if (XRE_GetProcessType() != GeckoProcessType_Default) { + MOZ_ASSERT( + false, + "SpeechDispatcherService can only be started on main gecko process"); + return nullptr; + } + + if (!sSingleton && create) { + sSingleton = new SpeechDispatcherService(); + sSingleton->Init(); + ClearOnShutdown(&sSingleton); + } + + return sSingleton; +} + +already_AddRefed +SpeechDispatcherService::GetInstanceForService() { + MOZ_ASSERT(NS_IsMainThread()); + RefPtr sapiService = GetInstance(); + return sapiService.forget(); +} + +void SpeechDispatcherService::EventNotify(uint32_t aMsgId, uint32_t aState) { + SpeechDispatcherCallback* callback = mCallbacks.GetWeak(aMsgId); + + if (callback) { + if (callback->OnSpeechEvent((SPDNotificationType)aState)) { + mCallbacks.Remove(aMsgId); + } + } +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/speechd/SpeechDispatcherService.h b/dom/media/webspeech/synth/speechd/SpeechDispatcherService.h new file mode 100644 index 0000000000..2922053c80 --- /dev/null +++ b/dom/media/webspeech/synth/speechd/SpeechDispatcherService.h @@ -0,0 +1,65 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechDispatcherService_h +#define mozilla_dom_SpeechDispatcherService_h + +#include "mozilla/StaticPtr.h" +#include "nsIObserver.h" +#include "nsISpeechService.h" +#include "nsIThread.h" +#include "nsRefPtrHashtable.h" +#include "nsTArray.h" + +struct SPDConnection; + +namespace mozilla { +namespace dom { + +class SpeechDispatcherCallback; +class SpeechDispatcherVoice; + +class SpeechDispatcherService final : public nsIObserver, + public nsISpeechService { + friend class SpeechDispatcherCallback; + + public: + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSIOBSERVER + NS_DECL_NSISPEECHSERVICE + + SpeechDispatcherService(); + + void Init(); + + void Setup(); + + void EventNotify(uint32_t aMsgId, uint32_t aState); + + static SpeechDispatcherService* GetInstance(bool create = true); + static already_AddRefed GetInstanceForService(); + + static StaticRefPtr sSingleton; + + private: + virtual ~SpeechDispatcherService(); + + void RegisterVoices(); + + bool mInitialized; + + SPDConnection* mSpeechdClient; + + nsRefPtrHashtable mCallbacks; + + nsCOMPtr mInitThread; + + nsRefPtrHashtable mVoices; +}; + +} // namespace dom +} // namespace mozilla +#endif diff --git a/dom/media/webspeech/synth/speechd/components.conf b/dom/media/webspeech/synth/speechd/components.conf new file mode 100644 index 0000000000..56b01ba5cb --- /dev/null +++ b/dom/media/webspeech/synth/speechd/components.conf @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{8817b1cf-5ada-43bf-bd73-607657703d0d}', + 'contract_ids': ['@mozilla.org/synthspeechdispatcher;1'], + 'singleton': True, + 'type': 'mozilla::dom::SpeechDispatcherService', + 'headers': ['/dom/media/webspeech/synth/speechd/SpeechDispatcherService.h'], + 'constructor': 'mozilla::dom::SpeechDispatcherService::GetInstanceForService', + 'categories': {"speech-synth-started": 'SpeechDispatcher Speech Synth'}, + }, +] diff --git a/dom/media/webspeech/synth/speechd/moz.build b/dom/media/webspeech/synth/speechd/moz.build new file mode 100644 index 0000000000..0d9632a488 --- /dev/null +++ b/dom/media/webspeech/synth/speechd/moz.build @@ -0,0 +1,15 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += ["SpeechDispatcherService.cpp"] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" diff --git a/dom/media/webspeech/synth/test/common.js b/dom/media/webspeech/synth/test/common.js new file mode 100644 index 0000000000..c22b0b488c --- /dev/null +++ b/dom/media/webspeech/synth/test/common.js @@ -0,0 +1,104 @@ +function synthTestQueue(aTestArgs, aEndFunc) { + var utterances = []; + for (var i in aTestArgs) { + var uargs = aTestArgs[i][0]; + var win = uargs.win || window; + var u = new win.SpeechSynthesisUtterance(uargs.text); + + if (uargs.args) { + for (var attr in uargs.args) { + u[attr] = uargs.args[attr]; + } + } + + function onend_handler(e) { + is(e.target, utterances.shift(), "Target matches utterances"); + ok(!speechSynthesis.speaking, "speechSynthesis is not speaking."); + + if (utterances.length) { + ok(speechSynthesis.pending, "other utterances queued"); + } else { + ok(!speechSynthesis.pending, "queue is empty, nothing pending."); + if (aEndFunc) { + aEndFunc(); + } + } + } + + u.addEventListener( + "start", + (function (expectedUri) { + return function (e) { + if (expectedUri) { + var chosenVoice = SpecialPowers.wrap(e).target.chosenVoiceURI; + is(chosenVoice, expectedUri, "Incorrect URI is used"); + } + }; + })(aTestArgs[i][1] ? aTestArgs[i][1].uri : null) + ); + + u.addEventListener("end", onend_handler); + u.addEventListener("error", onend_handler); + + u.addEventListener( + "error", + (function (expectedError) { + return function onerror_handler(e) { + ok( + expectedError, + "Error in speech utterance '" + e.target.text + "'" + ); + }; + })(aTestArgs[i][1] ? aTestArgs[i][1].err : false) + ); + + utterances.push(u); + win.speechSynthesis.speak(u); + } + + ok(!speechSynthesis.speaking, "speechSynthesis is not speaking yet."); + ok(speechSynthesis.pending, "speechSynthesis has an utterance queued."); +} + +function loadFrame(frameId) { + return new Promise(function (resolve, reject) { + var frame = document.getElementById(frameId); + frame.addEventListener("load", function (e) { + frame.contentWindow.document.title = frameId; + resolve(frame); + }); + frame.src = "about:blank"; + }); +} + +function waitForVoices(win) { + return new Promise(resolve => { + function resolver() { + if (win.speechSynthesis.getVoices().length) { + win.speechSynthesis.removeEventListener("voiceschanged", resolver); + resolve(); + } + } + + win.speechSynthesis.addEventListener("voiceschanged", resolver); + resolver(); + }); +} + +function loadSpeechTest(fileName, prefs, frameId = "testFrame") { + loadFrame(frameId).then(frame => { + waitForVoices(frame.contentWindow).then( + () => (document.getElementById("testFrame").src = fileName) + ); + }); +} + +function testSynthState(win, expectedState) { + for (var attr in expectedState) { + is( + win.speechSynthesis[attr], + expectedState[attr], + win.document.title + ": '" + attr + '" does not match' + ); + } +} diff --git a/dom/media/webspeech/synth/test/components.conf b/dom/media/webspeech/synth/test/components.conf new file mode 100644 index 0000000000..f37e4eafae --- /dev/null +++ b/dom/media/webspeech/synth/test/components.conf @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{e7d52d9e-c148-47d8-ab2a-95d7f40ea53d}', + 'contract_ids': ['@mozilla.org/fakesynth;1'], + 'singleton': True, + 'type': 'mozilla::dom::nsFakeSynthServices', + 'headers': ['/dom/media/webspeech/synth/test/nsFakeSynthServices.h'], + 'constructor': 'mozilla::dom::nsFakeSynthServices::GetInstanceForService', + 'categories': {'speech-synth-started': 'Fake Speech Synth'}, + }, +] diff --git a/dom/media/webspeech/synth/test/file_bfcache_page1.html b/dom/media/webspeech/synth/test/file_bfcache_page1.html new file mode 100644 index 0000000000..d6229eeeda --- /dev/null +++ b/dom/media/webspeech/synth/test/file_bfcache_page1.html @@ -0,0 +1,18 @@ + + + + + + + + + diff --git a/dom/media/webspeech/synth/test/file_bfcache_page2.html b/dom/media/webspeech/synth/test/file_bfcache_page2.html new file mode 100644 index 0000000000..30b9aa9117 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_bfcache_page2.html @@ -0,0 +1,14 @@ + + + + + diff --git a/dom/media/webspeech/synth/test/file_global_queue.html b/dom/media/webspeech/synth/test/file_global_queue.html new file mode 100644 index 0000000000..5d762c0d51 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_global_queue.html @@ -0,0 +1,69 @@ + + + + + + Test for Bug 1188099: Global queue should correctly schedule utterances + + + + +Mozilla Bug 1188099 + + + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_global_queue_cancel.html b/dom/media/webspeech/synth/test/file_global_queue_cancel.html new file mode 100644 index 0000000000..03b77ba2fc --- /dev/null +++ b/dom/media/webspeech/synth/test/file_global_queue_cancel.html @@ -0,0 +1,88 @@ + + + + + + Test for Bug 1188099: Calling cancel() should work correctly with global queue + + + + +Mozilla Bug 1188099 + + + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_global_queue_pause.html b/dom/media/webspeech/synth/test/file_global_queue_pause.html new file mode 100644 index 0000000000..e345eb4c98 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_global_queue_pause.html @@ -0,0 +1,130 @@ + + + + + + Test for Bug 1188099: Calling pause() should work correctly with global queue + + + + +Mozilla Bug 1188099 + + + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_indirect_service_events.html b/dom/media/webspeech/synth/test/file_indirect_service_events.html new file mode 100644 index 0000000000..5ed7812757 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_indirect_service_events.html @@ -0,0 +1,102 @@ + + + + + + Test for Bug 1155034: Check that indirect audio services dispatch their own events + + + + +Mozilla Bug 1155034 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_setup.html b/dom/media/webspeech/synth/test/file_setup.html new file mode 100644 index 0000000000..da8c2c6824 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_setup.html @@ -0,0 +1,96 @@ + + + + + + Test for Bug 525444: Web Speech API check all classes are present + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_speech_cancel.html b/dom/media/webspeech/synth/test/file_speech_cancel.html new file mode 100644 index 0000000000..2ab0e1d0a8 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_speech_cancel.html @@ -0,0 +1,100 @@ + + + + + + Test for Bug 1150315: Check that successive cancel/speak calls work + + + + +Mozilla Bug 1150315 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_speech_error.html b/dom/media/webspeech/synth/test/file_speech_error.html new file mode 100644 index 0000000000..b98ec2fac0 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_speech_error.html @@ -0,0 +1,46 @@ + + + + + + Test for Bug 1226015 + + + + +Mozilla Bug 1226015 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_speech_queue.html b/dom/media/webspeech/synth/test/file_speech_queue.html new file mode 100644 index 0000000000..a471034dcf --- /dev/null +++ b/dom/media/webspeech/synth/test/file_speech_queue.html @@ -0,0 +1,86 @@ + + + + + + Test for Bug 525444: Web Speech API, check speech synth queue + + + + +Mozilla Bug 525444 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/file_speech_repeating_utterance.html b/dom/media/webspeech/synth/test/file_speech_repeating_utterance.html new file mode 100644 index 0000000000..6e37653057 --- /dev/null +++ b/dom/media/webspeech/synth/test/file_speech_repeating_utterance.html @@ -0,0 +1,26 @@ + + + + + Test for Bug 1305344: Utterance not repeating in Firefox + + + + + + + diff --git a/dom/media/webspeech/synth/test/file_speech_simple.html b/dom/media/webspeech/synth/test/file_speech_simple.html new file mode 100644 index 0000000000..c3f240ccdc --- /dev/null +++ b/dom/media/webspeech/synth/test/file_speech_simple.html @@ -0,0 +1,53 @@ + + + + + + Test for Bug 650295: Web Speech API check all classes are present + + + + +Mozilla Bug 650295 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/mochitest.ini b/dom/media/webspeech/synth/test/mochitest.ini new file mode 100644 index 0000000000..2f188dac67 --- /dev/null +++ b/dom/media/webspeech/synth/test/mochitest.ini @@ -0,0 +1,29 @@ +[DEFAULT] +tags=mtg +subsuite = media +support-files = + common.js + file_bfcache_page1.html + file_bfcache_page2.html + file_setup.html + file_speech_queue.html + file_speech_simple.html + file_speech_cancel.html + file_speech_error.html + file_indirect_service_events.html + file_global_queue.html + file_global_queue_cancel.html + file_global_queue_pause.html + file_speech_repeating_utterance.html + +[test_setup.html] +[test_speech_queue.html] +[test_speech_simple.html] +[test_speech_cancel.html] +[test_speech_error.html] +[test_indirect_service_events.html] +[test_global_queue.html] +[test_global_queue_cancel.html] +[test_global_queue_pause.html] +[test_bfcache.html] +[test_speech_repeating_utterance.html] diff --git a/dom/media/webspeech/synth/test/nsFakeSynthServices.cpp b/dom/media/webspeech/synth/test/nsFakeSynthServices.cpp new file mode 100644 index 0000000000..075e8aa878 --- /dev/null +++ b/dom/media/webspeech/synth/test/nsFakeSynthServices.cpp @@ -0,0 +1,288 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.h" +#include "nsFakeSynthServices.h" +#include "nsPrintfCString.h" +#include "SharedBuffer.h" + +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/dom/nsSpeechTask.h" + +#include "nsThreadUtils.h" +#include "nsXULAppAPI.h" +#include "prenv.h" +#include "mozilla/Preferences.h" +#include "mozilla/DebugOnly.h" + +#define CHANNELS 1 +#define SAMPLERATE 1600 + +namespace mozilla::dom { + +StaticRefPtr nsFakeSynthServices::sSingleton; + +enum VoiceFlags { + eSuppressEvents = 1, + eSuppressEnd = 2, + eFailAtStart = 4, + eFail = 8 +}; + +struct VoiceDetails { + const char* uri; + const char* name; + const char* lang; + bool defaultVoice; + uint32_t flags; +}; + +static const VoiceDetails sVoices[] = { + {"urn:moz-tts:fake:bob", "Bob Marley", "en-JM", true, 0}, + {"urn:moz-tts:fake:amy", "Amy Winehouse", "en-GB", false, 0}, + {"urn:moz-tts:fake:lenny", "Leonard Cohen", "en-CA", false, 0}, + {"urn:moz-tts:fake:celine", "Celine Dion", "fr-CA", false, 0}, + { + "urn:moz-tts:fake:julie", + "Julieta Venegas", + "es-MX", + false, + }, + {"urn:moz-tts:fake:zanetta", "Zanetta Farussi", "it-IT", false, 0}, + {"urn:moz-tts:fake:margherita", "Margherita Durastanti", + "it-IT-noevents-noend", false, eSuppressEvents | eSuppressEnd}, + {"urn:moz-tts:fake:teresa", "Teresa Cornelys", "it-IT-noend", false, + eSuppressEnd}, + {"urn:moz-tts:fake:cecilia", "Cecilia Bartoli", "it-IT-failatstart", false, + eFailAtStart}, + {"urn:moz-tts:fake:gottardo", "Gottardo Aldighieri", "it-IT-fail", false, + eFail}, +}; + +// FakeSynthCallback +class FakeSynthCallback : public nsISpeechTaskCallback { + public: + explicit FakeSynthCallback(nsISpeechTask* aTask) : mTask(aTask) {} + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(FakeSynthCallback, + nsISpeechTaskCallback) + + NS_IMETHOD OnPause() override { + if (mTask) { + mTask->DispatchPause(1.5, 1); + } + + return NS_OK; + } + + NS_IMETHOD OnResume() override { + if (mTask) { + mTask->DispatchResume(1.5, 1); + } + + return NS_OK; + } + + NS_IMETHOD OnCancel() override { + if (mTask) { + mTask->DispatchEnd(1.5, 1); + } + + return NS_OK; + } + + NS_IMETHOD OnVolumeChanged(float aVolume) override { return NS_OK; } + + private: + virtual ~FakeSynthCallback() = default; + + nsCOMPtr mTask; +}; + +NS_IMPL_CYCLE_COLLECTION(FakeSynthCallback, mTask); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(FakeSynthCallback) + NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(FakeSynthCallback) +NS_IMPL_CYCLE_COLLECTING_RELEASE(FakeSynthCallback) + +// FakeSpeechSynth + +class FakeSpeechSynth : public nsISpeechService { + public: + FakeSpeechSynth() = default; + + NS_DECL_ISUPPORTS + NS_DECL_NSISPEECHSERVICE + + private: + virtual ~FakeSpeechSynth() = default; +}; + +NS_IMPL_ISUPPORTS(FakeSpeechSynth, nsISpeechService) + +NS_IMETHODIMP +FakeSpeechSynth::Speak(const nsAString& aText, const nsAString& aUri, + float aVolume, float aRate, float aPitch, + nsISpeechTask* aTask) { + class DispatchStart final : public Runnable { + public: + explicit DispatchStart(nsISpeechTask* aTask) + : mozilla::Runnable("DispatchStart"), mTask(aTask) {} + + NS_IMETHOD Run() override { + mTask->DispatchStart(); + + return NS_OK; + } + + private: + nsCOMPtr mTask; + }; + + class DispatchEnd final : public Runnable { + public: + DispatchEnd(nsISpeechTask* aTask, const nsAString& aText) + : mozilla::Runnable("DispatchEnd"), mTask(aTask), mText(aText) {} + + NS_IMETHOD Run() override { + mTask->DispatchEnd(mText.Length() / 2, mText.Length()); + + return NS_OK; + } + + private: + nsCOMPtr mTask; + nsString mText; + }; + + class DispatchError final : public Runnable { + public: + DispatchError(nsISpeechTask* aTask, const nsAString& aText) + : mozilla::Runnable("DispatchError"), mTask(aTask), mText(aText) {} + + NS_IMETHOD Run() override { + mTask->DispatchError(mText.Length() / 2, mText.Length()); + + return NS_OK; + } + + private: + nsCOMPtr mTask; + nsString mText; + }; + + uint32_t flags = 0; + for (VoiceDetails voice : sVoices) { + if (aUri.EqualsASCII(voice.uri)) { + flags = voice.flags; + break; + } + } + + if (flags & eFailAtStart) { + return NS_ERROR_FAILURE; + } + + RefPtr cb = + new FakeSynthCallback((flags & eSuppressEvents) ? nullptr : aTask); + + aTask->Setup(cb); + + nsCOMPtr runnable = new DispatchStart(aTask); + NS_DispatchToMainThread(runnable); + + if (flags & eFail) { + runnable = new DispatchError(aTask, aText); + NS_DispatchToMainThread(runnable); + } else if ((flags & eSuppressEnd) == 0) { + runnable = new DispatchEnd(aTask, aText); + NS_DispatchToMainThread(runnable); + } + + return NS_OK; +} + +// nsFakeSynthService + +NS_INTERFACE_MAP_BEGIN(nsFakeSynthServices) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIObserver) +NS_INTERFACE_MAP_END + +NS_IMPL_ADDREF(nsFakeSynthServices) +NS_IMPL_RELEASE(nsFakeSynthServices) + +static void AddVoices(nsISpeechService* aService, const VoiceDetails* aVoices, + uint32_t aLength) { + RefPtr registry = nsSynthVoiceRegistry::GetInstance(); + for (uint32_t i = 0; i < aLength; i++) { + NS_ConvertUTF8toUTF16 name(aVoices[i].name); + NS_ConvertUTF8toUTF16 uri(aVoices[i].uri); + NS_ConvertUTF8toUTF16 lang(aVoices[i].lang); + // These services can handle more than one utterance at a time and have + // several speaking simultaniously. So, aQueuesUtterances == false + registry->AddVoice(aService, uri, name, lang, true, false); + if (aVoices[i].defaultVoice) { + registry->SetDefaultVoice(uri, true); + } + } + + registry->NotifyVoicesChanged(); +} + +void nsFakeSynthServices::Init() { + mSynthService = new FakeSpeechSynth(); + AddVoices(mSynthService, sVoices, ArrayLength(sVoices)); +} + +// nsIObserver + +NS_IMETHODIMP +nsFakeSynthServices::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread()); + if (NS_WARN_IF(!(!strcmp(aTopic, "speech-synth-started")))) { + return NS_ERROR_UNEXPECTED; + } + + if (Preferences::GetBool("media.webspeech.synth.test")) { + NS_DispatchToMainThread(NewRunnableMethod( + "dom::nsFakeSynthServices::Init", this, &nsFakeSynthServices::Init)); + } + + return NS_OK; +} + +// static methods + +nsFakeSynthServices* nsFakeSynthServices::GetInstance() { + MOZ_ASSERT(NS_IsMainThread()); + if (!XRE_IsParentProcess()) { + MOZ_ASSERT(false, + "nsFakeSynthServices can only be started on main gecko process"); + return nullptr; + } + + if (!sSingleton) { + sSingleton = new nsFakeSynthServices(); + ClearOnShutdown(&sSingleton); + } + + return sSingleton; +} + +already_AddRefed +nsFakeSynthServices::GetInstanceForService() { + RefPtr picoService = GetInstance(); + return picoService.forget(); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/test/nsFakeSynthServices.h b/dom/media/webspeech/synth/test/nsFakeSynthServices.h new file mode 100644 index 0000000000..f7e1ca7da6 --- /dev/null +++ b/dom/media/webspeech/synth/test/nsFakeSynthServices.h @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsFakeSynthServices_h +#define nsFakeSynthServices_h + +#include "nsTArray.h" +#include "nsIObserver.h" +#include "nsISpeechService.h" +#include "nsRefPtrHashtable.h" +#include "mozilla/StaticPtr.h" +#include "mozilla/Monitor.h" + +namespace mozilla::dom { + +class nsFakeSynthServices : public nsIObserver { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIOBSERVER + + nsFakeSynthServices() = default; + + static nsFakeSynthServices* GetInstance(); + + static already_AddRefed GetInstanceForService(); + + private: + virtual ~nsFakeSynthServices() = default; + + void Init(); + + nsCOMPtr mSynthService; + + static StaticRefPtr sSingleton; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/test/startup/file_voiceschanged.html b/dom/media/webspeech/synth/test/startup/file_voiceschanged.html new file mode 100644 index 0000000000..6bb25462e4 --- /dev/null +++ b/dom/media/webspeech/synth/test/startup/file_voiceschanged.html @@ -0,0 +1,32 @@ + + + + + + Test for Bug 1254378: Web Speech API check all classes are present + + + + + + diff --git a/dom/media/webspeech/synth/test/startup/mochitest.ini b/dom/media/webspeech/synth/test/startup/mochitest.ini new file mode 100644 index 0000000000..ec4285b772 --- /dev/null +++ b/dom/media/webspeech/synth/test/startup/mochitest.ini @@ -0,0 +1,8 @@ +[DEFAULT] +tags=mtg +subsuite = media +support-files = + file_voiceschanged.html + +[test_voiceschanged.html] +skip-if = verify diff --git a/dom/media/webspeech/synth/test/startup/test_voiceschanged.html b/dom/media/webspeech/synth/test/startup/test_voiceschanged.html new file mode 100644 index 0000000000..a60252ea7e --- /dev/null +++ b/dom/media/webspeech/synth/test/startup/test_voiceschanged.html @@ -0,0 +1,32 @@ + + + + + + Test for Bug 1254378: Emit onvoiceschanged when voices first added + + + + +Mozilla Bug 1254378 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_bfcache.html b/dom/media/webspeech/synth/test/test_bfcache.html new file mode 100644 index 0000000000..ba5981a42b --- /dev/null +++ b/dom/media/webspeech/synth/test/test_bfcache.html @@ -0,0 +1,46 @@ + + + + + + Test for Bug 1230533: Test speech is stopped from a window when unloaded + + + + + +Mozilla Bug 1230533 +

+ +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_global_queue.html b/dom/media/webspeech/synth/test/test_global_queue.html new file mode 100644 index 0000000000..177f79b399 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_global_queue.html @@ -0,0 +1,35 @@ + + + + + + Test for Bug 1188099: Global queue should correctly schedule utterances + + + + + +Mozilla Bug 1188099 +

+ + +
+
+
+ + \ No newline at end of file diff --git a/dom/media/webspeech/synth/test/test_global_queue_cancel.html b/dom/media/webspeech/synth/test/test_global_queue_cancel.html new file mode 100644 index 0000000000..748d1367b5 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_global_queue_cancel.html @@ -0,0 +1,35 @@ + + + + + + Test for Bug 1188099: Calling cancel() should work correctly with global queue + + + + + +Mozilla Bug 1188099 +

+ + +
+
+
+ + \ No newline at end of file diff --git a/dom/media/webspeech/synth/test/test_global_queue_pause.html b/dom/media/webspeech/synth/test/test_global_queue_pause.html new file mode 100644 index 0000000000..9632d85127 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_global_queue_pause.html @@ -0,0 +1,35 @@ + + + + + + Test for Bug 1188099: Calling pause() should work correctly with global queue + + + + + +Mozilla Bug 1188099 +

+ + +
+
+
+ + \ No newline at end of file diff --git a/dom/media/webspeech/synth/test/test_indirect_service_events.html b/dom/media/webspeech/synth/test/test_indirect_service_events.html new file mode 100644 index 0000000000..e5b32e70f0 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_indirect_service_events.html @@ -0,0 +1,36 @@ + + + + + + Test for Bug 1155034: Check that indirect audio services dispatch their own events + + + + + +Mozilla Bug 1155034 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_setup.html b/dom/media/webspeech/synth/test/test_setup.html new file mode 100644 index 0000000000..da07687750 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_setup.html @@ -0,0 +1,32 @@ + + + + + + Test for Bug 525444: Web Speech API check all classes are present + + + + +Mozilla Bug 650295 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_speech_cancel.html b/dom/media/webspeech/synth/test/test_speech_cancel.html new file mode 100644 index 0000000000..ced952c736 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_speech_cancel.html @@ -0,0 +1,35 @@ + + + + + + Test for Bug 1150315: Web Speech API check all classes are present + + + + + +Mozilla Bug 1150315 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_speech_error.html b/dom/media/webspeech/synth/test/test_speech_error.html new file mode 100644 index 0000000000..e2ce156dc6 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_speech_error.html @@ -0,0 +1,35 @@ + + + + + + Test for Bug 1150315: Web Speech API check all classes are present + + + + + +Mozilla Bug 1226015 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_speech_queue.html b/dom/media/webspeech/synth/test/test_speech_queue.html new file mode 100644 index 0000000000..3bca9e0ce2 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_speech_queue.html @@ -0,0 +1,37 @@ + + + + + + Test for Bug 525444: Web Speech API, check speech synth queue + + + + + +Mozilla Bug 525444 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/test/test_speech_repeating_utterance.html b/dom/media/webspeech/synth/test/test_speech_repeating_utterance.html new file mode 100644 index 0000000000..6313a275c1 --- /dev/null +++ b/dom/media/webspeech/synth/test/test_speech_repeating_utterance.html @@ -0,0 +1,18 @@ + + + + + Test for Bug 1305344: Utterance not repeating in Firefox + + + + + + Mozilla Bug 1305344 + + + + diff --git a/dom/media/webspeech/synth/test/test_speech_simple.html b/dom/media/webspeech/synth/test/test_speech_simple.html new file mode 100644 index 0000000000..c6c0e3a5be --- /dev/null +++ b/dom/media/webspeech/synth/test/test_speech_simple.html @@ -0,0 +1,34 @@ + + + + + + Test for Bug 650295: Web Speech API check all classes are present + + + + + +Mozilla Bug 650295 +

+ + +
+
+
+ + diff --git a/dom/media/webspeech/synth/windows/SapiService.cpp b/dom/media/webspeech/synth/windows/SapiService.cpp new file mode 100644 index 0000000000..f1e44213d1 --- /dev/null +++ b/dom/media/webspeech/synth/windows/SapiService.cpp @@ -0,0 +1,445 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.h" +#include "SapiService.h" +#include "nsServiceManagerUtils.h" +#include "nsEscape.h" +#include "nsXULAppAPI.h" + +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/dom/nsSpeechTask.h" +#include "mozilla/Preferences.h" +#include "mozilla/ProfilerLabels.h" +#include "mozilla/StaticPrefs_media.h" + +namespace mozilla::dom { + +constexpr static WCHAR kSpCategoryOneCoreVoices[] = + L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech_OneCore\\Voices"; + +StaticRefPtr SapiService::sSingleton; + +class SapiCallback final : public nsISpeechTaskCallback { + public: + SapiCallback(nsISpeechTask* aTask, ISpVoice* aSapiClient, + uint32_t aTextOffset, uint32_t aSpeakTextLen) + : mTask(aTask), + mSapiClient(aSapiClient), + mTextOffset(aTextOffset), + mSpeakTextLen(aSpeakTextLen), + mCurrentIndex(0), + mStreamNum(0) { + mStartingTime = TimeStamp::Now(); + } + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SapiCallback, nsISpeechTaskCallback) + + NS_DECL_NSISPEECHTASKCALLBACK + + ULONG GetStreamNum() const { return mStreamNum; } + void SetStreamNum(ULONG aValue) { mStreamNum = aValue; } + + void OnSpeechEvent(const SPEVENT& speechEvent); + + private: + ~SapiCallback() {} + + float GetTimeDurationFromStart() const { + TimeDuration duration = TimeStamp::Now() - mStartingTime; + return duration.ToSeconds(); + } + + // This pointer is used to dispatch events + nsCOMPtr mTask; + RefPtr mSapiClient; + + uint32_t mTextOffset; + uint32_t mSpeakTextLen; + + // Used for calculating the time taken to speak the utterance + TimeStamp mStartingTime; + uint32_t mCurrentIndex; + + ULONG mStreamNum; +}; + +NS_IMPL_CYCLE_COLLECTION(SapiCallback, mTask); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SapiCallback) + NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(SapiCallback) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SapiCallback) + +NS_IMETHODIMP +SapiCallback::OnPause() { + if (FAILED(mSapiClient->Pause())) { + return NS_ERROR_FAILURE; + } + if (!mTask) { + // When calling pause() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchPause(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnResume() { + if (FAILED(mSapiClient->Resume())) { + return NS_ERROR_FAILURE; + } + if (!mTask) { + // When calling resume() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchResume(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnCancel() { + // After cancel, mCurrentIndex may be updated. + // At cancel case, use mCurrentIndex for DispatchEnd. + mSpeakTextLen = 0; + // Purge all the previous utterances and speak an empty string + if (FAILED(mSapiClient->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr))) { + return NS_ERROR_FAILURE; + } + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnVolumeChanged(float aVolume) { + mSapiClient->SetVolume(static_cast(aVolume * 100)); + return NS_OK; +} + +void SapiCallback::OnSpeechEvent(const SPEVENT& speechEvent) { + switch (speechEvent.eEventId) { + case SPEI_START_INPUT_STREAM: + mTask->DispatchStart(); + break; + case SPEI_END_INPUT_STREAM: + if (mSpeakTextLen) { + mCurrentIndex = mSpeakTextLen; + } + mTask->DispatchEnd(GetTimeDurationFromStart(), mCurrentIndex); + mTask = nullptr; + break; + case SPEI_TTS_BOOKMARK: + mCurrentIndex = static_cast(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"mark"_ns, GetTimeDurationFromStart(), + mCurrentIndex, 0, 0); + break; + case SPEI_WORD_BOUNDARY: + mCurrentIndex = static_cast(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"word"_ns, GetTimeDurationFromStart(), + mCurrentIndex, + static_cast(speechEvent.wParam), 1); + break; + case SPEI_SENTENCE_BOUNDARY: + mCurrentIndex = static_cast(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"sentence"_ns, GetTimeDurationFromStart(), + mCurrentIndex, + static_cast(speechEvent.wParam), 1); + break; + default: + break; + } +} + +// static +void __stdcall SapiService::SpeechEventCallback(WPARAM aWParam, + LPARAM aLParam) { + RefPtr spVoice = (ISpVoice*)aWParam; + RefPtr service = (SapiService*)aLParam; + + SPEVENT speechEvent; + while (spVoice->GetEvents(1, &speechEvent, nullptr) == S_OK) { + for (size_t i = 0; i < service->mCallbacks.Length(); i++) { + RefPtr callback = service->mCallbacks[i]; + if (callback->GetStreamNum() == speechEvent.ulStreamNum) { + callback->OnSpeechEvent(speechEvent); + if (speechEvent.eEventId == SPEI_END_INPUT_STREAM) { + service->mCallbacks.RemoveElementAt(i); + } + break; + } + } + } +} + +NS_INTERFACE_MAP_BEGIN(SapiService) + NS_INTERFACE_MAP_ENTRY(nsISpeechService) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechService) +NS_INTERFACE_MAP_END + +NS_IMPL_ADDREF(SapiService) +NS_IMPL_RELEASE(SapiService) + +SapiService::SapiService() : mInitialized(false) {} + +SapiService::~SapiService() {} + +bool SapiService::Init() { + AUTO_PROFILER_LABEL("SapiService::Init", OTHER); + + MOZ_ASSERT(!mInitialized); + + if (Preferences::GetBool("media.webspeech.synth.test") || + !StaticPrefs::media_webspeech_synth_enabled()) { + // When enabled, we shouldn't add OS backend (Bug 1160844) + return false; + } + + // Get all the voices from sapi and register in the SynthVoiceRegistry + if (!RegisterVoices()) { + return false; + } + + mInitialized = true; + return true; +} + +already_AddRefed SapiService::InitSapiInstance() { + RefPtr spVoice; + if (FAILED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, + getter_AddRefs(spVoice)))) { + return nullptr; + } + + // Set interest for all the events we are interested in + ULONGLONG eventMask = SPFEI(SPEI_START_INPUT_STREAM) | + SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | + SPFEI(SPEI_SENTENCE_BOUNDARY) | + SPFEI(SPEI_END_INPUT_STREAM); + + if (FAILED(spVoice->SetInterest(eventMask, eventMask))) { + return nullptr; + } + + // Set the callback function for receiving the events + spVoice->SetNotifyCallbackFunction( + (SPNOTIFYCALLBACK*)SapiService::SpeechEventCallback, + (WPARAM)spVoice.get(), (LPARAM)this); + + return spVoice.forget(); +} + +bool SapiService::RegisterVoices() { + nsCOMPtr registry = + do_GetService(NS_SYNTHVOICEREGISTRY_CONTRACTID); + if (!registry) { + return false; + } + bool result = RegisterVoices(registry, kSpCategoryOneCoreVoices); + result |= RegisterVoices(registry, SPCAT_VOICES); + if (result) { + registry->NotifyVoicesChanged(); + } + return result; +} + +bool SapiService::RegisterVoices(nsCOMPtr& registry, + const WCHAR* categoryId) { + nsresult rv; + + RefPtr category; + if (FAILED(CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, + IID_ISpObjectTokenCategory, + getter_AddRefs(category)))) { + return false; + } + if (FAILED(category->SetId(categoryId, FALSE))) { + return false; + } + + RefPtr voiceTokens; + if (FAILED(category->EnumTokens(nullptr, nullptr, + getter_AddRefs(voiceTokens)))) { + return false; + } + + WCHAR locale[LOCALE_NAME_MAX_LENGTH]; + while (true) { + RefPtr voiceToken; + if (voiceTokens->Next(1, getter_AddRefs(voiceToken), nullptr) != S_OK) { + break; + } + + RefPtr attributes; + if (FAILED( + voiceToken->OpenKey(L"Attributes", getter_AddRefs(attributes)))) { + continue; + } + + WCHAR* language = nullptr; + if (FAILED(attributes->GetStringValue(L"Language", &language))) { + continue; + } + + // Language attribute is LCID by hex. So we need convert to locale + // name. + nsAutoString hexLcid; + LCID lcid = wcstol(language, nullptr, 16); + CoTaskMemFree(language); + if (NS_WARN_IF( + !LCIDToLocaleName(lcid, locale, LOCALE_NAME_MAX_LENGTH, 0))) { + continue; + } + + WCHAR* description = nullptr; + if (FAILED(voiceToken->GetStringValue(nullptr, &description))) { + continue; + } + + nsAutoString uri; + uri.AssignLiteral("urn:moz-tts:sapi:"); + uri.Append(description); + uri.AppendLiteral("?"); + uri.Append(locale); + + // This service can only speak one utterance at a time, se we set + // aQueuesUtterances to true in order to track global state and schedule + // access to this service. + rv = registry->AddVoice(this, uri, nsDependentString(description), + nsDependentString(locale), true, true); + CoTaskMemFree(description); + if (NS_FAILED(rv)) { + continue; + } + + mVoices.InsertOrUpdate(uri, std::move(voiceToken)); + } + + return true; +} + +NS_IMETHODIMP +SapiService::Speak(const nsAString& aText, const nsAString& aUri, float aVolume, + float aRate, float aPitch, nsISpeechTask* aTask) { + NS_ENSURE_TRUE(mInitialized, NS_ERROR_NOT_AVAILABLE); + + RefPtr voiceToken; + if (!mVoices.Get(aUri, getter_AddRefs(voiceToken))) { + return NS_ERROR_NOT_AVAILABLE; + } + + RefPtr spVoice = InitSapiInstance(); + if (!spVoice) { + return NS_ERROR_FAILURE; + } + + if (FAILED(spVoice->SetVoice(voiceToken))) { + return NS_ERROR_FAILURE; + } + + if (FAILED(spVoice->SetVolume(static_cast(aVolume * 100)))) { + return NS_ERROR_FAILURE; + } + + // The max supported rate in SAPI engines is 3x, and the min is 1/3x. It is + // expressed by an integer. 0 being normal rate, -10 is 1/3 and 10 is 3x. + // Values below and above that are allowed, but the engine may clip the rate + // to its maximum capable value. + // "Each increment between -10 and +10 is logarithmically distributed such + // that incrementing or decrementing by 1 is multiplying or dividing the + // rate by the 10th root of 3" + // https://msdn.microsoft.com/en-us/library/ee431826(v=vs.85).aspx + long rate = aRate != 0 ? static_cast(10 * log10(aRate) / log10(3)) : 0; + if (FAILED(spVoice->SetRate(rate))) { + return NS_ERROR_FAILURE; + } + + // Set the pitch using xml + nsAutoString xml; + xml.AssignLiteral("(aPitch * 10.0f - 10.0f)); + xml.AppendLiteral("\">"); + uint32_t textOffset = xml.Length(); + + for (size_t i = 0; i < aText.Length(); i++) { + switch (aText[i]) { + case '&': + xml.AppendLiteral("&"); + break; + case '<': + xml.AppendLiteral("<"); + break; + case '>': + xml.AppendLiteral(">"); + break; + default: + xml.Append(aText[i]); + break; + } + } + + xml.AppendLiteral(""); + + RefPtr callback = + new SapiCallback(aTask, spVoice, textOffset, aText.Length()); + + // The last three parameters doesn't matter for an indirect service + nsresult rv = aTask->Setup(callback); + if (NS_FAILED(rv)) { + return rv; + } + + ULONG streamNum; + if (FAILED(spVoice->Speak(xml.get(), SPF_ASYNC, &streamNum))) { + aTask->Setup(nullptr); + return NS_ERROR_FAILURE; + } + + callback->SetStreamNum(streamNum); + // streamNum reassigns same value when last stream is finished even if + // callback for stream end isn't called + // So we cannot use data hashtable and has to add it to vector at last. + mCallbacks.AppendElement(callback); + + return NS_OK; +} + +NS_IMETHODIMP +SapiService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + return NS_OK; +} + +SapiService* SapiService::GetInstance() { + MOZ_ASSERT(NS_IsMainThread()); + if (XRE_GetProcessType() != GeckoProcessType_Default) { + MOZ_ASSERT(false, "SapiService can only be started on main gecko process"); + return nullptr; + } + + if (!sSingleton) { + RefPtr service = new SapiService(); + if (service->Init()) { + sSingleton = service; + ClearOnShutdown(&sSingleton); + } + } + return sSingleton; +} + +already_AddRefed SapiService::GetInstanceForService() { + RefPtr sapiService = GetInstance(); + return sapiService.forget(); +} + +} // namespace mozilla::dom diff --git a/dom/media/webspeech/synth/windows/SapiService.h b/dom/media/webspeech/synth/windows/SapiService.h new file mode 100644 index 0000000000..79cc20917b --- /dev/null +++ b/dom/media/webspeech/synth/windows/SapiService.h @@ -0,0 +1,57 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SapiService_h +#define mozilla_dom_SapiService_h + +#include "nsISpeechService.h" +#include "nsIObserver.h" +#include "nsRefPtrHashtable.h" +#include "nsTArray.h" +#include "mozilla/StaticPtr.h" + +#include +#include + +class nsISynthVoiceRegistry; + +namespace mozilla::dom { + +class SapiCallback; + +class SapiService final : public nsISpeechService, public nsIObserver { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSISPEECHSERVICE + NS_DECL_NSIOBSERVER + + SapiService(); + bool Init(); + + static SapiService* GetInstance(); + static already_AddRefed GetInstanceForService(); + + static void __stdcall SpeechEventCallback(WPARAM aWParam, LPARAM aLParam); + + private: + virtual ~SapiService(); + + already_AddRefed InitSapiInstance(); + bool RegisterVoices(); + bool RegisterVoices(nsCOMPtr& registry, + const WCHAR* categoryId); + + nsRefPtrHashtable mVoices; + nsTArray> mCallbacks; + + bool mInitialized; + + static StaticRefPtr sSingleton; +}; + +} // namespace mozilla::dom + +#endif diff --git a/dom/media/webspeech/synth/windows/components.conf b/dom/media/webspeech/synth/windows/components.conf new file mode 100644 index 0000000000..bc9b83a43a --- /dev/null +++ b/dom/media/webspeech/synth/windows/components.conf @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{21b4a45b-9806-4021-a706-d768ab0548f9}', + 'contract_ids': ['@mozilla.org/synthsapi;1'], + 'singleton': True, + 'type': 'mozilla::dom::SapiService', + 'headers': ['/dom/media/webspeech/synth/windows/SapiService.h'], + 'constructor': 'mozilla::dom::SapiService::GetInstanceForService', + 'categories': {"speech-synth-started": 'Sapi Speech Synth'}, + }, +] diff --git a/dom/media/webspeech/synth/windows/moz.build b/dom/media/webspeech/synth/windows/moz.build new file mode 100644 index 0000000000..90bafe9ca7 --- /dev/null +++ b/dom/media/webspeech/synth/windows/moz.build @@ -0,0 +1,17 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + "SapiService.cpp", +] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +include("/ipc/chromium/chromium-config.mozbuild") + +FINAL_LIBRARY = "xul" -- cgit v1.2.3