diff options
Diffstat (limited to 'dom/media/webspeech/synth/windows/SapiService.cpp')
-rw-r--r-- | dom/media/webspeech/synth/windows/SapiService.cpp | 445 |
1 files changed, 445 insertions, 0 deletions
diff --git a/dom/media/webspeech/synth/windows/SapiService.cpp b/dom/media/webspeech/synth/windows/SapiService.cpp new file mode 100644 index 0000000000..f1e44213d1 --- /dev/null +++ b/dom/media/webspeech/synth/windows/SapiService.cpp @@ -0,0 +1,445 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.h" +#include "SapiService.h" +#include "nsServiceManagerUtils.h" +#include "nsEscape.h" +#include "nsXULAppAPI.h" + +#include "mozilla/ClearOnShutdown.h" +#include "mozilla/dom/nsSynthVoiceRegistry.h" +#include "mozilla/dom/nsSpeechTask.h" +#include "mozilla/Preferences.h" +#include "mozilla/ProfilerLabels.h" +#include "mozilla/StaticPrefs_media.h" + +namespace mozilla::dom { + +constexpr static WCHAR kSpCategoryOneCoreVoices[] = + L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech_OneCore\\Voices"; + +StaticRefPtr<SapiService> SapiService::sSingleton; + +class SapiCallback final : public nsISpeechTaskCallback { + public: + SapiCallback(nsISpeechTask* aTask, ISpVoice* aSapiClient, + uint32_t aTextOffset, uint32_t aSpeakTextLen) + : mTask(aTask), + mSapiClient(aSapiClient), + mTextOffset(aTextOffset), + mSpeakTextLen(aSpeakTextLen), + mCurrentIndex(0), + mStreamNum(0) { + mStartingTime = TimeStamp::Now(); + } + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SapiCallback, nsISpeechTaskCallback) + + NS_DECL_NSISPEECHTASKCALLBACK + + ULONG GetStreamNum() const { return mStreamNum; } + void SetStreamNum(ULONG aValue) { mStreamNum = aValue; } + + void OnSpeechEvent(const SPEVENT& speechEvent); + + private: + ~SapiCallback() {} + + float GetTimeDurationFromStart() const { + TimeDuration duration = TimeStamp::Now() - mStartingTime; + return duration.ToSeconds(); + } + + // This pointer is used to dispatch events + nsCOMPtr<nsISpeechTask> mTask; + RefPtr<ISpVoice> mSapiClient; + + uint32_t mTextOffset; + uint32_t mSpeakTextLen; + + // Used for calculating the time taken to speak the utterance + TimeStamp mStartingTime; + uint32_t mCurrentIndex; + + ULONG mStreamNum; +}; + +NS_IMPL_CYCLE_COLLECTION(SapiCallback, mTask); + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SapiCallback) + NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(SapiCallback) +NS_IMPL_CYCLE_COLLECTING_RELEASE(SapiCallback) + +NS_IMETHODIMP +SapiCallback::OnPause() { + if (FAILED(mSapiClient->Pause())) { + return NS_ERROR_FAILURE; + } + if (!mTask) { + // When calling pause() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchPause(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnResume() { + if (FAILED(mSapiClient->Resume())) { + return NS_ERROR_FAILURE; + } + if (!mTask) { + // When calling resume() on child porcess, it may not receive end event + // from chrome process yet. + return NS_ERROR_FAILURE; + } + mTask->DispatchResume(GetTimeDurationFromStart(), mCurrentIndex); + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnCancel() { + // After cancel, mCurrentIndex may be updated. + // At cancel case, use mCurrentIndex for DispatchEnd. + mSpeakTextLen = 0; + // Purge all the previous utterances and speak an empty string + if (FAILED(mSapiClient->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr))) { + return NS_ERROR_FAILURE; + } + return NS_OK; +} + +NS_IMETHODIMP +SapiCallback::OnVolumeChanged(float aVolume) { + mSapiClient->SetVolume(static_cast<USHORT>(aVolume * 100)); + return NS_OK; +} + +void SapiCallback::OnSpeechEvent(const SPEVENT& speechEvent) { + switch (speechEvent.eEventId) { + case SPEI_START_INPUT_STREAM: + mTask->DispatchStart(); + break; + case SPEI_END_INPUT_STREAM: + if (mSpeakTextLen) { + mCurrentIndex = mSpeakTextLen; + } + mTask->DispatchEnd(GetTimeDurationFromStart(), mCurrentIndex); + mTask = nullptr; + break; + case SPEI_TTS_BOOKMARK: + mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"mark"_ns, GetTimeDurationFromStart(), + mCurrentIndex, 0, 0); + break; + case SPEI_WORD_BOUNDARY: + mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"word"_ns, GetTimeDurationFromStart(), + mCurrentIndex, + static_cast<ULONG>(speechEvent.wParam), 1); + break; + case SPEI_SENTENCE_BOUNDARY: + mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; + mTask->DispatchBoundary(u"sentence"_ns, GetTimeDurationFromStart(), + mCurrentIndex, + static_cast<ULONG>(speechEvent.wParam), 1); + break; + default: + break; + } +} + +// static +void __stdcall SapiService::SpeechEventCallback(WPARAM aWParam, + LPARAM aLParam) { + RefPtr<ISpVoice> spVoice = (ISpVoice*)aWParam; + RefPtr<SapiService> service = (SapiService*)aLParam; + + SPEVENT speechEvent; + while (spVoice->GetEvents(1, &speechEvent, nullptr) == S_OK) { + for (size_t i = 0; i < service->mCallbacks.Length(); i++) { + RefPtr<SapiCallback> callback = service->mCallbacks[i]; + if (callback->GetStreamNum() == speechEvent.ulStreamNum) { + callback->OnSpeechEvent(speechEvent); + if (speechEvent.eEventId == SPEI_END_INPUT_STREAM) { + service->mCallbacks.RemoveElementAt(i); + } + break; + } + } + } +} + +NS_INTERFACE_MAP_BEGIN(SapiService) + NS_INTERFACE_MAP_ENTRY(nsISpeechService) + NS_INTERFACE_MAP_ENTRY(nsIObserver) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechService) +NS_INTERFACE_MAP_END + +NS_IMPL_ADDREF(SapiService) +NS_IMPL_RELEASE(SapiService) + +SapiService::SapiService() : mInitialized(false) {} + +SapiService::~SapiService() {} + +bool SapiService::Init() { + AUTO_PROFILER_LABEL("SapiService::Init", OTHER); + + MOZ_ASSERT(!mInitialized); + + if (Preferences::GetBool("media.webspeech.synth.test") || + !StaticPrefs::media_webspeech_synth_enabled()) { + // When enabled, we shouldn't add OS backend (Bug 1160844) + return false; + } + + // Get all the voices from sapi and register in the SynthVoiceRegistry + if (!RegisterVoices()) { + return false; + } + + mInitialized = true; + return true; +} + +already_AddRefed<ISpVoice> SapiService::InitSapiInstance() { + RefPtr<ISpVoice> spVoice; + if (FAILED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, + getter_AddRefs(spVoice)))) { + return nullptr; + } + + // Set interest for all the events we are interested in + ULONGLONG eventMask = SPFEI(SPEI_START_INPUT_STREAM) | + SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | + SPFEI(SPEI_SENTENCE_BOUNDARY) | + SPFEI(SPEI_END_INPUT_STREAM); + + if (FAILED(spVoice->SetInterest(eventMask, eventMask))) { + return nullptr; + } + + // Set the callback function for receiving the events + spVoice->SetNotifyCallbackFunction( + (SPNOTIFYCALLBACK*)SapiService::SpeechEventCallback, + (WPARAM)spVoice.get(), (LPARAM)this); + + return spVoice.forget(); +} + +bool SapiService::RegisterVoices() { + nsCOMPtr<nsISynthVoiceRegistry> registry = + do_GetService(NS_SYNTHVOICEREGISTRY_CONTRACTID); + if (!registry) { + return false; + } + bool result = RegisterVoices(registry, kSpCategoryOneCoreVoices); + result |= RegisterVoices(registry, SPCAT_VOICES); + if (result) { + registry->NotifyVoicesChanged(); + } + return result; +} + +bool SapiService::RegisterVoices(nsCOMPtr<nsISynthVoiceRegistry>& registry, + const WCHAR* categoryId) { + nsresult rv; + + RefPtr<ISpObjectTokenCategory> category; + if (FAILED(CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, + IID_ISpObjectTokenCategory, + getter_AddRefs(category)))) { + return false; + } + if (FAILED(category->SetId(categoryId, FALSE))) { + return false; + } + + RefPtr<IEnumSpObjectTokens> voiceTokens; + if (FAILED(category->EnumTokens(nullptr, nullptr, + getter_AddRefs(voiceTokens)))) { + return false; + } + + WCHAR locale[LOCALE_NAME_MAX_LENGTH]; + while (true) { + RefPtr<ISpObjectToken> voiceToken; + if (voiceTokens->Next(1, getter_AddRefs(voiceToken), nullptr) != S_OK) { + break; + } + + RefPtr<ISpDataKey> attributes; + if (FAILED( + voiceToken->OpenKey(L"Attributes", getter_AddRefs(attributes)))) { + continue; + } + + WCHAR* language = nullptr; + if (FAILED(attributes->GetStringValue(L"Language", &language))) { + continue; + } + + // Language attribute is LCID by hex. So we need convert to locale + // name. + nsAutoString hexLcid; + LCID lcid = wcstol(language, nullptr, 16); + CoTaskMemFree(language); + if (NS_WARN_IF( + !LCIDToLocaleName(lcid, locale, LOCALE_NAME_MAX_LENGTH, 0))) { + continue; + } + + WCHAR* description = nullptr; + if (FAILED(voiceToken->GetStringValue(nullptr, &description))) { + continue; + } + + nsAutoString uri; + uri.AssignLiteral("urn:moz-tts:sapi:"); + uri.Append(description); + uri.AppendLiteral("?"); + uri.Append(locale); + + // This service can only speak one utterance at a time, se we set + // aQueuesUtterances to true in order to track global state and schedule + // access to this service. + rv = registry->AddVoice(this, uri, nsDependentString(description), + nsDependentString(locale), true, true); + CoTaskMemFree(description); + if (NS_FAILED(rv)) { + continue; + } + + mVoices.InsertOrUpdate(uri, std::move(voiceToken)); + } + + return true; +} + +NS_IMETHODIMP +SapiService::Speak(const nsAString& aText, const nsAString& aUri, float aVolume, + float aRate, float aPitch, nsISpeechTask* aTask) { + NS_ENSURE_TRUE(mInitialized, NS_ERROR_NOT_AVAILABLE); + + RefPtr<ISpObjectToken> voiceToken; + if (!mVoices.Get(aUri, getter_AddRefs(voiceToken))) { + return NS_ERROR_NOT_AVAILABLE; + } + + RefPtr<ISpVoice> spVoice = InitSapiInstance(); + if (!spVoice) { + return NS_ERROR_FAILURE; + } + + if (FAILED(spVoice->SetVoice(voiceToken))) { + return NS_ERROR_FAILURE; + } + + if (FAILED(spVoice->SetVolume(static_cast<USHORT>(aVolume * 100)))) { + return NS_ERROR_FAILURE; + } + + // The max supported rate in SAPI engines is 3x, and the min is 1/3x. It is + // expressed by an integer. 0 being normal rate, -10 is 1/3 and 10 is 3x. + // Values below and above that are allowed, but the engine may clip the rate + // to its maximum capable value. + // "Each increment between -10 and +10 is logarithmically distributed such + // that incrementing or decrementing by 1 is multiplying or dividing the + // rate by the 10th root of 3" + // https://msdn.microsoft.com/en-us/library/ee431826(v=vs.85).aspx + long rate = aRate != 0 ? static_cast<long>(10 * log10(aRate) / log10(3)) : 0; + if (FAILED(spVoice->SetRate(rate))) { + return NS_ERROR_FAILURE; + } + + // Set the pitch using xml + nsAutoString xml; + xml.AssignLiteral("<pitch absmiddle=\""); + // absmiddle doesn't allow float type + xml.AppendInt(static_cast<int32_t>(aPitch * 10.0f - 10.0f)); + xml.AppendLiteral("\">"); + uint32_t textOffset = xml.Length(); + + for (size_t i = 0; i < aText.Length(); i++) { + switch (aText[i]) { + case '&': + xml.AppendLiteral("&"); + break; + case '<': + xml.AppendLiteral("<"); + break; + case '>': + xml.AppendLiteral(">"); + break; + default: + xml.Append(aText[i]); + break; + } + } + + xml.AppendLiteral("</pitch>"); + + RefPtr<SapiCallback> callback = + new SapiCallback(aTask, spVoice, textOffset, aText.Length()); + + // The last three parameters doesn't matter for an indirect service + nsresult rv = aTask->Setup(callback); + if (NS_FAILED(rv)) { + return rv; + } + + ULONG streamNum; + if (FAILED(spVoice->Speak(xml.get(), SPF_ASYNC, &streamNum))) { + aTask->Setup(nullptr); + return NS_ERROR_FAILURE; + } + + callback->SetStreamNum(streamNum); + // streamNum reassigns same value when last stream is finished even if + // callback for stream end isn't called + // So we cannot use data hashtable and has to add it to vector at last. + mCallbacks.AppendElement(callback); + + return NS_OK; +} + +NS_IMETHODIMP +SapiService::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) { + return NS_OK; +} + +SapiService* SapiService::GetInstance() { + MOZ_ASSERT(NS_IsMainThread()); + if (XRE_GetProcessType() != GeckoProcessType_Default) { + MOZ_ASSERT(false, "SapiService can only be started on main gecko process"); + return nullptr; + } + + if (!sSingleton) { + RefPtr<SapiService> service = new SapiService(); + if (service->Init()) { + sSingleton = service; + ClearOnShutdown(&sSingleton); + } + } + return sSingleton; +} + +already_AddRefed<SapiService> SapiService::GetInstanceForService() { + RefPtr<SapiService> sapiService = GetInstance(); + return sapiService.forget(); +} + +} // namespace mozilla::dom |