diff options
Diffstat (limited to 'dom/media/webspeech/recognition/endpointer.h')
-rw-r--r-- | dom/media/webspeech/recognition/endpointer.h | 180 |
1 files changed, 180 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/endpointer.h b/dom/media/webspeech/recognition/endpointer.h new file mode 100644 index 0000000000..7879d6b9f3 --- /dev/null +++ b/dom/media/webspeech/recognition/endpointer.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ + +#include "energy_endpointer.h" + +namespace mozilla { + +struct AudioChunk; + +// A simple interface to the underlying energy-endpointer implementation, this +// class lets callers provide audio as being recorded and let them poll to find +// when the user has stopped speaking. +// +// There are two events that may trigger the end of speech: +// +// speechInputPossiblyComplete event: +// +// Signals that silence/noise has been detected for a *short* amount of +// time after some speech has been detected. It can be used for low latency +// UI feedback. To disable it, set it to a large amount. +// +// speechInputComplete event: +// +// This event is intended to signal end of input and to stop recording. +// The amount of time to wait after speech is set by +// speech_input_complete_silence_length_ and optionally two other +// parameters (see below). +// This time can be held constant, or can change as more speech is detected. +// In the latter case, the time changes after a set amount of time from the +// *beginning* of speech. This is motivated by the expectation that there +// will be two distinct types of inputs: short search queries and longer +// dictation style input. +// +// Three parameters are used to define the piecewise constant timeout function. +// The timeout length is speech_input_complete_silence_length until +// long_speech_length, when it changes to +// long_speech_input_complete_silence_length. +class Endpointer { + public: + explicit Endpointer(int sample_rate); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Process a segment of audio, which may be more than one frame. + // The status of the last frame will be returned. + EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); + + // Get the status of the endpointer. + EpStatus Status(int64_t *time_us); + + // Get the expected frame size for audio chunks. Audio chunks are expected + // to contain a number of samples that is a multiple of this number, and extra + // samples will be dropped. + int32_t FrameSize() const { + return frame_size_; + } + + // Returns true if the endpointer detected reasonable audio levels above + // background noise which could be user speech, false if not. + bool DidStartReceivingSpeech() const { + return speech_previously_detected_; + } + + bool IsEstimatingEnvironment() const { + return energy_endpointer_.estimating_environment(); + } + + void set_speech_input_complete_silence_length(int64_t time_us) { + speech_input_complete_silence_length_us_ = time_us; + } + + void set_long_speech_input_complete_silence_length(int64_t time_us) { + long_speech_input_complete_silence_length_us_ = time_us; + } + + void set_speech_input_possibly_complete_silence_length(int64_t time_us) { + speech_input_possibly_complete_silence_length_us_ = time_us; + } + + void set_long_speech_length(int64_t time_us) { + long_speech_length_us_ = time_us; + } + + bool speech_input_complete() const { + return speech_input_complete_; + } + + // RMS background noise level in dB. + float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } + + private: + // Reset internal states. Helper method common to initial input utterance + // and following input utternaces. + void Reset(); + + // Minimum allowable length of speech input. + int64_t speech_input_minimum_length_us_; + + // The speechInputPossiblyComplete event signals that silence/noise has been + // detected for a *short* amount of time after some speech has been detected. + // This proporty specifies the time period. + int64_t speech_input_possibly_complete_silence_length_us_; + + // The speechInputComplete event signals that silence/noise has been + // detected for a *long* amount of time after some speech has been detected. + // This property specifies the time period. + int64_t speech_input_complete_silence_length_us_; + + // Same as above, this specifies the required silence period after speech + // detection. This period is used instead of + // speech_input_complete_silence_length_ when the utterance is longer than + // long_speech_length_. This parameter is optional. + int64_t long_speech_input_complete_silence_length_us_; + + // The period of time after which the endpointer should consider + // long_speech_input_complete_silence_length_ as a valid silence period + // instead of speech_input_complete_silence_length_. This parameter is + // optional. + int64_t long_speech_length_us_; + + // First speech onset time, used in determination of speech complete timeout. + int64_t speech_start_time_us_; + + // Most recent end time, used in determination of speech complete timeout. + int64_t speech_end_time_us_; + + int64_t audio_frame_time_us_; + EpStatus old_ep_status_; + bool waiting_for_speech_possibly_complete_timeout_; + bool waiting_for_speech_complete_timeout_; + bool speech_previously_detected_; + bool speech_input_complete_; + EnergyEndpointer energy_endpointer_; + int sample_rate_; + int32_t frame_size_; +}; + +} // namespace mozilla + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ |