// Copyright (c) 2013 The Chromium Authors. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ #include "energy_endpointer.h" namespace mozilla { struct AudioChunk; // A simple interface to the underlying energy-endpointer implementation, this // class lets callers provide audio as being recorded and let them poll to find // when the user has stopped speaking. // // There are two events that may trigger the end of speech: // // speechInputPossiblyComplete event: // // Signals that silence/noise has been detected for a *short* amount of // time after some speech has been detected. It can be used for low latency // UI feedback. To disable it, set it to a large amount. // // speechInputComplete event: // // This event is intended to signal end of input and to stop recording. // The amount of time to wait after speech is set by // speech_input_complete_silence_length_ and optionally two other // parameters (see below). // This time can be held constant, or can change as more speech is detected. // In the latter case, the time changes after a set amount of time from the // *beginning* of speech. This is motivated by the expectation that there // will be two distinct types of inputs: short search queries and longer // dictation style input. // // Three parameters are used to define the piecewise constant timeout function. // The timeout length is speech_input_complete_silence_length until // long_speech_length, when it changes to // long_speech_input_complete_silence_length. class Endpointer { public: explicit Endpointer(int sample_rate); // Start the endpointer. This should be called at the beginning of a session. void StartSession(); // Stop the endpointer. void EndSession(); // Start environment estimation. Audio will be used for environment estimation // i.e. noise level estimation. void SetEnvironmentEstimationMode(); // Start user input. This should be called when the user indicates start of // input, e.g. by pressing a button. void SetUserInputMode(); // Process a segment of audio, which may be more than one frame. // The status of the last frame will be returned. EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); // Get the status of the endpointer. EpStatus Status(int64_t *time_us); // Get the expected frame size for audio chunks. Audio chunks are expected // to contain a number of samples that is a multiple of this number, and extra // samples will be dropped. int32_t FrameSize() const { return frame_size_; } // Returns true if the endpointer detected reasonable audio levels above // background noise which could be user speech, false if not. bool DidStartReceivingSpeech() const { return speech_previously_detected_; } bool IsEstimatingEnvironment() const { return energy_endpointer_.estimating_environment(); } void set_speech_input_complete_silence_length(int64_t time_us) { speech_input_complete_silence_length_us_ = time_us; } void set_long_speech_input_complete_silence_length(int64_t time_us) { long_speech_input_complete_silence_length_us_ = time_us; } void set_speech_input_possibly_complete_silence_length(int64_t time_us) { speech_input_possibly_complete_silence_length_us_ = time_us; } void set_long_speech_length(int64_t time_us) { long_speech_length_us_ = time_us; } bool speech_input_complete() const { return speech_input_complete_; } // RMS background noise level in dB. float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } private: // Reset internal states. Helper method common to initial input utterance // and following input utternaces. void Reset(); // Minimum allowable length of speech input. int64_t speech_input_minimum_length_us_; // The speechInputPossiblyComplete event signals that silence/noise has been // detected for a *short* amount of time after some speech has been detected. // This proporty specifies the time period. int64_t speech_input_possibly_complete_silence_length_us_; // The speechInputComplete event signals that silence/noise has been // detected for a *long* amount of time after some speech has been detected. // This property specifies the time period. int64_t speech_input_complete_silence_length_us_; // Same as above, this specifies the required silence period after speech // detection. This period is used instead of // speech_input_complete_silence_length_ when the utterance is longer than // long_speech_length_. This parameter is optional. int64_t long_speech_input_complete_silence_length_us_; // The period of time after which the endpointer should consider // long_speech_input_complete_silence_length_ as a valid silence period // instead of speech_input_complete_silence_length_. This parameter is // optional. int64_t long_speech_length_us_; // First speech onset time, used in determination of speech complete timeout. int64_t speech_start_time_us_; // Most recent end time, used in determination of speech complete timeout. int64_t speech_end_time_us_; int64_t audio_frame_time_us_; EpStatus old_ep_status_; bool waiting_for_speech_possibly_complete_timeout_; bool waiting_for_speech_complete_timeout_; bool speech_previously_detected_; bool speech_input_complete_; EnergyEndpointer energy_endpointer_; int sample_rate_; int32_t frame_size_; }; } // namespace mozilla #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_