energy_endpointer.cc

mozilla-central/dom/media/webspeech/recognition/energy_endpointer.cc (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

// Copyright (c) 2013 The Chromium Authors. All rights reserved.

//

// Redistribution and use in source and binary forms, with or without

// modification, are permitted provided that the following conditions are

// met:

//

//    * Redistributions of source code must retain the above copyright

// notice, this list of conditions and the following disclaimer.

//    * Redistributions in binary form must reproduce the above

// copyright notice, this list of conditions and the following disclaimer

// in the documentation and/or other materials provided with the

// distribution.

//    * Neither the name of Google Inc. nor the names of its

// contributors may be used to endorse or promote products derived from

// this software without specific prior written permission.

//

// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "energy_endpointer.h"

#include <math.h>

namespace {

// Returns the RMS (quadratic mean) of the input signal.

float RMS(const int16_t* samples, int num_samples) {

  int64_t ssq_int64_t = 0;

  int64_t sum_int64_t = 0;

  for (int i = 0; i < num_samples; ++i) {

    sum_int64_t += samples[i];

    ssq_int64_t += samples[i] * samples[i];

  // now convert to floats.

  double sum = static_cast<double>(sum_int64_t);

  sum /= num_samples;

  double ssq = static_cast<double>(ssq_int64_t);

  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));

int64_t Secs2Usecs(float seconds) {

  return static_cast<int64_t>(0.5 + (1.0e6 * seconds));

float GetDecibel(float value) {

  if (value > 1.0e-100)

    return 20 * log10(value);

  return -2000.0;

}  // namespace

namespace mozilla {

// Stores threshold-crossing histories for making decisions about the speech

// state.

class EnergyEndpointer::HistoryRing {

 public:

  HistoryRing() : insertion_index_(0) {}

  // Resets the ring to |size| elements each with state |initial_state|

  void SetRing(int size, bool initial_state);

  // Inserts a new entry into the ring and drops the oldest entry.

  void Insert(int64_t time_us, bool decision);

  // Returns the time in microseconds of the most recently added entry.

  int64_t EndTime() const;

  // Returns the sum of all intervals during which 'decision' is true within

  // the time in seconds specified by 'duration'. The returned interval is

  // in seconds.

  float RingSum(float duration_sec);

 private:

  struct DecisionPoint {

    int64_t time_us;

    bool decision;

};

  std::vector<DecisionPoint> decision_points_;

  int insertion_index_;  // Index at which the next item gets added/inserted.

  HistoryRing(const HistoryRing&);

  void operator=(const HistoryRing&);

};

void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {

  insertion_index_ = 0;

  decision_points_.clear();

  DecisionPoint init = { -1, initial_state };

  decision_points_.resize(size, init);

void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {

  decision_points_[insertion_index_].time_us = time_us;

  decision_points_[insertion_index_].decision = decision;

  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();

int64_t EnergyEndpointer::HistoryRing::EndTime() const {

  int ind = insertion_index_ - 1;

  if (ind < 0)

    ind = decision_points_.size() - 1;

  return decision_points_[ind].time_us;

float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {

  if (decision_points_.empty())

    return 0.0;

  int64_t sum_us = 0;

  int ind = insertion_index_ - 1;

  if (ind < 0)

    ind = decision_points_.size() - 1;

  int64_t end_us = decision_points_[ind].time_us;

  bool is_on = decision_points_[ind].decision;

  int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));

  if (start_us < 0)

    start_us = 0;

  size_t n_summed = 1;  // n points ==> (n-1) intervals

  while ((decision_points_[ind].time_us > start_us) &&

         (n_summed < decision_points_.size())) {

    --ind;

    if (ind < 0)

      ind = decision_points_.size() - 1;

    if (is_on)

      sum_us += end_us - decision_points_[ind].time_us;

    is_on = decision_points_[ind].decision;

    end_us = decision_points_[ind].time_us;

    n_summed++;

  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.

EnergyEndpointer::EnergyEndpointer()

    : status_(EP_PRE_SPEECH),

      offset_confirm_dur_sec_(0),

      endpointer_time_us_(0),

      fast_update_frames_(0),

      frame_counter_(0),

      max_window_dur_(4.0),

      sample_rate_(0),

      history_(new HistoryRing()),

      decision_threshold_(0),

      estimating_environment_(false),

      noise_level_(0),

      rms_adapt_(0),

      start_lag_(0),

      end_lag_(0),

      user_input_start_time_us_(0) {

EnergyEndpointer::~EnergyEndpointer() {

int EnergyEndpointer::TimeToFrame(float time) const {

  return static_cast<int32_t>(0.5 + (time / params_.frame_period()));

void EnergyEndpointer::Restart(bool reset_threshold) {

  status_ = EP_PRE_SPEECH;

  user_input_start_time_us_ = 0;

  if (reset_threshold) {

    decision_threshold_ = params_.decision_threshold();

    rms_adapt_ = decision_threshold_;

    noise_level_ = params_.decision_threshold() / 2.0f;

    frame_counter_ = 0;  // Used for rapid initial update of levels.

  // Set up the memories to hold the history windows.

  history_->SetRing(TimeToFrame(max_window_dur_), false);

  // Flag that indicates that current input should be used for

  // estimating the environment. The user has not yet started input

  // by e.g. pressed the push-to-talk button. By default, this is

  // false for backward compatibility.

  estimating_environment_ = false;

void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {

  params_ = params;

  // Find the longest history interval to be used, and make the ring

  // large enough to accommodate that number of frames.  NOTE: This

  // depends upon ep_frame_period being set correctly in the factory

  // that did this instantiation.

  max_window_dur_ = params_.onset_window();

  if (params_.speech_on_window() > max_window_dur_)

    max_window_dur_ = params_.speech_on_window();

  if (params_.offset_window() > max_window_dur_)

    max_window_dur_ = params_.offset_window();

  Restart(true);

  offset_confirm_dur_sec_ = params_.offset_window() -

                            params_.offset_confirm_dur();

  if (offset_confirm_dur_sec_ < 0.0)

    offset_confirm_dur_sec_ = 0.0;

  user_input_start_time_us_ = 0;

  // Flag that indicates that  current input should be used for

  // estimating the environment. The user has not yet started input

  // by e.g. pressed the push-to-talk button. By default, this is

  // false for backward compatibility.

  estimating_environment_ = false;

  // The initial value of the noise and speech levels is inconsequential.

  // The level of the first frame will overwrite these values.

  noise_level_ = params_.decision_threshold() / 2.0f;

  fast_update_frames_ =

      static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());

  frame_counter_ = 0;  // Used for rapid initial update of levels.

  sample_rate_ = params_.sample_rate();

  start_lag_ = static_cast<int>(sample_rate_ /

                                params_.max_fundamental_frequency());

  end_lag_ = static_cast<int>(sample_rate_ /

                              params_.min_fundamental_frequency());

void EnergyEndpointer::StartSession() {

  Restart(true);

void EnergyEndpointer::EndSession() {

  status_ = EP_POST_SPEECH;

void EnergyEndpointer::SetEnvironmentEstimationMode() {

  Restart(true);

  estimating_environment_ = true;

void EnergyEndpointer::SetUserInputMode() {

  estimating_environment_ = false;

  user_input_start_time_us_ = endpointer_time_us_;

void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,

                                         const int16_t* samples,

                                         int num_samples,

                                         float* rms_out) {

  endpointer_time_us_ = time_us;

  float rms = RMS(samples, num_samples);

  // Check that this is user input audio vs. pre-input adaptation audio.

  // Input audio starts when the user indicates start of input, by e.g.

  // pressing push-to-talk. Audio recieved prior to that is used to update

  // noise and speech level estimates.

  if (!estimating_environment_) {

    bool decision = false;

    if ((endpointer_time_us_ - user_input_start_time_us_) <

        Secs2Usecs(params_.contamination_rejection_period())) {

      decision = false;

      //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));

    } else {

      decision = (rms > decision_threshold_);

    history_->Insert(endpointer_time_us_, decision);

    switch (status_) {

      case EP_PRE_SPEECH:

        if (history_->RingSum(params_.onset_window()) >

            params_.onset_detect_dur()) {

          status_ = EP_POSSIBLE_ONSET;

        break;

      case EP_POSSIBLE_ONSET: {

        float tsum = history_->RingSum(params_.onset_window());

        if (tsum > params_.onset_confirm_dur()) {

          status_ = EP_SPEECH_PRESENT;

        } else {  // If signal is not maintained, drop back to pre-speech.

          if (tsum <= params_.onset_detect_dur())

            status_ = EP_PRE_SPEECH;

        break;

      case EP_SPEECH_PRESENT: {

        // To induce hysteresis in the state residency, we allow a

        // smaller residency time in the on_ring, than was required to

        // enter the SPEECH_PERSENT state.

        float on_time = history_->RingSum(params_.speech_on_window());

        if (on_time < params_.on_maintain_dur())

          status_ = EP_POSSIBLE_OFFSET;

        break;

      case EP_POSSIBLE_OFFSET:

        if (history_->RingSum(params_.offset_window()) <=

            offset_confirm_dur_sec_) {

          // Note that this offset time may be beyond the end

          // of the input buffer in a real-time system.  It will be up

          // to the RecognizerSession to decide what to do.

          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.

        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.

          if (history_->RingSum(params_.speech_on_window()) >=

              params_.on_maintain_dur())

            status_ = EP_SPEECH_PRESENT;

        break;

      default:

        break;

    // If this is a quiet, non-speech region, slowly adapt the detection

    // threshold to be about 6dB above the average RMS.

    if ((!decision) && (status_ == EP_PRE_SPEECH)) {

      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);

      rms_adapt_ = decision_threshold_;

    } else {

      // If this is in a speech region, adapt the decision threshold to

      // be about 10dB below the average RMS. If the noise level is high,

      // the threshold is pushed up.

      // Adaptation up to a higher level is 5 times faster than decay to

      // a lower level.

      if ((status_ == EP_SPEECH_PRESENT) && decision) {

        if (rms_adapt_ > rms) {

          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);

        } else {

          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);

        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;

        decision_threshold_ = (.90f * decision_threshold_) +

                              (0.10f * target_threshold);

    // Set a floor

    if (decision_threshold_ < params_.min_decision_threshold())

      decision_threshold_ = params_.min_decision_threshold();

  // Update speech and noise levels.

  UpdateLevels(rms);

  ++frame_counter_;

  if (rms_out)

    *rms_out = GetDecibel(rms);

float EnergyEndpointer::GetNoiseLevelDb() const {

  return GetDecibel(noise_level_);

void EnergyEndpointer::UpdateLevels(float rms) {

  // Update quickly initially. We assume this is noise and that

  // speech is 6dB above the noise.

  if (frame_counter_ < fast_update_frames_) {

    // Alpha increases from 0 to (k-1)/k where k is the number of time

    // steps in the initial adaptation period.

    float alpha = static_cast<float>(frame_counter_) /

        static_cast<float>(fast_update_frames_);

    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);

    //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));

  } else {

    // Update Noise level. The noise level adapts quickly downward, but

    // slowly upward. The noise_level_ parameter is not currently used

    // for threshold adaptation. It is used for UI feedback.

    if (noise_level_ < rms)

      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);

    else

      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);

  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {

    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.

    // Set a floor

    if (decision_threshold_ < params_.min_decision_threshold())

      decision_threshold_ = params_.min_decision_threshold();

EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {

  *status_time = history_->EndTime();

  return status_;

}  // namespace mozilla