Sign In

  • Username:
  • Password:

Upload File


HMM in Speech Recognition

I choose Hidden Markov Model (HMM) for my BirdSong-Recognition Project.

0x01 What’s HMM?


HMMs are great at modeling time series data.
As an audio signal is a time series signal,HMMs perfectly suit our needs.
An HMM is a model that represents prbability distribution over sequences of observations.
We assume that the outputs are generated by hidden states.
So, our goal is to find these hidden states so that we can model the signal.

0x02 How to build?

I import a Python lib ==> hmmlear
Describe the classic example using hmmlearn.



from __future__ import division
import numpy as np, random
from hmmlearn import hmm

states = ["Rainy", "Sunny"]
n_states = len(states)

observations = ["walk", "shop", "clean"]
n_observations = len(observations)

start_probability = np.array([0.6, 0.4])

transition_probability = np.array([
    [0.7, 0.3],
    [0.4, 0.6]

emission_probability = np.array([
    [0.1, 0.4, 0.5],
    [0.6, 0.3, 0.1]

model = hmm.MultinomialHMM(n_components=n_states, n_iter=1000)
model.startprob = start_probability
model.transmat = transition_probability
model.emissionprob = emission_probability

print model.transmat
print model.emissionprob
print model.startprob

# predict a sequence of hidden states based on visible states           

seq = []
lengths = []

for _ in range(100):
    length = random.randint(5, 10)
    for _ in range(length):
        r = random.random()
        if r < .2:
        elif r < .6:
seq = np.array([seq]).T
model =, lengths)

bob_says = np.array([[0, 2, 1, 1, 2, 0]]).T
logprob, alice_hears = model.decode(bob_says, algorithm="viterbi")
print "Bob says:", ", ".join(map(lambda x: observations[x], bob_says.T[0]))
print "Alice hears:", ", ".join(map(lambda x: states[x], alice_hears))

0x03 A Speech-Recg Example

This contains seven different words, where each word has 15 audio files associated with it.
It’s small but this is sufficient to understand how to build a speech recognizer that can recognize seven different words.

import os
import argparse 

import numpy as np
from import wavfile 
from hmmlearn import hmm
from features import mfcc

# Function to parse input arguments
def build_arg_parser():
    parser = argparse.ArgumentParser(description='Trains the HMM classifier')
    parser.add_argument("--input-folder", dest="input_folder", required=True,
            help="Input folder containing the audio files in subfolders")
    return parser

# Class to handle all HMM related processing
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components, 
                    covariance_type=self.cov_type, n_iter=self.n_iter)
            raise TypeError('Invalid model type')

    # X is a 2D numpy array where each row is 13D
    def train(self, X):

    # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)

if __name__=='__main__':
    args = build_arg_parser().parse_args()
    input_folder = args.input_folder

    hmm_models = []

    # Parse the input directory
    for dirname in os.listdir(input_folder):
        # Get the name of the subfolder 
        subfolder = os.path.join(input_folder, dirname)

        if not os.path.isdir(subfolder): 

        # Extract the label
        label = subfolder[subfolder.rfind('/') + 1:]

        # Initialize variables
        X = np.array([])
        y_words = []

        # Iterate through the audio files (leaving 1 file for testing in each class)
        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')][:-1]:
            # Read the input file
            filepath = os.path.join(subfolder, filename)
            sampling_freq, audio =

            # Extract MFCC features
            mfcc_features = mfcc(audio, sampling_freq)

            # Append to the variable X
            if len(X) == 0:
                X = mfcc_features
                X = np.append(X, mfcc_features, axis=0)

            # Append the label

        print 'X.shape =', X.shape
        # Train and save HMM model
        hmm_trainer = HMMTrainer()
        hmm_models.append((hmm_trainer, label))
        hmm_trainer = None

    # Test files
    input_files = [

    # Classify input data
    for input_file in input_files:
        # Read input file
        sampling_freq, audio =

        # Extract MFCC features
        mfcc_features = mfcc(audio, sampling_freq)

        # Define variables
        max_score = None
        output_label = None

        # Iterate through all HMM models and pick 
        # the one with the highest score
        for item in hmm_models:
            hmm_model, label = item
            score = hmm_model.get_score(mfcc_features)
            if score > max_score:
                max_score = score
                output_label = label

        # Print the output
        print "\nTrue:", input_file[input_file.find('/')+1:input_file.rfind('/')]
       print "Predicted:", output_label