Reputation: 63
My goal is to create program on octave that loads audio file (wav, flac), calculates its mfcc features and serve them as output. The problem is that I do not have much experience with octave and cannot get octave load the audio file and that is why I am not sure if the extraction algorithms is correct. Is there simple way of loading the file and getting its features?
Upvotes: 4
Views: 1804
Reputation: 21
Check out Octave functions for calculating MFCC at https://github.com/jagdish7908/mfcc-octave
For a detailed theory on steps to compute MFCC, refer http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
function frame = create_frames(y, Fs, Fsize, Fstep)
N = length(y);
% divide the signal into frames with overlap = framestep
samplesPerFrame = floor(Fs*Fsize);
samplesPerFramestep = floor(Fs*Fstep);
i = 1;
frame = [];
while(i <= N-samplesPerFrame)
frame = [frame y(i:(i+samplesPerFrame-1))];
i = i+samplesPerFramestep;
endwhile
return
endfunction
function ans = hz2mel(f)
ans = 1125*log(1+f/700);
return
endfunction
function ans = mel2hz(f)
ans = 700*(exp(f/1125) - 1);
return
endfunction
function bank = melbank(n, min, max, sr)
% n = number of banks
% min = min frequency in hertz
% max = max frequency in hertz
% convert the min and max freq in mel scale
NFFT = 512;
% figure out bin value of min and max freq
minBin = floor((NFFT)*min/(sr/2));
maxBin = floor((NFFT)*max/(sr/2));
% convert the min, max in mel scale
min_mel = hz2mel(min);
max_mel = hz2mel(max);
m = [min_mel:(max_mel-min_mel)/(n+2-1):max_mel];
%disp(m);
h = mel2hz(m);
% replace frequencies in h with thier respective bin values
fbin = floor((NFFT)*h/(sr/2));
%disp(h);
% create triangular melfilter vectors
H = zeros(NFFT,n);
for vect = 2:n+1
for k = minBin:maxBin
if k >= fbin(vect-1) && k <= fbin(vect)
H(k,vect) = (k-fbin(vect-1))/(fbin(vect)-fbin(vect-1));
elseif k >= fbin(vect) && k <= fbin(vect+1)
H(k,vect) = (fbin(vect+1) - k)/(fbin(vect+1)-fbin(vect));
endif
endfor
endfor
bank = H;
return
endfunction
clc;
clear all;
close all;
pkg load signal;
% record audio
Fs = 44100;
y = record(3,44100);
% OR %
% Load existing file
%[y, Fs] = wavread('../FILE_PATH/');
%y = y(44100:2*44100);
% create mel filterbanks
minFreq = 500; % minimum cutoff frequency in Hz
maxFreq = 10000; % maximum cutoff frequency in Hz
% melbank(number_of_banks, minFreq, mazFreq, sampling_rate)
foo = melbank(30,minFreq,maxFreq,Fs);
% create frames
frames = create_frames(y, Fs, 0.025, 0.010);
% calculate periodogram of each frame
NF = length(frames(1,:));
[P,F] = periodogram(frames(:,1),[], 1024, Fs);
% apply mel filters to the power spectra
P = foo.*P(1:512);
% sum the energy in each filter and take the logarithm
P = log(sum(P));
% take the DCT of the log filterbank energies
% discard the first coeff 'cause it'll be -Inf after taking log
L = length(P);
P = dct(P(2:L));
PXX = P;
for i = 2:NF
P = periodogram(frames(:,i),[], 1024, Fs);
% apply mel filters to the power spectra
P = foo.*P(1:512);
% sum the energy in each filter and take the logarithm
P = log(sum(P));
% take the DCT of the log filterbank energies
% discard the first coeff 'cause it'll be -Inf after taking log
P = dct(P(2:L));
% coeffients are stacked row wise for each frame
PXX = [PXX; P];
endfor
% stack the coeffients column wise
PXX = PXX';
plot(PXX);
Upvotes: 2
Reputation: 25220
You can run mfcc code from RASTAMAT in octave, you only need to fix few things, the fixed version is available for download here.
The changes are to properly set windows in powspec.m
WINDOW = hanning(winpts);
and to fix the bug in specgram function which is not compatible with Matlab.
Upvotes: 3