forked from astorfi/speechpy
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeature.py
More file actions
executable file
·185 lines (144 loc) · 8.69 KB
/
feature.py
File metadata and controls
executable file
·185 lines (144 loc) · 8.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from __future__ import division
import numpy as np
from . import processing
from scipy.fftpack import dct
import math
from . import functions
def filterbanks(num_filter, fftpoints, sampling_freq, low_freq=None, high_freq=None):
"""Compute the Mel-filterbanks. Each filter will be stored in one rows. The columns correspond to fft bins.
Args:
num_filter (int): the number of filters in the filterbank, default 20.
fftpoints (int): the FFT size. Default is 512.
sampling_freq (float): the samplerate of the signal we are working with. Affects mel spacing.
low_freq (float): lowest band edge of mel filters, default 0 Hz
high_freq (float): highest band edge of mel filters, default samplerate/2
Returns:
array: A numpy array of size num_filter x (fftpoints//2 + 1) which are filterbank
"""
high_freq = high_freq or sampling_freq / 2
low_freq = low_freq or 300
assert high_freq <= sampling_freq / 2, "High frequency cannot be greater than half of the sampling frequency!"
assert low_freq >= 0, "low frequency cannot be less than zero!"
######################################################
########### Computing the Mel filterbank #############
######################################################
# converting the upper and lower frequencies to Mels.
# num_filter + 2 is because for num_filter filterbanks we need num_filter+2 point.
mels = np.linspace(functions.frequency_to_mel(low_freq), functions.frequency_to_mel(high_freq), num_filter + 2)
# we should convert Mels back to Hertz because the start and end-points should be at the desired frequencies.
hertz = functions.mel_to_frequency(mels)
# The frequency resolution required to put filters at the
# exact points calculated above should be extracted.
# So we should round those frequencies to the closest FFT bin.
freq_index = (np.floor((fftpoints + 1) * hertz / sampling_freq)).astype(int)
# Initial definition
filterbank = np.zeros([num_filter, fftpoints])
# The triangular function for each filter
for i in range(0, num_filter):
left = int(freq_index[i])
middle = int(freq_index[i + 1])
right = int(freq_index[i + 2])
z = np.linspace(left, right, num=right - left + 1)
filterbank[i, left:right + 1] = functions.triangle(z, left=left, middle=middle, right=right)
return filterbank
def mfcc(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,num_cepstral =13,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True):
"""Compute MFCC features from an audio signal.
Args:
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
num_cepstral (int): Number of cepstral coefficients.
dc_elimination (bool): hIf the first dc component should be eliminated or not.
Returns:
array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
"""
feature, energy = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length, frame_stride=frame_stride,
num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency, high_frequency=high_frequency)
if len(feature) == 0:
return np.empty((0, num_cepstral))
feature = np.log(feature)
feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral]
# replace first cepstral coefficient with log of frame energy for DC elimination.
if dc_elimination:
feature[:, 0] = np.log(energy)
return feature
def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute Mel-filterbank energy features from an audio signal.
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
Returns:
array: features - the energy of fiterbank: num_frames x num_filters frame_energies.
The energy of each frame: num_frames x 1
"""
# Convert to float
signal = signal.astype(float)
# Stack frames
frames = processing.stack_frames(signal, sampling_frequency=sampling_frequency, frame_length=frame_length,
frame_stride=frame_stride,
filter=lambda x: np.ones((x,)),
zero_padding=False)
# getting the high frequency
high_frequency = high_frequency or sampling_frequency / 2
# calculation of the power sprectum
power_spectrum = processing.power_spectrum(frames, fft_length)
number_fft_coefficients = power_spectrum.shape[1]
frame_energies = np.sum(power_spectrum, 1) # this stores the total energy in each frame
# Handling zero enegies.
frame_energies = functions.zero_handling(frame_energies)
# Extracting the filterbank
filter_banks = filterbanks(num_filters, number_fft_coefficients, sampling_frequency, low_frequency, high_frequency)
# Filterbank energies
features = np.dot(power_spectrum, filter_banks.T)
features = functions.zero_handling(features)
return features, frame_energies
def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute log Mel-filterbank energy features from an audio signal.
Args:
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
Returns:
array: Features - The energy of fiterbank: num_frames x num_filters
frame_log_energies. The log energy of each frame: num_frames x 1
"""
feature, frame_energies = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length,
frame_stride=frame_stride,
num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency,
high_frequency=high_frequency)
feature = np.log(feature)
return feature
def extract_derivative_feature(feature):
"""
This function extracts temporal derivative features which are first and second derivatives.
Args:
feature (array): The feature vector which its size is: N x M
Return:
array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3
"""
first_derivative_feature = processing.derivative_extraction(feature, DeltaWindows=2)
second_derivative_feature = processing.derivative_extraction(first_derivative_feature, DeltaWindows=2)
# Creating the future cube for each file
feature_cube = np.concatenate(
(feature[:, :, None], first_derivative_feature[:, :, None],
second_derivative_feature[:, :, None]),
axis=2)
return feature_cube