forked from ShreyaKhare/speechpy
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeature.py
More file actions
executable file
·282 lines (232 loc) · 10.2 KB
/
feature.py
File metadata and controls
executable file
·282 lines (232 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""feature module.
This module provides functions for calculating the main speech
features that the package is aimed to extract as well as the required
elements.
Functions:
filterbanks: Compute the Mel-filterbanks
The filterbanks must be created for extracting
speech features such as MFCC.
mfcc: Extracting Mel Frequency Cepstral Coefficient feature.
mfe: Extracting Mel Energy feature.
lmfe: Extracting Log Mel Energy feature.
extract_derivative_feature: Extract the first and second derivative
features. This finction, directly use the ``derivative_extraction``
function in the ``processing`` module.
"""
from __future__ import division
import numpy as np
from . import processing
from scipy.fftpack import dct
from . import functions
def filterbanks(
num_filter,
coefficients,
sampling_freq,
low_freq=None,
high_freq=None):
"""Compute the Mel-filterbanks. Each filter will be stored in one rows.
The columns correspond to fft bins.
Args:
num_filter (int): the number of filters in the filterbank, default 20.
coefficients (int): (fftpoints//2 + 1). Default is 257.
sampling_freq (float): the samplerate of the signal we are working
with. It affects mel spacing.
low_freq (float): lowest band edge of mel filters, default 0 Hz
high_freq (float): highest band edge of mel filters,
default samplerate/2
Returns:
array: A numpy array of size num_filter x (fftpoints//2 + 1)
which are filterbank
"""
high_freq = high_freq or sampling_freq / 2
low_freq = low_freq or 300
s = "High frequency cannot be greater than half of the sampling frequency!"
assert high_freq <= sampling_freq / 2, s
assert low_freq >= 0, "low frequency cannot be less than zero!"
# Computing the Mel filterbank
# converting the upper and lower frequencies to Mels.
# num_filter + 2 is because for num_filter filterbanks we need
# num_filter+2 point.
mels = np.linspace(
functions.frequency_to_mel(low_freq),
functions.frequency_to_mel(high_freq),
num_filter + 2)
# we should convert Mels back to Hertz because the start and end-points
# should be at the desired frequencies.
hertz = functions.mel_to_frequency(mels)
# The frequency resolution required to put filters at the
# exact points calculated above should be extracted.
# So we should round those frequencies to the closest FFT bin.
freq_index = (
np.floor(
(coefficients +
1) *
hertz /
sampling_freq)).astype(int)
# Initial definition
filterbank = np.zeros([num_filter, coefficients])
# The triangular function for each filter
for i in range(0, num_filter):
left = int(freq_index[i])
middle = int(freq_index[i + 1])
right = int(freq_index[i + 2])
z = np.linspace(left, right, num=right - left + 1)
filterbank[i,
left:right + 1] = functions.triangle(z,
left=left,
middle=middle,
right=right)
return filterbank
def mfcc(
signal,
sampling_frequency,
frame_length=0.020,
frame_stride=0.01,
num_cepstral=13,
num_filters=40,
fft_length=512,
low_frequency=0,
high_frequency=None,
dc_elimination=True):
"""Compute MFCC features from an audio signal.
Args:
signal (array): the audio signal from which to compute features.
Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal
we are working with.
frame_length (float): the length of each frame in seconds.
Default is 0.020s
frame_stride (float): the step between successive frames in seconds.
Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank,
default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters.
In Hz, default is 0.
high_frequency (float): highest band edge of mel filters.
In Hz, default is samplerate/2
num_cepstral (int): Number of cepstral coefficients.
dc_elimination (bool): hIf the first dc component should
be eliminated or not.
Returns:
array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
"""
feature, energy = mfe(signal, sampling_frequency=sampling_frequency,
frame_length=frame_length, frame_stride=frame_stride,
num_filters=num_filters, fft_length=fft_length,
low_frequency=low_frequency,
high_frequency=high_frequency)
if len(feature) == 0:
return np.empty((0, num_cepstral))
feature = np.log(feature)
feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral]
# replace first cepstral coefficient with log of frame energy for DC
# elimination.
if dc_elimination:
feature[:, 0] = np.log(energy)
return feature
def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute Mel-filterbank energy features from an audio signal.
Args:
signal (array): the audio signal from which to compute features.
Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal
we are working with.
frame_length (float): the length of each frame in seconds.
Default is 0.020s
frame_stride (float): the step between successive frames in seconds.
Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank,
default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters.
In Hz, default is 0.
high_frequency (float): highest band edge of mel filters.
In Hz, default is samplerate/2
Returns:
array: features - the energy of fiterbank of size num_frames x num_filters. The energy of each frame: num_frames x 1
"""
# Convert to float
signal = signal.astype(float)
# Stack frames
frames = processing.stack_frames(
signal,
sampling_frequency=sampling_frequency,
frame_length=frame_length,
frame_stride=frame_stride,
filter=lambda x: np.ones(
(x,
)),
zero_padding=False)
# getting the high frequency
high_frequency = high_frequency or sampling_frequency / 2
# calculation of the power sprectum
power_spectrum = processing.power_spectrum(frames, fft_length)
coefficients = power_spectrum.shape[1]
# this stores the total energy in each frame
frame_energies = np.sum(power_spectrum, 1)
# Handling zero enegies.
frame_energies = functions.zero_handling(frame_energies)
# Extracting the filterbank
filter_banks = filterbanks(
num_filters,
coefficients,
sampling_frequency,
low_frequency,
high_frequency)
# Filterbank energies
features = np.dot(power_spectrum, filter_banks.T)
features = functions.zero_handling(features)
return features, frame_energies
def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute log Mel-filterbank energy features from an audio signal.
Args:
signal (array): the audio signal from which to compute features.
Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal
we are working with.
frame_length (float): the length of each frame in seconds.
Default is 0.020s
frame_stride (float): the step between successive frames in seconds.
Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank,
default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters.
In Hz, default is 0.
high_frequency (float): highest band edge of mel filters.
In Hz, default is samplerate/2
Returns:
array: Features - The log energy of fiterbank of size num_frames x num_filters frame_log_energies. The log energy of each frame num_frames x 1
"""
feature, frame_energies = mfe(signal,
sampling_frequency=sampling_frequency,
frame_length=frame_length,
frame_stride=frame_stride,
num_filters=num_filters,
fft_length=fft_length,
low_frequency=low_frequency,
high_frequency=high_frequency)
feature = np.log(feature)
return feature
def extract_derivative_feature(feature):
"""
This function extracts temporal derivative features which are
first and second derivatives.
Args:
feature (array): The feature vector which its size is: N x M
Return:
array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3
"""
first_derivative_feature = processing.derivative_extraction(
feature, DeltaWindows=2)
second_derivative_feature = processing.derivative_extraction(
first_derivative_feature, DeltaWindows=2)
# Creating the future cube for each file
feature_cube = np.concatenate(
(feature[:, :, None], first_derivative_feature[:, :, None],
second_derivative_feature[:, :, None]),
axis=2)
return feature_cube