|
| 1 | +.. image:: _images/speechpy_logo.gif |
| 2 | + :target: https://github.com/astorfi/speech_feature_extraction/blob/master/images/speechpy_logo.gif |
| 3 | + |
| 4 | +========================== |
| 5 | +speech_feature_extraction |
| 6 | +========================== |
| 7 | + |
| 8 | +.. image:: https://circleci.com/gh/astorfi/speech_feature_extraction.svg?style=svg |
| 9 | + :target: https://circleci.com/gh/astorfi/speech_feature_extraction |
| 10 | +.. image:: https://travis-ci.org/astorfi/speech_feature_extraction.svg?branch=master |
| 11 | + :target: https://travis-ci.org/astorfi/speech_feature_extraction |
| 12 | +.. image:: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat |
| 13 | + :target: https://github.com/astorfi/speech_feature_extraction/issues |
| 14 | +.. image:: https://coveralls.io/repos/github/astorfi/speech_feature_extraction/badge.svg?branch=master |
| 15 | + :target: https://coveralls.io/github/astorfi/speech_feature_extraction?branch=master |
| 16 | +.. image:: https://codecov.io/gh/astorfi/speech_feature_extraction/branch/master/graph/badge.svg |
| 17 | + :target: https://codecov.io/gh/astorfi/speech_feature_extraction |
| 18 | +.. image:: https://badge.fury.io/py/speechpy.svg |
| 19 | + :target: https://badge.fury.io/py/speechpy |
| 20 | +.. image:: https://zenodo.org/badge/87262342.svg |
| 21 | + :target: https://zenodo.org/badge/latestdoi/87262342 |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +This library provides most frquent used speech features including MFCCs and filterbank energies alogside with the logenergy of filterbanks. |
| 28 | +If you are interested to see what are MFCCs and how they are generated please refer to this |
| 29 | +`wiki <https://github.com/astorfi/speech_feature_extraction/wiki/>`_ page. |
| 30 | + |
| 31 | +.. image:: _images/speech.gif |
| 32 | + |
| 33 | +=============== |
| 34 | +How to Install? |
| 35 | +=============== |
| 36 | + |
| 37 | +There are two possible ways for installation of this package: local installation and PyPi. |
| 38 | + |
| 39 | +~~~~~~~~~~~~~~~~~~~ |
| 40 | +Local Installation |
| 41 | +~~~~~~~~~~~~~~~~~~~ |
| 42 | + |
| 43 | +For local installation at first the repository must be cloned:: |
| 44 | + |
| 45 | + git clone https://github.com/astorfi/speech_feature_extraction.git |
| 46 | + |
| 47 | +After cloning the reposity, root to the repository directory then execute:: |
| 48 | + |
| 49 | + python setup.py develop |
| 50 | + |
| 51 | +~~~~~ |
| 52 | +Pypi |
| 53 | +~~~~~ |
| 54 | + |
| 55 | +The package is available on PyPi. For direct installation simply execute the following: |
| 56 | + |
| 57 | +.. code-block:: shell |
| 58 | + |
| 59 | + pip install speechpy |
| 60 | + |
| 61 | +============================= |
| 62 | +What Features are supported? |
| 63 | +============================= |
| 64 | +- Mel Frequency Cepstral Coefficients(MFCCs) |
| 65 | +- Filterbank Energies |
| 66 | +- Log Filterbank Energies |
| 67 | + |
| 68 | +~~~~~~~~~~~~~~ |
| 69 | +MFCC Features |
| 70 | +~~~~~~~~~~~~~~ |
| 71 | + |
| 72 | +.. image:: _images/Speech_GIF.gif |
| 73 | + |
| 74 | +The supported attributes for generating MFCC features can be seen by investigating the related function: |
| 75 | + |
| 76 | +.. code-block:: python |
| 77 | + |
| 78 | + def mfcc(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,num_cepstral =13, |
| 79 | + num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True): |
| 80 | + """Compute MFCC features from an audio signal. |
| 81 | + :param signal: the audio signal from which to compute features. Should be an N x 1 array |
| 82 | + :param sampling_frequency: the sampling frequency of the signal we are working with. |
| 83 | + :param frame_length: the length of each frame in seconds. Default is 0.020s |
| 84 | + :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) |
| 85 | + :param num_filters: the number of filters in the filterbank, default 40. |
| 86 | + :param fft_length: number of FFT points. Default is 512. |
| 87 | + :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. |
| 88 | + :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 |
| 89 | + :param num_cepstral: Number of cepstral coefficients. |
| 90 | + :param dc_elimination: hIf the first dc component should be eliminated or not. |
| 91 | + :returns: A numpy array of size (num_frames x num_cepstral) containing mfcc features. |
| 92 | + """ |
| 93 | + |
| 94 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 95 | +Filterbank Energy Features |
| 96 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 97 | + |
| 98 | + |
| 99 | +.. code-block:: python |
| 100 | + |
| 101 | + def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, |
| 102 | + num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): |
| 103 | + """Compute Mel-filterbank energy features from an audio signal. |
| 104 | + :param signal: the audio signal from which to compute features. Should be an N x 1 array |
| 105 | + :param sampling_frequency: the sampling frequency of the signal we are working with. |
| 106 | + :param frame_length: the length of each frame in seconds. Default is 0.020s |
| 107 | + :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) |
| 108 | + :param num_filters: the number of filters in the filterbank, default 40. |
| 109 | + :param fft_length: number of FFT points. Default is 512. |
| 110 | + :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. |
| 111 | + :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 |
| 112 | + :returns: |
| 113 | + features: the energy of fiterbank: num_frames x num_filters |
| 114 | + frame_energies: the energy of each frame: num_frames x 1 |
| 115 | + """ |
| 116 | + |
| 117 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 118 | +log - Filterbank Energy Features |
| 119 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 120 | + |
| 121 | +The attributes for ``log_filterbank energies`` are the same for ``filterbank energies`` too. |
| 122 | + |
| 123 | +.. code-block:: python |
| 124 | + |
| 125 | + def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, |
| 126 | + num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): |
| 127 | + """Compute log Mel-filterbank energy features from an audio signal. |
| 128 | + :param signal: the audio signal from which to compute features. Should be an N x 1 array |
| 129 | + :param sampling_frequency: the sampling frequency of the signal we are working with. |
| 130 | + :param frame_length: the length of each frame in seconds. Default is 0.020s |
| 131 | + :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) |
| 132 | + :param num_filters: the number of filters in the filterbank, default 40. |
| 133 | + :param fft_length: number of FFT points. Default is 512. |
| 134 | + :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. |
| 135 | + :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 |
| 136 | + :returns: |
| 137 | + features: the energy of fiterbank: num_frames x num_filters |
| 138 | + frame_log_energies: the log energy of each frame: num_frames x 1 |
| 139 | + """ |
| 140 | + |
| 141 | +~~~~~~~~~~~~ |
| 142 | +Stack Frames |
| 143 | +~~~~~~~~~~~~ |
| 144 | + |
| 145 | +In ``Stack_Frames`` function, the stack of frames will be generated from the signal. |
| 146 | + |
| 147 | +.. code-block:: python |
| 148 | + |
| 149 | + def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020, Filter=lambda x: numpy.ones((x,)), |
| 150 | + zero_padding=True): |
| 151 | + """Frame a signal into overlapping frames. |
| 152 | + :param sig: The audio signal to frame of size (N,). |
| 153 | + :param sampling_frequency: The sampling frequency of the signal. |
| 154 | + :param frame_length: The length of the frame in second. |
| 155 | + :param frame_stride: The stride between frames. |
| 156 | + :param Filter: The time-domain filter for applying to each frame. By default it is one so nothing will be changed. |
| 157 | + :param zero_padding: If the samples is not a multiple of frame_length(number of frames sample), zero padding will |
| 158 | + be done for generating last frame. |
| 159 | + :returns: Array of frames. size: number_of_frames x frame_len. |
| 160 | + """ |
| 161 | + |
| 162 | + |
| 163 | +~~~~~~~~~~~~ |
| 164 | +Test Example |
| 165 | +~~~~~~~~~~~~ |
| 166 | + |
| 167 | +The test example can be seen in ``test/test.py`` as below: |
| 168 | + |
| 169 | +.. code-block:: python |
| 170 | + |
| 171 | + import scipy.io.wavfile as wav |
| 172 | + import numpy as np |
| 173 | + import speechpy |
| 174 | + |
| 175 | + file_name = 'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav' |
| 176 | + fs, signal = wav.read(file_name) |
| 177 | + signal = signal[:,0] |
| 178 | + |
| 179 | + ############# Extract MFCC features ############# |
| 180 | + mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, |
| 181 | + num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) |
| 182 | + mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc) |
| 183 | + print('mfcc feature cube shape=', mfcc_feature_cube.shape) |
| 184 | + |
| 185 | + ############# Extract logenergy features ############# |
| 186 | + logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, |
| 187 | + num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) |
| 188 | + logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy) |
| 189 | + print('logenergy features=', logenergy.shape) |
| 190 | + |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | + |
| 195 | +For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection. |
| 196 | + |
| 197 | +============= |
| 198 | +Dependencies |
| 199 | +============= |
| 200 | + |
| 201 | +Two packages of ``Scipy`` and ``NumPy`` are the required dependencies which will be installed automatically by running the ``setup.py`` file. |
| 202 | + |
| 203 | +~~~~~~~~~ |
| 204 | +Citation |
| 205 | +~~~~~~~~~ |
| 206 | + |
| 207 | +If you used this package, please cite it as follows: |
| 208 | + |
| 209 | +.. code:: bash |
| 210 | + |
| 211 | + @misc{amirsina_torfi_2017_810392, |
| 212 | + author = {Amirsina Torfi}, |
| 213 | + title = {astorfi/speech_feature_extraction: SpeechPy}, |
| 214 | + month = jun, |
| 215 | + year = 2017, |
| 216 | + doi = {10.5281/zenodo.810392}, |
| 217 | + url = {https://doi.org/10.5281/zenodo.810392}} |
0 commit comments