具体操作
具体操作#
利用librosa读取音频#
from matplotlib import pyplot as plt
import numpy as np
import librosa
# 利用librosa读取音频
input_wav_path = r'test.wav'
y, sr = librosa.load(input_wav_path)
y_num = np.arange(len(y))
# 截取前0.3s的音频
sample_signal = y[0:int(sr*0.3)]
sample_num = np.arange(len(sample_signal))
plt.figure(figsize=(11, 7), dpi=500)
plt.subplot(211)
plt.plot(y_num/sr, y, color='black')
plt.plot(sample_num/sr, sample_signal, color='blue')
plt.xlabel('Time (sec)')
plt.ylabel('Amplitude')
plt.title('Waveform')
plt.subplot(212)
plt.plot(sample_num/sr, sample_signal, color='blue')
plt.xlabel('Time (sec)')
plt.ylabel('Amplitude')
plt.title('0~0.3s waveform')
plt.tight_layout()
plt.savefig('waveform.png', dpi=500)
plt.show()
音频有不同的编码类型,librosa默认采取浮点格式读取,即读取的样本点均是
\([-1,-1]\)
之间的浮点值。更详细的文档参见SoX的Input & Output File Format Options
部分。
选项 描述 常见可选项
- b 每个编码样本所占的数据位数(位深) 8/16/32
- c 音频文件包含的通道数 ½
- e 音频文件的编码类型 signed-integer/unsigned-integer/floating-point
- r 音频文件的采样率 16k/16000/22050
- t 音频文件的文件类型 raw/mp3
-
SoX音频格式
提取梅尔频谱#
sample_rate = 16000
preemphasis = 0.97
n_fft = 1024
frame_length = 0.05 # ms
frame_shift = 0.01 # ms
fmin = 0
fmax = sample_rate/2
eps = 1e-10
n_mel = 80
win_length = int(sample_rate*frame_length)
hop_length = int(sample_rate*frame_shift)
mel_basis = librosa.filters.mel(
sample_rate, n_fft, n_mel, fmin=fmin, fmax=fmax)
def get_spectrogram(input_wav_path):
y, sr = librosa.load(input_wav_path)
y = np.append(y[0], y[1:]-preemphasis*y[:-1])
linear = librosa.stft(
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
mag = np.abs(linear)
mel = np.dot(mel_basis, mag)
mel = np.log10(np.maximum(eps, mel))
mel = mel.T.astype(np.float32) # (T,n_mels)
return mel
# plt.switch_backend('agg')
def plot_spectrogram(spectrogram, file_path):
spectrogram = spectrogram.T
fig = plt.figure(figsize=(16, 9))
plt.imshow(spectrogram, aspect='auto', origin='lower')
plt.colorbar()
plt.xlabel('frames')
plt.tight_layout()
plt.savefig(file_path, dpi=500)
plt.show()
mel_spec = get_spectrogram(input_wav_path)
plot_spectrogram(mel_spec, 'mel_spectrogram.png')
提取MFCC#
from scipy.fftpack import dct
num_ceps = 12 # MFCC阶数,可选值2~13
mfcc = dct(mel_spec, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]
plot_spectrogram(mfcc, 'mfcc.png')
# 将正弦同态滤波(sinusoidal liftering)应用于MFCC以去强调更高的MFCC,其已被证明可以改善噪声信号中的语音识别。
# reference: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
(nframes, ncoeff) = mfcc.shape
cep_lifter = 22
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift
plot_spectrogram(mfcc, 'mfcc_lift.png')
在语音合成中,类似于深度学习其它领域,输入数据要进行均值方差归一化,使得数据量纲一致并遵循一定分布,避免模型梯度爆炸,降低学习难度。
frame_num = mel_spec.shape[0]
cep_sum = np.sum(mel_spec, axis=0)
cep_squ_sum = np.sum(np.square(mel_spec), axis=0)
cep_mean = cep_sum/frame_num
cep_std = cep_squ_sum/frame_num-np.square(cep_mean)
最后更新:
2022-04-25