# ----------------------------------------------------------------------
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2018, Numenta, Inc. Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# following terms and conditions apply:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero Public License for more details.
#
# You should have received a copy of the GNU Affero Public License
# along with this program. If not, see http://www.gnu.org/licenses.
#
# http://numenta.org/licenses/
# ----------------------------------------------------------------------
"""
Adapted from https://github.com/tugstugi/pytorch-speech-commands Google
speech commands dataset.
"""
import random
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset
[docs]class LoadAudio(object):
"""Loads an audio into a numpy array."""
def __init__(self, sample_rate=16000):
self.sample_rate = sample_rate
def __call__(self, data):
path = data["path"]
if path:
samples, sample_rate = librosa.load(path, self.sample_rate)
else:
# silence
sample_rate = self.sample_rate
samples = np.zeros(sample_rate, dtype=np.float32)
data["samples"] = samples
data["sample_rate"] = sample_rate
return data
[docs]class FixAudioLength(object):
"""Either pads or truncates an audio into a fixed length."""
def __init__(self, time=1):
self.time = time
def __call__(self, data):
samples = data["samples"]
sample_rate = data["sample_rate"]
length = int(self.time * sample_rate)
if length < len(samples):
data["samples"] = samples[:length]
elif length > len(samples):
data["samples"] = np.pad(samples, (0, length - len(samples)), "constant")
return data
[docs]class ChangeAmplitude(object):
"""Changes amplitude of an audio randomly."""
def __init__(self, amplitude_range=(0.7, 1.1)):
self.amplitude_range = amplitude_range
def __call__(self, data):
if not should_apply_transform():
return data
data["samples"] = data["samples"] * random.uniform(*self.amplitude_range)
return data
[docs]class AddNoise(object):
"""Blend random noise into the sample.
A' = A * (1 - alpha) + alpha * noise
noise is random uniform in the range [-max_val, max_val]
"""
def __init__(self, alpha=0.0, max_val=1.0):
self.alpha = alpha
self.max_val = max_val
def __call__(self, data):
samples = data["samples"]
noise_vector = np.random.uniform(
-self.max_val, self.max_val, data["samples"].size
)
data["samples"] = samples * (1 - self.alpha) + noise_vector * self.alpha
return data
[docs]class ChangeSpeedAndPitchAudio(object):
"""Change the speed of an audio.
This transform also changes the pitch of the audio.
"""
def __init__(self, max_scale=0.2):
self.max_scale = max_scale
def __call__(self, data):
if not should_apply_transform():
return data
samples = data["samples"]
scale = random.uniform(-self.max_scale, self.max_scale)
speed_fac = 1.0 / (1 + scale)
data["samples"] = np.interp(
np.arange(0, len(samples), speed_fac), np.arange(0, len(samples)), samples
).astype(np.float32)
return data
[docs]class StretchAudio(object):
"""Stretches an audio randomly."""
def __init__(self, max_scale=0.2):
self.max_scale = max_scale
def __call__(self, data):
if not should_apply_transform():
return data
scale = random.uniform(-self.max_scale, self.max_scale)
data["samples"] = librosa.effects.time_stretch(data["samples"], 1 + scale)
return data
[docs]class TimeshiftAudio(object):
"""Shifts an audio randomly."""
def __init__(self, max_shift_seconds=0.2):
self.max_shift_seconds = max_shift_seconds
def __call__(self, data):
if not should_apply_transform():
return data
samples = data["samples"]
sample_rate = data["sample_rate"]
max_shift = sample_rate * self.max_shift_seconds
shift = random.randint(-max_shift, max_shift)
a = -min(0, shift)
b = max(0, shift)
samples = np.pad(samples, (a, b), "constant")
data["samples"] = samples[: len(samples) - a] if a else samples[b:]
return data
[docs]class AddBackgroundNoise(Dataset):
"""Adds a random background noise."""
def __init__(self, bg_dataset, max_percentage=0.45):
self.bg_dataset = bg_dataset
self.max_percentage = max_percentage
def __call__(self, data):
if not should_apply_transform():
return data
samples = data["samples"]
noise = random.choice(self.bg_dataset)["samples"]
percentage = random.uniform(0, self.max_percentage)
data["samples"] = samples * (1 - percentage) + noise * percentage
return data
[docs]class ToMelSpectrogram(object):
"""Creates the mel spectrogram from an audio.
The result is a 32x32 matrix.
"""
def __init__(self, n_mels=32):
self.n_mels = n_mels
def __call__(self, data):
samples = data["samples"]
sample_rate = data["sample_rate"]
s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels)
data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max)
return data
[docs]class ToTensor(object):
"""Converts into a tensor."""
def __init__(self, np_name, tensor_name, normalize=None):
self.np_name = np_name
self.tensor_name = tensor_name
self.normalize = normalize
def __call__(self, data):
tensor = torch.FloatTensor(data[self.np_name])
if self.normalize is not None:
mean, std = self.normalize
tensor -= mean
tensor /= std
data[self.tensor_name] = tensor
return data
[docs]class ToSTFT(object):
"""Applies on an audio the short time fourier transform."""
def __init__(self, n_fft=2048, hop_length=512):
self.n_fft = n_fft
self.hop_length = hop_length
def __call__(self, data):
samples = data["samples"]
data["n_fft"] = self.n_fft
data["hop_length"] = self.hop_length
data["stft"] = librosa.stft(
samples, n_fft=self.n_fft, hop_length=self.hop_length
)
data["stft_shape"] = data["stft"].shape
return data
[docs]class StretchAudioOnSTFT(object):
"""Stretches an audio on the frequency domain."""
def __init__(self, max_scale=0.2):
self.max_scale = max_scale
def __call__(self, data):
if not should_apply_transform():
return data
stft = data["stft"]
hop_length = data["hop_length"]
scale = random.uniform(-self.max_scale, self.max_scale)
stft_stretch = librosa.core.phase_vocoder(
stft, 1 + scale, hop_length=hop_length
)
data["stft"] = stft_stretch
return data
[docs]class TimeshiftAudioOnSTFT(object):
"""
A simple timeshift on the frequency domain without multiplying with exp.
"""
def __init__(self, max_shift=8):
self.max_shift = max_shift
def __call__(self, data):
if not should_apply_transform():
return data
stft = data["stft"]
shift = random.randint(-self.max_shift, self.max_shift)
a = -min(0, shift)
b = max(0, shift)
stft = np.pad(stft, ((0, 0), (a, b)), "constant")
if a == 0:
stft = stft[:, b:]
else:
stft = stft[:, 0:-a]
data["stft"] = stft
return data
[docs]class AddBackgroundNoiseOnSTFT(Dataset):
"""Adds a random background noise on the frequency domain."""
def __init__(self, bg_dataset, max_percentage=0.45):
self.bg_dataset = bg_dataset
self.max_percentage = max_percentage
def __call__(self, data):
if not should_apply_transform():
return data
noise = random.choice(self.bg_dataset)["stft"]
percentage = random.uniform(0, self.max_percentage)
data["stft"] = data["stft"] * (1 - percentage) + noise * percentage
return data
[docs]class FixSTFTDimension(object):
"""
Either pads or truncates in the time axis on the frequency domain, applied
after stretching, time shifting etc.
"""
def __call__(self, data):
stft = data["stft"]
t_len = stft.shape[1]
orig_t_len = data["stft_shape"][1]
if t_len > orig_t_len:
stft = stft[:, 0:orig_t_len]
elif t_len < orig_t_len:
stft = np.pad(stft, ((0, 0), (0, orig_t_len - t_len)), "constant")
data["stft"] = stft
return data
[docs]class ToMelSpectrogramFromSTFT(object):
"""Creates the mel spectrogram from the short time fourier transform of a
file.
The result is a 32x32 matrix.
"""
def __init__(self, n_mels=32):
self.n_mels = n_mels
def __call__(self, data):
stft = data["stft"]
sample_rate = data["sample_rate"]
n_fft = data["n_fft"]
mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels)
s = np.dot(mel_basis, np.abs(stft) ** 2.0)
data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max)
return data
[docs]class DeleteSTFT(object):
"""
Pytorch doesn't like complex numbers, use this transform to remove STFT after
computing the mel spectrogram.
"""
def __call__(self, data):
del data["stft"]
return data
[docs]class AudioFromSTFT(object):
"""Inverse short time fourier transform."""
def __call__(self, data):
stft = data["stft"]
data["istft_samples"] = librosa.core.istft(stft, dtype=data["samples"].dtype)
return data
[docs]class Unsqueeze(object):
"""Unsqueeze audio data into a single tensor."""
def __init__(self, tensor_name, model_type):
self.model_type = model_type
self.tensor_name = tensor_name
def __call__(self, data):
data = data[self.tensor_name]
if self.model_type in ["resnet9", "cnn"]:
data = torch.unsqueeze(data, 0)
return data