Source code for nupic.research.frameworks.pytorch.audio_transforms

# ----------------------------------------------------------------------
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2018, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# following terms and conditions apply:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero Public License for more details.
#
# You should have received a copy of the GNU Affero Public License
# along with this program.  If not, see http://www.gnu.org/licenses.
#
# http://numenta.org/licenses/
# ----------------------------------------------------------------------

"""
Adapted from https://github.com/tugstugi/pytorch-speech-commands Google
speech commands dataset.
"""

import random

import librosa
import numpy as np
import torch
from torch.utils.data import Dataset


[docs]def should_apply_transform(prob=0.5): """Transforms are only randomly applied with the given probability.""" return random.random() < prob
[docs]class LoadAudio(object): """Loads an audio into a numpy array.""" def __init__(self, sample_rate=16000): self.sample_rate = sample_rate def __call__(self, data): path = data["path"] if path: samples, sample_rate = librosa.load(path, self.sample_rate) else: # silence sample_rate = self.sample_rate samples = np.zeros(sample_rate, dtype=np.float32) data["samples"] = samples data["sample_rate"] = sample_rate return data
[docs]class FixAudioLength(object): """Either pads or truncates an audio into a fixed length.""" def __init__(self, time=1): self.time = time def __call__(self, data): samples = data["samples"] sample_rate = data["sample_rate"] length = int(self.time * sample_rate) if length < len(samples): data["samples"] = samples[:length] elif length > len(samples): data["samples"] = np.pad(samples, (0, length - len(samples)), "constant") return data
[docs]class ChangeAmplitude(object): """Changes amplitude of an audio randomly.""" def __init__(self, amplitude_range=(0.7, 1.1)): self.amplitude_range = amplitude_range def __call__(self, data): if not should_apply_transform(): return data data["samples"] = data["samples"] * random.uniform(*self.amplitude_range) return data
[docs]class AddNoise(object): """Blend random noise into the sample. A' = A * (1 - alpha) + alpha * noise noise is random uniform in the range [-max_val, max_val] """ def __init__(self, alpha=0.0, max_val=1.0): self.alpha = alpha self.max_val = max_val def __call__(self, data): samples = data["samples"] noise_vector = np.random.uniform( -self.max_val, self.max_val, data["samples"].size ) data["samples"] = samples * (1 - self.alpha) + noise_vector * self.alpha return data
[docs]class ChangeSpeedAndPitchAudio(object): """Change the speed of an audio. This transform also changes the pitch of the audio. """ def __init__(self, max_scale=0.2): self.max_scale = max_scale def __call__(self, data): if not should_apply_transform(): return data samples = data["samples"] scale = random.uniform(-self.max_scale, self.max_scale) speed_fac = 1.0 / (1 + scale) data["samples"] = np.interp( np.arange(0, len(samples), speed_fac), np.arange(0, len(samples)), samples ).astype(np.float32) return data
[docs]class StretchAudio(object): """Stretches an audio randomly.""" def __init__(self, max_scale=0.2): self.max_scale = max_scale def __call__(self, data): if not should_apply_transform(): return data scale = random.uniform(-self.max_scale, self.max_scale) data["samples"] = librosa.effects.time_stretch(data["samples"], 1 + scale) return data
[docs]class TimeshiftAudio(object): """Shifts an audio randomly.""" def __init__(self, max_shift_seconds=0.2): self.max_shift_seconds = max_shift_seconds def __call__(self, data): if not should_apply_transform(): return data samples = data["samples"] sample_rate = data["sample_rate"] max_shift = sample_rate * self.max_shift_seconds shift = random.randint(-max_shift, max_shift) a = -min(0, shift) b = max(0, shift) samples = np.pad(samples, (a, b), "constant") data["samples"] = samples[: len(samples) - a] if a else samples[b:] return data
[docs]class AddBackgroundNoise(Dataset): """Adds a random background noise.""" def __init__(self, bg_dataset, max_percentage=0.45): self.bg_dataset = bg_dataset self.max_percentage = max_percentage def __call__(self, data): if not should_apply_transform(): return data samples = data["samples"] noise = random.choice(self.bg_dataset)["samples"] percentage = random.uniform(0, self.max_percentage) data["samples"] = samples * (1 - percentage) + noise * percentage return data
[docs]class ToMelSpectrogram(object): """Creates the mel spectrogram from an audio. The result is a 32x32 matrix. """ def __init__(self, n_mels=32): self.n_mels = n_mels def __call__(self, data): samples = data["samples"] sample_rate = data["sample_rate"] s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels) data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max) return data
[docs]class ToTensor(object): """Converts into a tensor.""" def __init__(self, np_name, tensor_name, normalize=None): self.np_name = np_name self.tensor_name = tensor_name self.normalize = normalize def __call__(self, data): tensor = torch.FloatTensor(data[self.np_name]) if self.normalize is not None: mean, std = self.normalize tensor -= mean tensor /= std data[self.tensor_name] = tensor return data
[docs]class ToSTFT(object): """Applies on an audio the short time fourier transform.""" def __init__(self, n_fft=2048, hop_length=512): self.n_fft = n_fft self.hop_length = hop_length def __call__(self, data): samples = data["samples"] data["n_fft"] = self.n_fft data["hop_length"] = self.hop_length data["stft"] = librosa.stft( samples, n_fft=self.n_fft, hop_length=self.hop_length ) data["stft_shape"] = data["stft"].shape return data
[docs]class StretchAudioOnSTFT(object): """Stretches an audio on the frequency domain.""" def __init__(self, max_scale=0.2): self.max_scale = max_scale def __call__(self, data): if not should_apply_transform(): return data stft = data["stft"] hop_length = data["hop_length"] scale = random.uniform(-self.max_scale, self.max_scale) stft_stretch = librosa.core.phase_vocoder( stft, 1 + scale, hop_length=hop_length ) data["stft"] = stft_stretch return data
[docs]class TimeshiftAudioOnSTFT(object): """ A simple timeshift on the frequency domain without multiplying with exp. """ def __init__(self, max_shift=8): self.max_shift = max_shift def __call__(self, data): if not should_apply_transform(): return data stft = data["stft"] shift = random.randint(-self.max_shift, self.max_shift) a = -min(0, shift) b = max(0, shift) stft = np.pad(stft, ((0, 0), (a, b)), "constant") if a == 0: stft = stft[:, b:] else: stft = stft[:, 0:-a] data["stft"] = stft return data
[docs]class AddBackgroundNoiseOnSTFT(Dataset): """Adds a random background noise on the frequency domain.""" def __init__(self, bg_dataset, max_percentage=0.45): self.bg_dataset = bg_dataset self.max_percentage = max_percentage def __call__(self, data): if not should_apply_transform(): return data noise = random.choice(self.bg_dataset)["stft"] percentage = random.uniform(0, self.max_percentage) data["stft"] = data["stft"] * (1 - percentage) + noise * percentage return data
[docs]class FixSTFTDimension(object): """ Either pads or truncates in the time axis on the frequency domain, applied after stretching, time shifting etc. """ def __call__(self, data): stft = data["stft"] t_len = stft.shape[1] orig_t_len = data["stft_shape"][1] if t_len > orig_t_len: stft = stft[:, 0:orig_t_len] elif t_len < orig_t_len: stft = np.pad(stft, ((0, 0), (0, orig_t_len - t_len)), "constant") data["stft"] = stft return data
[docs]class ToMelSpectrogramFromSTFT(object): """Creates the mel spectrogram from the short time fourier transform of a file. The result is a 32x32 matrix. """ def __init__(self, n_mels=32): self.n_mels = n_mels def __call__(self, data): stft = data["stft"] sample_rate = data["sample_rate"] n_fft = data["n_fft"] mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels) s = np.dot(mel_basis, np.abs(stft) ** 2.0) data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max) return data
[docs]class DeleteSTFT(object): """ Pytorch doesn't like complex numbers, use this transform to remove STFT after computing the mel spectrogram. """ def __call__(self, data): del data["stft"] return data
[docs]class AudioFromSTFT(object): """Inverse short time fourier transform.""" def __call__(self, data): stft = data["stft"] data["istft_samples"] = librosa.core.istft(stft, dtype=data["samples"].dtype) return data
[docs]class Unsqueeze(object): """Unsqueeze audio data into a single tensor.""" def __init__(self, tensor_name, model_type): self.model_type = model_type self.tensor_name = tensor_name def __call__(self, data): data = data[self.tensor_name] if self.model_type in ["resnet9", "cnn"]: data = torch.unsqueeze(data, 0) return data