Source code for nupic.research.frameworks.pytorch.audio_transforms

# ----------------------------------------------------------------------
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2018, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# following terms and conditions apply:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero Public License for more details.
#
# You should have received a copy of the GNU Affero Public License
# along with this program.  If not, see http://www.gnu.org/licenses.
#
# http://numenta.org/licenses/
# ----------------------------------------------------------------------

"""
Adapted from https://github.com/tugstugi/pytorch-speech-commands Google
speech commands dataset.
"""

import random

import librosa
import numpy as np
import torch
from torch.utils.data import Dataset


[docs]def should_apply_transform(prob=0.5):
    """Transforms are only randomly applied with the given probability."""
    return random.random() < prob


[docs]class LoadAudio(object):
    """Loads an audio into a numpy array."""

    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate

    def __call__(self, data):
        path = data["path"]
        if path:
            samples, sample_rate = librosa.load(path, self.sample_rate)
        else:
            # silence
            sample_rate = self.sample_rate
            samples = np.zeros(sample_rate, dtype=np.float32)
        data["samples"] = samples
        data["sample_rate"] = sample_rate
        return data


[docs]class FixAudioLength(object):
    """Either pads or truncates an audio into a fixed length."""

    def __init__(self, time=1):
        self.time = time

    def __call__(self, data):
        samples = data["samples"]
        sample_rate = data["sample_rate"]
        length = int(self.time * sample_rate)
        if length < len(samples):
            data["samples"] = samples[:length]
        elif length > len(samples):
            data["samples"] = np.pad(samples, (0, length - len(samples)), "constant")
        return data


[docs]class ChangeAmplitude(object):
    """Changes amplitude of an audio randomly."""

    def __init__(self, amplitude_range=(0.7, 1.1)):
        self.amplitude_range = amplitude_range

    def __call__(self, data):
        if not should_apply_transform():
            return data

        data["samples"] = data["samples"] * random.uniform(*self.amplitude_range)
        return data


[docs]class AddNoise(object):
    """Blend random noise into the sample.

    A' = A * (1 - alpha) + alpha * noise

    noise is random uniform in the range [-max_val, max_val]
    """

    def __init__(self, alpha=0.0, max_val=1.0):
        self.alpha = alpha
        self.max_val = max_val

    def __call__(self, data):
        samples = data["samples"]
        noise_vector = np.random.uniform(
            -self.max_val, self.max_val, data["samples"].size
        )
        data["samples"] = samples * (1 - self.alpha) + noise_vector * self.alpha
        return data


[docs]class ChangeSpeedAndPitchAudio(object):
    """Change the speed of an audio.

    This transform also changes the pitch of the audio.
    """

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        samples = data["samples"]
        scale = random.uniform(-self.max_scale, self.max_scale)
        speed_fac = 1.0 / (1 + scale)
        data["samples"] = np.interp(
            np.arange(0, len(samples), speed_fac), np.arange(0, len(samples)), samples
        ).astype(np.float32)
        return data


[docs]class StretchAudio(object):
    """Stretches an audio randomly."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        scale = random.uniform(-self.max_scale, self.max_scale)
        data["samples"] = librosa.effects.time_stretch(data["samples"], 1 + scale)
        return data


[docs]class TimeshiftAudio(object):
    """Shifts an audio randomly."""

    def __init__(self, max_shift_seconds=0.2):
        self.max_shift_seconds = max_shift_seconds

    def __call__(self, data):
        if not should_apply_transform():
            return data

        samples = data["samples"]
        sample_rate = data["sample_rate"]
        max_shift = sample_rate * self.max_shift_seconds
        shift = random.randint(-max_shift, max_shift)
        a = -min(0, shift)
        b = max(0, shift)
        samples = np.pad(samples, (a, b), "constant")
        data["samples"] = samples[: len(samples) - a] if a else samples[b:]
        return data


[docs]class AddBackgroundNoise(Dataset):
    """Adds a random background noise."""

    def __init__(self, bg_dataset, max_percentage=0.45):
        self.bg_dataset = bg_dataset
        self.max_percentage = max_percentage

    def __call__(self, data):
        if not should_apply_transform():
            return data

        samples = data["samples"]
        noise = random.choice(self.bg_dataset)["samples"]
        percentage = random.uniform(0, self.max_percentage)
        data["samples"] = samples * (1 - percentage) + noise * percentage
        return data


[docs]class ToMelSpectrogram(object):
    """Creates the mel spectrogram from an audio.

    The result is a 32x32 matrix.
    """

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        samples = data["samples"]
        sample_rate = data["sample_rate"]
        s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels)
        data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max)
        return data


[docs]class ToTensor(object):
    """Converts into a tensor."""

    def __init__(self, np_name, tensor_name, normalize=None):
        self.np_name = np_name
        self.tensor_name = tensor_name
        self.normalize = normalize

    def __call__(self, data):
        tensor = torch.FloatTensor(data[self.np_name])
        if self.normalize is not None:
            mean, std = self.normalize
            tensor -= mean
            tensor /= std
        data[self.tensor_name] = tensor
        return data


[docs]class ToSTFT(object):
    """Applies on an audio the short time fourier transform."""

    def __init__(self, n_fft=2048, hop_length=512):
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __call__(self, data):
        samples = data["samples"]
        data["n_fft"] = self.n_fft
        data["hop_length"] = self.hop_length
        data["stft"] = librosa.stft(
            samples, n_fft=self.n_fft, hop_length=self.hop_length
        )
        data["stft_shape"] = data["stft"].shape
        return data


[docs]class StretchAudioOnSTFT(object):
    """Stretches an audio on the frequency domain."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data["stft"]
        hop_length = data["hop_length"]
        scale = random.uniform(-self.max_scale, self.max_scale)
        stft_stretch = librosa.core.phase_vocoder(
            stft, 1 + scale, hop_length=hop_length
        )
        data["stft"] = stft_stretch
        return data


[docs]class TimeshiftAudioOnSTFT(object):
    """
    A simple timeshift on the frequency domain without multiplying with exp.
    """

    def __init__(self, max_shift=8):
        self.max_shift = max_shift

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data["stft"]
        shift = random.randint(-self.max_shift, self.max_shift)
        a = -min(0, shift)
        b = max(0, shift)
        stft = np.pad(stft, ((0, 0), (a, b)), "constant")
        if a == 0:
            stft = stft[:, b:]
        else:
            stft = stft[:, 0:-a]
        data["stft"] = stft
        return data


[docs]class AddBackgroundNoiseOnSTFT(Dataset):
    """Adds a random background noise on the frequency domain."""

    def __init__(self, bg_dataset, max_percentage=0.45):
        self.bg_dataset = bg_dataset
        self.max_percentage = max_percentage

    def __call__(self, data):
        if not should_apply_transform():
            return data

        noise = random.choice(self.bg_dataset)["stft"]
        percentage = random.uniform(0, self.max_percentage)
        data["stft"] = data["stft"] * (1 - percentage) + noise * percentage
        return data


[docs]class FixSTFTDimension(object):
    """
    Either pads or truncates in the time axis on the frequency domain, applied
    after stretching, time shifting etc.
    """

    def __call__(self, data):
        stft = data["stft"]
        t_len = stft.shape[1]
        orig_t_len = data["stft_shape"][1]
        if t_len > orig_t_len:
            stft = stft[:, 0:orig_t_len]
        elif t_len < orig_t_len:
            stft = np.pad(stft, ((0, 0), (0, orig_t_len - t_len)), "constant")

        data["stft"] = stft
        return data


[docs]class ToMelSpectrogramFromSTFT(object):
    """Creates the mel spectrogram from the short time fourier transform of a
    file.

    The result is a 32x32 matrix.
    """

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        stft = data["stft"]
        sample_rate = data["sample_rate"]
        n_fft = data["n_fft"]
        mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels)
        s = np.dot(mel_basis, np.abs(stft) ** 2.0)
        data["mel_spectrogram"] = librosa.power_to_db(s, ref=np.max)
        return data


[docs]class DeleteSTFT(object):
    """
    Pytorch doesn't like complex numbers, use this transform to remove STFT after
    computing the mel spectrogram.
    """

    def __call__(self, data):
        del data["stft"]
        return data


[docs]class AudioFromSTFT(object):
    """Inverse short time fourier transform."""

    def __call__(self, data):
        stft = data["stft"]
        data["istft_samples"] = librosa.core.istft(stft, dtype=data["samples"].dtype)
        return data


[docs]class Unsqueeze(object):
    """Unsqueeze audio data into a single tensor."""

    def __init__(self, tensor_name, model_type):
        self.model_type = model_type
        self.tensor_name = tensor_name

    def __call__(self, data):
        data = data[self.tensor_name]
        if self.model_type in ["resnet9", "cnn"]:
            data = torch.unsqueeze(data, 0)
        return data
nupic.research

Navigation

Related Topics

Source code for nupic.research.frameworks.pytorch.audio_transforms