Source code for nupic.research.frameworks.pytorch.speech_commands_dataset

# ----------------------------------------------------------------------
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2018, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# following terms and conditions apply:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero Public License for more details.
#
# You should have received a copy of the GNU Affero Public License
# along with this program.  If not, see http://www.gnu.org/licenses.
#
# http://numenta.org/licenses/
# ----------------------------------------------------------------------


"""
Adapted from https://github.com/tugstugi/pytorch-speech-commands Google speech
commands dataset.
"""

import gc
import itertools
import os
import pickle

import librosa
import numpy as np
from torch.utils.data import Dataset

__all__ = [
    "CLASSES",
    "SpeechCommandsDataset",
    "BackgroundNoiseDataset",
    "PreprocessedSpeechDataset",
]

CLASSES = tuple(
    "unknown, silence, zero, one, two, three, four, five, six, seven, eight, "
    "nine".split(", ")
)


[docs]class SpeechCommandsDataset(Dataset):
    """Google speech commands dataset. Only labels in CLASSES, plus silence,
    are treated as known classes. All other classes are used as 'unknown'
    samples.

    Similar to the Kaggle challenge here:
    https://www.kaggle.com/c/tensorflow-speech-recognition-challenge
    """

    def __init__(
        self,
        folder,
        transform=None,
        classes=CLASSES,
        silence_percentage=0.1,
        sample_rate=16000,
    ):
        all_classes = [
            d
            for d in os.listdir(folder)
            if os.path.isdir(os.path.join(folder, d)) and not d.startswith("_")
        ]
        for c in classes[2:]:
            assert c in all_classes

        class_to_idx = {classes[i]: i for i in range(len(classes))}
        for c in all_classes:
            if c not in class_to_idx:
                print("Class ", c, "assigned as unknown")
                class_to_idx[c] = 0
        data = []
        for c in all_classes:
            d = os.path.join(folder, c)
            target = class_to_idx[c]
            for f in os.listdir(d):
                path = os.path.join(d, f)
                samples, sample_rate = librosa.load(path, sr=sample_rate)
                audio = {"samples": samples, "sample_rate": sample_rate}
                data.append((audio, target))

        # add silence
        target = class_to_idx["silence"]
        samples = np.zeros(sample_rate, dtype=np.float32)
        silence = {"samples": samples, "sample_rate": sample_rate}
        data += [(silence, target)] * int(len(data) * silence_percentage)

        self.classes = classes
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """Get item from dataset.

        :param index: index in the dataset
        :return: (audio, target) where target is index of the target class.
        :rtype: tuple[dict, int]
        """
        data, target = self.data[index]
        if self.transform is not None:
            data = self.transform(data)

        return data, target

[docs]    def make_weights_for_balanced_classes(self):
        """
        adopted from https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3.  # noqa: E501
        """
        nclasses = len(self.classes)
        count = np.ones(nclasses)
        for item in self.data:
            count[item[1]] += 1

        n = float(sum(count))
        weight_per_class = n / count
        weight = np.zeros(len(self))
        for idx, item in enumerate(self.data):
            weight[idx] = weight_per_class[item[1]]
        return weight


[docs]class BackgroundNoiseDataset(Dataset):
    """Dataset for silence / background noise."""

    def __init__(self, folder, transform=None, sample_rate=16000, sample_length=1):
        audio_files = [
            d
            for d in os.listdir(folder)
            if os.path.isfile(os.path.join(folder, d)) and d.endswith(".wav")
        ]
        samples = []
        for f in audio_files:
            path = os.path.join(folder, f)
            s, sr = librosa.load(path, sample_rate)
            samples.append(s)

        samples = np.hstack(samples)
        c = int(sample_rate * sample_length)
        r = len(samples) // c
        self.samples = samples[: r * c].reshape(-1, c)
        self.sample_rate = sample_rate
        self.classes = CLASSES
        self.transform = transform
        self.path = folder

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        data = {
            "samples": self.samples[index],
            "sample_rate": self.sample_rate,
            "path": self.path,
        }

        if self.transform is not None:
            data = self.transform(data)

        return data


[docs]class PreprocessedSpeechDataset(Dataset):
    """Google Speech Commands dataset preprocessed with with all transforms
    already applied.

    Use the 'process_dataset.py' script to create preprocessed dataset
    """

    def __init__(self, root, subset, classes=CLASSES, silence_percentage=0.1):
        """
        :param root: Dataset root directory
        :param subset: Which dataset subset to use ("train", "test", "valid", "noise")
        :param classes: List of classes to load. See CLASSES for valid options
        :param silence_percentage: Percentage of the dataset to be filled with silence
        """
        self.classes = classes

        self._root = root
        self._subset = subset
        self._silence_percentage = silence_percentage

        self.data = None

        # Circular list of all epochs in this dataset
        epochs = sorted(int(e) for e in os.listdir(root) if e.isdigit())
        self._all_epochs = itertools.cycle(epochs)

        # load first epoch
        self.next_epoch()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """Get item from dataset.

        :param index: index in the dataset
        :return: (audio, target) where target is index of the target class.
        :rtype: tuple[dict, int]
        """
        return self.data[index]

[docs]    def next_epoch(self):
        """Load next epoch from disk."""
        epoch = next(self._all_epochs)
        folder = os.path.join(self._root, str(epoch), self._subset)
        self.data = []
        silence = None

        gc.disable()

        for filename in os.listdir(folder):
            command = os.path.splitext(os.path.basename(filename))[0]
            with open(os.path.join(folder, filename), "rb") as pkl_file:
                audio = pickle.load(pkl_file)

            # Check for 'silence'
            if command == "silence":
                silence = audio
            else:
                target = self.classes.index(os.path.basename(command))
                self.data.extend(itertools.product(audio, [target]))

        gc.enable()

        target = self.classes.index("silence")
        self.data += [(silence, target)] * int(
            len(self.data) * self._silence_percentage
        )
        return epoch

[docs]    def make_weights_for_balanced_classes(self):
        """
        adopted from https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3.  # noqa E501
        """
        nclasses = len(self.classes)
        count = np.ones(nclasses)
        for item in self.data:
            count[item[1]] += 1

        n = float(sum(count))
        weight_per_class = n / count
        weight = np.zeros(len(self.data))
        for idx, item in enumerate(self.data):
            weight[idx] = weight_per_class[item[1]]
        return weight

[docs]    @staticmethod
    def is_valid(folder, epoch=0):
        """Check if the given folder is a valid preprocessed dataset."""
        # Validate by checking for the training 'silence.pkl' on the given epoch
        # This file is unique to our pre-processed dataset generated
        # by 'process_dataset.py'
        return os.path.exists(os.path.join(folder, str(epoch), "train", "silence.pkl"))
nupic.research

Navigation

Related Topics

Source code for nupic.research.frameworks.pytorch.speech_commands_dataset