Shortcuts

Source code for flash.audio.speech_recognition.data

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, Optional, Sequence, Type

from torch.utils.data import Dataset

from flash.audio.speech_recognition.input import (
    SpeechRecognitionCSVInput,
    SpeechRecognitionDatasetInput,
    SpeechRecognitionJSONInput,
    SpeechRecognitionPathsInput,
)
from flash.audio.speech_recognition.output_transform import SpeechRecognitionOutputTransform
from flash.core.data.data_module import DataModule
from flash.core.data.io.input import Input
from flash.core.data.io.input_transform import INPUT_TRANSFORM_TYPE, InputTransform
from flash.core.utilities.imports import _TOPIC_AUDIO_AVAILABLE
from flash.core.utilities.stages import RunningStage

# Skip doctests if requirements aren't available
if not _TOPIC_AUDIO_AVAILABLE:
    __doctest_skip__ = ["SpeechRecognitionData", "SpeechRecognitionData.*"]


[docs]class SpeechRecognitionData(DataModule): """The ``SpeechRecognitionData`` class is a :class:`~flash.core.data.data_module.DataModule` with a set of classmethods for loading data for speech recognition.""" input_transform_cls = InputTransform output_transform_cls = SpeechRecognitionOutputTransform
[docs] @classmethod def from_files( cls, train_files: Optional[Sequence[str]] = None, train_targets: Optional[Sequence[str]] = None, val_files: Optional[Sequence[str]] = None, val_targets: Optional[Sequence[str]] = None, test_files: Optional[Sequence[str]] = None, test_targets: Optional[Sequence[str]] = None, predict_files: Optional[Sequence[str]] = None, sampling_rate: int = 16000, input_cls: Type[Input] = SpeechRecognitionPathsInput, transform: INPUT_TRANSFORM_TYPE = InputTransform, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "SpeechRecognitionData": """Load the :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData` from lists of audio files and corresponding lists of targets. The supported file extensions are: ``.aiff``, ``.au``, ``.avr``, ``.caf``, ``.flac``, ``.mat``, ``.mat4``, ``.mat5``, ``.mpc2k``, ``.ogg``, ``.paf``, ``.pvf``, ``.rf64``, ``.ircam``, ``.voc``, ``.w64``, ``.wav``, ``.nist``, and ``.wavex``. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: train_files: The list of audio files to use when training. train_targets: The list of targets (ground truth speech transcripts) to use when training. val_files: The list of audio files to use when validating. val_targets: The list of targets (ground truth speech transcripts) to use when validating. test_files: The list of audio files to use when testing. test_targets: The list of targets (ground truth speech transcripts) to use when testing. predict_files: The list of audio files to use when predicting. sampling_rate: Sampling rate to use when loading the audio files. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData`. Examples ________ .. testsetup:: >>> import numpy as np >>> import soundfile as sf >>> samplerate = 1000 >>> data = np.random.uniform(-1, 1, size=(samplerate, 2)) >>> _ = [sf.write(f"speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> _ = [sf.write(f"predict_speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] .. doctest:: >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> datamodule = SpeechRecognitionData.from_files( ... train_files=["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... train_targets=["some speech", "some other speech", "some more speech"], ... predict_files=["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... batch_size=2, ... ) >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"speech_{i}.wav") for i in range(1, 4)] >>> _ = [os.remove(f"predict_speech_{i}.wav") for i in range(1, 4)] """ ds_kw = {"sampling_rate": sampling_rate} return cls( input_cls(RunningStage.TRAINING, train_files, train_targets, **ds_kw), input_cls(RunningStage.VALIDATING, val_files, val_targets, **ds_kw), input_cls(RunningStage.TESTING, test_files, test_targets, **ds_kw), input_cls(RunningStage.PREDICTING, predict_files, **ds_kw), transform=transform, transform_kwargs=transform_kwargs, **data_module_kwargs, )
[docs] @classmethod def from_csv( cls, input_field: str, target_field: Optional[str] = None, train_file: Optional[str] = None, val_file: Optional[str] = None, test_file: Optional[str] = None, predict_file: Optional[str] = None, sampling_rate: int = 16000, input_cls: Type[Input] = SpeechRecognitionCSVInput, transform: INPUT_TRANSFORM_TYPE = InputTransform, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "SpeechRecognitionData": """Load the :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData` from CSV files containing audio file paths and their corresponding targets. Input audio file paths will be extracted from the ``input_field`` column in the CSV files. The supported file extensions are: ``.aiff``, ``.au``, ``.avr``, ``.caf``, ``.flac``, ``.mat``, ``.mat4``, ``.mat5``, ``.mpc2k``, ``.ogg``, ``.paf``, ``.pvf``, ``.rf64``, ``.ircam``, ``.voc``, ``.w64``, ``.wav``, ``.nist``, and ``.wavex``. The targets will be extracted from the ``target_field`` in the CSV files. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: input_field: The field (column name) in the CSV files containing the audio file paths. target_field: The field (column name) in the CSV files containing the targets. train_file: The CSV file to use when training. val_file: The CSV file to use when validating. test_file: The CSV file to use when testing. predict_file: The CSV file to use when predicting. sampling_rate: Sampling rate to use when loading the audio files. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData`. Examples ________ The files can be in Comma Separated Values (CSV) format with either a ``.csv`` or ``.txt`` extension. .. testsetup:: >>> import numpy as np >>> from pandas import DataFrame >>> import soundfile as sf >>> samplerate = 1000 >>> data = np.random.uniform(-1, 1, size=(samplerate, 2)) >>> _ = [sf.write(f"speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> _ = [sf.write(f"predict_speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> DataFrame.from_dict({ ... "speech_files": ["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... "targets": ["some speech", "some other speech", "some more speech"], ... }).to_csv("train_data.csv", index=False) >>> DataFrame.from_dict({ ... "speech_files": ["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... }).to_csv("predict_data.csv", index=False) The file ``train_data.csv`` contains the following: .. code-block:: speech_files,targets speech_1.wav,some speech speech_2.wav,some other speech speech_3.wav,some more speech The file ``predict_data.csv`` contains the following: .. code-block:: speech_files predict_speech_1.wav predict_speech_2.wav predict_speech_3.wav .. doctest:: >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> datamodule = SpeechRecognitionData.from_csv( ... "speech_files", ... "targets", ... train_file="train_data.csv", ... predict_file="predict_data.csv", ... batch_size=2, ... ) >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"speech_{i}.wav") for i in range(1, 4)] >>> _ = [os.remove(f"predict_speech_{i}.wav") for i in range(1, 4)] >>> os.remove("train_data.csv") >>> os.remove("predict_data.csv") Alternatively, the files can be in Tab Separated Values (TSV) format with either a ``.tsv``. .. testsetup:: >>> import numpy as np >>> from pandas import DataFrame >>> import soundfile as sf >>> samplerate = 1000 >>> data = np.random.uniform(-1, 1, size=(samplerate, 2)) >>> _ = [sf.write(f"speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> _ = [sf.write(f"predict_speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> DataFrame.from_dict({ ... "speech_files": ["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... "targets": ["some speech", "some other speech", "some more speech"], ... }).to_csv("train_data.tsv", sep="\\t", index=False) >>> DataFrame.from_dict({ ... "speech_files": ["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... }).to_csv("predict_data.tsv", sep="\\t", index=False) The file ``train_data.tsv`` contains the following: .. code-block:: speech_files targets speech_1.wav some speech speech_2.wav some other speech speech_3.wav some more speech The file ``predict_data.tsv`` contains the following: .. code-block:: speech_files predict_speech_1.wav predict_speech_2.wav predict_speech_3.wav .. doctest:: >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> datamodule = SpeechRecognitionData.from_csv( ... "speech_files", ... "targets", ... train_file="train_data.tsv", ... predict_file="predict_data.tsv", ... batch_size=2, ... ) >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"speech_{i}.wav") for i in range(1, 4)] >>> _ = [os.remove(f"predict_speech_{i}.wav") for i in range(1, 4)] >>> os.remove("train_data.tsv") >>> os.remove("predict_data.tsv") """ ds_kw = {"input_key": input_field, "sampling_rate": sampling_rate} return cls( input_cls(RunningStage.TRAINING, train_file, target_key=target_field, **ds_kw), input_cls(RunningStage.VALIDATING, val_file, target_key=target_field, **ds_kw), input_cls(RunningStage.TESTING, test_file, target_key=target_field, **ds_kw), input_cls(RunningStage.PREDICTING, predict_file, **ds_kw), transform=transform, transform_kwargs=transform_kwargs, **data_module_kwargs, )
[docs] @classmethod def from_json( cls, input_field: str, target_field: Optional[str] = None, train_file: Optional[str] = None, val_file: Optional[str] = None, test_file: Optional[str] = None, predict_file: Optional[str] = None, sampling_rate: int = 16000, field: Optional[str] = None, input_cls: Type[Input] = SpeechRecognitionJSONInput, transform: INPUT_TRANSFORM_TYPE = InputTransform, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "SpeechRecognitionData": """Load the :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData` from JSON files containing audio file paths and their corresponding targets. Input audio file paths will be extracted from the ``input_field`` field in the JSON files. The supported file extensions are: ``.aiff``, ``.au``, ``.avr``, ``.caf``, ``.flac``, ``.mat``, ``.mat4``, ``.mat5``, ``.mpc2k``, ``.ogg``, ``.paf``, ``.pvf``, ``.rf64``, ``.ircam``, ``.voc``, ``.w64``, ``.wav``, ``.nist``, and ``.wavex``. The targets will be extracted from the ``target_field`` field in the JSON files. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: input_field: The field in the JSON files containing the audio file paths. target_field: The field in the JSON files containing the targets. train_file: The JSON file to use when training. val_file: The JSON file to use when validating. test_file: The JSON file to use when testing. predict_file: The JSON file to use when predicting. sampling_rate: Sampling rate to use when loading the audio files. field: The field that holds the data in the JSON file. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData`. Examples ________ .. testsetup:: >>> import numpy as np >>> from pandas import DataFrame >>> import soundfile as sf >>> samplerate = 1000 >>> data = np.random.uniform(-1, 1, size=(samplerate, 2)) >>> _ = [sf.write(f"speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> _ = [sf.write(f"predict_speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> DataFrame.from_dict({ ... "speech_files": ["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... "targets": ["some speech", "some other speech", "some more speech"], ... }).to_json("train_data.json", orient="records", lines=True) >>> DataFrame.from_dict({ ... "speech_files": ["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... }).to_json("predict_data.json", orient="records", lines=True) The file ``train_data.json`` contains the following: .. code-block:: {"speech_files":"speech_1.wav","targets":"some speech"} {"speech_files":"speech_2.wav","targets":"some other speech"} {"speech_files":"speech_3.wav","targets":"some more speech"} The file ``predict_data.json`` contains the following: .. code-block:: {"speech_files":"predict_speech_1.wav"} {"speech_files":"predict_speech_2.wav"} {"speech_files":"predict_speech_3.wav"} .. doctest:: >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> datamodule = SpeechRecognitionData.from_json( ... "speech_files", ... "targets", ... train_file="train_data.json", ... predict_file="predict_data.json", ... batch_size=2, ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <BLANKLINE> ... >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"speech_{i}.wav") for i in range(1, 4)] >>> _ = [os.remove(f"predict_speech_{i}.wav") for i in range(1, 4)] >>> os.remove("train_data.json") >>> os.remove("predict_data.json") """ ds_kw = {"input_key": input_field, "sampling_rate": sampling_rate, "field": field} return cls( input_cls(RunningStage.TRAINING, train_file, target_key=target_field, **ds_kw), input_cls(RunningStage.VALIDATING, val_file, target_key=target_field, **ds_kw), input_cls(RunningStage.TESTING, test_file, target_key=target_field, **ds_kw), input_cls(RunningStage.PREDICTING, predict_file, **ds_kw), transform=transform, transform_kwargs=transform_kwargs, **data_module_kwargs, )
[docs] @classmethod def from_datasets( cls, train_dataset: Optional[Dataset] = None, val_dataset: Optional[Dataset] = None, test_dataset: Optional[Dataset] = None, predict_dataset: Optional[Dataset] = None, sampling_rate: int = 16000, input_cls: Type[Input] = SpeechRecognitionDatasetInput, transform: INPUT_TRANSFORM_TYPE = InputTransform, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "SpeechRecognitionData": """Load the :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData` from PyTorch Dataset objects. The Dataset objects should be one of the following: * A PyTorch Dataset where the ``__getitem__`` returns a tuple: ``(file_path or , target)`` * A PyTorch Dataset where the ``__getitem__`` returns a dict: ``{"input": file_path, "target": target}`` The supported file extensions are: ``.aiff``, ``.au``, ``.avr``, ``.caf``, ``.flac``, ``.mat``, ``.mat4``, ``.mat5``, ``.mpc2k``, ``.ogg``, ``.paf``, ``.pvf``, ``.rf64``, ``.ircam``, ``.voc``, ``.w64``, ``.wav``, ``.nist``, and ``.wavex``. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: train_dataset: The Dataset to use when training. val_dataset: The Dataset to use when validating. test_dataset: The Dataset to use when testing. predict_dataset: The Dataset to use when predicting. sampling_rate: Sampling rate to use when loading the audio files. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.audio.speech_recognition.data.SpeechRecognitionData`. Examples ________ .. testsetup:: >>> import numpy as np >>> import soundfile as sf >>> samplerate = 1000 >>> data = np.random.uniform(-1, 1, size=(samplerate, 2)) >>> _ = [sf.write(f"speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] >>> _ = [sf.write(f"predict_speech_{i}.wav", data, samplerate, subtype='PCM_24') for i in range(1, 4)] A PyTorch Dataset where the ``__getitem__`` returns a tuple: ``(file_path, target)``: .. doctest:: >>> from torch.utils.data import Dataset >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> >>> class CustomDataset(Dataset): ... def __init__(self, files, targets=None): ... self.files = files ... self.targets = targets ... def __getitem__(self, index): ... if self.targets is not None: ... return self.files[index], self.targets[index] ... return self.files[index] ... def __len__(self): ... return len(self.files) ... >>> >>> datamodule = SpeechRecognitionData.from_datasets( ... train_dataset=CustomDataset( ... ["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... ["some speech", "some other speech", "some more speech"], ... ), ... predict_dataset=CustomDataset( ... ["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... ), ... batch_size=2, ... ) >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... A PyTorch Dataset where the ``__getitem__`` returns a dict: ``{"input": file_path, "target": target}``: .. doctest:: >>> from torch.utils.data import Dataset >>> from flash import Trainer >>> from flash.audio import SpeechRecognitionData, SpeechRecognition >>> >>> class CustomDataset(Dataset): ... def __init__(self, files, targets=None): ... self.files = files ... self.targets = targets ... def __getitem__(self, index): ... if self.targets is not None: ... return {"input": self.files[index], "target": self.targets[index]} ... return {"input": self.files[index]} ... def __len__(self): ... return len(self.files) ... >>> >>> datamodule = SpeechRecognitionData.from_datasets( ... train_dataset=CustomDataset( ... ["speech_1.wav", "speech_2.wav", "speech_3.wav"], ... ["some speech", "some other speech", "some more speech"], ... ), ... predict_dataset=CustomDataset( ... ["predict_speech_1.wav", "predict_speech_2.wav", "predict_speech_3.wav"], ... ), ... batch_size=2, ... ) >>> model = SpeechRecognition(backbone="patrickvonplaten/wav2vec2_tiny_random_robust") >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"speech_{i}.wav") for i in range(1, 4)] >>> _ = [os.remove(f"predict_speech_{i}.wav") for i in range(1, 4)] """ ds_kw = {"sampling_rate": sampling_rate} return cls( input_cls(RunningStage.TRAINING, train_dataset, **ds_kw), input_cls(RunningStage.VALIDATING, val_dataset, **ds_kw), input_cls(RunningStage.TESTING, test_dataset, **ds_kw), input_cls(RunningStage.PREDICTING, predict_dataset, **ds_kw), transform=transform, transform_kwargs=transform_kwargs, **data_module_kwargs, )

© Copyright 2020-2021, PyTorch Lightning. Revision a374dd4f.

Built with Sphinx using a theme provided by Read the Docs.
Read the Docs v: latest
Versions
latest
stable
0.8.2
0.8.1.post0
0.8.1
0.8.0
0.7.5
0.7.4
0.7.3
0.7.2
0.7.1
0.7.0
0.6.0
0.5.2
0.5.1
0.5.0
0.4.0
0.3.2
0.3.1
0.3.0
0.2.3
0.2.2
0.2.1
0.2.0
0.1.0post1
Downloads
html
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.