Shortcuts

Source code for flash.video.classification.data

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union

import pandas as pd
import torch
from torch.utils.data import Sampler

from flash.core.data.data_module import DataModule
from flash.core.data.io.input import Input
from flash.core.data.io.input_transform import INPUT_TRANSFORM_TYPE
from flash.core.data.utilities.classification import TargetFormatter
from flash.core.data.utilities.paths import PATH_TYPE
from flash.core.integrations.labelstudio.input import _parse_labelstudio_arguments, LabelStudioVideoClassificationInput
from flash.core.utilities.imports import (
    _FIFTYONE_AVAILABLE,
    _PYTORCHVIDEO_AVAILABLE,
    _VIDEO_EXTRAS_TESTING,
    _VIDEO_TESTING,
    requires,
)
from flash.core.utilities.stages import RunningStage
from flash.video.classification.input import (
    VideoClassificationCSVInput,
    VideoClassificationCSVPredictInput,
    VideoClassificationDataFrameInput,
    VideoClassificationDataFramePredictInput,
    VideoClassificationFiftyOneInput,
    VideoClassificationFiftyOnePredictInput,
    VideoClassificationFilesInput,
    VideoClassificationFoldersInput,
    VideoClassificationPathsPredictInput,
)
from flash.video.classification.input_transform import VideoClassificationInputTransform

if _FIFTYONE_AVAILABLE:
    SampleCollection = "fiftyone.core.collections.SampleCollection"
else:
    SampleCollection = None

if _PYTORCHVIDEO_AVAILABLE:
    from pytorchvideo.data.clip_sampling import ClipSampler
else:
    ClipSampler = None

# Skip doctests if requirements aren't available
__doctest_skip__ = []
if not _VIDEO_TESTING:
    __doctest_skip__ += [
        "VideoClassificationData",
        "VideoClassificationData.from_files",
        "VideoClassificationData.from_folders",
        "VideoClassificationData.from_data_frame",
        "VideoClassificationData.from_csv",
    ]
if not _VIDEO_EXTRAS_TESTING:
    __doctest_skip__ += ["VideoClassificationData.from_fiftyone"]


[docs]class VideoClassificationData(DataModule): """The ``VideoClassificationData`` class is a :class:`~flash.core.data.data_module.DataModule` with a set of classmethods for loading data for video classification.""" input_transform_cls = VideoClassificationInputTransform
[docs] @classmethod def from_files( cls, train_files: Optional[Sequence[str]] = None, train_targets: Optional[Sequence[Any]] = None, val_files: Optional[Sequence[str]] = None, val_targets: Optional[Sequence[Any]] = None, test_files: Optional[Sequence[str]] = None, test_targets: Optional[Sequence[Any]] = None, predict_files: Optional[Sequence[str]] = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, target_formatter: Optional[TargetFormatter] = None, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", input_cls: Type[Input] = VideoClassificationFilesInput, predict_input_cls: Type[Input] = VideoClassificationPathsPredictInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs, ) -> "VideoClassificationData": """Load the :class:`~flash.video.classification.data.VideoClassificationData` from lists of files and corresponding lists of targets. The supported file extensions are: ``.mp4``, and ``.avi``. The targets can be in any of our :ref:`supported classification target formats <formatting_classification_targets>`. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: train_files: The list of video files to use when training. train_targets: The list of targets to use when training. val_files: The list of video files to use when validating. val_targets: The list of targets to use when validating. test_files: The list of video files to use when testing. test_targets: The list of targets to use when testing. predict_files: The list of video files to use when predicting. train_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when training. val_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when validating. test_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when testing. predict_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when predicting. target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to control how targets are handled. See :ref:`formatting_classification_targets` for more details. clip_sampler: The clip sampler to use. One of: ``"uniform"``, ``"random"``, ``"constant_clips_per_video"``. clip_duration: The duration of clips to sample. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: The decoder to use to decode videos. One of: ``"pyav"``, ``"torchvision"``. Not used for frame videos. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.video.classification.data.VideoClassificationData`. Examples ________ .. testsetup:: >>> import torch >>> from torchvision import io >>> data = torch.randint(255, (10, 64, 64, 3)) >>> _ = [io.write_video(f"video_{i}.mp4", data, 5, "libx264rgb", {"crf": "0"}) for i in range(1, 4)] >>> _ = [io.write_video(f"predict_video_{i}.mp4", data, 5, "libx264rgb", {"crf": "0"}) for i in range(1, 4)] .. doctest:: >>> from flash import Trainer >>> from flash.video import VideoClassifier, VideoClassificationData >>> datamodule = VideoClassificationData.from_files( ... train_files=["video_1.mp4", "video_2.mp4", "video_3.mp4"], ... train_targets=["cat", "dog", "cat"], ... predict_files=["predict_video_1.mp4", "predict_video_2.mp4", "predict_video_3.mp4"], ... transform_kwargs=dict(image_size=(244, 244)), ... batch_size=2, ... ) >>> datamodule.num_classes 2 >>> datamodule.labels ['cat', 'dog'] >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes) >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"video_{i}.mp4") for i in range(1, 4)] >>> _ = [os.remove(f"predict_video_{i}.mp4") for i in range(1, 4)] """ ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, decode_audio=decode_audio, decoder=decoder, ) train_input = input_cls( RunningStage.TRAINING, train_files, train_targets, transform=train_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ) target_formatter = getattr(train_input, "target_formatter", None) return cls( train_input, input_cls( RunningStage.VALIDATING, val_files, val_targets, transform=val_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), input_cls( RunningStage.TESTING, test_files, test_targets, transform=test_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), predict_input_cls(RunningStage.PREDICTING, predict_files, transform=predict_transform, **ds_kw), **data_module_kwargs, )
[docs] @classmethod def from_folders( cls, train_folder: Optional[str] = None, val_folder: Optional[str] = None, test_folder: Optional[str] = None, predict_folder: Optional[str] = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, target_formatter: Optional[TargetFormatter] = None, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", input_cls: Type[Input] = VideoClassificationFoldersInput, predict_input_cls: Type[Input] = VideoClassificationPathsPredictInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs, ) -> "VideoClassificationData": """Load the :class:`~flash.video.classification.data.VideoClassificationData` from folders containing videos. The supported file extensions are: ``.mp4``, and ``.avi``. For train, test, and validation data, the folders are expected to contain a sub-folder for each class. Here's the required structure: .. code-block:: train_folder ├── cat │ ├── video_1.mp4 │ ├── video_3.mp4 │ ... └── dog ├── video_2.mp4 ... For prediction, the folder is expected to contain the files for inference, like this: .. code-block:: predict_folder ├── predict_video_1.mp4 ├── predict_video_2.mp4 ├── predict_video_3.mp4 ... To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: train_folder: The folder containing videos to use when training. val_folder: The folder containing videos to use when validating. test_folder: The folder containing videos to use when testing. predict_folder: The folder containing videos to use when predicting. train_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when training. val_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when validating. test_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when testing. predict_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when predicting. target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to control how targets are handled. See :ref:`formatting_classification_targets` for more details. clip_sampler: The clip sampler to use. One of: ``"uniform"``, ``"random"``, ``"constant_clips_per_video"``. clip_duration: The duration of clips to sample. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: The decoder to use to decode videos. One of: ``"pyav"``, ``"torchvision"``. Not used for frame videos. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.video.classification.data.VideoClassificationData`. Examples ________ .. testsetup:: >>> import os >>> import torch >>> from torchvision import io >>> data = torch.randint(255, (10, 64, 64, 3)) >>> os.makedirs(os.path.join("train_folder", "cat"), exist_ok=True) >>> os.makedirs(os.path.join("train_folder", "dog"), exist_ok=True) >>> os.makedirs("predict_folder", exist_ok=True) >>> _ = [io.write_video( ... os.path.join("train_folder", label, f"video_{i + 1}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i, label in enumerate(["cat", "dog", "cat"])] >>> _ = [ ... io.write_video( ... os.path.join("predict_folder", f"predict_video_{i}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i in range(1, 4) ... ] .. doctest:: >>> from flash import Trainer >>> from flash.video import VideoClassifier, VideoClassificationData >>> datamodule = VideoClassificationData.from_folders( ... train_folder="train_folder", ... predict_folder="predict_folder", ... transform_kwargs=dict(image_size=(244, 244)), ... batch_size=2, ... ) >>> datamodule.num_classes 2 >>> datamodule.labels ['cat', 'dog'] >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes) >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import shutil >>> shutil.rmtree("train_folder") >>> shutil.rmtree("predict_folder") """ ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, decode_audio=decode_audio, decoder=decoder, ) train_input = input_cls( RunningStage.TRAINING, train_folder, transform=train_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ) target_formatter = getattr(train_input, "target_formatter", None) return cls( train_input, input_cls( RunningStage.VALIDATING, val_folder, transform=val_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), input_cls( RunningStage.TESTING, test_folder, transform=test_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), predict_input_cls(RunningStage.PREDICTING, predict_folder, transform=predict_transform, **ds_kw), **data_module_kwargs, )
[docs] @classmethod def from_data_frame( cls, input_field: str, target_fields: Optional[Union[str, Sequence[str]]] = None, train_data_frame: Optional[pd.DataFrame] = None, train_videos_root: Optional[str] = None, train_resolver: Optional[Callable[[str, str], str]] = None, val_data_frame: Optional[pd.DataFrame] = None, val_videos_root: Optional[str] = None, val_resolver: Optional[Callable[[str, str], str]] = None, test_data_frame: Optional[pd.DataFrame] = None, test_videos_root: Optional[str] = None, test_resolver: Optional[Callable[[str, str], str]] = None, predict_data_frame: Optional[pd.DataFrame] = None, predict_videos_root: Optional[str] = None, predict_resolver: Optional[Callable[[str, str], str]] = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, target_formatter: Optional[TargetFormatter] = None, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", input_cls: Type[Input] = VideoClassificationDataFrameInput, predict_input_cls: Type[Input] = VideoClassificationDataFramePredictInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "VideoClassificationData": """Load the :class:`~flash.video.classification.data.VideoClassificationData` from pandas DataFrame objects containing video file paths and their corresponding targets. Input video file paths will be extracted from the ``input_field`` in the DataFrame. The supported file extensions are: ``.mp4``, and ``.avi``. The targets will be extracted from the ``target_fields`` in the DataFrame and can be in any of our :ref:`supported classification target formats <formatting_classification_targets>`. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: input_field: The field (column name) in the DataFrames containing the video file paths. target_fields: The field (column name) or list of fields in the DataFrames containing the targets. train_data_frame: The DataFrame to use when training. train_videos_root: The root directory containing train videos. train_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. val_data_frame: The DataFrame to use when validating. val_videos_root: The root directory containing validation videos. val_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. test_data_frame: The DataFrame to use when testing. test_videos_root: The root directory containing test videos. test_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. predict_data_frame: The DataFrame to use when predicting. predict_videos_root: The root directory containing predict videos. predict_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. train_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when training. val_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when validating. test_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when testing. predict_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when predicting. target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to control how targets are handled. See :ref:`formatting_classification_targets` for more details. clip_sampler: The clip sampler to use. One of: ``"uniform"``, ``"random"``, ``"constant_clips_per_video"``. clip_duration: The duration of clips to sample. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: The decoder to use to decode videos. One of: ``"pyav"``, ``"torchvision"``. Not used for frame videos. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.video.classification.data.VideoClassificationData`. Examples ________ .. testsetup:: >>> import os >>> import torch >>> from torchvision import io >>> data = torch.randint(255, (10, 64, 64, 3)) >>> os.makedirs("train_folder", exist_ok=True) >>> os.makedirs("predict_folder", exist_ok=True) >>> _ = [io.write_video( ... os.path.join("train_folder", f"video_{i}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i in range(1, 4)] >>> _ = [ ... io.write_video( ... os.path.join("predict_folder", f"predict_video_{i}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i in range(1, 4) ... ] .. doctest:: >>> from pandas import DataFrame >>> from flash import Trainer >>> from flash.video import VideoClassifier, VideoClassificationData >>> train_data_frame = DataFrame.from_dict( ... { ... "videos": ["video_1.mp4", "video_2.mp4", "video_3.mp4"], ... "targets": ["cat", "dog", "cat"], ... } ... ) >>> predict_data_frame = DataFrame.from_dict( ... { ... "videos": ["predict_video_1.mp4", "predict_video_2.mp4", "predict_video_3.mp4"], ... } ... ) >>> datamodule = VideoClassificationData.from_data_frame( ... "videos", ... "targets", ... train_data_frame=train_data_frame, ... train_videos_root="train_folder", ... predict_data_frame=predict_data_frame, ... predict_videos_root="predict_folder", ... transform_kwargs=dict(image_size=(244, 244)), ... batch_size=2, ... ) >>> datamodule.num_classes 2 >>> datamodule.labels ['cat', 'dog'] >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes) >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import shutil >>> shutil.rmtree("train_folder") >>> shutil.rmtree("predict_folder") >>> del train_data_frame >>> del predict_data_frame """ ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, decode_audio=decode_audio, decoder=decoder, ) train_data = (train_data_frame, input_field, target_fields, train_videos_root, train_resolver) val_data = (val_data_frame, input_field, target_fields, val_videos_root, val_resolver) test_data = (test_data_frame, input_field, target_fields, test_videos_root, test_resolver) predict_data = (predict_data_frame, input_field, predict_videos_root, predict_resolver) train_input = input_cls( RunningStage.TRAINING, *train_data, transform=train_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ) target_formatter = getattr(train_input, "target_formatter", None) return cls( train_input, input_cls( RunningStage.VALIDATING, *val_data, transform=val_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), input_cls( RunningStage.TESTING, *test_data, transform=test_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), predict_input_cls(RunningStage.PREDICTING, *predict_data, transform=predict_transform, **ds_kw), **data_module_kwargs, )
[docs] @classmethod def from_csv( cls, input_field: str, target_fields: Optional[Union[str, List[str]]] = None, train_file: Optional[PATH_TYPE] = None, train_videos_root: Optional[PATH_TYPE] = None, train_resolver: Optional[Callable[[PATH_TYPE, Any], PATH_TYPE]] = None, val_file: Optional[PATH_TYPE] = None, val_videos_root: Optional[PATH_TYPE] = None, val_resolver: Optional[Callable[[PATH_TYPE, Any], PATH_TYPE]] = None, test_file: Optional[str] = None, test_videos_root: Optional[str] = None, test_resolver: Optional[Callable[[PATH_TYPE, Any], PATH_TYPE]] = None, predict_file: Optional[str] = None, predict_videos_root: Optional[str] = None, predict_resolver: Optional[Callable[[PATH_TYPE, Any], PATH_TYPE]] = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, target_formatter: Optional[TargetFormatter] = None, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", input_cls: Type[Input] = VideoClassificationCSVInput, predict_input_cls: Type[Input] = VideoClassificationCSVPredictInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs: Any, ) -> "VideoClassificationData": """Load the :class:`~flash.video.classification.data.VideoClassificationData` from CSV files containing video file paths and their corresponding targets. Input video file paths will be extracted from the ``input_field`` column in the CSV files. The supported file extensions are: ``.mp4``, and ``.avi``. The targets will be extracted from the ``target_fields`` in the CSV files and can be in any of our :ref:`supported classification target formats <formatting_classification_targets>`. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: input_field: The field (column name) in the CSV files containing the video file paths. target_fields: The field (column name) or list of fields in the CSV files containing the targets. train_file: The CSV file to use when training. train_videos_root: The root directory containing train videos. train_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. val_file: The CSV file to use when validating. val_videos_root: The root directory containing validation videos. val_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. test_file: The CSV file to use when testing. test_videos_root: The root directory containing test videos. test_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. predict_file: The CSV file to use when predicting. predict_videos_root: The root directory containing predict videos. predict_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a video file path. train_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when training. val_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when validating. test_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when testing. predict_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when predicting. target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to control how targets are handled. See :ref:`formatting_classification_targets` for more details. clip_sampler: The clip sampler to use. One of: ``"uniform"``, ``"random"``, ``"constant_clips_per_video"``. clip_duration: The duration of clips to sample. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: The decoder to use to decode videos. One of: ``"pyav"``, ``"torchvision"``. Not used for frame videos. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.video.classification.data.VideoClassificationData`. Examples ________ .. testsetup:: >>> import os >>> import torch >>> from torchvision import io >>> from pandas import DataFrame >>> data = torch.randint(255, (10, 64, 64, 3)) >>> os.makedirs("train_folder", exist_ok=True) >>> os.makedirs("predict_folder", exist_ok=True) >>> _ = [io.write_video( ... os.path.join("train_folder", f"video_{i}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i in range(1, 4)] >>> _ = [ ... io.write_video( ... os.path.join("predict_folder", f"predict_video_{i}.mp4"), data, 5, "libx264rgb", {"crf": "0"} ... ) for i in range(1, 4) ... ] >>> DataFrame.from_dict({ ... "videos": ["video_1.mp4", "video_2.mp4", "video_3.mp4"], ... "targets": ["cat", "dog", "cat"], ... }).to_csv("train_data.csv", index=False) >>> DataFrame.from_dict({ ... "videos": ["predict_video_1.mp4", "predict_video_2.mp4", "predict_video_3.mp4"], ... }).to_csv("predict_data.csv", index=False) The file ``train_data.csv`` contains the following: .. code-block:: videos,targets video_1.mp4,cat video_2.mp4,dog video_3.mp4,cat The file ``predict_data.csv`` contains the following: .. code-block:: videos predict_video_1.mp4 predict_video_2.mp4 predict_video_3.mp4 .. doctest:: >>> from flash import Trainer >>> from flash.video import VideoClassifier, VideoClassificationData >>> datamodule = VideoClassificationData.from_csv( ... "videos", ... "targets", ... train_file="train_data.csv", ... train_videos_root="train_folder", ... predict_file="predict_data.csv", ... predict_videos_root="predict_folder", ... transform_kwargs=dict(image_size=(244, 244)), ... batch_size=2, ... ) >>> datamodule.num_classes 2 >>> datamodule.labels ['cat', 'dog'] >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes) >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import shutil >>> shutil.rmtree("train_folder") >>> shutil.rmtree("predict_folder") >>> os.remove("train_data.csv") >>> os.remove("predict_data.csv") """ ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, decode_audio=decode_audio, decoder=decoder, ) train_data = (train_file, input_field, target_fields, train_videos_root, train_resolver) val_data = (val_file, input_field, target_fields, val_videos_root, val_resolver) test_data = (test_file, input_field, target_fields, test_videos_root, test_resolver) predict_data = (predict_file, input_field, predict_videos_root, predict_resolver) train_input = input_cls( RunningStage.TRAINING, *train_data, transform=train_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ) target_formatter = getattr(train_input, "target_formatter", None) return cls( train_input, input_cls( RunningStage.VALIDATING, *val_data, transform=val_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), input_cls( RunningStage.TESTING, *test_data, transform=test_transform, video_sampler=video_sampler, target_formatter=target_formatter, **ds_kw, ), predict_input_cls(RunningStage.PREDICTING, *predict_data, transform=predict_transform, **ds_kw), **data_module_kwargs, )
[docs] @classmethod @requires("fiftyone") def from_fiftyone( cls, train_dataset: Optional[SampleCollection] = None, val_dataset: Optional[SampleCollection] = None, test_dataset: Optional[SampleCollection] = None, predict_dataset: Optional[SampleCollection] = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, target_formatter: Optional[TargetFormatter] = None, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", label_field: str = "ground_truth", input_cls: Type[Input] = VideoClassificationFiftyOneInput, predict_input_cls: Type[Input] = VideoClassificationFiftyOnePredictInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs, ) -> "VideoClassificationData": """Load the :class:`~flash.video.classification.data.VideoClassificationData` from FiftyOne ``SampleCollection`` objects. The supported file extensions are: ``.mp4``, and ``.avi``. The targets will be extracted from the ``label_field`` in the ``SampleCollection`` objects and can be in any of our :ref:`supported classification target formats <formatting_classification_targets>`. To learn how to customize the transforms applied for each stage, read our :ref:`customizing transforms guide <customizing_transforms>`. Args: train_dataset: The ``SampleCollection`` to use when training. val_dataset: The ``SampleCollection`` to use when validating. test_dataset: The ``SampleCollection`` to use when testing. predict_dataset: The ``SampleCollection`` to use when predicting. label_field: The field in the ``SampleCollection`` objects containing the targets. train_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when training. val_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when validating. test_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when testing. predict_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use when predicting. target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to control how targets are handled. See :ref:`formatting_classification_targets` for more details. clip_sampler: The clip sampler to use. One of: ``"uniform"``, ``"random"``, ``"constant_clips_per_video"``. clip_duration: The duration of clips to sample. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: The decoder to use to decode videos. One of: ``"pyav"``, ``"torchvision"``. Not used for frame videos. input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data. predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data. transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms. data_module_kwargs: Additional keyword arguments to provide to the :class:`~flash.core.data.data_module.DataModule` constructor. Returns: The constructed :class:`~flash.video.classification.data.VideoClassificationData`. Examples ________ .. testsetup:: >>> import torch >>> from torchvision import io >>> data = torch.randint(255, (10, 64, 64, 3)) >>> _ = [io.write_video(f"video_{i}.mp4", data, 5, "libx264rgb", {"crf": "0"}) for i in range(1, 4)] >>> _ = [io.write_video(f"predict_video_{i}.mp4", data, 5, "libx264rgb", {"crf": "0"}) for i in range(1, 4)] .. doctest:: >>> import fiftyone as fo >>> from flash import Trainer >>> from flash.video import VideoClassifier, VideoClassificationData >>> train_dataset = fo.Dataset.from_videos( ... ["video_1.mp4", "video_2.mp4", "video_3.mp4"] ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <BLANKLINE> ... >>> samples = [train_dataset[filepath] for filepath in train_dataset.values("filepath")] >>> for sample, label in zip(samples, ["cat", "dog", "cat"]): ... sample["ground_truth"] = fo.Classification(label=label) ... sample.save() ... >>> predict_dataset = fo.Dataset.from_images( ... ["predict_video_1.mp4", "predict_video_2.mp4", "predict_video_3.mp4"] ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <BLANKLINE> ... >>> datamodule = VideoClassificationData.from_fiftyone( ... train_dataset=train_dataset, ... predict_dataset=predict_dataset, ... transform_kwargs=dict(image_size=(244, 244)), ... batch_size=2, ... ) >>> datamodule.num_classes 2 >>> datamodule.labels ['cat', 'dog'] >>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes) >>> trainer = Trainer(fast_dev_run=True) >>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Training... >>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Predicting... .. testcleanup:: >>> import os >>> _ = [os.remove(f"video_{i}.mp4") for i in range(1, 4)] >>> _ = [os.remove(f"predict_video_{i}.mp4") for i in range(1, 4)] >>> del train_dataset >>> del predict_dataset """ ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, decode_audio=decode_audio, decoder=decoder, ) train_input = input_cls( RunningStage.TRAINING, train_dataset, transform=train_transform, video_sampler=video_sampler, label_field=label_field, target_formatter=target_formatter, **ds_kw, ) target_formatter = getattr(train_input, "target_formatter", None) return cls( train_input, input_cls( RunningStage.VALIDATING, val_dataset, transform=val_transform, video_sampler=video_sampler, label_field=label_field, target_formatter=target_formatter, **ds_kw, ), input_cls( RunningStage.TESTING, test_dataset, transform=test_transform, video_sampler=video_sampler, label_field=label_field, target_formatter=target_formatter, **ds_kw, ), predict_input_cls(RunningStage.PREDICTING, predict_dataset, transform=predict_transform, **ds_kw), **data_module_kwargs, )
[docs] @classmethod def from_labelstudio( cls, export_json: str = None, train_export_json: str = None, val_export_json: str = None, test_export_json: str = None, predict_export_json: str = None, data_folder: str = None, train_data_folder: str = None, val_data_folder: str = None, test_data_folder: str = None, predict_data_folder: str = None, train_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, test_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, predict_transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform, val_split: Optional[float] = None, multi_label: Optional[bool] = False, clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", input_cls: Type[Input] = LabelStudioVideoClassificationInput, transform_kwargs: Optional[Dict] = None, **data_module_kwargs, ) -> "VideoClassificationData": """Creates a :class:`~flash.core.data.data_module.DataModule` object from the given export file and data directory using the :class:`~flash.core.data.io.input.Input` of name :attr:`~flash.core.data.io.input.InputFormat.FOLDERS` from the passed or constructed :class:`~flash.core.data.io.input_transform.InputTransform`. Args: export_json: path to label studio export file train_export_json: path to label studio export file for train set, overrides export_json if specified val_export_json: path to label studio export file for validation test_export_json: path to label studio export file for test predict_export_json: path to label studio export file for predict data_folder: path to label studio data folder train_data_folder: path to label studio data folder for train data set, overrides data_folder if specified val_data_folder: path to label studio data folder for validation data test_data_folder: path to label studio data folder for test data predict_data_folder: path to label studio data folder for predict data train_transform: The dictionary of transforms to use during training which maps :class:`~flash.core.data.io.input_transform.InputTransform` hook names to callable transforms. val_transform: The dictionary of transforms to use during validation which maps :class:`~flash.core.data.io.input_transform.InputTransform` hook names to callable transforms. test_transform: The dictionary of transforms to use during testing which maps :class:`~flash.core.data.io.input_transform.InputTransform` hook names to callable transforms. predict_transform: The dictionary of transforms to use during predicting which maps :class:`~flash.core.data.io.input_transform.InputTransform` hook names to callable transforms. data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the :class:`~flash.core.data.data_module.DataModule`. input_transform: The :class:`~flash.core.data.io.input_transform.InputTransform` to pass to the :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.input_transform_cls`` will be constructed and used. val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. multi_label: Whether the label are multi encoded. clip_sampler: Defines how clips should be sampled from each video. clip_duration: Defines how long the sampled clips should be for each video. clip_sampler_kwargs: Additional keyword arguments to use when constructing the clip sampler. video_sampler: Sampler for the internal video container. This defines the order videos are decoded and, if necessary, the distributed split. decode_audio: If True, also decode audio from video. decoder: Defines what type of decoder used to decode a video. data_module_kwargs: Additional keyword arguments to use when constructing the datamodule. Returns: The constructed data module. Examples:: data_module = DataModule.from_labelstudio( export_json='project.json', data_folder='label-studio/media/upload', val_split=0.8, ) """ train_data, val_data, test_data, predict_data = _parse_labelstudio_arguments( export_json=export_json, train_export_json=train_export_json, val_export_json=val_export_json, test_export_json=test_export_json, predict_export_json=predict_export_json, data_folder=data_folder, train_data_folder=train_data_folder, val_data_folder=val_data_folder, test_data_folder=test_data_folder, predict_data_folder=predict_data_folder, val_split=val_split, multi_label=multi_label, ) ds_kw = dict( transform_kwargs=transform_kwargs, input_transforms_registry=cls.input_transforms_registry, clip_sampler=clip_sampler, clip_duration=clip_duration, clip_sampler_kwargs=clip_sampler_kwargs, video_sampler=video_sampler, decode_audio=decode_audio, decoder=decoder, ) train_input = input_cls(RunningStage.TRAINING, train_data, transform=train_transform, **ds_kw) ds_kw["parameters"] = getattr(train_input, "parameters", None) return cls( train_input, input_cls(RunningStage.VALIDATING, val_data, transform=val_transform, **ds_kw), input_cls(RunningStage.TESTING, test_data, transform=test_transform, **ds_kw), input_cls(RunningStage.PREDICTING, predict_data, transform=predict_transform, **ds_kw), **data_module_kwargs, )

© Copyright 2020-2021, PyTorch Lightning. Revision 8e9123c7.

Built with Sphinx using a theme provided by Read the Docs.
Read the Docs v: 0.7.1
Versions
latest
stable
0.7.1
0.7.0
0.6.0
0.5.2
0.5.1
0.5.0
0.4.0
0.3.2
0.3.1
0.3.0
0.2.3
0.2.2
0.2.1
0.2.0
0.1.0post1
docs-fix_typing
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.