Shortcuts

Source code for flash.text.seq2seq.core.input

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional

import flash
from flash.core.data.io.input import DataKeys, Input
from flash.core.data.utilities.paths import PATH_TYPE
from flash.core.utilities.imports import _TEXT_AVAILABLE, requires

if _TEXT_AVAILABLE:
    from datasets import Dataset, load_dataset
else:
    Dataset = object


[docs]class Seq2SeqInputBase(Input): @requires("text") def load_data( self, hf_dataset: Dataset, input_key: str, target_key: Optional[str] = None, ) -> Dataset: # remove extra columns extra_columns = set(hf_dataset.column_names) - {input_key, target_key} hf_dataset = hf_dataset.remove_columns(extra_columns) if input_key != DataKeys.INPUT: hf_dataset = hf_dataset.rename_column(input_key, DataKeys.INPUT) if target_key in hf_dataset.column_names and target_key != DataKeys.TARGET: hf_dataset = hf_dataset.rename_column(target_key, DataKeys.TARGET) if flash._IS_TESTING: # NOTE: must subset in this way to return a Dataset hf_dataset = [sample for sample in hf_dataset.select(range(40), keep_in_memory=True)] return hf_dataset
[docs]class Seq2SeqCSVInput(Seq2SeqInputBase): @requires("text") def load_data( self, csv_file: PATH_TYPE, input_key: str, target_key: Optional[str] = None, ) -> Dataset: dataset_dict = load_dataset("csv", data_files={"data": str(csv_file)}) return super().load_data( dataset_dict["data"], input_key, target_key, )
[docs]class Seq2SeqJSONInput(Seq2SeqInputBase): @requires("text") def load_data( self, json_file: PATH_TYPE, field: str, input_key: str, target_key: Optional[str] = None, ) -> Dataset: dataset_dict = load_dataset("json", data_files={"data": str(json_file)}, field=field) return super().load_data( dataset_dict["data"], input_key, target_key, )
[docs]class Seq2SeqListInput(Seq2SeqInputBase): @requires("text") def load_data( self, inputs: List[str], targets: Optional[List[str]] = None, ) -> Dataset: if targets is not None: hf_dataset = Dataset.from_dict({DataKeys.INPUT: inputs, DataKeys.TARGET: targets}) else: hf_dataset = Dataset.from_dict({DataKeys.INPUT: inputs}) return super().load_data( hf_dataset, DataKeys.INPUT, DataKeys.TARGET, )

© Copyright 2020-2021, PyTorch Lightning. Revision 8e9123c7.

Built with Sphinx using a theme provided by Read the Docs.
Read the Docs v: 0.7.1
Versions
latest
stable
0.7.1
0.7.0
0.6.0
0.5.2
0.5.1
0.5.0
0.4.0
0.3.2
0.3.1
0.3.0
0.2.3
0.2.2
0.2.1
0.2.0
0.1.0post1
docs-fix_typing
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.