vllm-project
diff --git a/‎src/llmcompressor/transformers/finetune/data/__init__.py
Lines changed: 1 addition & 0 deletions b/‎src/llmcompressor/transformers/finetune/data/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/base.py
Lines changed: 220 additions & 139 deletions b/‎src/llmcompressor/transformers/finetune/data/base.py
Lines changed: 220 additions & 139 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/c4.py
Lines changed: 9 additions & 4 deletions b/‎src/llmcompressor/transformers/finetune/data/c4.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
Lines changed: 11 additions & 30 deletions b/‎src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
Lines changed: 11 additions & 30 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/custom.py
Lines changed: 1 addition & 82 deletions b/‎src/llmcompressor/transformers/finetune/data/custom.py
Lines changed: 1 addition & 82 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/data_args.py
Lines changed: 22 additions & 8 deletions b/‎src/llmcompressor/transformers/finetune/data/data_args.py
Lines changed: 22 additions & 8 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/evolcodealpaca.py
Lines changed: 17 additions & 33 deletions b/‎src/llmcompressor/transformers/finetune/data/evolcodealpaca.py
Lines changed: 17 additions & 33 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/flickr_30k.py
Lines changed: 68 additions & 0 deletions b/‎src/llmcompressor/transformers/finetune/data/flickr_30k.py
Lines changed: 68 additions & 0 deletions
@@ -6,6 +6,7 @@
 from .custom import CustomDataset
 from .data_args import DataTrainingArguments
 from .evolcodealpaca import EvolCodeAlpacaDataset
+from .flickr_30k import Flickr30K
 from .gsm8k import GSM8KDataset
 from .open_platypus import OpenPlatypusDataset
 from .ptb import PtbDataset
 
@@ -1,6 +1,11 @@
 from copy import deepcopy
+from typing import TYPE_CHECKING
 
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
 
 
 @TextGenerationDataset.register(name="c4")
@@ -13,9 +18,9 @@ class C4Dataset(TextGenerationDataset):
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args, split, processor):
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
         data_args = deepcopy(data_args)
         data_args.dataset = "allenai/c4"
-        super().__init__(
-            text_column="text", data_args=data_args, split=split, processor=processor
-        )
+        data_args.text_column = "text"
+
+        super().__init__(data_args=data_args, split=split, processor=processor)
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from copy import deepcopy
-from typing import Optional
+from typing import TYPE_CHECKING
 
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
 
 
 @TextGenerationDataset.register(name="cnn_dailymail")
@@ -29,39 +33,16 @@ class CNNDailyMailDataset(TextGenerationDataset):
 
     SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"
 
-    def __init__(self, data_args, split, processor):
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
         data_args = deepcopy(data_args)
         data_args.dataset = "cnn_dailymail"
         data_args.dataset_config_name = "3.0.0"
 
-        super().__init__(
-            text_column="text", data_args=data_args, split=split, processor=processor
-        )
-
-    def get_raw_dataset(self, cache_dir: Optional[str] = None):
-        """
-        Load the raw dataset from Hugging Face, using cached copy if available.
-        Additionally reformats the entries to fit the template.
-
-        :param cache_dir: disk location to search for cached dataset
-        :return: the requested dataset
-        """
-        raw_dataset = super().get_raw_dataset(cache_dir=cache_dir)
+        super().__init__(data_args=data_args, split=split, processor=processor)
 
-        def restructure_fn(sample):
-            sample["text"] = self.SAMPLE_TEMPLATE.format(
+    def dataset_template(self, sample):
+        return {
+            "text": self.SAMPLE_TEMPLATE.format(
                 article=sample["article"], highlights=sample["highlights"]
             )
-
-            return sample
-
-        raw_dataset = self.map(
-            raw_dataset,
-            function=restructure_fn,
-            batched=False,
-            remove_columns=["article", "highlights", "id"],
-            num_proc=self.data_args.preprocessing_num_workers,
-            load_from_cache_file=not self.data_args.overwrite_cache,
-            desc="Restructuring CNN/DailyMail Dataset",
-        )
-        return raw_dataset
+        }
@@ -11,16 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from copy import deepcopy
-from typing import Dict, List, Union
-
-from datasets.dataset_dict import Dataset, DatasetDict
-
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
-from llmcompressor.transformers.utils.preprocessing_functions import (
-    PreprocessingFunctionRegistry,
-)
-from llmcompressor.utils import import_from_path
 
 
 @TextGenerationDataset.register(name="custom", alias=["json", "csv"])
@@ -36,76 +27,4 @@ class CustomDataset(TextGenerationDataset):
 
     """
 
-    def __init__(self, data_args, split, processor):
-        data_args = deepcopy(data_args)
-        super().__init__(
-            text_column=data_args.text_column,
-            data_args=data_args,
-            split=split,
-            processor=processor,
-        )
-        self.preprocessing_func = data_args.preprocessing_func
-        self.remove_columns = data_args.remove_columns
-
-    def get_raw_dataset(self, *_ignore, **__ignore) -> Union[DatasetDict, Dataset]:
-        """Get the raw dataset and apply preprocessing func if provided"""
-
-        dataset = self.data_args.dataset
-        if isinstance(dataset, DatasetDict) or isinstance(dataset, Dataset):
-            # user passed in an already instantiated dataset, just use it directly
-            raw_dataset = dataset
-        else:
-            # dataset must be loaded from file or HF Hub
-            raw_dataset = super().get_raw_dataset()
-
-        if self.preprocessing_func is not None:
-            if callable(self.preprocessing_func):
-                func = self.preprocessing_func
-            elif ":" in self.preprocessing_func:
-                # load func_name from "/path/to/file.py:func_name"
-                func = import_from_path(self.preprocessing_func)
-            else:
-                # load from the registry
-                func = PreprocessingFunctionRegistry.get_value_from_registry(
-                    name=self.preprocessing_func
-                )
-
-            raw_dataset = self.map(
-                raw_dataset,
-                function=func,
-                batched=False,
-                num_proc=self.data_args.preprocessing_num_workers,
-                desc="Applying custom func to the custom dataset",
-            )
-
-        self.remove_columns = (
-            self.remove_columns or self.get_remove_columns_from_dataset(raw_dataset)
-        )
-
-        if self.remove_columns is not None:
-            raw_dataset = self.map(
-                raw_dataset,
-                batched=True,
-                remove_columns=self.remove_columns,
-                num_proc=self.data_args.preprocessing_num_workers,
-                desc="Removing unneeded columns",
-            )
-
-        return raw_dataset
-
-    def get_remove_columns_from_dataset(
-        self, raw_dataset: Union[DatasetDict, Dataset]
-    ) -> List[str]:
-        """Remove redandant columns from the dataset for processing"""
-
-        remove_columns = raw_dataset.column_names
-        if isinstance(remove_columns, Dict):
-            remove_columns = raw_dataset[list(raw_dataset.keys())[0]].column_names
-
-        remove_columns = set(remove_columns)
-        if self.text_column in remove_columns:
-            remove_columns.remove(self.text_column)
-        if self.PROMPT_KEY in remove_columns:
-            remove_columns.remove(self.PROMPT_KEY)
-
-        return list(remove_columns)
+    pass
@@ -1,5 +1,7 @@
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from transformers import DefaultDataCollator
 
 
 @dataclass
@@ -31,26 +33,38 @@ class CustomDataTrainingArguments(DVCDatasetTrainingArguments):
         },
     )
 
-    text_column: Optional[str] = field(
+    text_column: str = field(
         default="text",
-        metadata={"help": "For custom datasets only. The text field key"},
+        metadata={
+            "help": (
+                "Optional key to be used as the `text` input to tokenizer/processor "
+                "after dataset preprocesssing"
+            )
+        },
     )
 
     remove_columns: Union[None, str, List] = field(
         default=None,
-        metadata={"help": "Column names to remove after preprocessing custom datasets"},
+        metadata={"help": "Column names to remove after preprocessing (deprecated)"},
     )
 
     preprocessing_func: Union[None, str, Callable] = field(
         default=None,
         metadata={
             "help": (
-                "The preprocessing function to apply or the preprocessing func name in "
-                "src/llmcompressor/transformers/utils/preprocessing_functions.py"
+                "Typically a function which applies a chat template. Can take the form "
+                "of either a function to apply to the dataset, a name defined in "
+                "src/llmcompressor/transformers/utils/preprocessing_functions.py, or "
+                "a path to a function definition of the form /path/to/file.py:func"
             )
         },
     )
 
+    data_collator: Callable[[Any], Any] = field(
+        default_factory=lambda: DefaultDataCollator(),
+        metadata={"help": "The function to used to form a batch from the dataset"},
+    )
+
 
 @dataclass
 class DataTrainingArguments(CustomDataTrainingArguments):
@@ -91,8 +105,8 @@ class DataTrainingArguments(CustomDataTrainingArguments):
             "help": "Whether or not to concatenate datapoints to fill max_seq_length"
         },
     )
-    raw_kwargs: Optional[Dict] = field(
-        default=None,
+    raw_kwargs: Dict = field(
+        default_factory=dict,
         metadata={"help": "Additional keyboard args to pass to datasets load_data"},
     )
     splits: Union[None, str, List, Dict] = field(
 
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from copy import deepcopy
-from typing import Optional
+from typing import TYPE_CHECKING
 
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
 
 
 @TextGenerationDataset.register(name="evolcodealpaca")
@@ -34,40 +38,20 @@ class EvolCodeAlpacaDataset(TextGenerationDataset):
         "\n\n### Response:\n"
     )
 
-    def __init__(self, data_args, split, processor):
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
         data_args = deepcopy(data_args)
         data_args.dataset = "theblackcat102/evol-codealpaca-v1"
-        super().__init__(
-            text_column="text", data_args=data_args, split=split, processor=processor
-        )
-
-    def get_raw_dataset(self, cache_dir: Optional[str] = None):
-        """
-        Load the raw dataset from Hugging Face, using cached copy if available.
-        Additionally reformats the entries to fit the alpaca template.
+        data_args.text_column = "text"
 
-        :param cache_dir: disk location to search for cached dataset
-        :return: the requested dataset
-        """
-        raw_dataset = super().get_raw_dataset(cache_dir=cache_dir)
+        super().__init__(data_args, split=split, processor=processor)
 
-        # helper fn for restructuring each dataset entry using the alpaca template
-        def restructure_fn(sample):
-            sample["text"] = self.EVOL_ALPACA_TEMPLATE.format(
-                instruction=sample["instruction"]
-            )
-            sample[self.PROMPT_KEY] = sample["text"]
-            if "output" in sample:
-                sample["text"] += sample["output"]
-            return sample
+    def dataset_template(self, sample):
+        prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"])
+        text = prompt
+        if "output" in text:
+            text += sample["output"]
 
-        raw_dataset = self.map(
-            raw_dataset,
-            function=restructure_fn,
-            batched=False,
-            remove_columns=["output", "instruction"],
-            num_proc=self.data_args.preprocessing_num_workers,
-            load_from_cache_file=not self.data_args.overwrite_cache,
-            desc="Restructuring Evol Code Alpaca Dataset",
-        )
-        return raw_dataset
+        return {
+            "text": text,
+            self.PROMPT_KEY: prompt,
+        }
@@ -0,0 +1,68 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+from loguru import logger
+
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
+
+
+@TextGenerationDataset.register(name="flickr", alias="flickr30k")
+class Flickr30K(TextGenerationDataset):
+    """
+    :param data_args: configuration settings for dataset loading
+    :param split: split from dataset to load, for instance `test` or `train[:5%]`
+    :param processor: processor or tokenizer to use on dataset
+    """
+
+    DEFAULT_CHAT_TEMPLATE = (
+        "{% for message in messages %}\n"
+        "{% if message['role'] == 'user' %}\n"
+        "{{ '<|user|>\n' + message['content'] + eos_token }}\n"
+        "{% elif message['role'] == 'system' %}\n"
+        "{{ '<|system|>\n' + message['content'] + eos_token }}\n"
+        "{% elif message['role'] == 'assistant' %}\n"
+        "{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n"
+        "{% endif %}\n"
+        "{% if loop.last and add_generation_prompt %}\n"
+        "{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+    )
+
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
+        data_args = deepcopy(data_args)
+        data_args.dataset = "lmms-lab/flickr30k"
+
+        super().__init__(data_args=data_args, split=split, processor=processor)
+
+        if (
+            self.tokenizer is not None
+            and getattr(self.tokenizer, "chat_template", None) is None
+        ):
+            # note that since tokenizer is a member of processor,
+            # this change affects processor.apply_chat_template
+            self.tokenizer.chat_template = self.DEFAULT_CHAT_TEMPLATE
+            logger.warning(
+                "tokenizer.chat_template is not set, using default chat template for "
+                f"{self.__class__.__name__}"
+            )
+
+    def dataset_template(self, sample):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What does the image show?"},
+                ],
+            }
+        ]
+        return {
+            "text": self.processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+            ),
+            "images": sample["image"],
+        }