mosaicml · bigning · Feb 8, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 3, 2024
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -4,10 +4,13 @@ on:
     branches:
     - main
     - release/**
+    # todo: remove this before merging
+    - add_finetuning_streaming_dataset_conversion
   pull_request:
     branches:
     - main
     - release/**
+    - add_finetuning_streaming_dataset_conversion
   workflow_call:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main or dev

@@ -106,7 +106,7 @@ def __init__(
 
     def __call__(self, examples: List[Dict[str,
                                            Any]]) -> Dict[str, torch.Tensor]:
-        for check_key in ['input_ids', 'labels', 'attention_mask']:
+        for check_key in ['input_ids', 'labels']:
             if check_key not in examples[0]:
                 raise KeyError(
                     f'Examples returned by dataset do not include required key: {check_key}'

@@ -152,6 +152,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
             sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
             sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
             batching_method=cfg.dataset.get('batching_method', 'random'),
+            max_seq_len=cfg.dataset.max_seq_len,
         )
 
     else:

@@ -42,6 +42,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 
 import datasets as hf_datasets
 import huggingface_hub as hf_hub
+import numpy as np
 from composer.utils import dist
 from streaming import StreamingDataset
 from transformers import PreTrainedTokenizerBase
@@ -332,6 +333,7 @@ def __init__(self,
                  sampling_method: str = 'balanced',
                  sampling_granularity: int = 1,
                  batching_method: str = 'random',
+                 max_seq_len: int = 2048,
                  **kwargs: Any):
 
         if len(kwargs) > 0:
@@ -371,10 +373,19 @@ def __init__(self,
         )
 
         self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
 
     # How to process a sample
     def __getitem__(self, idx: int) -> Dict[str, Any]:
         sample = super().__getitem__(idx)
+        if 'input_ids' in sample:
+            # already tokenized data
+            sample['input_ids'] = np.frombuffer(
+                sample['input_ids'],
+                dtype=np.int64)[:self.max_seq_len].tolist().copy()
+            sample['labels'] = np.frombuffer(sample['labels'],
+                                             dtype=np.int64).tolist().copy()
+            return sample
         return tokenize_formatted_example(sample, tokenizer=self.tokenizer)
 
 

diff --git a/scripts/train/yamls/finetune/gpt2-arc-easy--cpu-streaming-dataset.yaml b/scripts/train/yamls/finetune/gpt2-arc-easy--cpu-streaming-dataset.yaml
@@ -0,0 +1,77 @@
+max_seq_len: 512
+global_seed: 17
+
+# Run Name
+run_name:  # If left blank, will be read from env var $RUN_NAME
+
+# Model
+model:
+  name: hf_causal_lm
+  pretrained_model_name_or_path: gpt2
+  pretrained: true  # false: only use the architecture; true: initialize with pretrained weights
+
+# Tokenizer
+tokenizer:
+  name: gpt2
+  kwargs:
+    model_max_length: ${max_seq_len}
+
+# Dataloaders
+train_loader:
+  name: finetuning
+  dataset:
+    ############
+    remote: /Users/ning.wang/projects/llm-foundry/scripts/data_prep/example_data/
+    local: /tmp/
+    split: train
+    ############
+    shuffle: true
+    max_seq_len: ${max_seq_len}
+    decoder_only_format: true
+  drop_last: true
+  num_workers: 8
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-08
+  weight_decay: 0.0
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+max_duration: 1ep
+eval_interval: 1
+eval_first: false
+eval_subset_num_batches: -1
+global_train_batch_size: 8
+
+# System
+seed: ${global_seed}
+device_eval_batch_size: 8
+device_train_microbatch_size: 8
+# device_train_microbatch_size: auto
+precision: fp32
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}