Skip to content

add finutuning with streaming dataset example #945

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ on:
branches:
- main
- release/**
# todo: remove this before merging
- add_finetuning_streaming_dataset_conversion
pull_request:
branches:
- main
- release/**
- add_finetuning_streaming_dataset_conversion
workflow_call:
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/data/finetuning/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def __init__(

def __call__(self, examples: List[Dict[str,
Any]]) -> Dict[str, torch.Tensor]:
for check_key in ['input_ids', 'labels', 'attention_mask']:
for check_key in ['input_ids', 'labels']:
if check_key not in examples[0]:
raise KeyError(
f'Examples returned by dataset do not include required key: {check_key}'
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
batching_method=cfg.dataset.get('batching_method', 'random'),
max_seq_len=cfg.dataset.max_seq_len,
)

else:
Expand Down
11 changes: 11 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:

import datasets as hf_datasets
import huggingface_hub as hf_hub
import numpy as np
from composer.utils import dist
from streaming import StreamingDataset
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -332,6 +333,7 @@ def __init__(self,
sampling_method: str = 'balanced',
sampling_granularity: int = 1,
batching_method: str = 'random',
max_seq_len: int = 2048,
**kwargs: Any):

if len(kwargs) > 0:
Expand Down Expand Up @@ -371,10 +373,19 @@ def __init__(self,
)

self.tokenizer = tokenizer
self.max_seq_len = max_seq_len

# How to process a sample
def __getitem__(self, idx: int) -> Dict[str, Any]:
sample = super().__getitem__(idx)
if 'input_ids' in sample:
# already tokenized data
sample['input_ids'] = np.frombuffer(
sample['input_ids'],
dtype=np.int64)[:self.max_seq_len].tolist().copy()
sample['labels'] = np.frombuffer(sample['labels'],
dtype=np.int64).tolist().copy()
return sample
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
max_seq_len: 512
global_seed: 17

# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME

# Model
model:
name: hf_causal_lm
pretrained_model_name_or_path: gpt2
pretrained: true # false: only use the architecture; true: initialize with pretrained weights

# Tokenizer
tokenizer:
name: gpt2
kwargs:
model_max_length: ${max_seq_len}

# Dataloaders
train_loader:
name: finetuning
dataset:
############
remote: /Users/ning.wang/projects/llm-foundry/scripts/data_prep/example_data/
local: /tmp/
split: train
############
shuffle: true
max_seq_len: ${max_seq_len}
decoder_only_format: true
drop_last: true
num_workers: 8

# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1

optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0

algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0

max_duration: 1ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 8

# System
seed: ${global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: 8
# device_train_microbatch_size: auto
precision: fp32

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}