Skip to content
Merged
Show file tree
Hide file tree
Changes from 66 commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
cd9ab1c
nemo ilql heads
cat-state Dec 1, 2022
102a74c
nemo
cat-state Dec 1, 2022
5cc6fb7
model loads
cat-state Dec 2, 2022
f451099
contiguous error
cat-state Dec 4, 2022
8f74179
it trains
cat-state Dec 5, 2022
d5dfd6b
dbg
cat-state Dec 7, 2022
cf7d873
dbg
cat-state Dec 8, 2022
689f558
it works somewhat
cat-state Dec 8, 2022
314a0da
runs but hangs
cat-state Dec 15, 2022
d1a64fb
OOM debug
cat-state Dec 16, 2022
65413c1
debug off-by-one in split idxes
cat-state Jan 7, 2023
1391a9a
ILQL on only one rank
cat-state Jan 9, 2023
afe4d71
Merge remote-tracking branch 'origin' into nemo-integrate
cat-state Jan 9, 2023
7f8a849
nemo readd
cat-state Jan 9, 2023
d050cbf
ilql generate
cat-state Jan 9, 2023
5edaf8f
ilql test inference
cat-state Jan 9, 2023
8f01942
generate hang
cat-state Jan 9, 2023
7cd68f3
dbg
cat-state Jan 9, 2023
c01ba71
add resume from ckpt
cat-state Jan 13, 2023
aedb7d7
feat(examples): add hh
maxreciprocate Jan 14, 2023
0701714
style(*_hh): satisfy isort
maxreciprocate Jan 14, 2023
b86867a
feat(hh): add more `stop_sequences`
maxreciprocate Jan 14, 2023
d3cea9a
ilql optimize index select
cat-state Jan 15, 2023
4aef72f
ilql gpt
cat-state Jan 15, 2023
396319a
Merge remote-tracking branch 'origin' into nemo-integrate
cat-state Jan 15, 2023
ff3d697
eval
cat-state Jan 16, 2023
f9c6b84
reduce dupe fns
cat-state Jan 16, 2023
a3a69b9
add config loading
cat-state Jan 16, 2023
7bc3dab
unused imports
cat-state Jan 16, 2023
65d96ba
update config
cat-state Jan 16, 2023
dd7499b
validation seq parallel works
cat-state Jan 16, 2023
c5aadf5
seperate nemo example
cat-state Jan 16, 2023
ace3f6a
remove unused
cat-state Jan 17, 2023
5d62965
add support for activation checkpointing
cat-state Jan 19, 2023
b11c9e9
relative file import examples
cat-state Jan 19, 2023
0cdb333
Merge branch 'main' of github.com:CarperAI/trlx into nemo-integrate
cat-state Jan 19, 2023
99673c9
hh
cat-state Jan 20, 2023
251247c
Merge remote-tracking branch 'origin/add-hh-example' into nemo-hh
cat-state Jan 20, 2023
b8813b4
hh configs
cat-state Jan 20, 2023
1d56f5f
sync
cat-state Jan 21, 2023
49d0edc
sync
cat-state Jan 21, 2023
c50c729
port fixes from hh branch
cat-state Jan 22, 2023
fbd1c25
PR fixes
cat-state Jan 22, 2023
d1addda
unused imports remove
cat-state Jan 22, 2023
65a25f2
change validation batch sampler
cat-state Jan 24, 2023
c900e94
Tidy up eval
cat-state Jan 24, 2023
0daa1cb
update readme for r1.14
cat-state Jan 25, 2023
4ab9776
Add batched_index_select back after merge
cat-state Jan 30, 2023
7b30d17
improve docstr
cat-state Jan 30, 2023
376a1d5
try custom generate
cat-state Jan 31, 2023
8029266
jon's fix for dataloader crash
cat-state Jan 31, 2023
cbe17a2
fix bug with mutating nemo padding
cat-state Feb 1, 2023
ab2483c
log all metrics
cat-state Feb 1, 2023
00e060d
move metrics and fixes
cat-state Feb 2, 2023
fc80e91
qa
cat-state Feb 2, 2023
3850a7e
fmt
cat-state Feb 2, 2023
c75a370
typo
cat-state Feb 2, 2023
92f09a9
change save metric to not have slash
cat-state Feb 3, 2023
885dbf6
add inference script
cat-state Feb 3, 2023
6830e1a
find checkpoints
cat-state Feb 3, 2023
4c5057e
inference script
cat-state Feb 3, 2023
66a7f73
update readme
cat-state Feb 3, 2023
6a55597
Merge branch 'main' of github.com:CarperAI/trlx into nemo-integrate
cat-state Feb 3, 2023
aa62e47
fmt
cat-state Feb 3, 2023
42c8d7f
fmt
cat-state Feb 3, 2023
1025086
remove check
cat-state Feb 3, 2023
c96e434
update readme
cat-state Feb 3, 2023
b40091a
fix nits
cat-state Feb 3, 2023
d58278b
update trainer comment
cat-state Feb 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ accelerate config # choose DeepSpeed option
accelerate launch examples/simulacra.py
```

#### Use NeMo-Megatron to launch distributed training

Follow the setup instructions in the [NeMo README](./trlx/trainer/nemo).

```bash
python examples/nemo_ilql_sentiments.py
```

For more usage see the [NeMo README](./trlx/trainer/nemo)

#### Use Ray Tune to launch hyperparameter sweep
```bash
python -m trlx.sweep --config configs/sweeps/ppo_sweep.yml examples/ppo_sentiments.py
Expand Down
144 changes: 144 additions & 0 deletions configs/nemo_configs/megatron_20b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
name: megatron_gpt
restore_from_path: null # used when starting from a .nemo file

trainer:
devices: 8
num_nodes: 4
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 200 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 1
val_check_interval: 20
# check_val_every_n_epoch: null
limit_val_batches: 2
limit_test_batches: 0
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False

exp_manager:
# set this to save checkpoints
explicit_log_dir: ilql_sentiments_logs
exp_dir: null
name: megatron_gpt_20b_ilql_sentiments
create_tensorboard_logger: False
create_wandb_logger: True
wandb_logger_kwargs:
project: trlxnemo
name: megatron_gpt_20b_ilql_sentiments
resume_if_exists: False
resume_ignore_no_checkpoint: True
# set this to save checkpoints
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: reduced_train_loss
save_top_k: 1
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt-{reduced_train_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5

model:
micro_batch_size: 4
global_batch_size: 512
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
# model architecture
encoder_seq_length: 1024
max_position_embeddings: 2048
num_layers: 44
hidden_size: 6144
ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size.
num_attention_heads: 48
init_method_std: 0.007 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs


## Activation Checkpointing
activations_checkpoint_granularity: 'selective' #'selective' # 'selective' or 'full'
activations_checkpoint_method: 'uniform' # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'

## Sequence Parallelism
sequence_parallel: True

tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# Megatron O2-style half-precision
# TODO: this causes hangs for some reason
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125
sync_batch_comm: False
# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

data:
data_prefix:
- dataset: hh
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
splits_string: 900,50,50
seq_length: ${model.encoder_seq_length}
skip_warmup: True
num_workers: 2
dataloader_type: cyclic
reset_position_ids: False # Reset position ids after end-of-document token
reset_attention_mask: False # Reset attention mask after end-of-document token
eod_mask_loss: False # Mask loss for the end of document tokens

# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [0, 4, 8, 12] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: distributed_fused_adam
lr: 5.0e-5
weight_decay: 1.0e-6
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
max_steps: 200
min_lr: 5.0e-5
150 changes: 150 additions & 0 deletions configs/nemo_configs/megatron_65b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
name: megatron_gpt
restore_from_path: null # used when starting from a .nemo file

trainer:
devices: 8
num_nodes: 16
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 1
val_check_interval: 10
limit_val_batches: 0.0
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_gpt_70b
create_wandb_logger: True
wandb_logger_kwargs:
project: trlx
name: ilql_sentiments_70b
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: False
checkpoint_callback_params:
monitor: val_loss
save_top_k: 1
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5

model:
micro_batch_size: 8
global_batch_size: 128 #2048
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 4 #2
resume_from_checkpoint: null # manually set the checkpoint file to load from

# model architecture
encoder_seq_length: 2048
max_position_embeddings: 2048
num_layers: 80
hidden_size: 8192
ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size.
num_attention_heads: 128
init_method_std: 0.007 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs

sync_batch_comm: True

## Activation Checkpointing
activations_checkpoint_granularity: 'selective' # 'selective' or 'full'
activations_checkpoint_method: 'uniform' # 'block' # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: 1 # 2 # not used with 'selective'

## Sequence Parallelism
sequence_parallel: True

tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125

# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
# Or see example below:
# data_prefix:
# - .5
# - /raid/data/pile/my-gpt3_00_text_document
# - .5
# - /raid/data/pile/my-gpt3_01_text_document
data_prefix:
ignored: ignored
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
splits_string: 900,50,50
seq_length: ${model.encoder_seq_length}
skip_warmup: True
num_workers: 2
dataloader_type: single # cyclic
reset_position_ids: False # Reset position ids after end-of-document token
reset_attention_mask: False # Reset attention mask after end-of-document token
eod_mask_loss: False # Mask loss for the end of document tokens

# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [0, 4, 8, 12] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: distributed_fused_adam
lr: 1.1e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 115
constant_steps: 12500
min_lr: 1.1e-5
53 changes: 53 additions & 0 deletions configs/nemo_ilql_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
train:
seq_length: 1024
batch_size: 512
epochs: 100
total_steps: 200
checkpoint_interval: 200
eval_interval: 20

pipeline: "PromptPipeline"
orchestrator: "OfflineOrchestrator"
trainer: "NeMoILQLTrainer"
trainer_kwargs:
pretrained_model: "/mnt/nvme/home/uwu/nemo-megatron-gpt-20B/"
megatron_cfg: "megatron_20b.yaml"
seed: 1000

model:
model_path: "gpt2"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "gpt2"
truncation_side: "right"

optimizer:
name: "adamw"
kwargs:
lr: 5.0e-5
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 2000 # train.total_steps
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.7
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.001
beta: 0
steps_for_target_q_sync: 5
two_qs: True
gen_kwargs:
max_new_tokens: 56
top_k: 20
beta: 2
temperature: 0.9
Loading