Skip to content

Conversation

@undfined
Copy link
Collaborator

@undfined undfined commented May 15, 2025

ModelTrainConfig(
    model=TransformerConfig(
        d_model=4096,
        vocab_size=100352,
        n_layers=32,
        block=TransformerBlockConfig(
            attention=AttentionConfig(
                name='default',
                n_heads=32,
                n_kv_heads=None,
                bias=False,
                rope=RoPEConfig(name='default', theta=500000, full_precision=True, scaling=None),
                clip_qkv=None,
                qk_norm=LayerNormConfig(name='rms', eps=1e-06, elementwise_affine=None, bias=False, full_precision=None, dtype='float32'),
                dropout=None,
                use_flash=False,
                dtype='float32',
                sliding_window=None
            ),
            layer_norm=LayerNormConfig(name='rms', eps=1e-06, elementwise_affine=None, bias=False, full_precision=None, dtype='float32'),
            feed_forward=FeedForwardConfig(hidden_size=11008, name='default', bias=False, dtype='float32'),
            feed_forward_moe=None,
            name='reordered_norm',
            dropout=None
        ),
        lm_head=LMHeadConfig(
            name='default',
            layer_norm=LayerNormConfig(name='rms', eps=1e-06, elementwise_affine=None, bias=False, full_precision=None, dtype='float32'),
            bias=False,
            dtype='float32',
            loss_implementation='default'
        ),
        name='default',
        dtype='float32',
        init_method='normal',
        init_seed=0,
        init_std=0.02,
        freeze_params=None,
        block_overrides=None
    ),
    optim=SkipStepAdamWConfig(
        group_overrides=[OptimGroupOverride(params=['embeddings.weight'], opts={'weight_decay': 0.0})],
        compile=True,
        fixed_fields=('initial_lr',),
        lr=0.00016,
        betas=(0.9, 0.95),
        eps=1e-08,
        weight_decay=0.1,
        rolling_interval_length=128,
        sigma_factor=6,
        dtype=None
    ),
    dataset=NumpyDatasetConfig(
        tokenizer=TokenizerConfig(vocab_size=100278, eos_token_id=100257, pad_token_id=100277, bos_token_id=None, identifier='allenai/dolma2-tokenizer'),
        name='fsl',
        source_mixture_config=None,
        sequence_length=4096,
        max_target_sequence_length=8192,
        max_sequence_length=None,
        min_sequence_length=None,
        vsl_curriculum=None,
        paths=['weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy'],
        mix=None,
        mix_base_dir='/weka/oe-training-default/ai2-llm',
        dtype=None,
        metadata=None,
        include_instance_metadata=True,
        generate_doc_lengths=False,
        docs_per_instance=None,
        chunks_per_doc=None,
        seed=None,
        interleaving_exempt_paths=None,
        expand_glob=False,
        work_dir='/weka/oe-training-default/ai2-llm/ai2-tylerm/OLMo3-7b-2xC-dolma2/dataset-cache',
        instance_filter_config=None,
        label_mask_paths=None,
        long_doc_strategy=None
    ),
    data_loader=NumpyDataLoaderConfig(
        global_batch_size=4194304,
        seed=1337,
        work_dir='/weka/oe-training-default/ai2-llm/ai2-tylerm/OLMo3-7b-2xC-dolma2/dataset-cache',
        num_threads=None,
        num_workers=12,
        prefetch_factor=None,
        target_device_type=None
    ),
    trainer=TrainerConfig(
        save_folder='/weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/OLMo3-7b-2xC-dolma2',
        work_dir='/weka/oe-training-default/ai2-llm/ai2-tylerm/OLMo3-7b-2xC-dolma2/dataset-cache',
        load_path=None,
        load_strategy='if_available',
        checkpointer=CheckpointerConfig(work_dir=None, save_overwrite=None, pre_download=False, save_thread_count=None, load_thread_count=None, throttle_uploads=False),
        device=None,
        save_overwrite=True,
        max_duration=Duration(value=275503022080, unit='tokens'),
        cancel_check_interval=5,
        hard_stop=None,
        metrics_collect_interval=10,
        callbacks={
            'checkpointer': CheckpointerCallback(
                save_interval=1000,
                ephemeral_save_interval=100,
                pre_train_checkpoint=None,
                save_async=True,
                remove='ephemeral_only',
                enabled=True,
                _latest_checkpoint_step=-1,
                _latest_checkpoint_path='',
                _checkpoints=[],
                _ephemeral_checkpoints=[],
                _checkpoints_to_remove=[]
            ),
            'config_saver': ConfigSaverCallback(fname='config.json', save_data_paths=None, data_paths_fname=None, _config=None),
            'profiler': ProfilerCallback(skip_first=0, wait=1, warmup=5, active=3, repeat=1, enabled=False, _first_batch=True),
            'garbage_collector': GarbageCollectorCallback(gc_interval=1000, enabled=True, _start_state=None),
            'beaker': BeakerCallback(experiment_id=None, update_interval=None, description=None, enabled=None, config=None, result_dir='/results', _last_update=None),
            'batchwup': BatchSizeSchedulerCallback(
                batch_sizes=[4194304, 8388608, 16777216],
                schedule=[Duration(value=0, unit='tokens'), Duration(value=90915997286, unit='tokens'), Duration(value=275503022080, unit='tokens')]
            ),
            'wandb': WandBCallback(
                enabled=True,
                name='OLMo3-7b-2xC-dolma2',
                project='OLMo3-kitchen',
                entity=None,
                group='2f30212d',
                tags=None,
                notes=None,
                config=None,
                cancel_tags=['cancel', 'canceled', 'cancelled'],
                cancel_check_interval=10
            )
        },
        async_bookkeeping=None,
        no_checkpoints=False,
        no_evals=False
    ),
    train_module=TransformerTrainModuleConfig(
        rank_microbatch_size=8192,
        max_sequence_length=4096,
        optim=SkipStepAdamWConfig(
            group_overrides=[OptimGroupOverride(params=['embeddings.weight'], opts={'weight_decay': 0.0})],
            compile=True,
            fixed_fields=('initial_lr',),
            lr=0.00016,
            betas=(0.9, 0.95),
            eps=1e-08,
            weight_decay=0.1,
            rolling_interval_length=128,
            sigma_factor=6,
            dtype=None
        ),
        max_grad_norm=1.0,
        scheduler=WSD(
            lr_field='lr',
            initial_lr_field='initial_lr',
            units='tokens',
            warmup=8388608000,
            warmup_steps=None,
            warmup_fraction=None,
            decay=50000000000,
            decay_steps=None,
            decay_fraction=None,
            warmup_min_lr=0.0,
            decay_min_lr=0.0
        ),
        compile_model=True,
        float8_config=Float8Config(ao=None, ao_recipe=None, enabled=False),
        pp_config=None,
        dp_config=TransformerDataParallelConfig(
            name='hsdp',
            param_dtype='bfloat16',
            reduce_dtype='float32',
            num_replicas=None,
            shard_degree=None,
            wrapping_strategy='full',
            prefetch_factor=0
        ),
        tp_config=None,
        cp_config=None,
        ep_config=None,
        ac_config=None,
        z_loss_multiplier=1e-05,
        state_dict_save_opts=None,
        state_dict_load_opts=None,
        load_key_mapping=None,
        autocast_precision=None,
        label_ignore_index=-100
    ),
    init_seed=1337
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants