allenai
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎configs/official-1124/provenance.csv‎
Lines changed: 1120 additions & 0 deletions b/‎configs/official-1124/provenance.csv‎
Lines changed: 1120 additions & 0 deletions
diff --git a/‎hf_olmo/convert_olmo_to_hf.py‎
Lines changed: 3 additions & 1 deletion b/‎hf_olmo/convert_olmo_to_hf.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎hf_olmo/modeling_olmo.py‎
Lines changed: 4 additions & 3 deletions b/‎hf_olmo/modeling_olmo.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎olmo/checkpoint.py‎
Lines changed: 6 additions & 1 deletion b/‎olmo/checkpoint.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎test_fixtures/test-olmo-model/config.json‎
Lines changed: 1 addition & 1 deletion b/‎test_fixtures/test-olmo-model/config.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_fixtures/test-olmo-model/pytorch_model.bin‎
838 Bytes b/‎test_fixtures/test-olmo-model/pytorch_model.bin‎
838 Bytes
@@ -12,10 +12,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add GSM8K to in-loop evals (BPB over correct continuation)
 - Support for specifying custom dataset objects in the `data` section of the config file.
 - Added OLMo2-0425-1B configs for public usage.
-
+- Added a .csv file of olmo-mix1124 csvgz files. 
 ### Fixed
 
 - Changed a Union definition to be compatible with Python 3.9
+- Changed hf_olmo conversion to use backwards-compatible logic via `OLMo.from_checkpoint`.
+- fix save_overwrite pass
 
 ## [v0.6.1](https://github.com/allenai/OLMo/releases/tag/v0.6.1) - 2025-01-22
 
 
@@ -104,6 +104,7 @@ Stage 1 is the biggest stage, where we train on 4T or 5T tokens on largely web-b
 | Training config | [OLMo2-1B-stage1.yaml](configs/official-0425/OLMo2-1B-stage1.yaml) |[OLMo2-7B-stage1.yaml](configs/official-1124/OLMo2-7B-stage1.yaml)                                                | [OLMo2-13B-stage1.yaml](configs/official-1124/OLMo2-13B-stage1.yaml)                                               |                                              |
 | WandB           | [wandb.ai/OLMo2-1B](https://api.wandb.ai/links/ai2-llm/izdtrtu0)|[wandb.ai/OLMo2-7B](https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA)       | [wandb.ai/OLMo2-13B](https://wandb.ai/ai2-llm/OLMo-2-1124-13B/reports/OLMo-2-13B-Nov-2024--VmlldzoxMDUzMjQxNg) |
 
+You can find the .csv.gz files containing the training data [here](configs/official-1124/provenance.csv).
 
 ### Stage 2 for the 1B
 
@@ -142,7 +143,7 @@ on 300B high quality tokens. Then we average ("soup") the models.
 | random seed 2662, 300B | [stage2-ingredient4-step11931-tokens300B](https://huggingface.co/allenai/OLMo-2-1124-13B/tree/stage2-ingredient4-step35773-tokens300B) | [OLMo2-13B-stage2-seed2662-300B.yaml](configs/official-1124/OLMo2-13B-stage2-seed2662-300B.yaml) | [wandb.ai/OLMo2-13B](https://wandb.ai/ai2-llm/OLMo-2-1124-13B/reports/OLMo-2-13B-Nov-2024--VmlldzoxMDUzMjQxNg) |
 | **final souped model** | [main](https://huggingface.co/allenai/OLMo-2-1124-13B/tree/main)                                                                       | no config, we just averaged the weights in Python                                                | |
 
-The training configs linked here are set up to download the latest checkpoint after stage 1, and start training from there.
+The training configs linked here are set up to download the latest checkpoints after stage 1, and start training from there.
 
 > Note: You can find all the information about the 32B in the [OLMo-core](https://github.com/allenai/OLMo-core) repository.
 
 
@@ -17,6 +17,7 @@
 from hf_olmo.tokenization_olmo_fast import OLMoTokenizerFast
 from olmo import ModelConfig, Tokenizer, TrainConfig
 from olmo.checkpoint import build_sharded_checkpointer
+from olmo.model import OLMo
 from olmo.util import _get_s3_client
 
 logger = logging.getLogger(__name__)
@@ -70,7 +71,8 @@ def write_model(checkpoint_dir: str, ignore_olmo_compatibility: bool = False):
     old_model_path = os.path.join(checkpoint_dir, "model.pt")
     new_model_path = os.path.join(checkpoint_dir, "pytorch_model.bin")
 
-    state_dict = torch.load(old_model_path, map_location="cpu")
+    # Loading the checkpoint using `OLMo.from_checkpoint`` handles backwards compatibility logic.
+    state_dict = OLMo.from_checkpoint(checkpoint_dir).state_dict()
 
     # this takes care of the case where the model was saved with a different prefix,
     # typically due to unsharding.
 
@@ -3,7 +3,7 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
-from transformers import PreTrainedModel
+from transformers import GenerationMixin, PreTrainedModel
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.auto import AutoModelForCausalLM
@@ -38,7 +38,7 @@ def create_model_config_from_pretrained_config(config: OLMoConfig):
     return model_config
 
 
-class OLMoForCausalLM(PreTrainedModel):
+class OLMoForCausalLM(PreTrainedModel, GenerationMixin):
     """
     Extremely barebones HF model wrapper.
     """
@@ -143,7 +143,8 @@ def forward(
             hidden_states=hidden_states,
         )
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         return True
 
     def prepare_inputs_for_generation(
 
@@ -1926,18 +1926,23 @@ def save_checkpoint(
                 (checkpoint_dir / "model").mkdir(exist_ok=True, parents=True)
                 (checkpoint_dir / "optim").mkdir(exist_ok=True, parents=True)
                 (checkpoint_dir / "train").mkdir(exist_ok=True, parents=True)
+            barrier()
 
             wait_for(
                 lambda: (checkpoint_dir / "model").exists(), "Waiting for checkpoint model directory", timeout=10.0
             )
+
             wait_for(
                 lambda: (checkpoint_dir / "optim").exists(), "Waiting for checkpoint optim directory", timeout=10.0
             )
+
             wait_for(
                 lambda: (checkpoint_dir / "train").exists(), "Waiting for checkpoint train directory", timeout=10.0
             )
 
-            local_files_created = save_model_and_optim_state(checkpoint_dir, dist_model, optim)
+            local_files_created = save_model_and_optim_state(
+                checkpoint_dir, dist_model, optim, save_overwrite=self.cfg.save_overwrite
+            )
             if upload_to is not None:
                 for path in local_files_created:
                     path = Path(path)
 
@@ -44,7 +44,7 @@
   "rope_theta": 10000,
   "scale_emb_init": false,
   "scale_logits": false,
-  "transformers_version": "4.44.2",
+  "transformers_version": "4.52.0.dev0",
   "use_cache": true,
   "vocab_size": 50257,
   "weight_tying": true