Skip to content

Commit d37b52d

Browse files
kylesayrsKyle Sayers
andauthored
Separate trust_remote_code args (#152)
* add trust_remote_code * add trust_remote_code_data and separate out trust_remote_code_model * add trust_remote_code_data arg to applicable tests * use data args, not kwargs --------- Co-authored-by: Kyle Sayers <[email protected]>
1 parent 5bfb497 commit d37b52d

File tree

5 files changed

+22
-8
lines changed

5 files changed

+22
-8
lines changed

src/llmcompressor/transformers/finetune/data/data_args.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,12 @@ class DataTrainingArguments(CustomDataTrainingArguments):
164164
),
165165
},
166166
)
167+
trust_remote_code_data: bool = field(
168+
default=False,
169+
metadata={
170+
"help": "Whether or not to allow for datasets defined on the Hub using "
171+
"a dataset script. This option should only be set to True for "
172+
"repositories you trust and in which you have read the code, as it "
173+
"will execute code present on the Hub on your local machine."
174+
},
175+
)

src/llmcompressor/transformers/finetune/data/data_helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,12 @@ def get_raw_dataset(
8080
:return: the requested dataset
8181
8282
"""
83-
8483
raw_datasets = load_dataset(
8584
data_args.dataset,
8685
data_args.dataset_config_name,
8786
cache_dir=cache_dir,
8887
streaming=streaming,
88+
trust_remote_code=data_args.trust_remote_code_data,
8989
**kwargs,
9090
)
9191
return raw_datasets

src/llmcompressor/transformers/finetune/model_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class ModelArguments:
6969
"model has a output word embedding layer."
7070
},
7171
)
72-
trust_remote_code: bool = field(
72+
trust_remote_code_model: bool = field(
7373
default=False,
7474
metadata={
7575
"help": "Whether or not to allow for custom models to execute their "

src/llmcompressor/transformers/finetune/text_generation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,14 +146,14 @@ def initialize_model_from_path(
146146
revision=model_args.model_revision,
147147
use_auth_token=True if model_args.use_auth_token else None,
148148
tie_word_embeddings=model_args.tie_word_embeddings,
149-
trust_remote_code=model_args.trust_remote_code,
149+
trust_remote_code=model_args.trust_remote_code_model,
150150
)
151151
teacher_config = (
152152
AutoConfig.from_pretrained(
153153
model_args.distill_teacher,
154154
use_auth_token=True if model_args.use_auth_token else None,
155155
tie_word_embeddings=model_args.tie_word_embeddings,
156-
trust_remote_code=model_args.trust_remote_code,
156+
trust_remote_code=model_args.trust_remote_code_model,
157157
)
158158
if model_args.distill_teacher
159159
else None
@@ -187,7 +187,7 @@ def initialize_model_from_path(
187187
"use_auth_token": True if model_args.use_auth_token else None,
188188
"torch_dtype": parse_dtype(model_args.precision),
189189
"device_map": device_map,
190-
"trust_remote_code": model_args.trust_remote_code,
190+
"trust_remote_code": model_args.trust_remote_code_model,
191191
}
192192
teacher_device_map = None if fsdp_enabled else "auto"
193193
teacher_kwargs = {
@@ -196,7 +196,7 @@ def initialize_model_from_path(
196196
"use_auth_token": True if model_args.use_auth_token else None,
197197
"torch_dtype": parse_dtype(model_args.precision),
198198
"device_map": teacher_device_map,
199-
"trust_remote_code": model_args.trust_remote_code,
199+
"trust_remote_code": model_args.trust_remote_code_model,
200200
}
201201
# this calls from_pretrained under the hood so should be FSDP safe
202202
model = SparseAutoModel.text_generation_from_pretrained(
@@ -227,7 +227,7 @@ def initialize_tokenizer_from_path(model_args, model, teacher):
227227
use_fast=True,
228228
revision=model_args.model_revision,
229229
use_auth_token=True if model_args.use_auth_token else None,
230-
trust_remote_code=model_args.trust_remote_code,
230+
trust_remote_code=model_args.trust_remote_code_model,
231231
)
232232

233233
return tokenizer

tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def test_datasets(self, dataset_key, dataset_config, split, do_concat):
158158
dataset=dataset_key,
159159
dataset_config_name=dataset_config,
160160
concatenate_data=do_concat,
161+
trust_remote_code_data=True,
161162
)
162163
manager = TextGenerationDataset.load_from_registry(
163164
data_args.dataset,
@@ -265,7 +266,11 @@ def prepare_fixture(self, tiny_llama_tokenizer):
265266
def test_split_loading(self, split_def):
266267
from llmcompressor.transformers.finetune.model_args import ModelArguments
267268

268-
data_args = DataTrainingArguments(dataset="open_platypus", splits=split_def)
269+
data_args = DataTrainingArguments(
270+
dataset="open_platypus",
271+
splits=split_def,
272+
trust_remote_code_data=True,
273+
)
269274
training_args = TrainingArguments(do_train=True, output_dir="dummy")
270275
model_args = ModelArguments(model=None)
271276
stage_runner = StageRunner(

0 commit comments

Comments
 (0)