Parameter for loading transformer & tokenizer with local_files_only

AngledLuffa · AngledLuffa · commit 9418d1b07a76 · 2024-08-04T21:33:04.000-07:00
Add local_files_only as a parameter to the FoundationCache, in case that winds up being useful

NONE download_method doesn't download anything, including HF, rather than adding a new mode
diff --git a/stanza/models/common/bert_embedding.py b/stanza/models/common/bert_embedding.py
@@ -32,7 +32,7 @@ def update_max_length(model_name, tokenizer):
                       'NYTK/electra-small-discriminator-hungarian'):
         tokenizer.model_max_length = 512
 
-def load_tokenizer(model_name, tokenizer_kwargs=None):
+def load_tokenizer(model_name, tokenizer_kwargs=None, local_files_only=False):
     if model_name:
         # note that use_fast is the default
         try:
@@ -44,20 +44,21 @@ def load_tokenizer(model_name, tokenizer_kwargs=None):
             bert_args["add_prefix_space"] = True
         if tokenizer_kwargs:
             bert_args.update(tokenizer_kwargs)
+        bert_args['local_files_only'] = local_files_only
         bert_tokenizer = AutoTokenizer.from_pretrained(model_name, **bert_args)
         update_max_length(model_name, bert_tokenizer)
         return bert_tokenizer
     return None
 
-def load_bert(model_name):
+def load_bert(model_name, tokenizer_kwargs=None, local_files_only=False):
     if model_name:
         # such as: "vinai/phobert-base"
         try:
             from transformers import AutoModel
         except ImportError:
             raise ImportError("Please install transformers library for BERT support! Try `pip install transformers`.")
-        bert_model = AutoModel.from_pretrained(model_name)
-        bert_tokenizer = load_tokenizer(model_name)
+        bert_model = AutoModel.from_pretrained(model_name, local_files_only=local_files_only)
+        bert_tokenizer = load_tokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs, local_files_only=local_files_only)
         return bert_model, bert_tokenizer
     return None, None
 
diff --git a/stanza/models/common/foundation_cache.py b/stanza/models/common/foundation_cache.py
@@ -16,7 +16,7 @@
 BertRecord = namedtuple('BertRecord', ['model', 'tokenizer', 'peft_ids'])
 
 class FoundationCache:
-    def __init__(self, other=None):
+    def __init__(self, other=None, local_files_only=False):
         if other is None:
             self.bert = {}
             self.charlms = {}
@@ -29,12 +29,13 @@ def __init__(self, other=None):
             self.charlms = other.charlms
             self.pretrains = other.pretrains
             self.lock = other.lock
+        self.local_files_only=local_files_only
 
-    def load_bert(self, transformer_name):
-        m, t, _ = self.load_bert_with_peft(transformer_name, None)
+    def load_bert(self, transformer_name, local_files_only=None):
+        m, t, _ = self.load_bert_with_peft(transformer_name, None, local_files_only=local_files_only)
         return m, t
 
-    def load_bert_with_peft(self, transformer_name, peft_name):
+    def load_bert_with_peft(self, transformer_name, peft_name, local_files_only=None):
         """
         Load a transformer only once
 
@@ -44,7 +45,9 @@ def load_bert_with_peft(self, transformer_name, peft_name):
             return None, None, None
         with self.lock:
             if transformer_name not in self.bert:
-                model, tokenizer = bert_embedding.load_bert(transformer_name)
+                if local_files_only is None:
+                    local_files_only = self.local_files_only
+                model, tokenizer = bert_embedding.load_bert(transformer_name, local_files_only=local_files_only)
                 self.bert[transformer_name] = BertRecord(model, tokenizer, {})
             else:
                 logger.debug("Reusing bert %s", transformer_name)
@@ -98,26 +101,26 @@ class NoTransformerFoundationCache(FoundationCache):
     since it will then have the finetuned weights for other models
     which don't want them
     """
-    def load_bert(self, transformer_name):
-        return load_bert(transformer_name)
+    def load_bert(self, transformer_name, local_files_only=None):
+        return load_bert(transformer_name, local_files_only=self.local_files_only if local_files_only is None else local_files_only)
 
-    def load_bert_with_peft(self, transformer_name, peft_name):
-        return load_bert_with_peft(transformer_name, peft_name)
+    def load_bert_with_peft(self, transformer_name, peft_name, local_files_only=None):
+        return load_bert_with_peft(transformer_name, peft_name, local_files_only=self.local_files_only if local_files_only is None else local_files_only)
 
-def load_bert(model_name, foundation_cache=None):
+def load_bert(model_name, foundation_cache=None, local_files_only=None):
     """
     Load a bert, possibly using a foundation cache, ignoring the cache if None
     """
     if foundation_cache is None:
-        return bert_embedding.load_bert(model_name)
+        return bert_embedding.load_bert(model_name, local_files_only=local_files_only)
     else:
-        return foundation_cache.load_bert(model_name)
+        return foundation_cache.load_bert(model_name, local_files_only=local_files_only)
 
-def load_bert_with_peft(model_name, peft_name, foundation_cache=None):
+def load_bert_with_peft(model_name, peft_name, foundation_cache=None, local_files_only=None):
     if foundation_cache is None:
-        m, t = bert_embedding.load_bert(model_name)
+        m, t = bert_embedding.load_bert(model_name, local_files_only=local_files_only)
         return m, t, peft_name
-    return foundation_cache.load_bert_with_peft(model_name, peft_name)
+    return foundation_cache.load_bert_with_peft(model_name, peft_name, local_files_only=local_files_only)
 
 def load_charlm(charlm_file, foundation_cache=None, finetune=False):
     if not charlm_file:
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -560,7 +560,8 @@ def _build_model(self, foundation_cache):
         tokenizer_kwargs = self.config.tokenizer_kwargs.get(base_bert_name, {})
         if tokenizer_kwargs:
             logger.debug(f"Using tokenizer kwargs: {tokenizer_kwargs}")
-        self.tokenizer = load_tokenizer(self.config.bert_model, tokenizer_kwargs)
+        # we just downloaded the tokenizer, so for simplicity, we don't make another request to HF
+        self.tokenizer = load_tokenizer(self.config.bert_model, tokenizer_kwargs, local_files_only=True)
 
         if self.config.bert_finetune or (hasattr(self.config, 'lora') and self.config.lora):
             self.bert = self.bert.train()
diff --git a/stanza/pipeline/core.py b/stanza/pipeline/core.py
@@ -40,7 +40,7 @@ class DownloadMethod(Enum):
     """
     Determines a couple options on how to download resources for the pipeline.
 
-    NONE will not download anything, probably resulting in failure if the resources aren't already in place.
+    NONE will not download anything, including HF transformers, probably resulting in failure if the resources aren't already in place.
     REUSE_RESOURCES will reuse the existing resources.json and models, but will download any missing models.
     DOWNLOAD_RESOURCES will download a new resources.json and will overwrite any out of date models.
     """
@@ -201,16 +201,9 @@ def __init__(self,
         # set global logging level
         set_logging_level(logging_level, verbose)
 
-        # processors can use this to save on the effort of loading
-        # large sub-models, such as pretrained embeddings, bert, etc
-        if foundation_cache is None:
-            self.foundation_cache = FoundationCache()
-        else:
-            self.foundation_cache = foundation_cache
-
-        download_method = normalize_download_method(download_method)
-        if (download_method is DownloadMethod.DOWNLOAD_RESOURCES or
-            (download_method is DownloadMethod.REUSE_RESOURCES and not os.path.exists(os.path.join(self.dir, "resources.json")))):
+        self.download_method = normalize_download_method(download_method)
+        if (self.download_method is DownloadMethod.DOWNLOAD_RESOURCES or
+            (self.download_method is DownloadMethod.REUSE_RESOURCES and not os.path.exists(os.path.join(self.dir, "resources.json")))):
             logger.info("Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES")
             download_resources_json(self.dir,
                                     resources_url=resources_url,
@@ -219,6 +212,13 @@ def __init__(self,
                                     resources_filepath=resources_filepath,
                                     proxies=proxies)
 
+        # processors can use this to save on the effort of loading
+        # large sub-models, such as pretrained embeddings, bert, etc
+        if foundation_cache is None:
+            self.foundation_cache = FoundationCache(local_files_only=(self.download_method is DownloadMethod.NONE))
+        else:
+            self.foundation_cache = FoundationCache(foundation_cache, local_files_only=(self.download_method is DownloadMethod.NONE))
+
         # process different pipeline parameters
         lang, self.dir, package, processors = process_pipeline_parameters(lang, self.dir, package, processors)
 
@@ -241,7 +241,7 @@ def __init__(self,
         if lang in resources:
             self.load_list = maintain_processor_list(resources, lang, package, processors, maybe_add_mwt=(not kwargs.get("tokenize_pretokenized")))
             self.load_list = add_dependencies(resources, lang, self.load_list)
-            if download_method is not DownloadMethod.NONE:
+            if self.download_method is not DownloadMethod.NONE:
                 # skip processors which aren't downloaded from our collection
                 download_list = [x for x in self.load_list if x[0] in resources.get(lang, {})]
                 # skip variants