From a38c345bf365a2205b3bfdda5b5aabd7d5918660 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 26 Jan 2021 15:26:01 +0100
Subject: [PATCH 1/4] Allow partial loading of a cached tokenizer

---
 src/transformers/file_utils.py              |  2 +-
 src/transformers/tokenization_utils_base.py | 33 +++++++++++++++------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index fd22962f0de1..fc4f73b686f6 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -1239,7 +1239,7 @@ def get_from_cache(
                 # the models might've been found if local_files_only=False
                 # Notify the user about that
                 if local_files_only:
-                    raise ValueError(
+                    raise FileNotFoundError(
                         "Cannot find the requested files in the cached path and outgoing traffic has been"
                         " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
                         " to False."
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index f3161c710b9c..fe010c6519f9 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1730,20 +1730,26 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         # Get files from url, cache, or disk depending on the case
         resolved_vocab_files = {}
+        unresolved_files = []
         for file_id, file_path in vocab_files.items():
             if file_path is None:
                 resolved_vocab_files[file_id] = None
             else:
                 try:
-                    resolved_vocab_files[file_id] = cached_path(
-                        file_path,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                    )
+                    try:
+                        resolved_vocab_files[file_id] = cached_path(
+                            file_path,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            resume_download=resume_download,
+                            local_files_only=local_files_only,
+                            use_auth_token=use_auth_token,
+                        )
+                    except FileNotFoundError:
+                        if local_files_only:
+                            unresolved_files.append(file_id)
+
                 except requests.exceptions.HTTPError as err:
                     if "404 Client Error" in str(err):
                         logger.debug(err)
@@ -1751,6 +1757,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     else:
                         raise err
 
+        if len(unresolved_files) > 0:
+            logger.warning(
+                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
+                f"files are necessary for the tokenizer to operate."
+            )
+
         if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
             msg = (
                 f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
@@ -1760,6 +1772,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             raise EnvironmentError(msg)
 
         for file_id, file_path in vocab_files.items():
+            if file_id not in resolved_vocab_files:
+                continue
+
             if file_path == resolved_vocab_files[file_id]:
                 logger.info("loading file {}".format(file_path))
             else:

From 87897ab25039be65f0862fca106367625f909f5b Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 26 Jan 2021 15:42:46 +0100
Subject: [PATCH 2/4] Warning > Info

---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index fe010c6519f9..fd194340f3c2 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1758,7 +1758,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                         raise err
 
         if len(unresolved_files) > 0:
-            logger.warning(
+            logger.info(
                 f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                 f"files are necessary for the tokenizer to operate."
             )

From 6e5cd74e06a74bce1d2635fe9e1a87a06d226c05 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Tue, 26 Jan 2021 16:25:56 +0100
Subject: [PATCH 3/4] Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index fd194340f3c2..0d1b5e6a3d04 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1760,7 +1760,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         if len(unresolved_files) > 0:
             logger.info(
                 f"Can't load following files from cache: {unresolved_files} and cannot check if these "
-                f"files are necessary for the tokenizer to operate."
+                "files are necessary for the tokenizer to operate."
             )
 
         if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):

From a9669e53c8ec1920e11f2a0c5d3d1f4ad26cdaeb Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 27 Jan 2021 10:38:44 +0100
Subject: [PATCH 4/4] Raise error if not local_files_only

---
 src/transformers/tokenization_utils_base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 0d1b5e6a3d04..8544547d8226 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1746,9 +1746,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                             local_files_only=local_files_only,
                             use_auth_token=use_auth_token,
                         )
-                    except FileNotFoundError:
+                    except FileNotFoundError as error:
                         if local_files_only:
                             unresolved_files.append(file_id)
+                        else:
+                            raise error
 
                 except requests.exceptions.HTTPError as err:
                     if "404 Client Error" in str(err):