From a38c345bf365a2205b3bfdda5b5aabd7d5918660 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 26 Jan 2021 15:26:01 +0100 Subject: [PATCH 1/4] Allow partial loading of a cached tokenizer --- src/transformers/file_utils.py | 2 +- src/transformers/tokenization_utils_base.py | 33 +++++++++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index fd22962f0de1..fc4f73b686f6 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -1239,7 +1239,7 @@ def get_from_cache( # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: - raise ValueError( + raise FileNotFoundError( "Cannot find the requested files in the cached path and outgoing traffic has been" " disabled. To enable model look-ups and downloads online, set 'local_files_only'" " to False." diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f3161c710b9c..fe010c6519f9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1730,20 +1730,26 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], # Get files from url, cache, or disk depending on the case resolved_vocab_files = {} + unresolved_files = [] for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None else: try: - resolved_vocab_files[file_id] = cached_path( - file_path, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - ) + try: + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + ) + except FileNotFoundError: + if local_files_only: + unresolved_files.append(file_id) + except requests.exceptions.HTTPError as err: if "404 Client Error" in str(err): logger.debug(err) @@ -1751,6 +1757,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], else: raise err + if len(unresolved_files) > 0: + logger.warning( + f"Can't load following files from cache: {unresolved_files} and cannot check if these " + f"files are necessary for the tokenizer to operate." + ) + if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): msg = ( f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n" @@ -1760,6 +1772,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], raise EnvironmentError(msg) for file_id, file_path in vocab_files.items(): + if file_id not in resolved_vocab_files: + continue + if file_path == resolved_vocab_files[file_id]: logger.info("loading file {}".format(file_path)) else: From 87897ab25039be65f0862fca106367625f909f5b Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 26 Jan 2021 15:42:46 +0100 Subject: [PATCH 2/4] Warning > Info --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index fe010c6519f9..fd194340f3c2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1758,7 +1758,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], raise err if len(unresolved_files) > 0: - logger.warning( + logger.info( f"Can't load following files from cache: {unresolved_files} and cannot check if these " f"files are necessary for the tokenizer to operate." ) From 6e5cd74e06a74bce1d2635fe9e1a87a06d226c05 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 26 Jan 2021 16:25:56 +0100 Subject: [PATCH 3/4] Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index fd194340f3c2..0d1b5e6a3d04 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1760,7 +1760,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], if len(unresolved_files) > 0: logger.info( f"Can't load following files from cache: {unresolved_files} and cannot check if these " - f"files are necessary for the tokenizer to operate." + "files are necessary for the tokenizer to operate." ) if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): From a9669e53c8ec1920e11f2a0c5d3d1f4ad26cdaeb Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 27 Jan 2021 10:38:44 +0100 Subject: [PATCH 4/4] Raise error if not local_files_only --- src/transformers/tokenization_utils_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0d1b5e6a3d04..8544547d8226 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1746,9 +1746,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], local_files_only=local_files_only, use_auth_token=use_auth_token, ) - except FileNotFoundError: + except FileNotFoundError as error: if local_files_only: unresolved_files.append(file_id) + else: + raise error except requests.exceptions.HTTPError as err: if "404 Client Error" in str(err):