diff --git a/docs/source/guides/upload.mdx b/docs/source/guides/upload.mdx index 02a8298f71..a2121c7a9f 100644 --- a/docs/source/guides/upload.mdx +++ b/docs/source/guides/upload.mdx @@ -79,6 +79,9 @@ Use the `allow_patterns` and `ignore_patterns` arguments to specify which files Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded. +Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` file is not taken into account. +This means you must use `allow_patterns` and `ignore_patterns` to specify which files to upload instead. + ```py >>> api.upload_folder( ... folder_path="/path/to/local/folder", diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index 3d57aac8db..62e3bfd4ac 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -198,6 +198,11 @@ def _validate_path_in_repo(path_in_repo: str) -> str: raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'") if path_in_repo.startswith("./"): path_in_repo = path_in_repo[2:] + if any(part == ".git" for part in path_in_repo.split("/")): + raise ValueError( + "Invalid `path_in_repo` in CommitOperation: cannot update files under a '.git/' folder (path:" + f" '{path_in_repo}')." + ) return path_in_repo diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 29d38e2d6e..349f6c2f5d 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -27,7 +27,7 @@ import requests from requests.exceptions import HTTPError -from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, get_session +from huggingface_hub.utils import IGNORE_GIT_FOLDER_PATTERNS, EntryNotFoundError, RepositoryNotFoundError, get_session from ._commit_api import ( CommitOperation, @@ -86,6 +86,7 @@ USERNAME_PLACEHOLDER = "hf_user" _REGEX_DISCUSSION_URL = re.compile(r".*/discussions/(\d+)$") + logger = logging.get_logger(__name__) @@ -2606,6 +2607,9 @@ def upload_folder( will delete any remote file under `experiment/logs/`. Note that the `.gitattributes` file will not be deleted even if it matches the patterns. + Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` + file is not taken into account. + Uses `HfApi.create_commit` under the hood. Args: @@ -2720,6 +2724,13 @@ def upload_folder( if path_in_repo is None: path_in_repo = "" + # Do not upload .git folder + if ignore_patterns is None: + ignore_patterns = [] + elif isinstance(ignore_patterns, str): + ignore_patterns = [ignore_patterns] + ignore_patterns += IGNORE_GIT_FOLDER_PATTERNS + commit_message = ( commit_message if commit_message is not None else f"Upload {path_in_repo} with huggingface_hub" ) diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index 0274cecb2f..bfdf881b87 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -44,7 +44,7 @@ from ._headers import build_hf_headers, get_token_to_send from ._hf_folder import HfFolder from ._http import configure_http_backend, get_session, http_backoff -from ._paths import filter_repo_objects +from ._paths import filter_repo_objects, IGNORE_GIT_FOLDER_PATTERNS from ._runtime import ( dump_environment_info, get_fastai_version, diff --git a/src/huggingface_hub/utils/_paths.py b/src/huggingface_hub/utils/_paths.py index 2eeb606713..0a994bf5e9 100644 --- a/src/huggingface_hub/utils/_paths.py +++ b/src/huggingface_hub/utils/_paths.py @@ -20,6 +20,8 @@ T = TypeVar("T") +IGNORE_GIT_FOLDER_PATTERNS = [".git", ".git/*", "*/.git", "**/.git/**"] + def filter_repo_objects( items: Iterable[T], diff --git a/tests/test_commit_api.py b/tests/test_commit_api.py index 618e3275b0..404474620c 100644 --- a/tests/test_commit_api.py +++ b/tests/test_commit_api.py @@ -56,6 +56,44 @@ def test_path_in_repo_invalid(self) -> None: CommitOperationDelete(path_in_repo=input) +class TestCommitOperationForbiddenPathInRepo(unittest.TestCase): + """Commit operations must throw an error on files in the .git/ folder. + + Server would error anyway so it's best to prevent early. + """ + + INVALID_PATHS_IN_REPO = { + ".git", + ".git/path/to/file", + "./.git/path/to/file", + "subfolder/path/.git/to/file", + "./subfolder/path/.git/to/file", + } + + VALID_PATHS_IN_REPO = { + ".gitignore", + "path/to/.gitignore", + "path/to/something.git", + "path/to/something.git/more", + } + + def test_cannot_update_file_in_git_folder(self): + for path in self.INVALID_PATHS_IN_REPO: + with self.subTest(msg=f"Add: '{path}'"): + with self.assertRaises(ValueError): + CommitOperationAdd(path_in_repo=path, path_or_fileobj=b"content") + with self.subTest(msg=f"Delete: '{path}'"): + with self.assertRaises(ValueError): + CommitOperationDelete(path_in_repo=path) + + def test_valid_path_in_repo_containing_git(self): + for path in self.VALID_PATHS_IN_REPO: + with self.subTest(msg=f"Add: '{path}'"): + CommitOperationAdd(path_in_repo=path, path_or_fileobj=b"content") + with self.subTest(msg=f"Delete: '{path}'"): + CommitOperationDelete(path_in_repo=path) + + class TestWarnOnOverwritingOperations(unittest.TestCase): add_file_ab = CommitOperationAdd(path_in_repo="a/b.txt", path_or_fileobj=b"data") add_file_abc = CommitOperationAdd(path_in_repo="a/b/c.md", path_or_fileobj=b"data") diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index c63e7b31b5..280d0cd50c 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -45,7 +45,7 @@ REPO_TYPE_SPACE, SPACES_SDK_TYPES, ) -from huggingface_hub.file_download import cached_download, hf_hub_download +from huggingface_hub.file_download import hf_hub_download from huggingface_hub.hf_api import ( CommitInfo, DatasetInfo, @@ -67,6 +67,7 @@ HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError, + SoftTemporaryDirectory, logging, ) from huggingface_hub.utils.endpoint_helpers import ( @@ -281,13 +282,12 @@ def setUp(self) -> None: self.addCleanup(rmtree_with_retry, self.tmp_dir) @retry_endpoint - def test_upload_file_validation(self): - REPO_NAME = repo_name("upload") + def test_upload_file_validation(self) -> None: with self.assertRaises(ValueError, msg="Wrong repo type"): self._api.upload_file( path_or_fileobj=self.tmp_file, path_in_repo="README.md", - repo_id=f"{USER}/{REPO_NAME}", + repo_id="something", repo_type="this type does not exist", ) @@ -306,102 +306,61 @@ def test_commit_operation_validation(self): ) @retry_endpoint - def test_upload_file_str_path(self): - REPO_NAME = repo_name("str_path") - self._api.create_repo(repo_id=REPO_NAME) - try: - return_val = self._api.upload_file( - path_or_fileobj=self.tmp_file, - path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - ) - self.assertEqual( - return_val, - f"{self._api.endpoint}/{USER}/{REPO_NAME}/blob/main/temp/new_file.md", - ) - url = "{}/{user}/{repo}/resolve/main/temp/new_file.md".format( - ENDPOINT_STAGING, - user=USER, - repo=REPO_NAME, - ) - filepath = cached_download(url, force_download=True, legacy_cache_layout=True) - with open(filepath) as downloaded_file: - content = downloaded_file.read() - self.assertEqual(content, self.tmp_file_content) - - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) - - @retry_endpoint - def test_upload_file_pathlib_path(self): - """Regression test for https://github.com/huggingface/huggingface_hub/issues/1246.""" - repo_id = f"{USER}/{repo_name()}" - self._api.create_repo(repo_id=repo_id) - self._api.upload_file( - path_or_fileobj=Path(self.tmp_file), - path_in_repo="README.md", + @use_tmp_repo() + def test_upload_file_str_path(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + return_val = self._api.upload_file( + path_or_fileobj=self.tmp_file, + path_in_repo="temp/new_file.md", repo_id=repo_id, ) - self.assertIn("README.md", self._api.list_repo_files(repo_id=repo_id)) - self._api.delete_repo(repo_id=repo_id) + self.assertEqual(return_val, f"{repo_url}/blob/main/temp/new_file.md") - @retry_endpoint - def test_upload_file_fileobj(self): - REPO_NAME = repo_name("fileobj") - self._api.create_repo(repo_id=REPO_NAME) - try: - with open(self.tmp_file, "rb") as filestream: - return_val = self._api.upload_file( - path_or_fileobj=filestream, - path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - ) - self.assertEqual( - return_val, - f"{self._api.endpoint}/{USER}/{REPO_NAME}/blob/main/temp/new_file.md", - ) - url = "{}/{user}/{repo}/resolve/main/temp/new_file.md".format(ENDPOINT_STAGING, user=USER, repo=REPO_NAME) - filepath = cached_download(url, force_download=True, legacy_cache_layout=True) - with open(filepath) as downloaded_file: - content = downloaded_file.read() - self.assertEqual(content, self.tmp_file_content) + with SoftTemporaryDirectory() as cache_dir: + with open(hf_hub_download(repo_id=repo_id, filename="temp/new_file.md", cache_dir=cache_dir)) as f: + self.assertEqual(f.read(), self.tmp_file_content) - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) + @retry_endpoint + @use_tmp_repo() + def test_upload_file_pathlib_path(self, repo_url: RepoUrl) -> None: + """Regression test for https://github.com/huggingface/huggingface_hub/issues/1246.""" + self._api.upload_file(path_or_fileobj=Path(self.tmp_file), path_in_repo="README.md", repo_id=repo_url.repo_id) + self.assertIn("README.md", self._api.list_repo_files(repo_id=repo_url.repo_id)) @retry_endpoint - def test_upload_file_bytesio(self): - REPO_NAME = repo_name("bytesio") - self._api.create_repo(repo_id=REPO_NAME) - try: - filecontent = BytesIO(b"File content, but in bytes IO") + @use_tmp_repo() + def test_upload_file_fileobj(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + with open(self.tmp_file, "rb") as filestream: return_val = self._api.upload_file( - path_or_fileobj=filecontent, + path_or_fileobj=filestream, path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - ) - self.assertEqual( - return_val, - f"{self._api.endpoint}/{USER}/{REPO_NAME}/blob/main/temp/new_file.md", + repo_id=repo_id, ) + self.assertEqual(return_val, f"{repo_url}/blob/main/temp/new_file.md") - url = "{}/{user}/{repo}/resolve/main/temp/new_file.md".format(ENDPOINT_STAGING, user=USER, repo=REPO_NAME) - filepath = cached_download(url, force_download=True, legacy_cache_layout=True) - with open(filepath) as downloaded_file: - content = downloaded_file.read() - self.assertEqual(content, filecontent.getvalue().decode()) + with SoftTemporaryDirectory() as cache_dir: + with open(hf_hub_download(repo_id=repo_id, filename="temp/new_file.md", cache_dir=cache_dir)) as f: + self.assertEqual(f.read(), self.tmp_file_content) - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) + @retry_endpoint + @use_tmp_repo() + def test_upload_file_bytesio(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + content = BytesIO(b"File content, but in bytes IO") + return_val = self._api.upload_file( + path_or_fileobj=content, + path_in_repo="temp/new_file.md", + repo_id=repo_id, + ) + self.assertEqual(return_val, f"{repo_url}/blob/main/temp/new_file.md") + + with SoftTemporaryDirectory() as cache_dir: + with open(hf_hub_download(repo_id=repo_id, filename="temp/new_file.md", cache_dir=cache_dir)) as f: + self.assertEqual(f.read(), content.getvalue().decode()) @retry_endpoint - def test_create_repo_return_value(self): + def test_create_repo_return_value(self) -> None: REPO_NAME = repo_name("org") url = self._api.create_repo(repo_id=REPO_NAME) self.assertIsInstance(url, str) @@ -433,85 +392,38 @@ def test_create_repo_already_exists_but_no_write_permission(self): self._api.delete_repo(repo_id=repo_id, token=OTHER_TOKEN) @retry_endpoint - def test_upload_buffer(self): - REPO_NAME = repo_name("buffer") - self._api.create_repo(repo_id=REPO_NAME) - try: - buffer = BytesIO() - buffer.write(self.tmp_file_content.encode()) - return_val = self._api.upload_file( - path_or_fileobj=buffer.getvalue(), - path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - ) - self.assertEqual( - return_val, - f"{self._api.endpoint}/{USER}/{REPO_NAME}/blob/main/temp/new_file.md", - ) - - url = "{}/{user}/{repo}/resolve/main/temp/new_file.md".format(ENDPOINT_STAGING, user=USER, repo=REPO_NAME) - filepath = cached_download(url, force_download=True, legacy_cache_layout=True) - with open(filepath) as downloaded_file: - content = downloaded_file.read() - self.assertEqual(content, self.tmp_file_content) - - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) - - @retry_endpoint - def test_upload_file_create_pr(self): - REPO_NAME = repo_name("buffer") - pr_revision = quote("refs/pr/1", safe="") - self._api.create_repo(repo_id=REPO_NAME) - try: - buffer = BytesIO() - buffer.write(self.tmp_file_content.encode()) - return_val = self._api.upload_file( - path_or_fileobj=buffer.getvalue(), - path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - create_pr=True, - ) - self.assertEqual( - return_val, - f"{self._api.endpoint}/{USER}/{REPO_NAME}/blob/{pr_revision}/temp/new_file.md", - ) - - url = "{}/{user}/{repo}/resolve/{revision}/temp/new_file.md".format( - ENDPOINT_STAGING, revision=pr_revision, user=USER, repo=REPO_NAME - ) - filepath = cached_download(url, force_download=True, legacy_cache_layout=True) - with open(filepath) as downloaded_file: - content = downloaded_file.read() - self.assertEqual(content, self.tmp_file_content) + @use_tmp_repo() + def test_upload_file_create_pr(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + return_val = self._api.upload_file( + path_or_fileobj=self.tmp_file_content.encode(), + path_in_repo="temp/new_file.md", + repo_id=repo_id, + create_pr=True, + ) + self.assertEqual(return_val, f"{repo_url}/blob/{quote('refs/pr/1', safe='')}/temp/new_file.md") - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) + with SoftTemporaryDirectory() as cache_dir: + with open( + hf_hub_download( + repo_id=repo_id, filename="temp/new_file.md", revision="refs/pr/1", cache_dir=cache_dir + ) + ) as f: + self.assertEqual(f.read(), self.tmp_file_content) @retry_endpoint - def test_delete_file(self): - REPO_NAME = repo_name("delete") - self._api.create_repo(repo_id=REPO_NAME) - try: - self._api.upload_file( - path_or_fileobj=self.tmp_file, - path_in_repo="temp/new_file.md", - repo_id=f"{USER}/{REPO_NAME}", - ) - self._api.delete_file(path_in_repo="temp/new_file.md", repo_id=f"{USER}/{REPO_NAME}") - - with self.assertRaises(HTTPError): - # Should raise a 404 - hf_hub_download(f"{USER}/{REPO_NAME}", "temp/new_file.md") + @use_tmp_repo() + def test_delete_file(self, repo_url: RepoUrl) -> None: + self._api.upload_file( + path_or_fileobj=self.tmp_file, + path_in_repo="temp/new_file.md", + repo_id=repo_url.repo_id, + ) + self._api.delete_file(path_in_repo="temp/new_file.md", repo_id=repo_url.repo_id) - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) + with self.assertRaises(EntryNotFoundError): + # Should raise a 404 + hf_hub_download(repo_url.repo_id, "temp/new_file.md") def test_get_full_repo_name(self): repo_name_with_no_org = self._api.get_full_repo_name("model") @@ -611,6 +523,29 @@ def test_upload_folder_default_path_in_repo(self): # URL to root of repository self.assertEqual(url, f"{self._api.endpoint}/{USER}/{REPO_NAME}/tree/main/") + @retry_endpoint + @use_tmp_repo() + def test_upload_folder_git_folder_excluded(self, repo_url: RepoUrl) -> None: + # Simulate a folder with a .git folder + def _create_file(*parts) -> None: + path = Path(self.tmp_dir, *parts) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("content") + + _create_file(".git", "file.txt") + _create_file(".git", "folder", "file.txt") + _create_file("folder", ".git", "file.txt") + _create_file("folder", ".git", "folder", "file.txt") + _create_file(".git_something", "file.txt") + _create_file("file.git") + + # Upload folder and check that .git folder is excluded + self._api.upload_folder(folder_path=self.tmp_dir, repo_id=repo_url.repo_id) + self.assertEqual( + set(self._api.list_repo_files(repo_id=repo_url.repo_id)), + {".gitattributes", ".git_something/file.txt", "file.git", "temp", "nested/file.bin"}, + ) + @retry_endpoint def test_create_commit_create_pr(self): REPO_NAME = repo_name("create_commit_create_pr") diff --git a/tests/test_utils_paths.py b/tests/test_utils_paths.py index 8c968ad0f3..a521888cad 100644 --- a/tests/test_utils_paths.py +++ b/tests/test_utils_paths.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, Callable, List, Optional, Union -from huggingface_hub.utils import filter_repo_objects +from huggingface_hub.utils import IGNORE_GIT_FOLDER_PATTERNS, filter_repo_objects @dataclass @@ -101,3 +101,27 @@ def _check( ), expected_items, ) + + +class TestGitFolderExclusion(unittest.TestCase): + GIT_FOLDER_PATHS = [ + ".git", + ".git/file.txt", + ".git/folder/file.txt", + "path/to/folder/.git", + "path/to/folder/.git/file.txt", + "path/to/.git/folder/file.txt", + ] + + NOT_GIT_FOLDER_PATHS = [ + ".gitignore", + "path/foo.git/file.txt", + "path/.git_bar/file.txt", + "path/to/file.git", + ] + + def test_exclude_git_folder(self): + filtered_paths = filter_repo_objects( + items=self.GIT_FOLDER_PATHS + self.NOT_GIT_FOLDER_PATHS, ignore_patterns=IGNORE_GIT_FOLDER_PATTERNS + ) + self.assertListEqual(list(filtered_paths), self.NOT_GIT_FOLDER_PATHS)