Skip to content

Commit

Permalink
Raise issue if trying to upload .git/ folder + ignore .git/ folde…
Browse files Browse the repository at this point in the history
…r in `upload_folder` (#1408)

* More constrained

* error message

* ignore .git folder

* Add tests

* add docstring

* docs

* Update docs/source/guides/upload.mdx

Co-authored-by: lewtun <[email protected]>

* Update src/huggingface_hub/hf_api.py

---------

Co-authored-by: lewtun <[email protected]>
  • Loading branch information
Wauplin and lewtun authored Mar 24, 2023
1 parent 0d2fc77 commit ce2789c
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 165 deletions.
3 changes: 3 additions & 0 deletions docs/source/guides/upload.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ Use the `allow_patterns` and `ignore_patterns` arguments to specify which files
Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm).
If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded.

Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` file is not taken into account.
This means you must use `allow_patterns` and `ignore_patterns` to specify which files to upload instead.

```py
>>> api.upload_folder(
... folder_path="/path/to/local/folder",
Expand Down
5 changes: 5 additions & 0 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
if path_in_repo.startswith("./"):
path_in_repo = path_in_repo[2:]
if any(part == ".git" for part in path_in_repo.split("/")):
raise ValueError(
"Invalid `path_in_repo` in CommitOperation: cannot update files under a '.git/' folder (path:"
f" '{path_in_repo}')."
)
return path_in_repo


Expand Down
13 changes: 12 additions & 1 deletion src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import requests
from requests.exceptions import HTTPError

from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, get_session
from huggingface_hub.utils import IGNORE_GIT_FOLDER_PATTERNS, EntryNotFoundError, RepositoryNotFoundError, get_session

from ._commit_api import (
CommitOperation,
Expand Down Expand Up @@ -86,6 +86,7 @@
USERNAME_PLACEHOLDER = "hf_user"
_REGEX_DISCUSSION_URL = re.compile(r".*/discussions/(\d+)$")


logger = logging.get_logger(__name__)


Expand Down Expand Up @@ -2606,6 +2607,9 @@ def upload_folder(
will delete any remote file under `experiment/logs/`. Note that the `.gitattributes` file will not be deleted
even if it matches the patterns.
Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore`
file is not taken into account.
Uses `HfApi.create_commit` under the hood.
Args:
Expand Down Expand Up @@ -2720,6 +2724,13 @@ def upload_folder(
if path_in_repo is None:
path_in_repo = ""

# Do not upload .git folder
if ignore_patterns is None:
ignore_patterns = []
elif isinstance(ignore_patterns, str):
ignore_patterns = [ignore_patterns]
ignore_patterns += IGNORE_GIT_FOLDER_PATTERNS

commit_message = (
commit_message if commit_message is not None else f"Upload {path_in_repo} with huggingface_hub"
)
Expand Down
2 changes: 1 addition & 1 deletion src/huggingface_hub/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from ._headers import build_hf_headers, get_token_to_send
from ._hf_folder import HfFolder
from ._http import configure_http_backend, get_session, http_backoff
from ._paths import filter_repo_objects
from ._paths import filter_repo_objects, IGNORE_GIT_FOLDER_PATTERNS
from ._runtime import (
dump_environment_info,
get_fastai_version,
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/utils/_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

T = TypeVar("T")

IGNORE_GIT_FOLDER_PATTERNS = [".git", ".git/*", "*/.git", "**/.git/**"]


def filter_repo_objects(
items: Iterable[T],
Expand Down
38 changes: 38 additions & 0 deletions tests/test_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,44 @@ def test_path_in_repo_invalid(self) -> None:
CommitOperationDelete(path_in_repo=input)


class TestCommitOperationForbiddenPathInRepo(unittest.TestCase):
"""Commit operations must throw an error on files in the .git/ folder.
Server would error anyway so it's best to prevent early.
"""

INVALID_PATHS_IN_REPO = {
".git",
".git/path/to/file",
"./.git/path/to/file",
"subfolder/path/.git/to/file",
"./subfolder/path/.git/to/file",
}

VALID_PATHS_IN_REPO = {
".gitignore",
"path/to/.gitignore",
"path/to/something.git",
"path/to/something.git/more",
}

def test_cannot_update_file_in_git_folder(self):
for path in self.INVALID_PATHS_IN_REPO:
with self.subTest(msg=f"Add: '{path}'"):
with self.assertRaises(ValueError):
CommitOperationAdd(path_in_repo=path, path_or_fileobj=b"content")
with self.subTest(msg=f"Delete: '{path}'"):
with self.assertRaises(ValueError):
CommitOperationDelete(path_in_repo=path)

def test_valid_path_in_repo_containing_git(self):
for path in self.VALID_PATHS_IN_REPO:
with self.subTest(msg=f"Add: '{path}'"):
CommitOperationAdd(path_in_repo=path, path_or_fileobj=b"content")
with self.subTest(msg=f"Delete: '{path}'"):
CommitOperationDelete(path_in_repo=path)


class TestWarnOnOverwritingOperations(unittest.TestCase):
add_file_ab = CommitOperationAdd(path_in_repo="a/b.txt", path_or_fileobj=b"data")
add_file_abc = CommitOperationAdd(path_in_repo="a/b/c.md", path_or_fileobj=b"data")
Expand Down
Loading

0 comments on commit ce2789c

Please sign in to comment.