Skip to content

Commit

Permalink
Merge branch 'main' into 1352-create-chunked-commits
Browse files Browse the repository at this point in the history
  • Loading branch information
Wauplin committed Mar 27, 2023
2 parents 9b696a8 + ce2789c commit e42a937
Show file tree
Hide file tree
Showing 33 changed files with 1,094 additions and 1,212 deletions.
3 changes: 3 additions & 0 deletions docs/source/guides/upload.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ Use the `allow_patterns` and `ignore_patterns` arguments to specify which files
Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm).
If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded.

Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` file is not taken into account.
This means you must use `allow_patterns` and `ignore_patterns` to specify which files to upload instead.

```py
>>> api.upload_folder(
... folder_path="/path/to/local/folder",
Expand Down
18 changes: 18 additions & 0 deletions docs/source/package_reference/utilities.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ Using these shouldn't be necessary if you use `huggingface_hub` and you don't mo

[[autodoc]] logging.get_logger

## Configure HTTP backend

In some environments, you might want to configure how HTTP calls are made, for example if you are using a proxy.
`huggingface_hub` let you configure this globally using [`configure_http_backend`]. All requests made to the Hub will
then use your settings. Under the hood, `huggingface_hub` uses `requests.Session` so you might want to refer to the
[`requests` documentation](https://requests.readthedocs.io/en/latest/user/advanced) to learn more about the parameters
available.

Since `requests.Session` is not guaranteed to be thread-safe, `huggingface_hub` creates one session instance per thread.
Using sessions allows us to keep the connection open between HTTP calls and ultimately save time. If you are
integrating `huggingface_hub` in a third-party library and wants to make a custom call to the Hub, use [`get_session`]
to get a Session configured by your users (i.e. replace any `requests.get(...)` call by `get_session().get(...)`).

[[autodoc]] configure_http_backend

[[autodoc]] get_session


## Handle HTTP errors

`huggingface_hub` defines its own HTTP errors to refine the `HTTPError` raised by
Expand Down
2 changes: 0 additions & 2 deletions src/huggingface_hub/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,6 @@ programmatic way of creating a repo, deleting it (`⚠️ caution`), pushing a
single file to a repo or listing models from the Hub, you'll find helpers in
`hf_api.py`. Some example functionality available with the `HfApi` class:

* `set_access_token()`
* `unset_access_token()`
* `whoami()`
* `create_repo()`
* `list_repo_files()`
Expand Down
10 changes: 5 additions & 5 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from typing import TYPE_CHECKING


__version__ = "0.13.0.dev0"
__version__ = "0.14.0.dev0"

# Alphabetical order of definitions is ensured in tests
# WARNING: any comment added in this dictionary definition will be lost when
Expand Down Expand Up @@ -160,10 +160,8 @@
"repo_type_and_id_from_hf_id",
"request_space_hardware",
"restart_space",
"set_access_token",
"space_info",
"unlike",
"unset_access_token",
"update_repo_visibility",
"upload_file",
"upload_folder",
Expand Down Expand Up @@ -212,7 +210,9 @@
"HFCacheInfo",
"HfFolder",
"cached_assets_path",
"configure_http_backend",
"dump_environment_info",
"get_session",
"logging",
"scan_cache_dir",
],
Expand Down Expand Up @@ -424,10 +424,8 @@ def __dir__():
repo_type_and_id_from_hf_id, # noqa: F401
request_space_hardware, # noqa: F401
restart_space, # noqa: F401
set_access_token, # noqa: F401
space_info, # noqa: F401
unlike, # noqa: F401
unset_access_token, # noqa: F401
update_repo_visibility, # noqa: F401
upload_file, # noqa: F401
upload_folder, # noqa: F401
Expand Down Expand Up @@ -472,7 +470,9 @@ def __dir__():
HFCacheInfo, # noqa: F401
HfFolder, # noqa: F401
cached_assets_path, # noqa: F401
configure_http_backend, # noqa: F401
dump_environment_info, # noqa: F401
get_session, # noqa: F401
logging, # noqa: F401
scan_cache_dir, # noqa: F401
)
Expand Down
91 changes: 26 additions & 65 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
from pathlib import Path, PurePosixPath
from typing import Any, BinaryIO, Dict, Iterable, Iterator, List, Optional, Union

import requests
from tqdm.contrib.concurrent import thread_map

from .constants import ENDPOINT
from .lfs import UploadInfo, _validate_batch_actions, lfs_upload, post_lfs_batch_info
from huggingface_hub import get_session

from .constants import ENDPOINT, HF_HUB_ENABLE_HF_TRANSFER
from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
from .utils import (
build_hf_headers,
chunk_iterable,
Expand All @@ -25,7 +26,6 @@
validate_hf_hub_args,
)
from .utils import tqdm as hf_tqdm
from .utils._deprecation import _deprecate_method
from .utils._typing import Literal


Expand Down Expand Up @@ -130,14 +130,6 @@ def __post_init__(self) -> None:
else:
self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)

@_deprecate_method(version="0.14", message="Operation is validated at initialization.")
def validate(self) -> None:
pass

@_deprecate_method(version="0.14", message="Use `upload_info` property instead.")
def _upload_info(self) -> UploadInfo:
return self.upload_info

@contextmanager
def as_file(self, with_tqdm: bool = False) -> Iterator[BinaryIO]:
"""
Expand Down Expand Up @@ -206,6 +198,11 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
if path_in_repo.startswith("./"):
path_in_repo = path_in_repo[2:]
if any(part == ".git" for part in path_in_repo.split("/")):
raise ValueError(
"Invalid `path_in_repo` in CommitOperation: cannot update files under a '.git/' folder (path:"
f" '{path_in_repo}')."
)
return path_in_repo


Expand Down Expand Up @@ -338,64 +335,28 @@ def upload_lfs_files(
return

# Step 3: upload files concurrently according to these instructions
def _inner_upload_lfs_object(batch_action):
def _wrapped_lfs_upload(batch_action) -> None:
try:
operation = oid2addop[batch_action["oid"]]
return _upload_lfs_object(operation=operation, lfs_batch_action=batch_action, token=token)
lfs_upload(operation=operation, lfs_batch_action=batch_action, token=token)
except Exception as exc:
raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc

logger.debug(
f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
)
thread_map(
_inner_upload_lfs_object,
filtered_actions,
desc=f"Upload {len(filtered_actions)} LFS files",
max_workers=num_threads,
tqdm_class=hf_tqdm,
)


def _upload_lfs_object(operation: CommitOperationAdd, lfs_batch_action: dict, token: Optional[str]):
"""
Handles uploading a given object to the Hub with the LFS protocol.
Defers to [`~utils.lfs.lfs_upload`] for the actual upload logic.
Can be a No-op if the content of the file is already present on the hub
large file storage.
Args:
operation (`CommitOperationAdd`):
The add operation triggering this upload
lfs_batch_action (`dict`):
Upload instructions from the LFS batch endpoint for this object.
See [`~utils.lfs.post_lfs_batch_info`] for more details.
token (`str`, *optional*):
A [user access token](https://hf.co/settings/tokens) to authenticate requests against the Hub
Raises: `ValueError` if `lfs_batch_action` is improperly formatted
"""
_validate_batch_actions(lfs_batch_action)
upload_info = operation.upload_info
actions = lfs_batch_action.get("actions")
if actions is None:
# The file was already uploaded
logger.debug(f"Content of file {operation.path_in_repo} is already present upstream - skipping upload")
return
upload_action = lfs_batch_action["actions"].get("upload")
verify_action = lfs_batch_action["actions"].get("verify")
with operation.as_file(with_tqdm=True) as fileobj:
logger.debug(f"Uploading {operation.path_in_repo} as LFS file...")
lfs_upload(
fileobj=fileobj,
upload_action=upload_action,
verify_action=verify_action,
upload_info=upload_info,
token=token,
if HF_HUB_ENABLE_HF_TRANSFER:
logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
for action in filtered_actions:
_wrapped_lfs_upload(action)
else:
logger.debug(
f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
)
thread_map(
_wrapped_lfs_upload,
filtered_actions,
desc=f"Upload {len(filtered_actions)} LFS files",
max_workers=num_threads,
tqdm_class=hf_tqdm,
)
logger.debug(f"{operation.path_in_repo}: Upload successful")


def _validate_preupload_info(preupload_info: dict):
Expand Down Expand Up @@ -468,7 +429,7 @@ def fetch_upload_modes(
]
}

resp = requests.post(
resp = get_session().post(
f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
json=payload,
headers=headers,
Expand Down
8 changes: 1 addition & 7 deletions src/huggingface_hub/_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import os
import subprocess
from getpass import getpass
from typing import List, Optional
from typing import Optional

from .commands._cli_utils import ANSI
from .commands.delete_cache import _ask_for_confirmation_no_tui
Expand All @@ -30,7 +30,6 @@
set_git_credential,
unset_git_credential,
)
from .utils._deprecation import _deprecate_method


logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -295,8 +294,3 @@ def _set_store_as_git_credential_helper_globally() -> None:
run_subprocess("git config --global credential.helper store")
except subprocess.CalledProcessError as exc:
raise EnvironmentError(exc.stderr)


@_deprecate_method(version="0.14", message="Please use `list_credential_helpers` instead.")
def _currently_setup_credential_helpers(directory: Optional[str] = None) -> List[str]:
return list_credential_helpers(directory)
8 changes: 3 additions & 5 deletions src/huggingface_hub/commands/lfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@
from argparse import _SubParsersAction
from typing import Dict, List, Optional

import requests

from huggingface_hub.commands import BaseHuggingfaceCLICommand
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND, SliceFileObj

from ..utils import hf_raise_for_status, logging
from ..utils import get_session, hf_raise_for_status, logging


logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -172,7 +170,7 @@ def run(self):
seek_from=i * chunk_size,
read_limit=chunk_size,
) as data:
r = requests.put(presigned_url, data=data)
r = get_session().put(presigned_url, data=data)
hf_raise_for_status(r)
parts.append(
{
Expand All @@ -192,7 +190,7 @@ def run(self):
)
# Not precise but that's ok.

r = requests.post(
r = get_session().post(
completion_url,
json={
"oid": oid,
Expand Down
3 changes: 0 additions & 3 deletions src/huggingface_hub/commands/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@
logout,
notebook_login,
)
from .._login import (
_currently_setup_credential_helpers as currently_setup_credential_helpers, # noqa: F401 # for backward compatibility
)
from ..utils import HfFolder
from ._cli_utils import ANSI

Expand Down
18 changes: 13 additions & 5 deletions src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,8 +477,7 @@ def http_get(
if HF_HUB_ENABLE_HF_TRANSFER:
try:
# Download file using an external Rust-based package. Download is faster
# (~2x speed-up) but support less features (no error handling, no retries,
# no progress bars).
# (~2x speed-up) but support less features (no progress bars).
from hf_transfer import download

logger.debug(f"Download {url} using HF_TRANSFER.")
Expand Down Expand Up @@ -539,6 +538,15 @@ def http_get(
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)

if total is not None and total != temp_file.tell():
raise EnvironmentError(
f"Consistency check failed: file should be of size {total} but has size"
f" {temp_file.tell()} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
" pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
" know by opening an issue on https://github.com/huggingface/huggingface_hub."
)

progress.close()


Expand Down Expand Up @@ -670,7 +678,7 @@ def cached_download(
timeout=etag_timeout,
)
hf_raise_for_status(r)
etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
etag = r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")
# We favor a custom header indicating the etag of the linked resource, and
# we fallback to the regular etag header.
# If we don't have any of those, raise an error.
Expand Down Expand Up @@ -1506,8 +1514,8 @@ def get_hf_file_metadata(
etag=_normalize_etag(
# We favor a custom header indicating the etag of the linked resource, and
# we fallback to the regular etag header.
r.headers.get("ETag")
or r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG)
r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG)
or r.headers.get("ETag")
),
# Either from response headers (if redirected) or defaults to request url
# Do not use directly `url`, as `_request_wrapper` might have followed relative
Expand Down
Loading

0 comments on commit e42a937

Please sign in to comment.