Skip to content

Commit

Permalink
add support for every + add tests + add doc
Browse files Browse the repository at this point in the history
  • Loading branch information
Wauplin committed Sep 5, 2023
1 parent 566a243 commit acc570b
Show file tree
Hide file tree
Showing 3 changed files with 367 additions and 137 deletions.
53 changes: 53 additions & 0 deletions docs/source/en/guides/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,59 @@ but before that, all previous logs on the repo on deleted. All of this in a sing
... )
```

## Upload from the CLI

You can also upload files to the Hub directly from your terminal using the `huggingface-cli upload` command. Internally
it uses the same [`upload_file`] and [`upload_folder`] helpers described above.

You can either upload a single file or an entire folder:

```bash
# Usage: huggingface-cli upload [repo_id] [local_path] [path_in_repo]
>>> huggingface-cli upload Wauplin/my-cool-model ./models/model.safetensors model.safetensors
https://huggingface.co/Wauplin/my-cool-model/blob/main/model.safetensors

>>> huggingface-cli upload Wauplin/my-cool-model ./models .
https://huggingface.co/Wauplin/my-cool-model/tree/main
```

`local_path` and `path_in_repo` are optional and can be implicitly inferred. By default, `local_path` will be set to
the current directory and `path_in_repo` will be set to the relative path between the current directory and `local_path`.
If the implicit paths cannot be inferred, an error is raised.

```bash
# Upload file (implicit path_in_repo)
huggingface-cli upload my-cool-model model.safetensors

# Upload directory (implicit path_in_repo)
huggingface-cli upload my-cool-model ./models

# Upload directory (implicit local_path, implicit path_in_repo)
huggingface-cli upload my-cool-model
```

By default, the token saved locally (using `huggingface-cli login`) will be used. If you want to authenticate explicitly,
use the `--token` option:

```bash
huggingface-cli upload my-cool-model --token=hf_****
```

When uploading a folder, you can use the `--include` and `--exclude` arguments to filter the files to upload. You can
also use `--delete` to delete existing files on the Hub.

```bash
# Sync local Space with Hub (upload new files except from logs/, delete removed files)
huggingface-cli upload Wauplin/space-example --repo-type=space --exclude="/logs/*" --delete="*" --commit-message="Sync local Space with Hub"
```

Finally, you can also schedule a job that will upload your files regularly (see [scheduled uploads](#scheduled-uploads)).

```bash
# Upload new logs every 10 minutes
huggingface-cli upload training-model logs/ --every=10
```

## Advanced features

In most cases, you won't need more than [`upload_file`] and [`upload_folder`] to upload your files to the Hub.
Expand Down
223 changes: 149 additions & 74 deletions src/huggingface_hub/commands/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,61 @@
"""Contains command to upload a repo or file with the CLI.
Usage:
huggingface-cli upload repo_id
huggingface-cli upload repo_id [path] [path-in-repo]
# Upload file (implicit path in repo)
huggingface-cli upload my-cool-model ./my-cool-model.safetensors
# Upload file (explicit path in repo)
huggingface-cli upload my-cool-model ./my-cool-model.safetensors model.safetensors
# Upload directory (implicit paths)
huggingface-cli upload my-cool-model
# Upload directory (explicit local path, explicit path in repo)
huggingface-cli upload my-cool-model ./models/my-cool-model .
# Upload filtered directory (example: tensorboard logs except for the last run)
huggingface-cli upload my-cool-model ./model/training /logs --include "*.tfevents.*" --exclude "*20230905*"
# Upload private dataset
huggingface-cli upload Wauplin/my-cool-dataset ./data . --repo-type=dataset --private
# Upload with token
huggingface-cli upload Wauplin/my-cool-model --token=hf_****
# Sync local Space with Hub (upload new files, delete removed files)
huggingface-cli upload Wauplin/space-example --repo-type=space --exclude="/logs/*" --delete="*" --commit-message="Sync local Space with Hub"
# Schedule commits every 30 minutes
huggingface-cli upload Wauplin/my-cool-model --every=30
"""
import os
import time
import warnings
from argparse import Namespace, _SubParsersAction
from pathlib import Path
from typing import List, Optional

from huggingface_hub import HfApi
from huggingface_hub import logging
from huggingface_hub._commit_scheduler import CommitScheduler
from huggingface_hub.commands import BaseHuggingfaceCLICommand
from huggingface_hub.hf_api import create_repo, upload_file, upload_folder
from huggingface_hub.utils import disable_progress_bars, enable_progress_bars


class UploadCommand(BaseHuggingfaceCLICommand):
@staticmethod
def register_subcommand(parser: _SubParsersAction):
upload_parser = parser.add_parser(
"upload",
help="Upload a repo or a repo file to huggingface.co",
)
upload_parser = parser.add_parser("upload", help="Upload a file or a folder to a repo on the Hub")
upload_parser.add_argument(
"repo_id",
type=str,
help="The ID of the repo to upload to (e.g. `username/repo-name`).",
"repo_id", type=str, help="The ID of the repo to upload to (e.g. `username/repo-name`)."
)
upload_parser.add_argument(
"path",
nargs="?",
help="Local path. (optional)",
"local_path", nargs="?", help="Local path to the file or folder to upload. Defaults to current directory."
)
upload_parser.add_argument(
"path_in_repo",
nargs="?",
help="Path in repo. (optional)",
help="Path of the file or folder in the repo. Defaults to the relative path of the file or folder.",
)
upload_parser.add_argument(
"--repo-type",
Expand All @@ -59,128 +80,182 @@ def register_subcommand(parser: _SubParsersAction):
upload_parser.add_argument(
"--revision",
type=str,
help="The revision of the repo to upload.",
help="An optional Git revision id which can be a branch name, a tag, or a commit hash.",
)
upload_parser.add_argument(
"--include",
nargs="+",
type=str,
help="Glob patterns to match files to upload.",
"--private",
action="store_true",
help=(
"Whether to create a private repo if repo doesn't exist on the Hub. Ignored if the repo already"
" exists."
),
)
upload_parser.add_argument("--include", nargs="*", type=str, help="Glob patterns to match files to upload.")
upload_parser.add_argument(
"--exclude",
nargs="+",
type=str,
help="Glob patterns to exclude from files to upload.",
"--exclude", nargs="*", type=str, help="Glob patterns to exclude from files to upload."
)
upload_parser.add_argument(
"--delete",
nargs="+",
nargs="*",
type=str,
help="Glob patterns for file to be deleted from the repo while committing.",
)
upload_parser.add_argument(
"--commit-message",
type=str,
help="The summary / title / first line of the generated commit.",
)
upload_parser.add_argument(
"--commit-description",
type=str,
help="The description of the generated commit.",
"--commit-message", type=str, help="The summary / title / first line of the generated commit."
)
upload_parser.add_argument("--commit-description", type=str, help="The description of the generated commit.")
upload_parser.add_argument(
"--create-pr",
action="store_true",
help="Whether to create a PR.",
"--create-pr", action="store_true", help="Whether to upload content as a new Pull Request."
)
upload_parser.add_argument(
"--every",
action="store_true",
help="Run a CommitScheduler instead of a single commit.",
type=float,
help="If set, a background job is scheduled to create commits every `every` minutes.",
)
upload_parser.add_argument(
"--token",
type=str,
help="A User Access Token generated from https://huggingface.co/settings/tokens",
"--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens"
)
upload_parser.add_argument(
"--quiet",
action="store_true",
help="If True, progress bars are disabled and only the path to the uploaded files is printed.",
)
upload_parser.add_argument("--verbose", action="store_true", help="If True, more logs are printed.")
upload_parser.set_defaults(func=UploadCommand)

def __init__(self, args: Namespace) -> None:
self.api = HfApi(token=args.token)
self.repo_id: str = args.repo_id
self.path: str = args.path
self.path_in_repo: str = args.path_in_repo
self.repo_type: Optional[str] = args.repo_type
self.revision: Optional[str] = args.revision
self.include: List[str] = args.include
self.exclude: List[str] = args.exclude
self.delete: List[str] = args.delete
self.private: bool = args.private

self.include: Optional[List[str]] = args.include
self.exclude: Optional[List[str]] = args.exclude
self.delete: Optional[List[str]] = args.delete

self.commit_message: Optional[str] = args.commit_message
self.commit_description: Optional[str] = args.commit_description
self.create_pr: bool = args.create_pr
self.every: bool = args.every
self.token: Optional[str] = args.token
self.quiet: bool = args.quiet

# Quiet/verbose mode
self.quiet: bool = args.quiet # disable warnings and progress bars
self.verbose: bool = args.verbose # set verbosity to INFO
if self.quiet and self.verbose:
raise ValueError("Cannot set both `--quiet` and `--verbose`.")

# Possibly implicit `path` and `path_in_repo`
self.local_path: str = args.local_path if args.local_path is not None else "."
self.path_in_repo: str
if args.path_in_repo is not None:
self.path_in_repo = args.path_in_repo
else: # Implicit path_in_repo => relative to current directory
try:
self.path_in_repo = Path(self.local_path).relative_to(".").as_posix()
except ValueError as e:
raise ValueError(
"Cannot determine `path_in_repo` implicitly. Please set `--path-in-repo=...` and retry."
) from e

if args.every is not None and args.every <= 0:
raise ValueError(f"`every` must be a positive value (got '{args.every}')")
self.every: Optional[float] = args.every

def run(self) -> None:
if self.quiet:
disable_progress_bars()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
print(self._upload()) # Print path to uploaded files
print(self._upload())
enable_progress_bars()
else:
print(self._upload()) # Print path to uploaded files
if self.verbose:
logging.set_verbosity_info()
print(self._upload())

def _upload(self) -> str:
self.path = "." if self.path is None else self.path
if os.path.isfile(self.local_path):
if self.include is not None and len(self.include) > 0:
warnings.warn("Ignoring `--include` since a single file is uploaded.")
if self.exclude is not None and len(self.exclude) > 0:
warnings.warn("Ignoring `--exclude` since a single file is uploaded.")
if self.delete is not None and len(self.delete) > 0:
warnings.warn("Ignoring `--delete` since a single file is uploaded.")

self.path_in_repo = (
self.path_in_repo
if self.path_in_repo
else (os.path.relpath(self.path).replace("\\", "/") if self.path != "." else "/")
)
# Schedule commits if `every` is set
if self.every is not None:
if os.path.isfile(self.local_path):
# If file => watch entire folder + use allow_patterns
folder_path = os.path.dirname(self.local_path)
path_in_repo = (
self.path_in_repo[: -len(self.local_path)] # remove filename from path_in_repo
if self.path_in_repo.endswith(self.local_path)
else self.path_in_repo
)
allow_patterns = [self.local_path]
ignore_patterns = []
else:
folder_path = self.local_path
path_in_repo = self.path_in_repo
allow_patterns = self.include or []
ignore_patterns = self.exclude or []
if self.delete is not None and len(self.delete) > 0:
warnings.warn("Ignoring `--delete` when uploading with scheduled commits.")

# File or Folder based uploading
if os.path.isfile(self.path):
if self.include or self.exclude or self.delete:
raise ValueError("--include / --exclude / --delete cannot be used with a file path.")

return self.api.upload_file(
path_or_fileobj=self.path,
path_in_repo=self.path_in_repo,
scheduler = CommitScheduler(
folder_path=folder_path,
repo_id=self.repo_id,
repo_type=self.repo_type,
revision=self.revision,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
path_in_repo=path_in_repo,
private=self.private,
every=self.every,
token=self.token,
)
print(f"Scheduling commits every {self.every} minutes to {scheduler.repo_id}.")
try: # Block main thread until KeyboardInterrupt
while True:
time.sleep(100)
except KeyboardInterrupt:
scheduler.stop()
return "Stopped scheduled commits."

# Otherwise, create repo and proceed with the upload
if not os.path.isfile(self.local_path) and not os.path.isdir(self.local_path):
raise FileNotFoundError(f"No such file or directory: '{self.local_path}'.")
repo_id = create_repo(
repo_id=self.repo_id, repo_type=self.repo_type, exist_ok=True, private=self.private, token=self.token
).repo_id

# File-based upload
if os.path.isfile(self.local_path):
return upload_file(
path_or_fileobj=self.local_path,
path_in_repo=self.path_in_repo,
repo_id=repo_id,
repo_type=self.repo_type,
revision=self.revision,
token=self.token,
commit_message=self.commit_message,
commit_description=self.commit_description,
create_pr=self.create_pr,
run_as_future=self.every,
)

elif os.path.isdir(self.path):
return self.api.upload_folder(
folder_path=self.path,
# Folder-based upload
else:
return upload_folder(
folder_path=self.local_path,
path_in_repo=self.path_in_repo,
repo_id=self.repo_id,
token=self.token,
repo_id=repo_id,
repo_type=self.repo_type,
revision=self.revision,
token=self.token,
commit_message=self.commit_message,
commit_description=self.commit_description,
create_pr=self.create_pr,
allow_patterns=self.include,
ignore_patterns=self.exclude,
delete_patterns=self.delete,
run_as_future=self.every,
)

else:
raise ValueError(f"Provided PATH: {self.path} does not exist.")
Loading

0 comments on commit acc570b

Please sign in to comment.