Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

👨‍💻 Configure HF Hub URL with environment variable #815

Merged
merged 8 commits into from
Apr 4, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_version() -> str:

install_requires = [
"filelock",
"requests",
"requests>=2.27",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What makes us require a very recent version of requests? This potentially creates dependency conflicts for users trying to install our library alongside other packages which potentially don't support latest requests versions. What's the oldest version we can work with here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minimal required version of requests is 2.27.0 because we use JSONDecodeError, and it was introduced only in 2.27.0: https://docs.python-requests.org/en/latest/community/updates/#id2

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When was 2.27.0 released? and do we know of libraries that pin an upper bound on the version of requests they use?

Copy link
Contributor Author

@SBrandeis SBrandeis Apr 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2.27.0 was released on 2022-03-01 2022-01-03

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requests.JSONDecodeError was introduced by psf/requests#5856

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, in my opinion this is ok to ship this version pin, given that most libraries would probably not pin an upper bound on the version of requests they use. And we can always relax this constraint if we realize it's affecting users?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's merge this with this pinned version, I'll open a PR today to fix it and remove the constraint before we release a new version. The pinned version is way too recent. cc @LysandreJik

"tqdm",
"pyyaml",
"typing-extensions>=3.7.4.3", # to be able to import TypeAlias
Expand All @@ -27,11 +27,7 @@ def get_version() -> str:
"torch",
]

extras["tensorflow"] = [
"tensorflow",
"pydot",
"graphviz"
]
extras["tensorflow"] = ["tensorflow", "pydot", "graphviz"]

extras["testing"] = [
"pytest",
Expand Down
5 changes: 3 additions & 2 deletions src/huggingface_hub/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from urllib.parse import urlparse


# Possible values for env variables
Expand All @@ -24,10 +25,10 @@
os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
)

ENDPOINT = (
ENDPOINT = os.getenv("HF_HUB_URL") or (
"https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co"
)

ENDPOINT_DOMAIN = urlparse(ENDPOINT).hostname or "huggingface.co"

HUGGINGFACE_CO_URL_TEMPLATE = ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"

Expand Down
10 changes: 6 additions & 4 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from .constants import (
ENDPOINT,
ENDPOINT_DOMAIN,
REPO_TYPES,
REPO_TYPES_MAPPING,
REPO_TYPES_URL_PREFIXES,
Expand Down Expand Up @@ -80,7 +81,7 @@ def _validate_repo_id_deprecation(repo_id, name, organization):
return name, organization


def repo_type_and_id_from_hf_id(hf_id: str):
def repo_type_and_id_from_hf_id(hf_id: str, _hf_co_domain: Optional[str] = None):
adrinjalali marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns the repo type and ID from a huggingface.co URL linking to a
repository
Expand All @@ -95,15 +96,16 @@ def repo_type_and_id_from_hf_id(hf_id: str):
- <namespace>/<repo_id>
- <repo_id>
"""
is_hf_url = "huggingface.co" in hf_id and "@" not in hf_id
hf_co_domain = _hf_co_domain if _hf_co_domain is not None else ENDPOINT_DOMAIN
is_hf_url = hf_co_domain in hf_id and "@" not in hf_id
url_segments = hf_id.split("/")
is_hf_id = len(url_segments) <= 3

if is_hf_url:
namespace, repo_id = url_segments[-2:]
if namespace == "huggingface.co":
if namespace == hf_co_domain:
namespace = None
if len(url_segments) > 2 and "huggingface.co" not in url_segments[-3]:
if len(url_segments) > 2 and hf_co_domain not in url_segments[-3]:
repo_type = url_segments[-3]
else:
repo_type = None
Expand Down
14 changes: 10 additions & 4 deletions src/huggingface_hub/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@
from contextlib import contextmanager
from pathlib import Path
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import urlparse

from tqdm.auto import tqdm

from huggingface_hub.constants import REPO_TYPES_URL_PREFIXES, REPOCARD_NAME
from huggingface_hub.constants import (
ENDPOINT_DOMAIN,
REPO_TYPES_URL_PREFIXES,
REPOCARD_NAME,
)
from huggingface_hub.repocard import metadata_load, metadata_save

from .hf_api import ENDPOINT, HfApi, HfFolder, repo_type_and_id_from_hf_id
Expand Down Expand Up @@ -633,7 +638,7 @@ def clone_from(self, repo_url: str, use_auth_token: Union[bool, str, None] = Non
)
api = HfApi()

if "huggingface.co" in repo_url or (
if ENDPOINT_DOMAIN in repo_url or (
"http" not in repo_url and len(repo_url.split("/")) <= 2
):
repo_type, namespace, repo_id = repo_type_and_id_from_hf_id(repo_url)
Expand All @@ -655,7 +660,8 @@ def clone_from(self, repo_url: str, use_auth_token: Union[bool, str, None] = Non
repo_id = f"{namespace}/{repo_id}"
repo_url += repo_id

repo_url = repo_url.replace("https://", f"https://user:{token}@")
scheme = urlparse(repo_url).scheme
repo_url = repo_url.replace(f"{scheme}://", f"{scheme}://user:{token}@")

if namespace == user or namespace in valid_organisations:
api.create_repo(
Expand All @@ -671,7 +677,7 @@ def clone_from(self, repo_url: str, use_auth_token: Union[bool, str, None] = Non
repo_url += repo_id

# For error messages, it's cleaner to show the repo url without the token.
clean_repo_url = re.sub(r"https://.*@", "https://", repo_url)
clean_repo_url = re.sub(r"(https?)://.*@", r"\1://", repo_url)
try:
subprocess.run(
"git lfs install".split(),
Expand Down
5 changes: 4 additions & 1 deletion tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,4 +1277,7 @@ def test_repo_type_and_id_from_hf_id(self):
}

for key, value in possible_values.items():
self.assertEqual(repo_type_and_id_from_hf_id(key), tuple(value))
self.assertEqual(
repo_type_and_id_from_hf_id(key, _hf_co_domain="huggingface.co"),
tuple(value),
)