Skip to content

Commit

Permalink
[DONE] aistudio, hf hub, bos update download (#7608)
Browse files Browse the repository at this point in the history
* try fix

* fix hf download bug ...

* update config download bug

* fix

* add subfolder

* update

* 优先级,先本地,再builtin,再aistudio,再hf hub,再bos

* 更新chattemplate文件检索路径

* update

* fix subfolder && add tests

* fix

* update

* fix tokenizer_config_file_dir_list

* subfolder test

* fix from_pretrained() load hf sharded model

* 更新逻辑

* update use_safetensors

* update

* fix resolve_weight_file_from_hf_hub

* 更新bos旧的下载方式

* update download from hf hubgit add .

* update logging

* update

* 关闭代理

* update

* update

* fix image process

---------

Co-authored-by: CrazyBoyM <[email protected]>
Co-authored-by: Ke Bai <[email protected]>
  • Loading branch information
3 people authored Jan 4, 2024
1 parent 7d789a8 commit f93e7da
Show file tree
Hide file tree
Showing 22 changed files with 1,916 additions and 302 deletions.
34 changes: 24 additions & 10 deletions paddlenlp/generation/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddlenlp.transformers.utils import resolve_cache_dir
from paddlenlp.utils.log import logger

from ..transformers.aistudio_utils import aistudio_download
from ..utils import GENERATION_CONFIG_NAME
from ..utils.downloader import (
COMMUNITY_MODEL_PREFIX,
Expand Down Expand Up @@ -336,6 +337,7 @@ def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
from_hf_hub: bool = False,
from_aistudio: bool = False,
config_file_name: Optional[Union[str, os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
Expand Down Expand Up @@ -404,12 +406,11 @@ def from_pretrained(
```"""
config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME

subfolder = kwargs.pop("subfolder", None)
subfolder = kwargs.pop("subfolder", "")
if subfolder is None:
subfolder = ""

config_path = os.path.join(pretrained_model_name_or_path, config_file_name)
config_path = str(config_path)

cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir)

# 1. get the configuration file from local file, eg: /cache/path/model_config.json
if os.path.isfile(pretrained_model_name_or_path):
Expand All @@ -418,24 +419,37 @@ def from_pretrained(
# 2. get the configuration file from url, eg: https://ip/path/to/model_config.json
elif is_url(pretrained_model_name_or_path):
resolved_config_file = get_path_from_url_with_filelock(
pretrained_model_name_or_path, cache_dir, check_exist=not force_download
pretrained_model_name_or_path,
cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder),
check_exist=not force_download,
)
# 3. get the configuration file from local dir with default name, eg: /local/path
elif os.path.isdir(pretrained_model_name_or_path):
configuration_file = os.path.join(pretrained_model_name_or_path, GENERATION_CONFIG_NAME)
configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, config_file_name)
if os.path.exists(configuration_file):
resolved_config_file = configuration_file
else:
# try to detect old-school config file
raise FileNotFoundError("please make sure there is `generation_config.json` under the dir")

# 4. get the configuration file from HF hub
# 4. get the configuration file from aistudio
elif from_aistudio:
resolved_config_file = aistudio_download(
repo_id=pretrained_model_name_or_path,
filename=config_file_name,
cache_dir=cache_dir,
subfolder=subfolder,
)
# 5. get the configuration file from HF hub
elif from_hf_hub:
resolved_config_file = resolve_hf_generation_config_path(
repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder
)
else:
community_url = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, GENERATION_CONFIG_NAME])
url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, config_file_name]
cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
if subfolder != "":
url_list.insert(2, subfolder)
community_url = "/".join(url_list)
if url_file_exists(community_url):
resolved_config_file = get_path_from_url_with_filelock(
community_url, cache_dir, check_exist=not force_download
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
from .auto.modeling import *
from .auto.tokenizer import *
from .auto.processing import *
from .auto.image_processing import *
from .auto.configuration import *
from .codegen.modeling import *
from .codegen.tokenizer import *
Expand Down
27 changes: 25 additions & 2 deletions paddlenlp/transformers/aistudio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from aistudio_sdk.hub import download


Expand All @@ -23,11 +25,32 @@ class EntryNotFoundError(Exception):
pass


def aistudio_download(repo_id: str, filename: str):
# TODO: add arguments such as cache_dir, revision, etc.
def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -> str:
if subfolder is not None and subfolder != "":
weights_name = "/".join([subfolder, weights_name])
return weights_name


def aistudio_download(
repo_id: str,
filename: str = None,
cache_dir: Optional[str] = None,
subfolder: Optional[str] = "",
revision: Optional[str] = None,
**kwargs,
):
if revision is None:
revision = "master"
filename = _add_subfolder(filename, subfolder)
download_kwargs = {}
if revision is not None:
download_kwargs["revision"] = revision
if cache_dir is not None:
download_kwargs["cache_dir"] = cache_dir
res = download(
repo_id=repo_id,
filename=filename,
**download_kwargs,
)
if "path" in res:
return res["path"]
Expand Down
34 changes: 19 additions & 15 deletions paddlenlp/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar
config = AutoConfig.from_pretrained("bert-base-uncased")
config.save_pretrained('./bert-base-uncased')
"""
subfolder = kwargs.get("subfolder", None)
from_aistudio = kwargs.get("from_aistudio", False)
from_hf_hub = kwargs.get("from_hf_hub", False)
cache_dir = resolve_cache_dir(
pretrained_model_name_or_path, from_hf_hub=from_hf_hub, cache_dir=kwargs.pop("cache_dir", None)
)
subfolder = kwargs.get("subfolder", "")
if subfolder is None:
subfolder = ""
from_aistudio = kwargs.pop("from_aistudio", False)
from_hf_hub = kwargs.pop("from_hf_hub", False)
cache_dir = kwargs.pop("cache_dir", None)
cache_dir = resolve_cache_dir(from_hf_hub=from_hf_hub, from_aistudio=from_aistudio, cache_dir=cache_dir)

if not cls.name2class:
cls.name2class = {}
Expand All @@ -182,10 +183,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar

# From local dir path
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, cls.config_file)
config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_file)
if not os.path.exists(config_file):
# try to load legacy config file
legacy_config_file = os.path.join(pretrained_model_name_or_path, cls.legacy_config_file)
legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_config_file)
if not os.path.exists(legacy_config_file):
raise ValueError(
f"config file<{cls.config_file}> or legacy config file<{cls.legacy_config_file}> not found"
Expand All @@ -203,6 +204,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar
file = aistudio_download(
repo_id=pretrained_model_name_or_path,
filename=cls.config_file,
subfolder=subfolder,
cache_dir=cache_dir,
)
return cls.from_pretrained(os.path.dirname(file))
elif from_hf_hub:
Expand All @@ -219,15 +222,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar

# Assuming from community-contributed pretrained models
else:
# support subfolder
if subfolder is not None:
pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder)
url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file]
legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file]
cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
if subfolder != "":
url_list.insert(2, subfolder)
legacy_url_list.insert(2, subfolder)
community_config_path = "/".join(url_list)
legacy_community_config_path = "/".join(legacy_url_list)

community_config_path = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file])
if not url_file_exists(community_config_path):
legacy_community_config_path = "/".join(
[COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file]
)
if not url_file_exists(legacy_community_config_path):
raise RuntimeError(
f"Can't load Config for '{pretrained_model_name_or_path}'.\n"
Expand Down
Loading

0 comments on commit f93e7da

Please sign in to comment.