Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX - bug dowload SUSY dataset #34

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

Badr-MOUFAD
Copy link
Collaborator

closes #33

@mathurinm
Copy link
Owner

There seems to be an error in the .xy file that prevents this from working (see also #33 (comment))

In [3]: libsvmdata.fetch_libsvm("SUSY", verbose=True)
Dataset: SUSY
Downloading data from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/SUSY.xz (585.8 MB)

file_sizes: 100%|████████████████████████████| 614M/614M [01:30<00:00, 6.82MB/s]
Successfully downloaded file to /home/mathurin/data/libsvm/binary/SUSY.xz
Loading svmlight file...
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [3], line 1
----> 1 libsvmdata.fetch_libsvm("SUSY", verbose=True)

File ~/workspace/libsvmdata/libsvmdata/datasets.py:440, in fetch_libsvm(dataset, replace, normalize, min_nnz, verbose)
    438 if verbose:
    439     print("Dataset: %s" % dataset)
--> 440 X, y = _get_X_y(dataset, multilabel, replace=replace, verbose=verbose)
    442 # removing columns with to few non zero entries when using sparse X
    443 if sparse.issparse(X) and min_nnz != 0:

File ~/workspace/libsvmdata/libsvmdata/datasets.py:353, in _get_X_y(dataset, multilabel, replace, verbose)
    351     print("Loading svmlight file...")
    352 with open(tmp_path, 'rb') as f:
--> 353     X, y = load_svmlight_file(
    354         f, n_features=n_features_total, multilabel=multilabel)
    356 tmp_path.unlink()
    357 # if X's density is more than 0.5, store it in dense format:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:172, in load_svmlight_file(f, n_features, dtype, multilabel, zero_based, query_id, offset, length)
     45 def load_svmlight_file(
     46     f,
     47     *,
   (...)
     54     length=-1,
     55 ):
     56     """Load datasets in the svmlight / libsvm format into sparse CSR matrix.
     57 
     58     This format is a text-based format, with one sample per line. It does
   (...)
    169         X, y = get_data()
    170     """
    171     return tuple(
--> 172         load_svmlight_files(
    173             [f],
    174             n_features=n_features,
    175             dtype=dtype,
    176             multilabel=multilabel,
    177             zero_based=zero_based,
    178             query_id=query_id,
    179             offset=offset,
    180             length=length,
    181         )
    182     )

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:334, in load_svmlight_files(files, n_features, dtype, multilabel, zero_based, query_id, offset, length)
    331 if (offset != 0 or length > 0) and n_features is None:
    332     raise ValueError("n_features is required when offset or length is specified.")
--> 334 r = [
    335     _open_and_load(
    336         f,
    337         dtype,
    338         multilabel,
    339         bool(zero_based),
    340         bool(query_id),
    341         offset=offset,
    342         length=length,
    343     )
    344     for f in files
    345 ]
    347 if (
    348     zero_based is False
    349     or zero_based == "auto"
    350     and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
    351 ):
    352     for _, indices, _, _, _ in r:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:335, in <listcomp>(.0)
    331 if (offset != 0 or length > 0) and n_features is None:
    332     raise ValueError("n_features is required when offset or length is specified.")
    334 r = [
--> 335     _open_and_load(
    336         f,
    337         dtype,
    338         multilabel,
    339         bool(zero_based),
    340         bool(query_id),
    341         offset=offset,
    342         length=length,
    343     )
    344     for f in files
    345 ]
    347 if (
    348     zero_based is False
    349     or zero_based == "auto"
    350     and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
    351 ):
    352     for _, indices, _, _, _ in r:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:208, in _open_and_load(f, dtype, multilabel, zero_based, query_id, offset, length)
    206 def _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1):
    207     if hasattr(f, "read"):
--> 208         actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file(
    209             f, dtype, multilabel, zero_based, query_id, offset, length
    210         )
    211     else:
    212         with closing(_gen_open(f)) as f:

File sklearn/datasets/_svmlight_format_fast.pyx:76, in sklearn.datasets._svmlight_format_fast._load_svmlight_file()

ValueError: could not convert string to float: b'\xfd7zXZ\x00\x00\x04\xe6\xd6\xb4F\x02\x00!\x01\x16\x00\x00\x00t/\xe5\xa3\xe2\xd4\xb6\xef\xfe]\x00\x18\x08\x02\x88]S\x0e\\\x92\xf1B\xf1\x89\x1c\x18\xc3k^\x85\xe5\x91y\xd4\xdfK"\xe4\xfd\xe6\x16\xf1D\xc1e\xf5>\x80U\xd6\xfe\x18\x96,P\xb2\x96\xe3U\xc2\xc2\xdd\x17)\xdbm\xc3N\xa4\x1eC\xb8^\xcc~\xde\xef\xa7\x11Z\xc9\x81\xb8\xa6u\xddw\xb0\x8d\xdc;\xcb\xbdq~\x8d|C\x9f\xb6'

Copy link
Owner

@mathurinm mathurinm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above

@Badr-MOUFAD
Copy link
Collaborator Author

I looked up the SUSY dataset but can't see the error

@mathurinm
Copy link
Owner

mathurinm commented Feb 16, 2023 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

SUSY dataset fails
2 participants