Skip to content

Commit

Permalink
Remove redundant prepare_module (#2597)
Browse files Browse the repository at this point in the history
* Remove redundant prepare_module

* Remove redundant base_path
  • Loading branch information
albertvillanova authored Jul 7, 2021
1 parent 00aa516 commit 5ef78ed
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 17 deletions.
2 changes: 1 addition & 1 deletion src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ def as_streaming_dataset(
from .utils.streaming_download_manager import StreamingDownloadManager

dl_manager = StreamingDownloadManager(
base_path=base_path,
base_path=base_path or self.base_path,
download_config=DownloadConfig(use_auth_token=use_auth_token),
dataset_name=self.name,
data_dir=self.config.data_dir,
Expand Down
17 changes: 1 addition & 16 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,20 +808,6 @@ def load_dataset(
f"using 'pip install datasets[streaming]' or 'pip install aiohttp' for instance"
)
# Download/copy dataset processing script
module_path, hash, resolved_file_path = prepare_module(
path,
script_version=script_version,
download_config=download_config,
download_mode=download_mode,
dataset=True,
return_resolved_file_path=True,
use_auth_token=use_auth_token,
)
# Set the base path for downloads as the parent of the script location
if resolved_file_path is not None:
base_path = url_or_path_parent(resolved_file_path)
else:
base_path = None

# Create a dataset builder
builder_instance = load_dataset_builder(
Expand All @@ -841,10 +827,9 @@ def load_dataset(
# Retturn iterable dataset in case of streaming
if streaming:
# this extends the open and os.path.join functions for data streaming
extend_module_for_streaming(module_path, use_auth_token=use_auth_token)
extend_module_for_streaming(builder_instance.__module__, use_auth_token=use_auth_token)
return builder_instance.as_streaming_dataset(
split=split,
base_path=base_path,
use_auth_token=use_auth_token,
)

Expand Down

1 comment on commit 5ef78ed

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008359 / 0.011353 (-0.002994) 0.003722 / 0.011008 (-0.007286) 0.027525 / 0.038508 (-0.010983) 0.031394 / 0.023109 (0.008285) 0.288873 / 0.275898 (0.012975) 0.318178 / 0.323480 (-0.005302) 0.007591 / 0.007986 (-0.000395) 0.004778 / 0.004328 (0.000450) 0.008101 / 0.004250 (0.003850) 0.039188 / 0.037052 (0.002135) 0.286912 / 0.258489 (0.028423) 0.323565 / 0.293841 (0.029724) 0.020459 / 0.128546 (-0.108087) 0.007255 / 0.075646 (-0.068391) 0.221114 / 0.419271 (-0.198157) 0.040246 / 0.043533 (-0.003287) 0.278622 / 0.255139 (0.023483) 0.313242 / 0.283200 (0.030042) 0.079505 / 0.141683 (-0.062178) 1.406379 / 1.452155 (-0.045776) 1.433162 / 1.492716 (-0.059554)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.013110 / 0.018006 (-0.004896) 0.524742 / 0.000490 (0.524252) 0.004036 / 0.000200 (0.003837) 0.000071 / 0.000054 (0.000017)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037122 / 0.037411 (-0.000289) 0.023413 / 0.014526 (0.008888) 0.029448 / 0.176557 (-0.147109) 0.126686 / 0.737135 (-0.610450) 0.032337 / 0.296338 (-0.264002)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.309854 / 0.215209 (0.094645) 3.104817 / 2.077655 (1.027162) 1.510309 / 1.504120 (0.006190) 1.356088 / 1.541195 (-0.185107) 1.421038 / 1.468490 (-0.047452) 0.272764 / 4.584777 (-4.312013) 4.243997 / 3.745712 (0.498285) 4.362071 / 5.269862 (-0.907791) 1.569823 / 4.565676 (-2.995853) 0.036586 / 0.424275 (-0.387690) 0.005419 / 0.007607 (-0.002188) 0.459236 / 0.226044 (0.233192) 4.646674 / 2.268929 (2.377745) 2.165309 / 55.444624 (-53.279316) 1.839682 / 6.876477 (-5.036795) 1.913008 / 2.142072 (-0.229065) 0.419946 / 4.805227 (-4.385281) 0.097868 / 6.500664 (-6.402797) 0.052323 / 0.075469 (-0.023146)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 9.836732 / 1.841788 (7.994944) 12.622970 / 8.074308 (4.548662) 23.390901 / 10.191392 (13.199509) 0.624613 / 0.680424 (-0.055811) 0.448771 / 0.534201 (-0.085430) 0.200359 / 0.579283 (-0.378924) 0.500512 / 0.434364 (0.066148) 0.181320 / 0.540337 (-0.359018) 0.880822 / 1.386936 (-0.506114)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008973 / 0.011353 (-0.002380) 0.003804 / 0.011008 (-0.007204) 0.031089 / 0.038508 (-0.007420) 0.035004 / 0.023109 (0.011895) 0.290036 / 0.275898 (0.014138) 0.326178 / 0.323480 (0.002698) 0.007626 / 0.007986 (-0.000359) 0.004877 / 0.004328 (0.000549) 0.008712 / 0.004250 (0.004461) 0.040587 / 0.037052 (0.003535) 0.286738 / 0.258489 (0.028249) 0.324567 / 0.293841 (0.030726) 0.023272 / 0.128546 (-0.105274) 0.007843 / 0.075646 (-0.067803) 0.249918 / 0.419271 (-0.169354) 0.045511 / 0.043533 (0.001978) 0.294083 / 0.255139 (0.038944) 0.315151 / 0.283200 (0.031951) 0.083383 / 0.141683 (-0.058299) 1.590511 / 1.452155 (0.138357) 1.620386 / 1.492716 (0.127670)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.044333 / 0.018006 (0.026327) 0.505282 / 0.000490 (0.504792) 0.011205 / 0.000200 (0.011005) 0.000488 / 0.000054 (0.000433)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.032068 / 0.037411 (-0.005343) 0.021204 / 0.014526 (0.006678) 0.027632 / 0.176557 (-0.148925) 0.113510 / 0.737135 (-0.623625) 0.029391 / 0.296338 (-0.266947)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.305355 / 0.215209 (0.090146) 3.044541 / 2.077655 (0.966887) 1.496892 / 1.504120 (-0.007228) 1.341953 / 1.541195 (-0.199242) 1.406649 / 1.468490 (-0.061842) 0.271300 / 4.584777 (-4.313477) 4.148198 / 3.745712 (0.402486) 2.558928 / 5.269862 (-2.710934) 0.992907 / 4.565676 (-3.572770) 0.031928 / 0.424275 (-0.392347) 0.004848 / 0.007607 (-0.002760) 0.398906 / 0.226044 (0.172861) 4.002641 / 2.268929 (1.733713) 1.919874 / 55.444624 (-53.524751) 1.612389 / 6.876477 (-5.264088) 1.679758 / 2.142072 (-0.462315) 0.366778 / 4.805227 (-4.438449) 0.087852 / 6.500664 (-6.412812) 0.046282 / 0.075469 (-0.029187)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 9.744700 / 1.841788 (7.902912) 12.284201 / 8.074308 (4.209892) 23.222322 / 10.191392 (13.030930) 0.646537 / 0.680424 (-0.033886) 0.440984 / 0.534201 (-0.093217) 0.199758 / 0.579283 (-0.379525) 0.506245 / 0.434364 (0.071881) 0.155933 / 0.540337 (-0.384404) 0.825706 / 1.386936 (-0.561230)

CML watermark

Please sign in to comment.