Training slows down significantly for small dataset sizes #8113
-
🐛 BugWhen the dataset size is small (i.e. comparable to the minibatch size), it slows training down significantly. No GPU, batch size 64, dataset size 1024: 185 iterations/second 1 GPU, batch size 64, dataset size 1024: 110 iterations/second 1 GPU, batch size 800, dataset size 1024: 19 iterations/second Please reproduce using the BoringModelimport os, sys
from argparse import ArgumentParser
import torch
from torch.utils.data import Dataset, DistributedSampler, DataLoader
from pl_examples import cli_lightning_logo
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.callbacks.progress import ProgressBar, ProgressBarBase, tqdm, reset, convert_inf
class CustomProgressBar(ProgressBar):
def init_train_tqdm(self) -> tqdm:
""" Override this to customize the tqdm bar for training. """
bar = tqdm(
desc='Training',
initial=self.trainer.global_step,
position=(2 * self.process_position),
disable=self.is_disabled,
leave=True,
dynamic_ncols=True,
file=sys.stdout,
smoothing=0,
)
return bar
def on_train_start(self, trainer, pl_module):
super(ProgressBar, self).on_train_start(trainer, pl_module)
self.main_progress_bar = self.init_train_tqdm()
self.prev_train_gs = -1
reset(self.main_progress_bar, self.trainer.max_steps)
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
super(ProgressBar, self).on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
if self.prev_train_gs != self.trainer.global_step and self._should_update(self.trainer.global_step, self.trainer.max_steps):
self._update_bar(self.main_progress_bar)
self.main_progress_bar.set_postfix(trainer.progress_bar_dict)
self.prev_train_gs = self.trainer.global_step
def on_train_epoch_start(self, trainer, pl_module):
super(ProgressBar, self).on_train_epoch_start(trainer, pl_module)
def on_train_end(self, trainer, pl_module):
super(ProgressBar, self).on_train_end(trainer, pl_module)
class RandomDataset(Dataset):
"""
>>> RandomDataset(size=10, length=20) # doctest: +ELLIPSIS
<...bug_report_model.RandomDataset object at ...>
"""
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
"""
>>> BoringModel() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
BoringModel(
(layer): Linear(...)
)
"""
def __init__(self, train_data, test_data, bs):
"""
Testing PL Module
Use as follows:
- subclass
- modify the behavior for what you want
class TestModel(BaseTestModel):
def training_step(...):
# do your own thing
or:
model = BaseTestModel()
model.training_epoch_end = None
"""
super().__init__()
self.layer1 = torch.nn.Linear(32, 32)
self.layer2 = torch.nn.Linear(32, 32)
self.layer3 = torch.nn.Linear(32, 2)
self.train_data = train_data
self.test_data = test_data
self.bs = bs
def forward(self, x):
return self.layer3(torch.relu(self.layer2(torch.relu(self.layer1(x)))))
def loss(self, batch, prediction):
# An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
def step(self, x):
x = self.forward(x)
out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
return out
def training_step(self, batch, batch_idx):
output = self.forward(batch)
loss = self.loss(batch, output)
return {"loss": loss}
def training_step_end(self, training_step_outputs):
return training_step_outputs
def training_epoch_end(self, outputs) -> None:
torch.stack([x["loss"] for x in outputs]).mean()
def configure_optimizers(self):
optimizer = torch.optim.Adam(list(self.layer1.parameters()) + list(self.layer2.parameters()) + list(self.layer3.parameters()), lr=0.001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]
def train_dataloader(self):
train_loader = DataLoader(self.train_data, shuffle=True, num_workers=1, batch_size=self.bs)
return train_loader
parser = ArgumentParser()
parser.add_argument("--gpus", type=int, default=0)
parser.add_argument("--num_processes", type=int, default=1)
parser.add_argument("--dataset_size", type=int, default=1024)
parser.add_argument("--mb_size", type=int, default=64)
args = parser.parse_args()
def test_run():
# data
train_data = torch.randn(args.dataset_size, 32)
test_data = torch.randn(256, 32)
# model
model = BoringModel(train_data, test_data, bs=args.mb_size)
trainer = Trainer(
gpus=args.gpus,
logger=False,
max_steps=5000,
limit_val_batches=0,
num_processes=args.num_processes,
weights_summary=None,
reload_dataloaders_every_epoch=False,
callbacks=[CustomProgressBar()]
)
# fit
trainer.fit(model)
print(f"{trainer.accelerator_backend=}")
print(f"{trainer.gpus=}")
print(f"{trainer.num_processes=}")
print(f"{trainer.global_step=}")
if __name__ == "__main__":
test_run() To ReproduceRun the following command: Expected behaviorIterations/second is unaffected by dataset size. Environment
Additional contextMy guess is that this is caused by inter-epoch reloading of the dataset. The code should be restructured to pre-load a fixed number of minibatches ahead, rather than caring about the location of epoch boundaries. |
Beta Was this translation helpful? Give feedback.
Replies: 4 comments 1 reply
-
Dear @jbuckman, Could you use profiler="simple" or profiler="advanced" to explore the source of the problem. Best, |
Beta Was this translation helpful? Give feedback.
-
I would say that the case here is that you would need to assume always some overhead and for a small dataset the initial phase is dominant compare to the full run, you can see a parallel with a car riding 100 or 1000 meters, in both cases, you need to start from zero and as long you go you benefit from no need starting again... |
Beta Was this translation helpful? Give feedback.
This comment was marked as disruptive content.
This comment was marked as disruptive content.
-
Hey @Borda Thanks |
Beta Was this translation helpful? Give feedback.
I would say that the case here is that you would need to assume always some overhead and for a small dataset the initial phase is dominant compare to the full run, you can see a parallel with a car riding 100 or 1000 meters, in both cases, you need to start from zero and as long you go you benefit from no need starting again...