Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nomerge] Potential perf opt on bbox kernels #6872

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions torchvision/prototype/transforms/functional/_geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def horizontal_flip_bounding_box(
bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
).reshape(-1, 4)

# TODO: performance improvement by using inplace on intermediate results
bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]

return convert_format_bounding_box(
Expand Down Expand Up @@ -81,6 +82,7 @@ def vertical_flip_bounding_box(
bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
).reshape(-1, 4)

# TODO: performance improvement by using inplace on intermediate results
bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]

return convert_format_bounding_box(
Expand Down Expand Up @@ -172,6 +174,7 @@ def resize_bounding_box(
new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
return (
# TODO: check if this is faster than repeating the ratios to shape of 4 and multipling directly
bounding_box.reshape(-1, 2, 2).mul(ratios).to(bounding_box.dtype).reshape(bounding_box.shape),
(new_height, new_width),
)
Expand Down Expand Up @@ -356,6 +359,7 @@ def _affine_bounding_box_xyxy(
# 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
# and compute bounding box from 4 transformed points:
transformed_points = transformed_points.reshape(-1, 4, 2)
# TODO: check if aminmax could help here
out_bbox_mins, _ = torch.min(transformed_points, dim=1)
out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
Expand All @@ -376,6 +380,7 @@ def _affine_bounding_box_xyxy(
)
new_points = torch.matmul(points, transposed_affine_matrix)
tr, _ = torch.min(new_points, dim=0, keepdim=True)
# TODO: performance improvement by using inplace on intermediate results
# Translate bounding boxes
out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
Expand Down Expand Up @@ -759,6 +764,7 @@ def pad_bounding_box(

bounding_box = bounding_box.clone()

# TODO: avoid repeated indexing and perform concurrently. `output.add_(torch.tensor([left, top, left, top]))` is 3x faster
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A quick check of:

def single(inpt, left, top):
    output = inpt.clone()
    output[..., 0] += left
    output[..., 1] += top
    output[..., 2] += left
    output[..., 3] += top

    return output


def concurrent(inpt, left, top):
    output = inpt.clone()
    output.add_(torch.tensor([left, top, left, top]))

    return output

Shows:

[-------- Pad cpu torch.float32 --------]
               |  single   |  concurrent 
1 threads: ------------------------------
      (16, 4)  |     60    |       20    
6 threads: ------------------------------
      (16, 4)  |     58    |       15    

Times are in microseconds (us).

There might be multiple opportunities for speed up like this.

# this works without conversion since padding only affects xy coordinates
bounding_box[..., 0] += left
bounding_box[..., 1] += top
Expand Down Expand Up @@ -815,6 +821,7 @@ def crop_bounding_box(
bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
)

# TODO: avoid repeated indexing and perform concurrently.
# Crop or implicit pad if left and/or top have negative values:
bounding_box[..., 0::2] -= left
bounding_box[..., 1::2] -= top
Expand Down Expand Up @@ -946,6 +953,7 @@ def perspective_bounding_box(
# x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
# y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)

# TODO: Investigate potential optimizations by in-placing intermediate results, aminmax etc
numer_points = torch.matmul(points, theta1.T)
denom_points = torch.matmul(points, theta2.T)
transformed_points = numer_points / denom_points
Expand Down Expand Up @@ -1062,6 +1070,7 @@ def elastic_bounding_box(
# Or add spatial_size arg and check displacement shape
spatial_size = displacement.shape[-3], displacement.shape[-2]

# TODO: Investigate potential optimizations by in-placing intermediate results, aminmax etc
id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device)
# We construct an approximation of inverse grid as inv_grid = id_grid - displacement
# This is not an exact inverse of the grid
Expand Down