@@ -123,8 +123,8 @@ scalar_t upsample_get_value_bounded(
123
123
int access_y = max (min (y, dim.y - 1 ), 0L );
124
124
int access_x = max (min (x, dim.x - 1 ), 0L );
125
125
return data
126
- [n * strides.w + c * strides.z + access_y * strides.y +
127
- access_x * strides.x ];
126
+ [n * strides.x + c * strides.y + access_y * strides.z +
127
+ access_x * strides.w ];
128
128
}
129
129
130
130
template <typename scalar_t >
@@ -136,7 +136,7 @@ scalar_t upsample_get_value_bounded(
136
136
long c,
137
137
long x) {
138
138
int access_x = max (min (x, dim - 1 ), 0L );
139
- return data[n * strides.z + c * strides.y + access_x * strides.x ];
139
+ return data[n * strides.x + c * strides.y + access_x * strides.z ];
140
140
}
141
141
142
142
template <typename scalar_t >
@@ -153,8 +153,8 @@ void upsample_increment_value_bounded(
153
153
int access_x = max (min (x, dim.x - 1 ), 0L );
154
154
AtomicType<scalar_t >::atomic_add (
155
155
data,
156
- n * strides.w + c * strides.z + access_y * strides.y +
157
- access_x * strides.x ,
156
+ n * strides.x + c * strides.y + access_y * strides.z +
157
+ access_x * strides.w ,
158
158
value);
159
159
}
160
160
@@ -200,24 +200,24 @@ kernel void upsample_linear1d(
200
200
constant ulong3& output_strides [[buffer(3 )]],
201
201
constant long3& input_sizes [[buffer(4 )]],
202
202
constant long3& output_sizes [[buffer(5 )]],
203
- constant float& scale [[buffer(6 )]],
203
+ constant float2& scales [[buffer(6 )]],
204
204
constant bool& align_corners [[buffer(7 )]],
205
205
uint thread_index [[thread_position_in_grid]]) {
206
206
auto output_x = thread_index;
207
207
auto real_x = area_pixel_compute_source_index (
208
- scale , output_x, align_corners, /* cubic=*/ false );
208
+ scales. x , output_x, align_corners, /* cubic=*/ false );
209
209
auto t_x = fract (real_x);
210
210
211
- for (int n = 0 ; n < output_sizes.z ; n++) {
211
+ for (int n = 0 ; n < output_sizes.x ; n++) {
212
212
for (int c = 0 ; c < output_sizes.y ; c++) {
213
213
auto i00 = upsample_get_value_bounded<T>(
214
- inputData, input_sizes.x , input_strides, n, c, real_x);
214
+ inputData, input_sizes.z , input_strides, n, c, real_x);
215
215
auto i01 = upsample_get_value_bounded<T>(
216
- inputData, input_sizes.x , input_strides, n, c, real_x + 1 );
216
+ inputData, input_sizes.z , input_strides, n, c, real_x + 1 );
217
217
auto res = linear_interp (i00, i01, t_x);
218
218
outputData
219
- [n * output_strides.z + c * output_strides.y +
220
- output_x * output_strides.x ] = static_cast <T>(res);
219
+ [n * output_strides.x + c * output_strides.y +
220
+ output_x * output_strides.z ] = static_cast <T>(res);
221
221
}
222
222
}
223
223
}
@@ -232,26 +232,26 @@ kernel void upsample_bilinear2d(
232
232
constant float2& scales [[buffer(6 )]],
233
233
constant bool& align_corners [[buffer(7 )]],
234
234
uint thread_index [[thread_position_in_grid]]) {
235
- auto output_x = thread_index % output_sizes.x ;
236
- auto output_y = thread_index / output_sizes.x ;
235
+ auto output_x = thread_index % output_sizes.w ;
236
+ auto output_y = thread_index / output_sizes.w ;
237
237
auto real_x = area_pixel_compute_source_index (
238
238
scales.x , output_x, align_corners, /* cubic=*/ false );
239
239
auto t_x = fract (real_x);
240
240
241
241
auto real_y = area_pixel_compute_source_index (
242
242
scales.y , output_y, align_corners, /* cubic=*/ false );
243
243
auto t_y = fract (real_y);
244
- for (int n = 0 ; n < output_sizes.w ; n++) {
245
- for (int c = 0 ; c < output_sizes.z ; c++) {
244
+ for (int n = 0 ; n < output_sizes.x ; n++) {
245
+ for (int c = 0 ; c < output_sizes.y ; c++) {
246
246
auto i00 = upsample_get_value_bounded<T>(
247
- inputData, input_sizes.xy , input_strides, n, c, real_y, real_x);
247
+ inputData, input_sizes.wz , input_strides, n, c, real_y, real_x);
248
248
auto i01 = upsample_get_value_bounded<T>(
249
- inputData, input_sizes.xy , input_strides, n, c, real_y, real_x + 1 );
249
+ inputData, input_sizes.wz , input_strides, n, c, real_y, real_x + 1 );
250
250
auto i10 = upsample_get_value_bounded<T>(
251
- inputData, input_sizes.xy , input_strides, n, c, real_y + 1 , real_x);
251
+ inputData, input_sizes.wz , input_strides, n, c, real_y + 1 , real_x);
252
252
auto i11 = upsample_get_value_bounded<T>(
253
253
inputData,
254
- input_sizes.xy ,
254
+ input_sizes.wz ,
255
255
input_strides,
256
256
n,
257
257
c,
@@ -261,8 +261,8 @@ kernel void upsample_bilinear2d(
261
261
auto i1_l = linear_interp (i10, i11, t_x);
262
262
auto res = linear_interp (i0_l, i1_l, t_y);
263
263
outputData
264
- [n * output_strides.w + c * output_strides.z +
265
- output_x * output_strides.x + output_y * output_strides.y ] =
264
+ [n * output_strides.x + c * output_strides.y +
265
+ output_y * output_strides.z + output_x * output_strides.w ] =
266
266
static_cast <T>(res);
267
267
}
268
268
}
@@ -283,36 +283,36 @@ kernel void upsample_bilinear2d_aa(
283
283
constant float2& scales [[buffer(6 )]],
284
284
constant bool& align_corners [[buffer(7 )]],
285
285
uint thread_index [[thread_position_in_grid]]) {
286
- auto output_x = thread_index % output_sizes.x ;
287
- auto output_y = thread_index / output_sizes.x ;
286
+ auto output_x = thread_index % output_sizes.w ;
287
+ auto output_y = thread_index / output_sizes.w ;
288
288
(void )align_corners; // Align corners is unused for AA algorithm
289
289
auto x_center = area_pixel_compute_source_index (
290
290
scales.x , output_x, /* align_corners=*/ false , /* cubic=*/ false );
291
291
auto y_center = area_pixel_compute_source_index (
292
292
scales.y , output_y, /* align_corners=*/ false , /* cubic=*/ false );
293
293
auto clamped_scales = max (1.0 , scales);
294
294
auto x_min = max (0L , long (floor (x_center - clamped_scales.x + 1 )));
295
- auto x_max = min (input_sizes.x , long (ceil (x_center + clamped_scales.x )));
295
+ auto x_max = min (input_sizes.w , long (ceil (x_center + clamped_scales.x )));
296
296
auto y_min = max (0L , long (floor (y_center - clamped_scales.y + 1 )));
297
- auto y_max = min (input_sizes.y , long (ceil (y_center + clamped_scales.y )));
298
- for (int n = 0 ; n < output_sizes.w ; n++) {
299
- for (int c = 0 ; c < output_sizes.z ; c++) {
297
+ auto y_max = min (input_sizes.z , long (ceil (y_center + clamped_scales.y )));
298
+ for (int n = 0 ; n < output_sizes.x ; n++) {
299
+ for (int c = 0 ; c < output_sizes.y ; c++) {
300
300
float res = 0.0 ;
301
301
float ws = 0.0 ;
302
302
constant auto * input =
303
- inputData + n * input_strides.w + c * input_strides.z ;
303
+ inputData + n * input_strides.x + c * input_strides.y ;
304
304
for (auto y = y_min; y < y_max; ++y) {
305
305
auto dy = bilinear_functor ((y - y_center) / clamped_scales.y );
306
306
for (auto x = x_min; x < x_max; ++x) {
307
307
auto dx = bilinear_functor ((x - x_center) / clamped_scales.x );
308
- auto val = input[x * input_strides.x + y * input_strides.y ];
308
+ auto val = input[x * input_strides.w + y * input_strides.z ];
309
309
res += val * dx * dy;
310
310
ws += dx * dy;
311
311
}
312
312
}
313
313
outputData
314
- [n * output_strides.w + c * output_strides.z +
315
- output_x * output_strides.x + output_y * output_strides.y ] =
314
+ [n * output_strides.x + c * output_strides.y +
315
+ output_y * output_strides.z + output_x * output_strides.w ] =
316
316
static_cast <T>(res / ws);
317
317
}
318
318
}
@@ -329,8 +329,8 @@ kernel void upsample_bicubic2d(
329
329
constant float2& scales [[buffer(6 )]],
330
330
constant bool& align_corners [[buffer(7 )]],
331
331
uint thread_index [[thread_position_in_grid]]) {
332
- auto output_x = thread_index % output_sizes.x ;
333
- auto output_y = thread_index / output_sizes.x ;
332
+ auto output_x = thread_index % output_sizes.w ;
333
+ auto output_y = thread_index / output_sizes.w ;
334
334
auto real_x = area_pixel_compute_source_index (
335
335
scales.x , output_x, align_corners, /* cubic=*/ true );
336
336
int in_x = floor (real_x);
@@ -340,38 +340,38 @@ kernel void upsample_bicubic2d(
340
340
scales.y , output_y, align_corners, /* cubic=*/ true );
341
341
int in_y = floor (real_y);
342
342
auto t_y = real_y - in_y;
343
- for (int n = 0 ; n < output_sizes.w ; n++) {
344
- for (int c = 0 ; c < output_sizes.z ; c++) {
343
+ for (int n = 0 ; n < output_sizes.x ; n++) {
344
+ for (int c = 0 ; c < output_sizes.y ; c++) {
345
345
float coefficients[4 ];
346
346
for (int k = 0 ; k < 4 ; k++) {
347
347
coefficients[k] = cubic_interp1d (
348
348
upsample_get_value_bounded<T>(
349
349
inputData,
350
- input_sizes.xy ,
350
+ input_sizes.wz ,
351
351
input_strides,
352
352
n,
353
353
c,
354
354
in_y - 1 + k,
355
355
in_x - 1 ),
356
356
upsample_get_value_bounded<T>(
357
357
inputData,
358
- input_sizes.xy ,
358
+ input_sizes.wz ,
359
359
input_strides,
360
360
n,
361
361
c,
362
362
in_y - 1 + k,
363
363
in_x + 0 ),
364
364
upsample_get_value_bounded<T>(
365
365
inputData,
366
- input_sizes.xy ,
366
+ input_sizes.wz ,
367
367
input_strides,
368
368
n,
369
369
c,
370
370
in_y - 1 + k,
371
371
in_x + 1 ),
372
372
upsample_get_value_bounded<T>(
373
373
inputData,
374
- input_sizes.xy ,
374
+ input_sizes.wz ,
375
375
input_strides,
376
376
n,
377
377
c,
@@ -386,8 +386,8 @@ kernel void upsample_bicubic2d(
386
386
coefficients[3 ],
387
387
t_y));
388
388
outputData
389
- [n * output_strides.w + c * output_strides.z +
390
- output_x * output_strides.x + output_y * output_strides.y ] = inp;
389
+ [n * output_strides.x + c * output_strides.y +
390
+ output_y * output_strides.z + output_x * output_strides.w ] = inp;
391
391
}
392
392
}
393
393
}
@@ -403,8 +403,8 @@ kernel void upsample_bicubic2d_backward(
403
403
constant float2& scales [[buffer(6 )]],
404
404
constant bool& align_corners [[buffer(7 )]],
405
405
uint thread_index [[thread_position_in_grid]]) {
406
- auto output_x = thread_index % output_sizes.x ;
407
- auto output_y = thread_index / output_sizes.x ;
406
+ auto output_x = thread_index % output_sizes.w ;
407
+ auto output_y = thread_index / output_sizes.w ;
408
408
auto real_x = area_pixel_compute_source_index<float >(
409
409
scales.x , output_x, align_corners, /* cubic=*/ true );
410
410
int input_x = floor (real_x);
@@ -421,16 +421,16 @@ kernel void upsample_bicubic2d_backward(
421
421
get_cubic_upsampling_coefficients (x_coeffs, t_x);
422
422
get_cubic_upsampling_coefficients (y_coeffs, t_y);
423
423
424
- for (int n = 0 ; n < output_sizes.w ; n++) {
425
- for (int c = 0 ; c < output_sizes.z ; ++c) {
424
+ for (int n = 0 ; n < output_sizes.x ; n++) {
425
+ for (int c = 0 ; c < output_sizes.y ; ++c) {
426
426
auto out_value = gradOutputData
427
- [n * output_strides.w + c * output_strides.z +
428
- output_x * output_strides.x + output_y * output_strides.y ];
427
+ [n * output_strides.x + c * output_strides.y +
428
+ output_y * output_strides.z + output_x * output_strides.w ];
429
429
for (int i = 0 ; i < 4 ; i++) {
430
430
for (int j = 0 ; j < 4 ; j++) {
431
431
upsample_increment_value_bounded<T>(
432
432
gradInputData,
433
- input_sizes.xy ,
433
+ input_sizes.wz ,
434
434
input_strides,
435
435
n,
436
436
c,
@@ -478,7 +478,7 @@ kernel void upsample_bicubic2d_backward(
478
478
constant ulong3 & output_strides [[buffer(3 )]], \
479
479
constant long3 & input_sizes [[buffer(4 )]], \
480
480
constant long3 & output_sizes [[buffer(5 )]], \
481
- constant float & scale [[buffer(6 )]], \
481
+ constant float2 & scales [[buffer(6 )]], \
482
482
constant bool & align_corners [[buffer(7 )]], \
483
483
uint thread_index [[thread_position_in_grid]])
484
484
0 commit comments