Open
Description
PyTorch PR: pytorch/pytorch#153096
PyTorch CI: https://github.com/pytorch/pytorch/actions/runs/14909681029/job/41882713332
python test/inductor/test_compile_subprocess.py GPUTests.test_upsample_bicubic2d_xpu
(AssertionError: Tensor-likes are not close!)
GPUTests.test_upsample_bicubic2d_xpu
starts to fail after f4eb329
Reproducer:
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
import triton
import triton.language as tl
from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
from torch._C import _xpu_getCurrentRawStream as get_raw_stream
from torch._C import _xpu_getCurrentRawStream as get_raw_stream
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
assert_alignment = torch._C._dynamo.guards.assert_alignment
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
alloc_from_pool = torch.ops.inductor._alloc_from_pool
async_compile = AsyncCompile()
empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
# kernel path: /tmp/tmpe36k26mw/xc/cxcmlzu2y7jefjxlsuswz7djr574ritanone4cm544hpb7lof5tn.py
# Topologically Sorted Source Nodes: [upsample_bicubic2d], Original ATen: [aten.floor, aten.arange, aten._to_copy, aten.mul, aten._unsafe_index, aten.sub, aten.clamp, aten.add, aten.rsub]
# Source node to ATen node mapping:
# upsample_bicubic2d => _unsafe_index, _unsafe_index_1, _unsafe_index_10, _unsafe_index_11, _unsafe_index_12, _unsafe_index_13, _unsafe_index_14, _unsafe_index_15, _unsafe_index_2, _unsafe_index_3, _unsafe_index_4, _unsafe_index_5, _unsafe_index_6, _unsafe_index_7, _unsafe_index_8, _unsafe_index_9, add_10, add_11, add_12, add_13, add_14, add_15, add_16, add_17, add_18, add_19, add_20, add_21, add_22, add_23, add_24, add_25, add_26, add_27, add_28, add_4, add_5, add_6, add_7, add_8, add_9, clamp_max, clamp_max_1, clamp_min, clamp_min_1, convert_element_type_1, floor, floor_1, iota_1, mul, mul_10, mul_11, mul_12, mul_13, mul_14, mul_15, mul_16, mul_17, mul_18, mul_19, mul_2, mul_20, mul_21, mul_22, mul_23, mul_24, mul_25, mul_26, mul_27, mul_28, mul_29, mul_3, mul_30, mul_31, mul_32, mul_33, mul_34, mul_35, mul_36, mul_37, mul_38, mul_39, mul_4, mul_40, mul_41, mul_42, mul_43, mul_44, mul_45, mul_5, mul_6, mul_7, mul_8, mul_9, sub, sub_1, sub_10, sub_11, sub_12, sub_13, sub_14, sub_15, sub_16, sub_17, sub_18, sub_19, sub_4, sub_5, sub_6, sub_7, sub_8, sub_9
# Graph fragment:
# %floor_1 : [num_users=2] = call_function[target=torch.ops.aten.floor.default](args = (%unsqueeze,), kwargs = {})
# %iota_1 : [num_users=1] = call_function[target=torch.ops.prims.iota.default](args = (128,), kwargs = {start: 0, step: 1, dtype: torch.int64, device: xpu:0, requires_grad: False})
# %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%iota_1, torch.float32), kwargs = {})
# %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, 0.2440944881889764), kwargs = {})
# %floor : [num_users=2] = call_function[target=torch.ops.aten.floor.default](args = (%mul,), kwargs = {})
# %_unsafe_index : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_2, %clamp_max_3]), kwargs = {})
# %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul, %floor), kwargs = {})
# %clamp_min_1 : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub_1, 0.0), kwargs = {})
# %clamp_max_1 : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.default](args = (%clamp_min_1, 1.0), kwargs = {})
# %add_4 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%clamp_max_1, 1.0), kwargs = {})
# %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_4, -0.75), kwargs = {})
# %sub_4 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_2, -3.75), kwargs = {})
# %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_4, %add_4), kwargs = {})
# %add_5 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, -6.0), kwargs = {})
# %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_5, %add_4), kwargs = {})
# %sub_5 : [num_users=4] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_4, -3.0), kwargs = {})
# %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index, %sub_5), kwargs = {})
# %_unsafe_index_1 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_4, %clamp_max_5]), kwargs = {})
# %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%clamp_max_1, 1.25), kwargs = {})
# %sub_6 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_5, 2.25), kwargs = {})
# %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_6, %clamp_max_1), kwargs = {})
# %mul_7 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_6, %clamp_max_1), kwargs = {})
# %add_6 : [num_users=4] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_7, 1), kwargs = {})
# %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_1, %add_6), kwargs = {})
# %add_14 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_26, %mul_27), kwargs = {})
# %_unsafe_index_2 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_6, %clamp_max_7]), kwargs = {})
# %sub_7 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %clamp_max_1), kwargs = {})
# %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_7, 1.25), kwargs = {})
# %sub_8 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_8, 2.25), kwargs = {})
# %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_8, %sub_7), kwargs = {})
# %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_9, %sub_7), kwargs = {})
# %add_7 : [num_users=4] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_10, 1), kwargs = {})
# %mul_28 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_2, %add_7), kwargs = {})
# %add_15 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_14, %mul_28), kwargs = {})
# %_unsafe_index_3 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_8, %clamp_max_9]), kwargs = {})
# %sub_9 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (2.0, %clamp_max_1), kwargs = {})
# %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_9, -0.75), kwargs = {})
# %sub_10 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_11, -3.75), kwargs = {})
# %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_10, %sub_9), kwargs = {})
# %add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_12, -6.0), kwargs = {})
# %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_8, %sub_9), kwargs = {})
# %sub_11 : [num_users=4] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_13, -3.0), kwargs = {})
# %mul_29 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_3, %sub_11), kwargs = {})
# %add_16 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_15, %mul_29), kwargs = {})
# %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%unsqueeze, %floor_1), kwargs = {})
# %clamp_min : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub, 0.0), kwargs = {})
# %clamp_max : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.default](args = (%clamp_min, 1.0), kwargs = {})
# %add_9 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%clamp_max, 1.0), kwargs = {})
# %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_9, -0.75), kwargs = {})
# %sub_12 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_14, -3.75), kwargs = {})
# %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_12, %add_9), kwargs = {})
# %add_10 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_15, -6.0), kwargs = {})
# %mul_16 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_10, %add_9), kwargs = {})
# %sub_13 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_16, -3.0), kwargs = {})
# %mul_42 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_16, %sub_13), kwargs = {})
# %_unsafe_index_4 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_10, %clamp_max_11]), kwargs = {})
# %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_4, %sub_5), kwargs = {})
# %_unsafe_index_5 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_12, %clamp_max_13]), kwargs = {})
# %mul_31 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_5, %add_6), kwargs = {})
# %add_17 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_30, %mul_31), kwargs = {})
# %_unsafe_index_6 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_14, %clamp_max_15]), kwargs = {})
# %mul_32 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_6, %add_7), kwargs = {})
# %add_18 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_17, %mul_32), kwargs = {})
# %_unsafe_index_7 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_16, %clamp_max_17]), kwargs = {})
# %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_7, %sub_11), kwargs = {})
# %add_19 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_18, %mul_33), kwargs = {})
# %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%clamp_max, 1.25), kwargs = {})
# %sub_14 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_17, 2.25), kwargs = {})
# %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_14, %clamp_max), kwargs = {})
# %mul_19 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_18, %clamp_max), kwargs = {})
# %add_11 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_19, 1), kwargs = {})
# %mul_43 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_19, %add_11), kwargs = {})
# %add_26 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_42, %mul_43), kwargs = {})
# %_unsafe_index_8 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_18, %clamp_max_19]), kwargs = {})
# %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_8, %sub_5), kwargs = {})
# %_unsafe_index_9 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_20, %clamp_max_21]), kwargs = {})
# %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_9, %add_6), kwargs = {})
# %add_20 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_34, %mul_35), kwargs = {})
# %_unsafe_index_10 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_22, %clamp_max_23]), kwargs = {})
# %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_10, %add_7), kwargs = {})
# %add_21 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_20, %mul_36), kwargs = {})
# %_unsafe_index_11 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_24, %clamp_max_25]), kwargs = {})
# %mul_37 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_11, %sub_11), kwargs = {})
# %add_22 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_21, %mul_37), kwargs = {})
# %sub_15 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %clamp_max), kwargs = {})
# %mul_20 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_15, 1.25), kwargs = {})
# %sub_16 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_20, 2.25), kwargs = {})
# %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_16, %sub_15), kwargs = {})
# %mul_22 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_21, %sub_15), kwargs = {})
# %add_12 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_22, 1), kwargs = {})
# %mul_44 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_22, %add_12), kwargs = {})
# %add_27 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_26, %mul_44), kwargs = {})
# %_unsafe_index_12 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_26, %clamp_max_27]), kwargs = {})
# %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_12, %sub_5), kwargs = {})
# %_unsafe_index_13 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_28, %clamp_max_29]), kwargs = {})
# %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_13, %add_6), kwargs = {})
# %add_23 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_38, %mul_39), kwargs = {})
# %_unsafe_index_14 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_30, %clamp_max_31]), kwargs = {})
# %mul_40 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_14, %add_7), kwargs = {})
# %add_24 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_23, %mul_40), kwargs = {})
# %_unsafe_index_15 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_32, %clamp_max_33]), kwargs = {})
# %mul_41 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_15, %sub_11), kwargs = {})
# %add_25 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_24, %mul_41), kwargs = {})
# %sub_17 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (2.0, %clamp_max), kwargs = {})
# %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_17, -0.75), kwargs = {})
# %sub_18 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_23, -3.75), kwargs = {})
# %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_18, %sub_17), kwargs = {})
# %add_13 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_24, -6.0), kwargs = {})
# %mul_25 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_13, %sub_17), kwargs = {})
# %sub_19 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_25, -3.0), kwargs = {})
# %mul_45 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_25, %sub_19), kwargs = {})
# %add_28 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_27, %mul_45), kwargs = {})
triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_0 = async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_0', '''
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
@triton_heuristics.pointwise(
size_hints={'x': 262144},
filename=__file__,
triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='xpu', index=0, multi_processor_count=56, cc={'architecture': 13136561920, 'driver_version': '1.6.32567+18', 'gpu_eu_count': 448, 'gpu_subslice_count': 56, 'has_atomic64': True, 'has_bfloat16_conversions': True, 'has_fp16': True, 'has_fp64': True, 'has_subgroup_2d_block_io': True, 'has_subgroup_matrix_multiply_accumulate': True, 'has_subgroup_matrix_multiply_accumulate_tensor_float32': False, 'max_compute_units': 448, 'max_num_sub_groups': 64, 'max_work_group_size': 1024, 'name': 'Intel(R) Data Center GPU Max 1100', 'platform_name': 'Intel(R) oneAPI Unified Runtime over Level-Zero', 'sub_group_sizes': [16, 32], 'total_memory': 51539607552, 'type': 'gpu', 'vendor': 'Intel(R) Corporation', 'version': '12.60.7'}, major=None, regs_per_multiprocessor=None, max_threads_per_multi_processor=None, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': '9C4F292F97BD56CD901940BC672A5DEDD5EC916EC29FD8FD818A1266B7DD4045', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 196608
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = tl.full([XBLOCK], True, tl.int1)
x1 = ((xindex // 128) % 128)
x0 = (xindex % 128)
x2 = xindex // 16384
x4 = xindex
tmp0 = x1
tmp1 = tmp0.to(tl.float32)
tmp2 = 0.49606299212598426
tmp3 = tmp1 * tmp2
tmp4 = libdevice.floor(tmp3)
tmp5 = tmp4.to(tl.int32)
tmp6 = tl.full([1], 1, tl.int64)
tmp7 = tmp5 - tmp6
tmp8 = tl.full([1], 0, tl.int64)
tmp9 = triton_helpers.maximum(tmp7, tmp8)
tmp10 = tl.full([1], 63, tl.int64)
tmp11 = triton_helpers.minimum(tmp9, tmp10)
tmp12 = x0
tmp13 = tmp12.to(tl.float32)
tmp14 = 0.2440944881889764
tmp15 = tmp13 * tmp14
tmp16 = libdevice.floor(tmp15)
tmp17 = tmp16.to(tl.int32)
tmp18 = tmp17 - tmp6
tmp19 = triton_helpers.maximum(tmp18, tmp8)
tmp20 = tl.full([1], 31, tl.int64)
tmp21 = triton_helpers.minimum(tmp19, tmp20)
tmp22 = tl.load(in_ptr0 + (tmp21 + 32*tmp11 + 2048*x2), None, eviction_policy='evict_last')
tmp23 = tmp15 - tmp16
tmp24 = 0.0
tmp25 = triton_helpers.maximum(tmp23, tmp24)
tmp26 = 1.0
tmp27 = triton_helpers.minimum(tmp25, tmp26)
tmp28 = tmp27 + tmp26
tmp29 = -0.75
tmp30 = tmp28 * tmp29
tmp31 = -3.75
tmp32 = tmp30 - tmp31
tmp33 = tmp32 * tmp28
tmp34 = -6.0
tmp35 = tmp33 + tmp34
tmp36 = tmp35 * tmp28
tmp37 = -3.0
tmp38 = tmp36 - tmp37
tmp39 = tmp22 * tmp38
tmp40 = triton_helpers.maximum(tmp17, tmp8)
tmp41 = triton_helpers.minimum(tmp40, tmp20)
tmp42 = tl.load(in_ptr0 + (tmp41 + 32*tmp11 + 2048*x2), None, eviction_policy='evict_last')
tmp43 = 1.25
tmp44 = tmp27 * tmp43
tmp45 = 2.25
tmp46 = tmp44 - tmp45
tmp47 = tmp46 * tmp27
tmp48 = tmp47 * tmp27
tmp49 = tmp48 + tmp26
tmp50 = tmp42 * tmp49
tmp51 = tmp39 + tmp50
tmp52 = tmp17 + tmp6
tmp53 = triton_helpers.maximum(tmp52, tmp8)
tmp54 = triton_helpers.minimum(tmp53, tmp20)
tmp55 = tl.load(in_ptr0 + (tmp54 + 32*tmp11 + 2048*x2), None, eviction_policy='evict_last')
tmp56 = tmp26 - tmp27
tmp57 = tmp56 * tmp43
tmp58 = tmp57 - tmp45
tmp59 = tmp58 * tmp56
tmp60 = tmp59 * tmp56
tmp61 = tmp60 + tmp26
tmp62 = tmp55 * tmp61
tmp63 = tmp51 + tmp62
tmp64 = tl.full([1], 2, tl.int64)
tmp65 = tmp17 + tmp64
tmp66 = triton_helpers.maximum(tmp65, tmp8)
tmp67 = triton_helpers.minimum(tmp66, tmp20)
tmp68 = tl.load(in_ptr0 + (tmp67 + 32*tmp11 + 2048*x2), None, eviction_policy='evict_last')
tmp69 = 2.0
tmp70 = tmp69 - tmp27
tmp71 = tmp70 * tmp29
tmp72 = tmp71 - tmp31
tmp73 = tmp72 * tmp70
tmp74 = tmp73 + tmp34
tmp75 = tmp74 * tmp70
tmp76 = tmp75 - tmp37
tmp77 = tmp68 * tmp76
tmp78 = tmp63 + tmp77
tmp79 = triton_helpers.maximum(tmp5, tmp8)
tmp80 = triton_helpers.minimum(tmp79, tmp10)
tmp81 = tl.load(in_ptr0 + (tmp21 + 32*tmp80 + 2048*x2), None, eviction_policy='evict_last')
tmp82 = tmp81 * tmp38
tmp83 = tl.load(in_ptr0 + (tmp41 + 32*tmp80 + 2048*x2), None, eviction_policy='evict_last')
tmp84 = tmp83 * tmp49
tmp85 = tmp82 + tmp84
tmp86 = tl.load(in_ptr0 + (tmp54 + 32*tmp80 + 2048*x2), None, eviction_policy='evict_last')
tmp87 = tmp86 * tmp61
tmp88 = tmp85 + tmp87
tmp89 = tl.load(in_ptr0 + (tmp67 + 32*tmp80 + 2048*x2), None, eviction_policy='evict_last')
tmp90 = tmp89 * tmp76
tmp91 = tmp88 + tmp90
tmp92 = tmp3 - tmp4
tmp93 = triton_helpers.maximum(tmp92, tmp24)
tmp94 = triton_helpers.minimum(tmp93, tmp26)
tmp95 = tmp94 + tmp26
tmp96 = tmp95 * tmp29
tmp97 = tmp96 - tmp31
tmp98 = tmp97 * tmp95
tmp99 = tmp98 + tmp34
tmp100 = tmp99 * tmp95
tmp101 = tmp100 - tmp37
tmp102 = tmp78 * tmp101
tmp103 = tmp94 * tmp43
tmp104 = tmp103 - tmp45
tmp105 = tmp104 * tmp94
tmp106 = tmp105 * tmp94
tmp107 = tmp106 + tmp26
tmp108 = tmp91 * tmp107
tmp109 = tmp102 + tmp108
tmp110 = tmp5 + tmp6
tmp111 = triton_helpers.maximum(tmp110, tmp8)
tmp112 = triton_helpers.minimum(tmp111, tmp10)
tmp113 = tl.load(in_ptr0 + (tmp21 + 32*tmp112 + 2048*x2), None, eviction_policy='evict_last')
tmp114 = tmp113 * tmp38
tmp115 = tl.load(in_ptr0 + (tmp41 + 32*tmp112 + 2048*x2), None, eviction_policy='evict_last')
tmp116 = tmp115 * tmp49
tmp117 = tmp114 + tmp116
tmp118 = tl.load(in_ptr0 + (tmp54 + 32*tmp112 + 2048*x2), None, eviction_policy='evict_last')
tmp119 = tmp118 * tmp61
tmp120 = tmp117 + tmp119
tmp121 = tl.load(in_ptr0 + (tmp67 + 32*tmp112 + 2048*x2), None, eviction_policy='evict_last')
tmp122 = tmp121 * tmp76
tmp123 = tmp120 + tmp122
tmp124 = tmp5 + tmp64
tmp125 = triton_helpers.maximum(tmp124, tmp8)
tmp126 = triton_helpers.minimum(tmp125, tmp10)
tmp127 = tl.load(in_ptr0 + (tmp21 + 32*tmp126 + 2048*x2), None, eviction_policy='evict_last')
tmp128 = tmp127 * tmp38
tmp129 = tl.load(in_ptr0 + (tmp41 + 32*tmp126 + 2048*x2), None, eviction_policy='evict_last')
tmp130 = tmp129 * tmp49
tmp131 = tmp128 + tmp130
tmp132 = tl.load(in_ptr0 + (tmp54 + 32*tmp126 + 2048*x2), None, eviction_policy='evict_last')
tmp133 = tmp132 * tmp61
tmp134 = tmp131 + tmp133
tmp135 = tl.load(in_ptr0 + (tmp67 + 32*tmp126 + 2048*x2), None, eviction_policy='evict_last')
tmp136 = tmp135 * tmp76
tmp137 = tmp134 + tmp136
tmp138 = tmp26 - tmp94
tmp139 = tmp138 * tmp43
tmp140 = tmp139 - tmp45
tmp141 = tmp140 * tmp138
tmp142 = tmp141 * tmp138
tmp143 = tmp142 + tmp26
tmp144 = tmp123 * tmp143
tmp145 = tmp109 + tmp144
tmp146 = tmp69 - tmp94
tmp147 = tmp146 * tmp29
tmp148 = tmp147 - tmp31
tmp149 = tmp148 * tmp146
tmp150 = tmp149 + tmp34
tmp151 = tmp150 * tmp146
tmp152 = tmp151 - tmp37
tmp153 = tmp137 * tmp152
tmp154 = tmp145 + tmp153
tl.store(in_out_ptr0 + (x4), tmp154, None)
''', device_str='xpu')
# kernel path: /tmp/tmpe36k26mw/ne/cnelxk2d7rfu3352ax455ioflfva4f7qk3fnbnotmwmqhyn4mp5e.py
# Topologically Sorted Source Nodes: [upsample_bicubic2d_1], Original ATen: [aten.floor, aten.arange, aten._to_copy, aten.add, aten.mul, aten.sub, aten._unsafe_index, aten.clamp, aten.rsub]
# Source node to ATen node mapping:
# upsample_bicubic2d_1 => _unsafe_index_16, _unsafe_index_17, _unsafe_index_18, _unsafe_index_19, _unsafe_index_20, _unsafe_index_21, _unsafe_index_22, _unsafe_index_23, _unsafe_index_24, _unsafe_index_25, _unsafe_index_26, _unsafe_index_27, _unsafe_index_28, _unsafe_index_29, _unsafe_index_30, _unsafe_index_31, add_29, add_35, add_36, add_37, add_38, add_39, add_40, add_41, add_42, add_43, add_44, add_45, add_46, add_47, add_48, add_49, add_50, add_51, add_52, add_53, add_54, add_55, add_56, add_57, add_58, add_59, clamp_max_34, clamp_max_35, clamp_min_34, clamp_min_35, convert_element_type_5, floor_2, floor_3, iota_3, mul_46, mul_48, mul_49, mul_50, mul_51, mul_52, mul_53, mul_54, mul_55, mul_56, mul_57, mul_58, mul_59, mul_60, mul_61, mul_62, mul_63, mul_64, mul_65, mul_66, mul_67, mul_68, mul_69, mul_70, mul_71, mul_72, mul_73, mul_74, mul_75, mul_76, mul_77, mul_78, mul_79, mul_80, mul_81, mul_82, mul_83, mul_84, mul_85, mul_86, mul_87, mul_88, mul_89, mul_90, mul_91, sub_20, sub_22, sub_23, sub_26, sub_27, sub_28, sub_29, sub_30, sub_31, sub_32, sub_33, sub_34, sub_35, sub_36, sub_37, sub_38, sub_39, sub_40, sub_41
# Graph fragment:
# %floor_3 : [num_users=2] = call_function[target=torch.ops.aten.floor.default](args = (%unsqueeze_1,), kwargs = {})
# %iota_3 : [num_users=1] = call_function[target=torch.ops.prims.iota.default](args = (256,), kwargs = {start: 0, step: 1, dtype: torch.int64, device: xpu:0, requires_grad: False})
# %convert_element_type_5 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%iota_3, torch.float32), kwargs = {})
# %add_29 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%convert_element_type_5, 0.5), kwargs = {})
# %mul_46 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_29, 0.125), kwargs = {})
# %sub_20 : [num_users=2] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_46, 0.5), kwargs = {})
# %floor_2 : [num_users=2] = call_function[target=torch.ops.aten.floor.default](args = (%sub_20,), kwargs = {})
# %_unsafe_index_16 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_36, %clamp_max_37]), kwargs = {})
# %sub_23 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%sub_20, %floor_2), kwargs = {})
# %clamp_min_35 : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub_23, 0.0), kwargs = {})
# %clamp_max_35 : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.default](args = (%clamp_min_35, 1.0), kwargs = {})
# %add_35 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%clamp_max_35, 1.0), kwargs = {})
# %mul_48 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_35, -0.75), kwargs = {})
# %sub_26 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_48, -3.75), kwargs = {})
# %mul_49 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_26, %add_35), kwargs = {})
# %add_36 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_49, -6.0), kwargs = {})
# %mul_50 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_36, %add_35), kwargs = {})
# %sub_27 : [num_users=4] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_50, -3.0), kwargs = {})
# %mul_72 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_16, %sub_27), kwargs = {})
# %_unsafe_index_17 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_38, %clamp_max_39]), kwargs = {})
# %mul_51 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%clamp_max_35, 1.25), kwargs = {})
# %sub_28 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_51, 2.25), kwargs = {})
# %mul_52 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_28, %clamp_max_35), kwargs = {})
# %mul_53 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_52, %clamp_max_35), kwargs = {})
# %add_37 : [num_users=4] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_53, 1), kwargs = {})
# %mul_73 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_17, %add_37), kwargs = {})
# %add_45 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_72, %mul_73), kwargs = {})
# %_unsafe_index_18 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_40, %clamp_max_41]), kwargs = {})
# %sub_29 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %clamp_max_35), kwargs = {})
# %mul_54 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_29, 1.25), kwargs = {})
# %sub_30 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_54, 2.25), kwargs = {})
# %mul_55 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_30, %sub_29), kwargs = {})
# %mul_56 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_55, %sub_29), kwargs = {})
# %add_38 : [num_users=4] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_56, 1), kwargs = {})
# %mul_74 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_18, %add_38), kwargs = {})
# %add_46 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_45, %mul_74), kwargs = {})
# %_unsafe_index_19 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_42, %clamp_max_43]), kwargs = {})
# %sub_31 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (2.0, %clamp_max_35), kwargs = {})
# %mul_57 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_31, -0.75), kwargs = {})
# %sub_32 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_57, -3.75), kwargs = {})
# %mul_58 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_32, %sub_31), kwargs = {})
# %add_39 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_58, -6.0), kwargs = {})
# %mul_59 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_39, %sub_31), kwargs = {})
# %sub_33 : [num_users=4] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_59, -3.0), kwargs = {})
# %mul_75 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_19, %sub_33), kwargs = {})
# %add_47 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_46, %mul_75), kwargs = {})
# %sub_22 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%unsqueeze_1, %floor_3), kwargs = {})
# %clamp_min_34 : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub_22, 0.0), kwargs = {})
# %clamp_max_34 : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.default](args = (%clamp_min_34, 1.0), kwargs = {})
# %add_40 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%clamp_max_34, 1.0), kwargs = {})
# %mul_60 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_40, -0.75), kwargs = {})
# %sub_34 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_60, -3.75), kwargs = {})
# %mul_61 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_34, %add_40), kwargs = {})
# %add_41 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_61, -6.0), kwargs = {})
# %mul_62 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_41, %add_40), kwargs = {})
# %sub_35 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_62, -3.0), kwargs = {})
# %mul_88 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_47, %sub_35), kwargs = {})
# %_unsafe_index_20 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_44, %clamp_max_45]), kwargs = {})
# %mul_76 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_20, %sub_27), kwargs = {})
# %_unsafe_index_21 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_46, %clamp_max_47]), kwargs = {})
# %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_21, %add_37), kwargs = {})
# %add_48 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_76, %mul_77), kwargs = {})
# %_unsafe_index_22 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_48, %clamp_max_49]), kwargs = {})
# %mul_78 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_22, %add_38), kwargs = {})
# %add_49 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_48, %mul_78), kwargs = {})
# %_unsafe_index_23 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_50, %clamp_max_51]), kwargs = {})
# %mul_79 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_23, %sub_33), kwargs = {})
# %add_50 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_49, %mul_79), kwargs = {})
# %mul_63 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%clamp_max_34, 1.25), kwargs = {})
# %sub_36 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_63, 2.25), kwargs = {})
# %mul_64 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_36, %clamp_max_34), kwargs = {})
# %mul_65 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_64, %clamp_max_34), kwargs = {})
# %add_42 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_65, 1), kwargs = {})
# %mul_89 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_50, %add_42), kwargs = {})
# %add_57 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_88, %mul_89), kwargs = {})
# %_unsafe_index_24 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_52, %clamp_max_53]), kwargs = {})
# %mul_80 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_24, %sub_27), kwargs = {})
# %_unsafe_index_25 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_54, %clamp_max_55]), kwargs = {})
# %mul_81 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_25, %add_37), kwargs = {})
# %add_51 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_80, %mul_81), kwargs = {})
# %_unsafe_index_26 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_56, %clamp_max_57]), kwargs = {})
# %mul_82 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_26, %add_38), kwargs = {})
# %add_52 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_51, %mul_82), kwargs = {})
# %_unsafe_index_27 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_58, %clamp_max_59]), kwargs = {})
# %mul_83 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_27, %sub_33), kwargs = {})
# %add_53 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_52, %mul_83), kwargs = {})
# %sub_37 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %clamp_max_34), kwargs = {})
# %mul_66 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_37, 1.25), kwargs = {})
# %sub_38 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_66, 2.25), kwargs = {})
# %mul_67 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_38, %sub_37), kwargs = {})
# %mul_68 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_67, %sub_37), kwargs = {})
# %add_43 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_68, 1), kwargs = {})
# %mul_90 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_53, %add_43), kwargs = {})
# %add_58 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_57, %mul_90), kwargs = {})
# %_unsafe_index_28 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_60, %clamp_max_61]), kwargs = {})
# %mul_84 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_28, %sub_27), kwargs = {})
# %_unsafe_index_29 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_62, %clamp_max_63]), kwargs = {})
# %mul_85 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_29, %add_37), kwargs = {})
# %add_54 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_84, %mul_85), kwargs = {})
# %_unsafe_index_30 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_64, %clamp_max_65]), kwargs = {})
# %mul_86 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_30, %add_38), kwargs = {})
# %add_55 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_54, %mul_86), kwargs = {})
# %_unsafe_index_31 : [num_users=1] = call_function[target=torch.ops.aten._unsafe_index.Tensor](args = (%arg0_1, [None, None, %clamp_max_66, %clamp_max_67]), kwargs = {})
# %mul_87 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_unsafe_index_31, %sub_33), kwargs = {})
# %add_56 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_55, %mul_87), kwargs = {})
# %sub_39 : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (2.0, %clamp_max_34), kwargs = {})
# %mul_69 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_39, -0.75), kwargs = {})
# %sub_40 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_69, -3.75), kwargs = {})
# %mul_70 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_40, %sub_39), kwargs = {})
# %add_44 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_70, -6.0), kwargs = {})
# %mul_71 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_44, %sub_39), kwargs = {})
# %sub_41 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_71, -3.0), kwargs = {})
# %mul_91 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_56, %sub_41), kwargs = {})
# %add_59 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_58, %mul_91), kwargs = {})
triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_1 = async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_1', '''
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
@triton_heuristics.pointwise(
size_hints={'x': 524288},
filename=__file__,
triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='xpu', index=0, multi_processor_count=56, cc={'architecture': 13136561920, 'driver_version': '1.6.32567+18', 'gpu_eu_count': 448, 'gpu_subslice_count': 56, 'has_atomic64': True, 'has_bfloat16_conversions': True, 'has_fp16': True, 'has_fp64': True, 'has_subgroup_2d_block_io': True, 'has_subgroup_matrix_multiply_accumulate': True, 'has_subgroup_matrix_multiply_accumulate_tensor_float32': False, 'max_compute_units': 448, 'max_num_sub_groups': 64, 'max_work_group_size': 1024, 'name': 'Intel(R) Data Center GPU Max 1100', 'platform_name': 'Intel(R) oneAPI Unified Runtime over Level-Zero', 'sub_group_sizes': [16, 32], 'total_memory': 51539607552, 'type': 'gpu', 'vendor': 'Intel(R) Corporation', 'version': '12.60.7'}, major=None, regs_per_multiprocessor=None, max_threads_per_multi_processor=None, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': '9C4F292F97BD56CD901940BC672A5DEDD5EC916EC29FD8FD818A1266B7DD4045', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_1(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 393216
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = tl.full([XBLOCK], True, tl.int1)
x1 = ((xindex // 256) % 128)
x0 = (xindex % 256)
x2 = xindex // 32768
x4 = xindex
tmp0 = x1
tmp1 = tmp0.to(tl.float32)
tmp2 = 0.5
tmp3 = tmp1 + tmp2
tmp4 = tmp3 * tmp2
tmp5 = tmp4 - tmp2
tmp6 = libdevice.floor(tmp5)
tmp7 = tmp6.to(tl.int32)
tmp8 = tl.full([1], 1, tl.int64)
tmp9 = tmp7 - tmp8
tmp10 = tl.full([1], 0, tl.int64)
tmp11 = triton_helpers.maximum(tmp9, tmp10)
tmp12 = tl.full([1], 63, tl.int64)
tmp13 = triton_helpers.minimum(tmp11, tmp12)
tmp14 = x0
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15 + tmp2
tmp17 = 0.125
tmp18 = tmp16 * tmp17
tmp19 = tmp18 - tmp2
tmp20 = libdevice.floor(tmp19)
tmp21 = tmp20.to(tl.int32)
tmp22 = tmp21 - tmp8
tmp23 = triton_helpers.maximum(tmp22, tmp10)
tmp24 = tl.full([1], 31, tl.int64)
tmp25 = triton_helpers.minimum(tmp23, tmp24)
tmp26 = tl.load(in_ptr0 + (tmp25 + 32*tmp13 + 2048*x2), None, eviction_policy='evict_last')
tmp27 = tmp19 - tmp20
tmp28 = 0.0
tmp29 = triton_helpers.maximum(tmp27, tmp28)
tmp30 = 1.0
tmp31 = triton_helpers.minimum(tmp29, tmp30)
tmp32 = tmp31 + tmp30
tmp33 = -0.75
tmp34 = tmp32 * tmp33
tmp35 = -3.75
tmp36 = tmp34 - tmp35
tmp37 = tmp36 * tmp32
tmp38 = -6.0
tmp39 = tmp37 + tmp38
tmp40 = tmp39 * tmp32
tmp41 = -3.0
tmp42 = tmp40 - tmp41
tmp43 = tmp26 * tmp42
tmp44 = triton_helpers.maximum(tmp21, tmp10)
tmp45 = triton_helpers.minimum(tmp44, tmp24)
tmp46 = tl.load(in_ptr0 + (tmp45 + 32*tmp13 + 2048*x2), None, eviction_policy='evict_last')
tmp47 = 1.25
tmp48 = tmp31 * tmp47
tmp49 = 2.25
tmp50 = tmp48 - tmp49
tmp51 = tmp50 * tmp31
tmp52 = tmp51 * tmp31
tmp53 = tmp52 + tmp30
tmp54 = tmp46 * tmp53
tmp55 = tmp43 + tmp54
tmp56 = tmp21 + tmp8
tmp57 = triton_helpers.maximum(tmp56, tmp10)
tmp58 = triton_helpers.minimum(tmp57, tmp24)
tmp59 = tl.load(in_ptr0 + (tmp58 + 32*tmp13 + 2048*x2), None, eviction_policy='evict_last')
tmp60 = tmp30 - tmp31
tmp61 = tmp60 * tmp47
tmp62 = tmp61 - tmp49
tmp63 = tmp62 * tmp60
tmp64 = tmp63 * tmp60
tmp65 = tmp64 + tmp30
tmp66 = tmp59 * tmp65
tmp67 = tmp55 + tmp66
tmp68 = tl.full([1], 2, tl.int64)
tmp69 = tmp21 + tmp68
tmp70 = triton_helpers.maximum(tmp69, tmp10)
tmp71 = triton_helpers.minimum(tmp70, tmp24)
tmp72 = tl.load(in_ptr0 + (tmp71 + 32*tmp13 + 2048*x2), None, eviction_policy='evict_last')
tmp73 = 2.0
tmp74 = tmp73 - tmp31
tmp75 = tmp74 * tmp33
tmp76 = tmp75 - tmp35
tmp77 = tmp76 * tmp74
tmp78 = tmp77 + tmp38
tmp79 = tmp78 * tmp74
tmp80 = tmp79 - tmp41
tmp81 = tmp72 * tmp80
tmp82 = tmp67 + tmp81
tmp83 = triton_helpers.maximum(tmp7, tmp10)
tmp84 = triton_helpers.minimum(tmp83, tmp12)
tmp85 = tl.load(in_ptr0 + (tmp25 + 32*tmp84 + 2048*x2), None, eviction_policy='evict_last')
tmp86 = tmp85 * tmp42
tmp87 = tl.load(in_ptr0 + (tmp45 + 32*tmp84 + 2048*x2), None, eviction_policy='evict_last')
tmp88 = tmp87 * tmp53
tmp89 = tmp86 + tmp88
tmp90 = tl.load(in_ptr0 + (tmp58 + 32*tmp84 + 2048*x2), None, eviction_policy='evict_last')
tmp91 = tmp90 * tmp65
tmp92 = tmp89 + tmp91
tmp93 = tl.load(in_ptr0 + (tmp71 + 32*tmp84 + 2048*x2), None, eviction_policy='evict_last')
tmp94 = tmp93 * tmp80
tmp95 = tmp92 + tmp94
tmp96 = tmp5 - tmp6
tmp97 = triton_helpers.maximum(tmp96, tmp28)
tmp98 = triton_helpers.minimum(tmp97, tmp30)
tmp99 = tmp98 + tmp30
tmp100 = tmp99 * tmp33
tmp101 = tmp100 - tmp35
tmp102 = tmp101 * tmp99
tmp103 = tmp102 + tmp38
tmp104 = tmp103 * tmp99
tmp105 = tmp104 - tmp41
tmp106 = tmp82 * tmp105
tmp107 = tmp98 * tmp47
tmp108 = tmp107 - tmp49
tmp109 = tmp108 * tmp98
tmp110 = tmp109 * tmp98
tmp111 = tmp110 + tmp30
tmp112 = tmp95 * tmp111
tmp113 = tmp106 + tmp112
tmp114 = tmp7 + tmp8
tmp115 = triton_helpers.maximum(tmp114, tmp10)
tmp116 = triton_helpers.minimum(tmp115, tmp12)
tmp117 = tl.load(in_ptr0 + (tmp25 + 32*tmp116 + 2048*x2), None, eviction_policy='evict_last')
tmp118 = tmp117 * tmp42
tmp119 = tl.load(in_ptr0 + (tmp45 + 32*tmp116 + 2048*x2), None, eviction_policy='evict_last')
tmp120 = tmp119 * tmp53
tmp121 = tmp118 + tmp120
tmp122 = tl.load(in_ptr0 + (tmp58 + 32*tmp116 + 2048*x2), None, eviction_policy='evict_last')
tmp123 = tmp122 * tmp65
tmp124 = tmp121 + tmp123
tmp125 = tl.load(in_ptr0 + (tmp71 + 32*tmp116 + 2048*x2), None, eviction_policy='evict_last')
tmp126 = tmp125 * tmp80
tmp127 = tmp124 + tmp126
tmp128 = tmp7 + tmp68
tmp129 = triton_helpers.maximum(tmp128, tmp10)
tmp130 = triton_helpers.minimum(tmp129, tmp12)
tmp131 = tl.load(in_ptr0 + (tmp25 + 32*tmp130 + 2048*x2), None, eviction_policy='evict_last')
tmp132 = tmp131 * tmp42
tmp133 = tl.load(in_ptr0 + (tmp45 + 32*tmp130 + 2048*x2), None, eviction_policy='evict_last')
tmp134 = tmp133 * tmp53
tmp135 = tmp132 + tmp134
tmp136 = tl.load(in_ptr0 + (tmp58 + 32*tmp130 + 2048*x2), None, eviction_policy='evict_last')
tmp137 = tmp136 * tmp65
tmp138 = tmp135 + tmp137
tmp139 = tl.load(in_ptr0 + (tmp71 + 32*tmp130 + 2048*x2), None, eviction_policy='evict_last')
tmp140 = tmp139 * tmp80
tmp141 = tmp138 + tmp140
tmp142 = tmp30 - tmp98
tmp143 = tmp142 * tmp47
tmp144 = tmp143 - tmp49
tmp145 = tmp144 * tmp142
tmp146 = tmp145 * tmp142
tmp147 = tmp146 + tmp30
tmp148 = tmp127 * tmp147
tmp149 = tmp113 + tmp148
tmp150 = tmp73 - tmp98
tmp151 = tmp150 * tmp33
tmp152 = tmp151 - tmp35
tmp153 = tmp152 * tmp150
tmp154 = tmp153 + tmp38
tmp155 = tmp154 * tmp150
tmp156 = tmp155 - tmp41
tmp157 = tmp141 * tmp156
tmp158 = tmp149 + tmp157
tl.store(in_out_ptr0 + (x4), tmp158, None)
''', device_str='xpu')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (4, 3, 64, 32), (6144, 2048, 32, 1))
with torch.xpu._DeviceGuard(0):
torch.xpu.set_device(0)
buf0 = empty_strided_xpu((4, 3, 128, 128), (49152, 16384, 128, 1), torch.float32)
buf1 = buf0; del buf0 # reuse
buf2 = buf1; del buf1 # reuse
buf6 = buf2; del buf2 # reuse
buf13 = buf6; del buf6 # reuse
# Topologically Sorted Source Nodes: [upsample_bicubic2d], Original ATen: [aten.floor, aten.arange, aten._to_copy, aten.mul, aten._unsafe_index, aten.sub, aten.clamp, aten.add, aten.rsub]
stream0 = get_raw_stream(0)
triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_0.run(buf13, arg0_1, 196608, stream=stream0)
buf14 = empty_strided_xpu((4, 3, 128, 256), (98304, 32768, 256, 1), torch.float32)
buf15 = buf14; del buf14 # reuse
buf16 = buf15; del buf15 # reuse
buf20 = buf16; del buf16 # reuse
buf27 = buf20; del buf20 # reuse
# Topologically Sorted Source Nodes: [upsample_bicubic2d_1], Original ATen: [aten.floor, aten.arange, aten._to_copy, aten.add, aten.mul, aten.sub, aten._unsafe_index, aten.clamp, aten.rsub]
stream0 = get_raw_stream(0)
triton_poi_fused__to_copy__unsafe_index_add_arange_clamp_floor_mul_rsub_sub_1.run(buf27, arg0_1, 393216, stream=stream0)
del arg0_1
return (buf13, buf27, )
def main():
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
torch.manual_seed(42)
arg0_1 = rand_strided((4, 3, 64, 32), (6144, 2048, 32, 1), device='xpu:0', dtype=torch.float32)
def fn(a):
return (
aten.upsample_bicubic2d(a, (128, 128), True),
aten.upsample_bicubic2d(a, (128, 256), False),
)
res = call([arg0_1])
ref = fn(arg0_1)
torch.testing.assert_close(res[0], ref[0], atol=0.05, rtol=0.05)
torch.testing.assert_close(res[1], ref[1], atol=0.05, rtol=0.05)
if __name__ == "__main__":
main()