VirtualFramework/kernelsCache/ConcatImageFromPlanar-4e4df6c2.v.01070000BBCFD912.NVIDIA GeForce RTX 3070 Ti.560.94_64.bin
2025-03-07 17:23:20 +08:00

86 lines
2.2 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_86, texmode_independent
.address_size 64
// .globl ConcatImageFromPlanarKernel
.entry ConcatImageFromPlanarKernel(
.param .u64 .ptr .global .align 4 ConcatImageFromPlanarKernel_param_0,
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_1,
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_2
)
{
.reg .pred %p<11>;
.reg .f32 %f<17>;
.reg .b32 %r<23>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [ConcatImageFromPlanarKernel_param_0];
ld.param.u64 %rd2, [ConcatImageFromPlanarKernel_param_1];
ld.param.u64 %rd3, [ConcatImageFromPlanarKernel_param_2];
mov.b32 %r3, %envreg3;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r6, %r3;
mad.lo.s32 %r1, %r5, %r4, %r7;
mov.u32 %r8, %ctaid.y;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %tid.y;
mov.b32 %r11, %envreg4;
add.s32 %r12, %r10, %r11;
mad.lo.s32 %r2, %r9, %r8, %r12;
setp.gt.s32 %p1, %r1, 2047;
setp.gt.s32 %p2, %r2, 2047;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_2;
shl.b32 %r13, %r2, 11;
add.s32 %r14, %r13, %r1;
mul.wide.s32 %rd4, %r14, 4;
add.s64 %rd5, %rd1, %rd4;
max.s32 %r15, %r1, 0;
min.s32 %r16, %r15, 2047;
max.s32 %r17, %r2, 0;
min.s32 %r18, %r17, 2047;
shl.b32 %r19, %r18, 11;
or.b32 %r20, %r19, %r16;
mul.wide.u32 %rd6, %r20, 16;
add.s64 %rd7, %rd2, %rd6;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd7];
abs.ftz.f32 %f6, %f1;
abs.ftz.f32 %f8, %f2;
abs.ftz.f32 %f10, %f3;
abs.ftz.f32 %f12, %f4;
setp.geu.ftz.f32 %p4, %f10, 0f7F800000;
setp.geu.ftz.f32 %p5, %f12, 0f7F800000;
setp.geu.ftz.f32 %p6, %f8, 0f7F800000;
setp.geu.ftz.f32 %p7, %f6, 0f7F800000;
or.pred %p8, %p7, %p6;
selp.b32 %r21, -1, 0, %p8;
or.pred %p9, %p5, %p4;
selp.b32 %r22, -1, %r21, %p9;
setp.gt.s32 %p10, %r22, -1;
mul.wide.s32 %rd8, %r14, 16;
add.s64 %rd9, %rd3, %rd8;
selp.f32 %f13, %f4, 0f00000000, %p10;
ld.global.nc.f32 %f14, [%rd5+33554432];
ld.global.nc.f32 %f15, [%rd5+16777216];
ld.global.nc.f32 %f16, [%rd5];
st.global.v4.f32 [%rd9], {%f16, %f15, %f14, %f13};
$L__BB0_2:
ret;
}