86 lines
2.2 KiB
Plaintext
86 lines
2.2 KiB
Plaintext
//
|
||
// Generated by NVIDIA NVVM Compiler
|
||
//
|
||
// Compiler Build ID: UNKNOWN
|
||
// Unknown Toolkit Version
|
||
// Based on NVVM 7.0.1
|
||
//
|
||
|
||
.version 8.5
|
||
.target sm_86, texmode_independent
|
||
.address_size 64
|
||
|
||
// .globl ConcatImageFromPlanarKernel
|
||
|
||
.entry ConcatImageFromPlanarKernel(
|
||
.param .u64 .ptr .global .align 4 ConcatImageFromPlanarKernel_param_0,
|
||
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_1,
|
||
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_2
|
||
)
|
||
{
|
||
.reg .pred %p<11>;
|
||
.reg .f32 %f<17>;
|
||
.reg .b32 %r<23>;
|
||
.reg .b64 %rd<10>;
|
||
|
||
|
||
ld.param.u64 %rd1, [ConcatImageFromPlanarKernel_param_0];
|
||
ld.param.u64 %rd2, [ConcatImageFromPlanarKernel_param_1];
|
||
ld.param.u64 %rd3, [ConcatImageFromPlanarKernel_param_2];
|
||
mov.b32 %r3, %envreg3;
|
||
mov.u32 %r4, %ctaid.x;
|
||
mov.u32 %r5, %ntid.x;
|
||
mov.u32 %r6, %tid.x;
|
||
add.s32 %r7, %r6, %r3;
|
||
mad.lo.s32 %r1, %r5, %r4, %r7;
|
||
mov.u32 %r8, %ctaid.y;
|
||
mov.u32 %r9, %ntid.y;
|
||
mov.u32 %r10, %tid.y;
|
||
mov.b32 %r11, %envreg4;
|
||
add.s32 %r12, %r10, %r11;
|
||
mad.lo.s32 %r2, %r9, %r8, %r12;
|
||
setp.gt.s32 %p1, %r1, 2047;
|
||
setp.gt.s32 %p2, %r2, 2047;
|
||
or.pred %p3, %p1, %p2;
|
||
@%p3 bra $L__BB0_2;
|
||
|
||
shl.b32 %r13, %r2, 11;
|
||
add.s32 %r14, %r13, %r1;
|
||
mul.wide.s32 %rd4, %r14, 4;
|
||
add.s64 %rd5, %rd1, %rd4;
|
||
max.s32 %r15, %r1, 0;
|
||
min.s32 %r16, %r15, 2047;
|
||
max.s32 %r17, %r2, 0;
|
||
min.s32 %r18, %r17, 2047;
|
||
shl.b32 %r19, %r18, 11;
|
||
or.b32 %r20, %r19, %r16;
|
||
mul.wide.u32 %rd6, %r20, 16;
|
||
add.s64 %rd7, %rd2, %rd6;
|
||
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd7];
|
||
abs.ftz.f32 %f6, %f1;
|
||
abs.ftz.f32 %f8, %f2;
|
||
abs.ftz.f32 %f10, %f3;
|
||
abs.ftz.f32 %f12, %f4;
|
||
setp.geu.ftz.f32 %p4, %f10, 0f7F800000;
|
||
setp.geu.ftz.f32 %p5, %f12, 0f7F800000;
|
||
setp.geu.ftz.f32 %p6, %f8, 0f7F800000;
|
||
setp.geu.ftz.f32 %p7, %f6, 0f7F800000;
|
||
or.pred %p8, %p7, %p6;
|
||
selp.b32 %r21, -1, 0, %p8;
|
||
or.pred %p9, %p5, %p4;
|
||
selp.b32 %r22, -1, %r21, %p9;
|
||
setp.gt.s32 %p10, %r22, -1;
|
||
mul.wide.s32 %rd8, %r14, 16;
|
||
add.s64 %rd9, %rd3, %rd8;
|
||
selp.f32 %f13, %f4, 0f00000000, %p10;
|
||
ld.global.nc.f32 %f14, [%rd5+33554432];
|
||
ld.global.nc.f32 %f15, [%rd5+16777216];
|
||
ld.global.nc.f32 %f16, [%rd5];
|
||
st.global.v4.f32 [%rd9], {%f16, %f15, %f14, %f13};
|
||
|
||
$L__BB0_2:
|
||
ret;
|
||
|
||
}
|
||
|
||
|