1328 lines
37 KiB
Plaintext
1328 lines
37 KiB
Plaintext
//
|
||
// Generated by NVIDIA NVVM Compiler
|
||
//
|
||
// Compiler Build ID: UNKNOWN
|
||
// Unknown Toolkit Version
|
||
// Based on NVVM 7.0.1
|
||
//
|
||
|
||
.version 8.5
|
||
.target sm_86, texmode_independent
|
||
.address_size 64
|
||
|
||
// .globl BackwardPUE
|
||
|
||
.entry BackwardPUE(
|
||
.param .u64 .ptr .global .align 16 BackwardPUE_param_0,
|
||
.param .u64 .ptr .global .align 16 BackwardPUE_param_1,
|
||
.param .u32 BackwardPUE_param_2
|
||
)
|
||
{
|
||
.reg .pred %p<81>;
|
||
.reg .f32 %f<366>;
|
||
.reg .b32 %r<60>;
|
||
.reg .b64 %rd<9>;
|
||
|
||
|
||
ld.param.u64 %rd2, [BackwardPUE_param_0];
|
||
ld.param.u32 %r3, [BackwardPUE_param_2];
|
||
setp.eq.s32 %p4, %r3, 0;
|
||
@%p4 bra $L__BB0_60;
|
||
|
||
mov.b32 %r4, %envreg3;
|
||
mov.u32 %r5, %ctaid.x;
|
||
mov.u32 %r6, %ntid.x;
|
||
mov.u32 %r7, %tid.x;
|
||
add.s32 %r8, %r7, %r4;
|
||
mad.lo.s32 %r1, %r6, %r5, %r8;
|
||
mov.u32 %r9, %ctaid.y;
|
||
mov.u32 %r10, %ntid.y;
|
||
mov.u32 %r11, %tid.y;
|
||
mov.b32 %r12, %envreg4;
|
||
add.s32 %r13, %r11, %r12;
|
||
mad.lo.s32 %r2, %r10, %r9, %r13;
|
||
setp.gt.s32 %p5, %r1, 2047;
|
||
setp.gt.s32 %p6, %r2, 2047;
|
||
or.pred %p7, %p5, %p6;
|
||
@%p7 bra $L__BB0_60;
|
||
|
||
shl.b32 %r14, %r2, 11;
|
||
add.s32 %r15, %r14, %r1;
|
||
cvt.s64.s32 %rd1, %r15;
|
||
mul.wide.s32 %rd4, %r15, 16;
|
||
add.s64 %rd5, %rd2, %rd4;
|
||
ld.global.nc.v4.f32 {%f68, %f69, %f70, %f71}, [%rd5];
|
||
abs.ftz.f32 %f73, %f68;
|
||
abs.ftz.f32 %f75, %f69;
|
||
abs.ftz.f32 %f77, %f70;
|
||
abs.ftz.f32 %f79, %f71;
|
||
setp.geu.ftz.f32 %p8, %f77, 0f7F800000;
|
||
setp.geu.ftz.f32 %p9, %f79, 0f7F800000;
|
||
setp.geu.ftz.f32 %p10, %f75, 0f7F800000;
|
||
setp.geu.ftz.f32 %p11, %f73, 0f7F800000;
|
||
or.pred %p12, %p11, %p10;
|
||
selp.b32 %r16, -1, 0, %p12;
|
||
or.pred %p13, %p9, %p8;
|
||
selp.b32 %r17, -1, %r16, %p13;
|
||
setp.gt.s32 %p14, %r17, -1;
|
||
selp.f32 %f4, %f71, 0f00000000, %p14;
|
||
selp.f32 %f3, %f70, 0f00000000, %p14;
|
||
selp.f32 %f2, %f69, 0f00000000, %p14;
|
||
selp.f32 %f1, %f68, 0f00000000, %p14;
|
||
mul.ftz.f32 %f6, %f1, 0f3FF69738;
|
||
mul.ftz.f32 %f8, %f2, 0f3FF69738;
|
||
mul.ftz.f32 %f10, %f3, 0f3FF69738;
|
||
setp.gtu.ftz.f32 %p15, %f6, 0f3B123EA5;
|
||
@%p15 bra $L__BB0_4;
|
||
bra.uni $L__BB0_3;
|
||
|
||
$L__BB0_4:
|
||
setp.gtu.ftz.f32 %p16, %f6, 0f3EBDF067;
|
||
@%p16 bra $L__BB0_20;
|
||
bra.uni $L__BB0_5;
|
||
|
||
$L__BB0_20:
|
||
fma.rn.ftz.f32 %f165, %f1, 0f411FFF18, 0fC0A5DF51;
|
||
mul.ftz.f32 %f166, %f165, 0f3FB8AA3B;
|
||
ex2.approx.ftz.f32 %f167, %f166;
|
||
add.ftz.f32 %f357, %f167, 0fBBCD22E0;
|
||
bra.uni $L__BB0_21;
|
||
|
||
$L__BB0_3:
|
||
mul.ftz.f32 %f357, %f1, 0f3AB2B981;
|
||
bra.uni $L__BB0_21;
|
||
|
||
$L__BB0_5:
|
||
fma.rn.ftz.f32 %f12, %f1, 0f3F95D17C, 0fBAEA3C23;
|
||
mov.f32 %f83, 0f3F945BFF;
|
||
cvt.rzi.f32.f32 %f84, %f83;
|
||
add.ftz.f32 %f85, %f84, %f84;
|
||
mov.f32 %f86, 0f40145BFF;
|
||
sub.ftz.f32 %f87, %f86, %f85;
|
||
abs.ftz.f32 %f13, %f87;
|
||
abs.ftz.f32 %f14, %f12;
|
||
mov.b32 %r18, %f14;
|
||
and.b32 %r19, %r18, 8388607;
|
||
or.b32 %r20, %r19, 1065353216;
|
||
mov.b32 %f88, %r20;
|
||
shr.u32 %r21, %r18, 23;
|
||
cvt.rn.f32.u32 %f89, %r21;
|
||
add.ftz.f32 %f90, %f89, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p17, %f88, 0f3FB504F3;
|
||
mul.ftz.f32 %f91, %f88, 0f3F000000;
|
||
add.ftz.f32 %f92, %f90, 0f3F800000;
|
||
selp.f32 %f93, %f92, %f90, %p17;
|
||
selp.f32 %f94, %f91, %f88, %p17;
|
||
add.ftz.f32 %f95, %f94, 0fBF800000;
|
||
add.ftz.f32 %f81, %f94, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f80,%f81;
|
||
// end inline asm
|
||
add.ftz.f32 %f96, %f95, %f95;
|
||
mul.ftz.f32 %f97, %f80, %f96;
|
||
mul.ftz.f32 %f98, %f97, %f97;
|
||
mov.f32 %f99, 0f3C4CAF63;
|
||
mov.f32 %f100, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f101, %f100, %f98, %f99;
|
||
mov.f32 %f102, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f103, %f101, %f98, %f102;
|
||
mul.rn.ftz.f32 %f104, %f103, %f98;
|
||
mul.rn.ftz.f32 %f105, %f104, %f97;
|
||
sub.ftz.f32 %f106, %f95, %f97;
|
||
add.ftz.f32 %f107, %f106, %f106;
|
||
neg.ftz.f32 %f108, %f97;
|
||
fma.rn.ftz.f32 %f109, %f108, %f95, %f107;
|
||
mul.rn.ftz.f32 %f110, %f80, %f109;
|
||
add.ftz.f32 %f111, %f105, %f97;
|
||
sub.ftz.f32 %f112, %f97, %f111;
|
||
add.ftz.f32 %f113, %f105, %f112;
|
||
add.ftz.f32 %f114, %f110, %f113;
|
||
add.ftz.f32 %f115, %f111, %f114;
|
||
sub.ftz.f32 %f116, %f111, %f115;
|
||
add.ftz.f32 %f117, %f114, %f116;
|
||
mov.f32 %f118, 0f3F317200;
|
||
mul.rn.ftz.f32 %f119, %f93, %f118;
|
||
mov.f32 %f120, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f121, %f93, %f120;
|
||
add.ftz.f32 %f122, %f119, %f115;
|
||
sub.ftz.f32 %f123, %f119, %f122;
|
||
add.ftz.f32 %f124, %f115, %f123;
|
||
add.ftz.f32 %f125, %f117, %f124;
|
||
add.ftz.f32 %f126, %f121, %f125;
|
||
add.ftz.f32 %f127, %f122, %f126;
|
||
sub.ftz.f32 %f128, %f122, %f127;
|
||
add.ftz.f32 %f129, %f126, %f128;
|
||
abs.ftz.f32 %f15, %f86;
|
||
setp.gt.ftz.f32 %p18, %f15, 0f77F684DF;
|
||
selp.f32 %f130, 0f39945BFF, 0f40145BFF, %p18;
|
||
mul.rn.ftz.f32 %f131, %f130, %f127;
|
||
neg.ftz.f32 %f132, %f131;
|
||
fma.rn.ftz.f32 %f133, %f130, %f127, %f132;
|
||
fma.rn.ftz.f32 %f134, %f130, %f129, %f133;
|
||
mov.f32 %f135, 0f00000000;
|
||
fma.rn.ftz.f32 %f136, %f135, %f127, %f134;
|
||
add.rn.ftz.f32 %f137, %f131, %f136;
|
||
neg.ftz.f32 %f138, %f137;
|
||
add.rn.ftz.f32 %f139, %f131, %f138;
|
||
add.rn.ftz.f32 %f140, %f139, %f136;
|
||
mov.b32 %r22, %f137;
|
||
setp.eq.s32 %p19, %r22, 1118925336;
|
||
add.s32 %r23, %r22, -1;
|
||
mov.b32 %f141, %r23;
|
||
add.ftz.f32 %f142, %f140, 0f37000000;
|
||
selp.f32 %f16, %f142, %f140, %p19;
|
||
selp.f32 %f143, %f141, %f137, %p19;
|
||
mov.f32 %f144, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f145, %f143, %f144;
|
||
cvt.rzi.f32.f32 %f146, %f145;
|
||
abs.ftz.f32 %f147, %f146;
|
||
setp.gt.ftz.f32 %p20, %f147, 0f42FC0000;
|
||
mov.b32 %r24, %f146;
|
||
and.b32 %r25, %r24, -2147483648;
|
||
or.b32 %r26, %r25, 1123811328;
|
||
mov.b32 %f148, %r26;
|
||
selp.f32 %f149, %f148, %f146, %p20;
|
||
mov.f32 %f150, 0fBF317218;
|
||
fma.rn.ftz.f32 %f151, %f149, %f150, %f143;
|
||
mov.f32 %f152, 0f3102E308;
|
||
fma.rn.ftz.f32 %f153, %f149, %f152, %f151;
|
||
mul.ftz.f32 %f154, %f153, 0f3FB8AA3B;
|
||
add.ftz.f32 %f155, %f149, 0f4B40007F;
|
||
mov.b32 %r27, %f155;
|
||
shl.b32 %r28, %r27, 23;
|
||
mov.b32 %f156, %r28;
|
||
ex2.approx.ftz.f32 %f157, %f154;
|
||
mul.ftz.f32 %f17, %f157, %f156;
|
||
setp.eq.ftz.f32 %p21, %f17, 0f7F800000;
|
||
mov.f32 %f356, 0f7F800000;
|
||
@%p21 bra $L__BB0_7;
|
||
|
||
fma.rn.ftz.f32 %f356, %f17, %f16, %f17;
|
||
|
||
$L__BB0_7:
|
||
setp.lt.ftz.f32 %p22, %f12, 0f00000000;
|
||
setp.eq.ftz.f32 %p23, %f13, 0f3F800000;
|
||
and.pred %p1, %p22, %p23;
|
||
setp.eq.ftz.f32 %p24, %f12, 0f00000000;
|
||
@%p24 bra $L__BB0_11;
|
||
bra.uni $L__BB0_8;
|
||
|
||
$L__BB0_11:
|
||
add.ftz.f32 %f162, %f12, %f12;
|
||
selp.f32 %f356, %f162, 0f00000000, %p23;
|
||
bra.uni $L__BB0_12;
|
||
|
||
$L__BB0_8:
|
||
setp.geu.ftz.f32 %p25, %f12, 0f00000000;
|
||
@%p25 bra $L__BB0_12;
|
||
|
||
mov.f32 %f158, 0f40145BFF;
|
||
cvt.rzi.f32.f32 %f159, %f158;
|
||
setp.eq.ftz.f32 %p26, %f159, 0f40145BFF;
|
||
mov.b32 %r29, %f356;
|
||
xor.b32 %r30, %r29, -2147483648;
|
||
mov.b32 %f160, %r30;
|
||
selp.f32 %f356, %f160, %f356, %p1;
|
||
@%p26 bra $L__BB0_12;
|
||
|
||
mov.f32 %f356, 0f7FFFFFFF;
|
||
|
||
$L__BB0_12:
|
||
add.ftz.f32 %f163, %f14, %f15;
|
||
mov.b32 %r31, %f163;
|
||
setp.lt.s32 %p28, %r31, 2139095040;
|
||
@%p28 bra $L__BB0_19;
|
||
|
||
setp.gtu.ftz.f32 %p29, %f14, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p30, %f15, 0f7F800000;
|
||
or.pred %p31, %p29, %p30;
|
||
@%p31 bra $L__BB0_18;
|
||
bra.uni $L__BB0_14;
|
||
|
||
$L__BB0_18:
|
||
add.ftz.f32 %f356, %f12, 0f40145BFF;
|
||
bra.uni $L__BB0_19;
|
||
|
||
$L__BB0_14:
|
||
setp.eq.ftz.f32 %p32, %f15, 0f7F800000;
|
||
@%p32 bra $L__BB0_17;
|
||
bra.uni $L__BB0_15;
|
||
|
||
$L__BB0_17:
|
||
setp.gt.ftz.f32 %p34, %f14, 0f3F800000;
|
||
selp.f32 %f164, 0f7F800000, 0f00000000, %p34;
|
||
setp.eq.ftz.f32 %p35, %f12, 0fBF800000;
|
||
selp.f32 %f356, 0f3F800000, %f164, %p35;
|
||
bra.uni $L__BB0_19;
|
||
|
||
$L__BB0_15:
|
||
setp.neu.ftz.f32 %p33, %f14, 0f7F800000;
|
||
@%p33 bra $L__BB0_19;
|
||
|
||
selp.f32 %f356, 0fFF800000, 0f7F800000, %p1;
|
||
|
||
$L__BB0_19:
|
||
setp.eq.ftz.f32 %p36, %f12, 0f3F800000;
|
||
selp.f32 %f357, 0f3F800000, %f356, %p36;
|
||
|
||
$L__BB0_21:
|
||
setp.gtu.ftz.f32 %p37, %f8, 0f3B123EA5;
|
||
@%p37 bra $L__BB0_23;
|
||
bra.uni $L__BB0_22;
|
||
|
||
$L__BB0_23:
|
||
setp.gtu.ftz.f32 %p38, %f8, 0f3EBDF067;
|
||
@%p38 bra $L__BB0_39;
|
||
bra.uni $L__BB0_24;
|
||
|
||
$L__BB0_39:
|
||
fma.rn.ftz.f32 %f253, %f2, 0f411FFF18, 0fC0A5DF51;
|
||
mul.ftz.f32 %f254, %f253, 0f3FB8AA3B;
|
||
ex2.approx.ftz.f32 %f255, %f254;
|
||
add.ftz.f32 %f361, %f255, 0fBBCD22E0;
|
||
bra.uni $L__BB0_40;
|
||
|
||
$L__BB0_22:
|
||
mul.ftz.f32 %f361, %f2, 0f3AB2B981;
|
||
bra.uni $L__BB0_40;
|
||
|
||
$L__BB0_24:
|
||
fma.rn.ftz.f32 %f31, %f2, 0f3F95D17C, 0fBAEA3C23;
|
||
mov.f32 %f171, 0f3F945BFF;
|
||
cvt.rzi.f32.f32 %f172, %f171;
|
||
add.ftz.f32 %f173, %f172, %f172;
|
||
mov.f32 %f174, 0f40145BFF;
|
||
sub.ftz.f32 %f175, %f174, %f173;
|
||
abs.ftz.f32 %f32, %f175;
|
||
abs.ftz.f32 %f33, %f31;
|
||
mov.b32 %r32, %f33;
|
||
and.b32 %r33, %r32, 8388607;
|
||
or.b32 %r34, %r33, 1065353216;
|
||
mov.b32 %f176, %r34;
|
||
shr.u32 %r35, %r32, 23;
|
||
cvt.rn.f32.u32 %f177, %r35;
|
||
add.ftz.f32 %f178, %f177, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p39, %f176, 0f3FB504F3;
|
||
mul.ftz.f32 %f179, %f176, 0f3F000000;
|
||
add.ftz.f32 %f180, %f178, 0f3F800000;
|
||
selp.f32 %f181, %f180, %f178, %p39;
|
||
selp.f32 %f182, %f179, %f176, %p39;
|
||
add.ftz.f32 %f183, %f182, 0fBF800000;
|
||
add.ftz.f32 %f169, %f182, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f168,%f169;
|
||
// end inline asm
|
||
add.ftz.f32 %f184, %f183, %f183;
|
||
mul.ftz.f32 %f185, %f168, %f184;
|
||
mul.ftz.f32 %f186, %f185, %f185;
|
||
mov.f32 %f187, 0f3C4CAF63;
|
||
mov.f32 %f188, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f189, %f188, %f186, %f187;
|
||
mov.f32 %f190, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f191, %f189, %f186, %f190;
|
||
mul.rn.ftz.f32 %f192, %f191, %f186;
|
||
mul.rn.ftz.f32 %f193, %f192, %f185;
|
||
sub.ftz.f32 %f194, %f183, %f185;
|
||
add.ftz.f32 %f195, %f194, %f194;
|
||
neg.ftz.f32 %f196, %f185;
|
||
fma.rn.ftz.f32 %f197, %f196, %f183, %f195;
|
||
mul.rn.ftz.f32 %f198, %f168, %f197;
|
||
add.ftz.f32 %f199, %f193, %f185;
|
||
sub.ftz.f32 %f200, %f185, %f199;
|
||
add.ftz.f32 %f201, %f193, %f200;
|
||
add.ftz.f32 %f202, %f198, %f201;
|
||
add.ftz.f32 %f203, %f199, %f202;
|
||
sub.ftz.f32 %f204, %f199, %f203;
|
||
add.ftz.f32 %f205, %f202, %f204;
|
||
mov.f32 %f206, 0f3F317200;
|
||
mul.rn.ftz.f32 %f207, %f181, %f206;
|
||
mov.f32 %f208, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f209, %f181, %f208;
|
||
add.ftz.f32 %f210, %f207, %f203;
|
||
sub.ftz.f32 %f211, %f207, %f210;
|
||
add.ftz.f32 %f212, %f203, %f211;
|
||
add.ftz.f32 %f213, %f205, %f212;
|
||
add.ftz.f32 %f214, %f209, %f213;
|
||
add.ftz.f32 %f215, %f210, %f214;
|
||
sub.ftz.f32 %f216, %f210, %f215;
|
||
add.ftz.f32 %f217, %f214, %f216;
|
||
abs.ftz.f32 %f34, %f174;
|
||
setp.gt.ftz.f32 %p40, %f34, 0f77F684DF;
|
||
selp.f32 %f218, 0f39945BFF, 0f40145BFF, %p40;
|
||
mul.rn.ftz.f32 %f219, %f218, %f215;
|
||
neg.ftz.f32 %f220, %f219;
|
||
fma.rn.ftz.f32 %f221, %f218, %f215, %f220;
|
||
fma.rn.ftz.f32 %f222, %f218, %f217, %f221;
|
||
mov.f32 %f223, 0f00000000;
|
||
fma.rn.ftz.f32 %f224, %f223, %f215, %f222;
|
||
add.rn.ftz.f32 %f225, %f219, %f224;
|
||
neg.ftz.f32 %f226, %f225;
|
||
add.rn.ftz.f32 %f227, %f219, %f226;
|
||
add.rn.ftz.f32 %f228, %f227, %f224;
|
||
mov.b32 %r36, %f225;
|
||
setp.eq.s32 %p41, %r36, 1118925336;
|
||
add.s32 %r37, %r36, -1;
|
||
mov.b32 %f229, %r37;
|
||
add.ftz.f32 %f230, %f228, 0f37000000;
|
||
selp.f32 %f35, %f230, %f228, %p41;
|
||
selp.f32 %f231, %f229, %f225, %p41;
|
||
mov.f32 %f232, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f233, %f231, %f232;
|
||
cvt.rzi.f32.f32 %f234, %f233;
|
||
abs.ftz.f32 %f235, %f234;
|
||
setp.gt.ftz.f32 %p42, %f235, 0f42FC0000;
|
||
mov.b32 %r38, %f234;
|
||
and.b32 %r39, %r38, -2147483648;
|
||
or.b32 %r40, %r39, 1123811328;
|
||
mov.b32 %f236, %r40;
|
||
selp.f32 %f237, %f236, %f234, %p42;
|
||
mov.f32 %f238, 0fBF317218;
|
||
fma.rn.ftz.f32 %f239, %f237, %f238, %f231;
|
||
mov.f32 %f240, 0f3102E308;
|
||
fma.rn.ftz.f32 %f241, %f237, %f240, %f239;
|
||
mul.ftz.f32 %f242, %f241, 0f3FB8AA3B;
|
||
add.ftz.f32 %f243, %f237, 0f4B40007F;
|
||
mov.b32 %r41, %f243;
|
||
shl.b32 %r42, %r41, 23;
|
||
mov.b32 %f244, %r42;
|
||
ex2.approx.ftz.f32 %f245, %f242;
|
||
mul.ftz.f32 %f36, %f245, %f244;
|
||
setp.eq.ftz.f32 %p43, %f36, 0f7F800000;
|
||
mov.f32 %f360, 0f7F800000;
|
||
@%p43 bra $L__BB0_26;
|
||
|
||
fma.rn.ftz.f32 %f360, %f36, %f35, %f36;
|
||
|
||
$L__BB0_26:
|
||
setp.lt.ftz.f32 %p44, %f31, 0f00000000;
|
||
setp.eq.ftz.f32 %p45, %f32, 0f3F800000;
|
||
and.pred %p2, %p44, %p45;
|
||
setp.eq.ftz.f32 %p46, %f31, 0f00000000;
|
||
@%p46 bra $L__BB0_30;
|
||
bra.uni $L__BB0_27;
|
||
|
||
$L__BB0_30:
|
||
add.ftz.f32 %f250, %f31, %f31;
|
||
selp.f32 %f360, %f250, 0f00000000, %p45;
|
||
bra.uni $L__BB0_31;
|
||
|
||
$L__BB0_27:
|
||
setp.geu.ftz.f32 %p47, %f31, 0f00000000;
|
||
@%p47 bra $L__BB0_31;
|
||
|
||
mov.f32 %f246, 0f40145BFF;
|
||
cvt.rzi.f32.f32 %f247, %f246;
|
||
setp.eq.ftz.f32 %p48, %f247, 0f40145BFF;
|
||
mov.b32 %r43, %f360;
|
||
xor.b32 %r44, %r43, -2147483648;
|
||
mov.b32 %f248, %r44;
|
||
selp.f32 %f360, %f248, %f360, %p2;
|
||
@%p48 bra $L__BB0_31;
|
||
|
||
mov.f32 %f360, 0f7FFFFFFF;
|
||
|
||
$L__BB0_31:
|
||
add.ftz.f32 %f251, %f33, %f34;
|
||
mov.b32 %r45, %f251;
|
||
setp.lt.s32 %p50, %r45, 2139095040;
|
||
@%p50 bra $L__BB0_38;
|
||
|
||
setp.gtu.ftz.f32 %p51, %f33, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p52, %f34, 0f7F800000;
|
||
or.pred %p53, %p51, %p52;
|
||
@%p53 bra $L__BB0_37;
|
||
bra.uni $L__BB0_33;
|
||
|
||
$L__BB0_37:
|
||
add.ftz.f32 %f360, %f31, 0f40145BFF;
|
||
bra.uni $L__BB0_38;
|
||
|
||
$L__BB0_33:
|
||
setp.eq.ftz.f32 %p54, %f34, 0f7F800000;
|
||
@%p54 bra $L__BB0_36;
|
||
bra.uni $L__BB0_34;
|
||
|
||
$L__BB0_36:
|
||
setp.gt.ftz.f32 %p56, %f33, 0f3F800000;
|
||
selp.f32 %f252, 0f7F800000, 0f00000000, %p56;
|
||
setp.eq.ftz.f32 %p57, %f31, 0fBF800000;
|
||
selp.f32 %f360, 0f3F800000, %f252, %p57;
|
||
bra.uni $L__BB0_38;
|
||
|
||
$L__BB0_34:
|
||
setp.neu.ftz.f32 %p55, %f33, 0f7F800000;
|
||
@%p55 bra $L__BB0_38;
|
||
|
||
selp.f32 %f360, 0fFF800000, 0f7F800000, %p2;
|
||
|
||
$L__BB0_38:
|
||
setp.eq.ftz.f32 %p58, %f31, 0f3F800000;
|
||
selp.f32 %f361, 0f3F800000, %f360, %p58;
|
||
|
||
$L__BB0_40:
|
||
setp.gtu.ftz.f32 %p59, %f10, 0f3B123EA5;
|
||
@%p59 bra $L__BB0_42;
|
||
bra.uni $L__BB0_41;
|
||
|
||
$L__BB0_42:
|
||
setp.gtu.ftz.f32 %p60, %f10, 0f3EBDF067;
|
||
@%p60 bra $L__BB0_58;
|
||
bra.uni $L__BB0_43;
|
||
|
||
$L__BB0_58:
|
||
fma.rn.ftz.f32 %f341, %f3, 0f411FFF18, 0fC0A5DF51;
|
||
mul.ftz.f32 %f342, %f341, 0f3FB8AA3B;
|
||
ex2.approx.ftz.f32 %f343, %f342;
|
||
add.ftz.f32 %f365, %f343, 0fBBCD22E0;
|
||
bra.uni $L__BB0_59;
|
||
|
||
$L__BB0_41:
|
||
mul.ftz.f32 %f365, %f3, 0f3AB2B981;
|
||
bra.uni $L__BB0_59;
|
||
|
||
$L__BB0_43:
|
||
fma.rn.ftz.f32 %f50, %f3, 0f3F95D17C, 0fBAEA3C23;
|
||
mov.f32 %f259, 0f3F945BFF;
|
||
cvt.rzi.f32.f32 %f260, %f259;
|
||
add.ftz.f32 %f261, %f260, %f260;
|
||
mov.f32 %f262, 0f40145BFF;
|
||
sub.ftz.f32 %f263, %f262, %f261;
|
||
abs.ftz.f32 %f51, %f263;
|
||
abs.ftz.f32 %f52, %f50;
|
||
mov.b32 %r46, %f52;
|
||
and.b32 %r47, %r46, 8388607;
|
||
or.b32 %r48, %r47, 1065353216;
|
||
mov.b32 %f264, %r48;
|
||
shr.u32 %r49, %r46, 23;
|
||
cvt.rn.f32.u32 %f265, %r49;
|
||
add.ftz.f32 %f266, %f265, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p61, %f264, 0f3FB504F3;
|
||
mul.ftz.f32 %f267, %f264, 0f3F000000;
|
||
add.ftz.f32 %f268, %f266, 0f3F800000;
|
||
selp.f32 %f269, %f268, %f266, %p61;
|
||
selp.f32 %f270, %f267, %f264, %p61;
|
||
add.ftz.f32 %f271, %f270, 0fBF800000;
|
||
add.ftz.f32 %f257, %f270, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f256,%f257;
|
||
// end inline asm
|
||
add.ftz.f32 %f272, %f271, %f271;
|
||
mul.ftz.f32 %f273, %f256, %f272;
|
||
mul.ftz.f32 %f274, %f273, %f273;
|
||
mov.f32 %f275, 0f3C4CAF63;
|
||
mov.f32 %f276, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f277, %f276, %f274, %f275;
|
||
mov.f32 %f278, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f279, %f277, %f274, %f278;
|
||
mul.rn.ftz.f32 %f280, %f279, %f274;
|
||
mul.rn.ftz.f32 %f281, %f280, %f273;
|
||
sub.ftz.f32 %f282, %f271, %f273;
|
||
add.ftz.f32 %f283, %f282, %f282;
|
||
neg.ftz.f32 %f284, %f273;
|
||
fma.rn.ftz.f32 %f285, %f284, %f271, %f283;
|
||
mul.rn.ftz.f32 %f286, %f256, %f285;
|
||
add.ftz.f32 %f287, %f281, %f273;
|
||
sub.ftz.f32 %f288, %f273, %f287;
|
||
add.ftz.f32 %f289, %f281, %f288;
|
||
add.ftz.f32 %f290, %f286, %f289;
|
||
add.ftz.f32 %f291, %f287, %f290;
|
||
sub.ftz.f32 %f292, %f287, %f291;
|
||
add.ftz.f32 %f293, %f290, %f292;
|
||
mov.f32 %f294, 0f3F317200;
|
||
mul.rn.ftz.f32 %f295, %f269, %f294;
|
||
mov.f32 %f296, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f297, %f269, %f296;
|
||
add.ftz.f32 %f298, %f295, %f291;
|
||
sub.ftz.f32 %f299, %f295, %f298;
|
||
add.ftz.f32 %f300, %f291, %f299;
|
||
add.ftz.f32 %f301, %f293, %f300;
|
||
add.ftz.f32 %f302, %f297, %f301;
|
||
add.ftz.f32 %f303, %f298, %f302;
|
||
sub.ftz.f32 %f304, %f298, %f303;
|
||
add.ftz.f32 %f305, %f302, %f304;
|
||
abs.ftz.f32 %f53, %f262;
|
||
setp.gt.ftz.f32 %p62, %f53, 0f77F684DF;
|
||
selp.f32 %f306, 0f39945BFF, 0f40145BFF, %p62;
|
||
mul.rn.ftz.f32 %f307, %f306, %f303;
|
||
neg.ftz.f32 %f308, %f307;
|
||
fma.rn.ftz.f32 %f309, %f306, %f303, %f308;
|
||
fma.rn.ftz.f32 %f310, %f306, %f305, %f309;
|
||
mov.f32 %f311, 0f00000000;
|
||
fma.rn.ftz.f32 %f312, %f311, %f303, %f310;
|
||
add.rn.ftz.f32 %f313, %f307, %f312;
|
||
neg.ftz.f32 %f314, %f313;
|
||
add.rn.ftz.f32 %f315, %f307, %f314;
|
||
add.rn.ftz.f32 %f316, %f315, %f312;
|
||
mov.b32 %r50, %f313;
|
||
setp.eq.s32 %p63, %r50, 1118925336;
|
||
add.s32 %r51, %r50, -1;
|
||
mov.b32 %f317, %r51;
|
||
add.ftz.f32 %f318, %f316, 0f37000000;
|
||
selp.f32 %f54, %f318, %f316, %p63;
|
||
selp.f32 %f319, %f317, %f313, %p63;
|
||
mov.f32 %f320, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f321, %f319, %f320;
|
||
cvt.rzi.f32.f32 %f322, %f321;
|
||
abs.ftz.f32 %f323, %f322;
|
||
setp.gt.ftz.f32 %p64, %f323, 0f42FC0000;
|
||
mov.b32 %r52, %f322;
|
||
and.b32 %r53, %r52, -2147483648;
|
||
or.b32 %r54, %r53, 1123811328;
|
||
mov.b32 %f324, %r54;
|
||
selp.f32 %f325, %f324, %f322, %p64;
|
||
mov.f32 %f326, 0fBF317218;
|
||
fma.rn.ftz.f32 %f327, %f325, %f326, %f319;
|
||
mov.f32 %f328, 0f3102E308;
|
||
fma.rn.ftz.f32 %f329, %f325, %f328, %f327;
|
||
mul.ftz.f32 %f330, %f329, 0f3FB8AA3B;
|
||
add.ftz.f32 %f331, %f325, 0f4B40007F;
|
||
mov.b32 %r55, %f331;
|
||
shl.b32 %r56, %r55, 23;
|
||
mov.b32 %f332, %r56;
|
||
ex2.approx.ftz.f32 %f333, %f330;
|
||
mul.ftz.f32 %f55, %f333, %f332;
|
||
setp.eq.ftz.f32 %p65, %f55, 0f7F800000;
|
||
mov.f32 %f364, 0f7F800000;
|
||
@%p65 bra $L__BB0_45;
|
||
|
||
fma.rn.ftz.f32 %f364, %f55, %f54, %f55;
|
||
|
||
$L__BB0_45:
|
||
setp.lt.ftz.f32 %p66, %f50, 0f00000000;
|
||
setp.eq.ftz.f32 %p67, %f51, 0f3F800000;
|
||
and.pred %p3, %p66, %p67;
|
||
setp.eq.ftz.f32 %p68, %f50, 0f00000000;
|
||
@%p68 bra $L__BB0_49;
|
||
bra.uni $L__BB0_46;
|
||
|
||
$L__BB0_49:
|
||
add.ftz.f32 %f338, %f50, %f50;
|
||
selp.f32 %f364, %f338, 0f00000000, %p67;
|
||
bra.uni $L__BB0_50;
|
||
|
||
$L__BB0_46:
|
||
setp.geu.ftz.f32 %p69, %f50, 0f00000000;
|
||
@%p69 bra $L__BB0_50;
|
||
|
||
mov.f32 %f334, 0f40145BFF;
|
||
cvt.rzi.f32.f32 %f335, %f334;
|
||
setp.eq.ftz.f32 %p70, %f335, 0f40145BFF;
|
||
mov.b32 %r57, %f364;
|
||
xor.b32 %r58, %r57, -2147483648;
|
||
mov.b32 %f336, %r58;
|
||
selp.f32 %f364, %f336, %f364, %p3;
|
||
@%p70 bra $L__BB0_50;
|
||
|
||
mov.f32 %f364, 0f7FFFFFFF;
|
||
|
||
$L__BB0_50:
|
||
add.ftz.f32 %f339, %f52, %f53;
|
||
mov.b32 %r59, %f339;
|
||
setp.lt.s32 %p72, %r59, 2139095040;
|
||
@%p72 bra $L__BB0_57;
|
||
|
||
setp.gtu.ftz.f32 %p73, %f52, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p74, %f53, 0f7F800000;
|
||
or.pred %p75, %p73, %p74;
|
||
@%p75 bra $L__BB0_56;
|
||
bra.uni $L__BB0_52;
|
||
|
||
$L__BB0_56:
|
||
add.ftz.f32 %f364, %f50, 0f40145BFF;
|
||
bra.uni $L__BB0_57;
|
||
|
||
$L__BB0_52:
|
||
setp.eq.ftz.f32 %p76, %f53, 0f7F800000;
|
||
@%p76 bra $L__BB0_55;
|
||
bra.uni $L__BB0_53;
|
||
|
||
$L__BB0_55:
|
||
setp.gt.ftz.f32 %p78, %f52, 0f3F800000;
|
||
selp.f32 %f340, 0f7F800000, 0f00000000, %p78;
|
||
setp.eq.ftz.f32 %p79, %f50, 0fBF800000;
|
||
selp.f32 %f364, 0f3F800000, %f340, %p79;
|
||
bra.uni $L__BB0_57;
|
||
|
||
$L__BB0_53:
|
||
setp.neu.ftz.f32 %p77, %f52, 0f7F800000;
|
||
@%p77 bra $L__BB0_57;
|
||
|
||
selp.f32 %f364, 0fFF800000, 0f7F800000, %p3;
|
||
|
||
$L__BB0_57:
|
||
setp.eq.ftz.f32 %p80, %f50, 0f3F800000;
|
||
selp.f32 %f365, 0f3F800000, %f364, %p80;
|
||
|
||
$L__BB0_59:
|
||
ld.param.u64 %rd8, [BackwardPUE_param_1];
|
||
mov.f32 %f344, 0f00000000;
|
||
max.f32 %f345, %f357, %f344;
|
||
mov.f32 %f346, 0f7F7FFFFF;
|
||
max.f32 %f347, %f361, %f344;
|
||
max.f32 %f348, %f365, %f344;
|
||
max.f32 %f349, %f4, %f344;
|
||
min.f32 %f350, %f348, %f346;
|
||
min.f32 %f351, %f347, %f346;
|
||
min.f32 %f352, %f345, %f346;
|
||
min.f32 %f353, %f349, %f346;
|
||
shl.b64 %rd6, %rd1, 4;
|
||
add.s64 %rd7, %rd8, %rd6;
|
||
st.global.v4.f32 [%rd7], {%f352, %f351, %f350, %f353};
|
||
|
||
$L__BB0_60:
|
||
ret;
|
||
|
||
}
|
||
// .globl ForwardPUE
|
||
.entry ForwardPUE(
|
||
.param .u64 .ptr .global .align 16 ForwardPUE_param_0,
|
||
.param .u64 .ptr .global .align 16 ForwardPUE_param_1,
|
||
.param .u32 ForwardPUE_param_2
|
||
)
|
||
{
|
||
.reg .pred %p<81>;
|
||
.reg .f32 %f<359>;
|
||
.reg .b32 %r<60>;
|
||
.reg .b64 %rd<8>;
|
||
|
||
|
||
ld.param.u64 %rd2, [ForwardPUE_param_0];
|
||
ld.param.u64 %rd3, [ForwardPUE_param_1];
|
||
ld.param.u32 %r3, [ForwardPUE_param_2];
|
||
mov.b32 %r4, %envreg3;
|
||
mov.u32 %r5, %ctaid.x;
|
||
mov.u32 %r6, %ntid.x;
|
||
mov.u32 %r7, %tid.x;
|
||
add.s32 %r8, %r7, %r4;
|
||
mad.lo.s32 %r1, %r6, %r5, %r8;
|
||
mov.u32 %r9, %ctaid.y;
|
||
mov.u32 %r10, %ntid.y;
|
||
mov.u32 %r11, %tid.y;
|
||
mov.b32 %r12, %envreg4;
|
||
add.s32 %r13, %r11, %r12;
|
||
mad.lo.s32 %r2, %r10, %r9, %r13;
|
||
setp.gt.s32 %p4, %r1, 2047;
|
||
setp.gt.s32 %p5, %r2, 2047;
|
||
or.pred %p6, %p4, %p5;
|
||
@%p6 bra $L__BB1_61;
|
||
|
||
shl.b32 %r14, %r2, 11;
|
||
add.s32 %r15, %r14, %r1;
|
||
cvt.s64.s32 %rd1, %r15;
|
||
mul.wide.s32 %rd4, %r15, 16;
|
||
add.s64 %rd5, %rd2, %rd4;
|
||
ld.global.nc.v4.f32 {%f68, %f69, %f70, %f71}, [%rd5];
|
||
abs.ftz.f32 %f73, %f68;
|
||
abs.ftz.f32 %f75, %f69;
|
||
abs.ftz.f32 %f77, %f70;
|
||
abs.ftz.f32 %f79, %f71;
|
||
setp.geu.ftz.f32 %p7, %f77, 0f7F800000;
|
||
setp.geu.ftz.f32 %p8, %f79, 0f7F800000;
|
||
setp.geu.ftz.f32 %p9, %f75, 0f7F800000;
|
||
setp.geu.ftz.f32 %p10, %f73, 0f7F800000;
|
||
or.pred %p11, %p10, %p9;
|
||
selp.b32 %r16, -1, 0, %p11;
|
||
or.pred %p12, %p8, %p7;
|
||
selp.b32 %r17, -1, %r16, %p12;
|
||
setp.gt.s32 %p13, %r17, -1;
|
||
selp.f32 %f4, %f71, 0f00000000, %p13;
|
||
selp.f32 %f358, %f70, 0f00000000, %p13;
|
||
selp.f32 %f357, %f69, 0f00000000, %p13;
|
||
selp.f32 %f356, %f68, 0f00000000, %p13;
|
||
setp.eq.s32 %p14, %r3, 0;
|
||
@%p14 bra $L__BB1_60;
|
||
|
||
setp.gtu.ftz.f32 %p15, %f356, 0f35D3FDC0;
|
||
@%p15 bra $L__BB1_4;
|
||
bra.uni $L__BB1_3;
|
||
|
||
$L__BB1_4:
|
||
setp.gtu.ftz.f32 %p16, %f356, 0f3D03ED56;
|
||
@%p16 bra $L__BB1_20;
|
||
bra.uni $L__BB1_5;
|
||
|
||
$L__BB1_20:
|
||
add.ftz.f32 %f166, %f356, 0f3BCD22E0;
|
||
lg2.approx.ftz.f32 %f167, %f166;
|
||
fma.rn.ftz.f32 %f347, %f167, 0f3E08BDF0, 0f3F7FA592;
|
||
bra.uni $L__BB1_21;
|
||
|
||
$L__BB1_3:
|
||
mul.ftz.f32 %f347, %f356, 0f44B09ACE;
|
||
bra.uni $L__BB1_21;
|
||
|
||
$L__BB1_5:
|
||
mov.f32 %f83, 0f3E5CDE7E;
|
||
cvt.rzi.f32.f32 %f84, %f83;
|
||
add.ftz.f32 %f85, %f84, %f84;
|
||
mov.f32 %f86, 0f3EDCDE7E;
|
||
sub.ftz.f32 %f87, %f86, %f85;
|
||
abs.ftz.f32 %f9, %f87;
|
||
abs.ftz.f32 %f10, %f356;
|
||
mov.b32 %r18, %f10;
|
||
and.b32 %r19, %r18, 8388607;
|
||
or.b32 %r20, %r19, 1065353216;
|
||
mov.b32 %f88, %r20;
|
||
shr.u32 %r21, %r18, 23;
|
||
cvt.rn.f32.u32 %f89, %r21;
|
||
add.ftz.f32 %f90, %f89, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p17, %f88, 0f3FB504F3;
|
||
mul.ftz.f32 %f91, %f88, 0f3F000000;
|
||
add.ftz.f32 %f92, %f90, 0f3F800000;
|
||
selp.f32 %f93, %f92, %f90, %p17;
|
||
selp.f32 %f94, %f91, %f88, %p17;
|
||
add.ftz.f32 %f95, %f94, 0fBF800000;
|
||
add.ftz.f32 %f81, %f94, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f80,%f81;
|
||
// end inline asm
|
||
add.ftz.f32 %f96, %f95, %f95;
|
||
mul.ftz.f32 %f97, %f80, %f96;
|
||
mul.ftz.f32 %f98, %f97, %f97;
|
||
mov.f32 %f99, 0f3C4CAF63;
|
||
mov.f32 %f100, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f101, %f100, %f98, %f99;
|
||
mov.f32 %f102, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f103, %f101, %f98, %f102;
|
||
mul.rn.ftz.f32 %f104, %f103, %f98;
|
||
mul.rn.ftz.f32 %f105, %f104, %f97;
|
||
sub.ftz.f32 %f106, %f95, %f97;
|
||
add.ftz.f32 %f107, %f106, %f106;
|
||
neg.ftz.f32 %f108, %f97;
|
||
fma.rn.ftz.f32 %f109, %f108, %f95, %f107;
|
||
mul.rn.ftz.f32 %f110, %f80, %f109;
|
||
add.ftz.f32 %f111, %f105, %f97;
|
||
sub.ftz.f32 %f112, %f97, %f111;
|
||
add.ftz.f32 %f113, %f105, %f112;
|
||
add.ftz.f32 %f114, %f110, %f113;
|
||
add.ftz.f32 %f115, %f111, %f114;
|
||
sub.ftz.f32 %f116, %f111, %f115;
|
||
add.ftz.f32 %f117, %f114, %f116;
|
||
mov.f32 %f118, 0f3F317200;
|
||
mul.rn.ftz.f32 %f119, %f93, %f118;
|
||
mov.f32 %f120, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f121, %f93, %f120;
|
||
add.ftz.f32 %f122, %f119, %f115;
|
||
sub.ftz.f32 %f123, %f119, %f122;
|
||
add.ftz.f32 %f124, %f115, %f123;
|
||
add.ftz.f32 %f125, %f117, %f124;
|
||
add.ftz.f32 %f126, %f121, %f125;
|
||
add.ftz.f32 %f127, %f122, %f126;
|
||
sub.ftz.f32 %f128, %f122, %f127;
|
||
add.ftz.f32 %f129, %f126, %f128;
|
||
abs.ftz.f32 %f11, %f86;
|
||
setp.gt.ftz.f32 %p18, %f11, 0f77F684DF;
|
||
selp.f32 %f130, 0f385CDE7E, 0f3EDCDE7E, %p18;
|
||
mul.rn.ftz.f32 %f131, %f130, %f127;
|
||
neg.ftz.f32 %f132, %f131;
|
||
fma.rn.ftz.f32 %f133, %f130, %f127, %f132;
|
||
fma.rn.ftz.f32 %f134, %f130, %f129, %f133;
|
||
mov.f32 %f135, 0f00000000;
|
||
fma.rn.ftz.f32 %f136, %f135, %f127, %f134;
|
||
add.rn.ftz.f32 %f137, %f131, %f136;
|
||
neg.ftz.f32 %f138, %f137;
|
||
add.rn.ftz.f32 %f139, %f131, %f138;
|
||
add.rn.ftz.f32 %f140, %f139, %f136;
|
||
mov.b32 %r22, %f137;
|
||
setp.eq.s32 %p19, %r22, 1118925336;
|
||
add.s32 %r23, %r22, -1;
|
||
mov.b32 %f141, %r23;
|
||
add.ftz.f32 %f142, %f140, 0f37000000;
|
||
selp.f32 %f12, %f142, %f140, %p19;
|
||
selp.f32 %f143, %f141, %f137, %p19;
|
||
mov.f32 %f144, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f145, %f143, %f144;
|
||
cvt.rzi.f32.f32 %f146, %f145;
|
||
abs.ftz.f32 %f147, %f146;
|
||
setp.gt.ftz.f32 %p20, %f147, 0f42FC0000;
|
||
mov.b32 %r24, %f146;
|
||
and.b32 %r25, %r24, -2147483648;
|
||
or.b32 %r26, %r25, 1123811328;
|
||
mov.b32 %f148, %r26;
|
||
selp.f32 %f149, %f148, %f146, %p20;
|
||
mov.f32 %f150, 0fBF317218;
|
||
fma.rn.ftz.f32 %f151, %f149, %f150, %f143;
|
||
mov.f32 %f152, 0f3102E308;
|
||
fma.rn.ftz.f32 %f153, %f149, %f152, %f151;
|
||
mul.ftz.f32 %f154, %f153, 0f3FB8AA3B;
|
||
add.ftz.f32 %f155, %f149, 0f4B40007F;
|
||
mov.b32 %r27, %f155;
|
||
shl.b32 %r28, %r27, 23;
|
||
mov.b32 %f156, %r28;
|
||
ex2.approx.ftz.f32 %f157, %f154;
|
||
mul.ftz.f32 %f13, %f157, %f156;
|
||
setp.eq.ftz.f32 %p21, %f13, 0f7F800000;
|
||
mov.f32 %f346, 0f7F800000;
|
||
@%p21 bra $L__BB1_7;
|
||
|
||
fma.rn.ftz.f32 %f346, %f13, %f12, %f13;
|
||
|
||
$L__BB1_7:
|
||
setp.lt.ftz.f32 %p22, %f356, 0f00000000;
|
||
setp.eq.ftz.f32 %p23, %f9, 0f3F800000;
|
||
and.pred %p1, %p22, %p23;
|
||
setp.eq.ftz.f32 %p24, %f356, 0f00000000;
|
||
@%p24 bra $L__BB1_11;
|
||
bra.uni $L__BB1_8;
|
||
|
||
$L__BB1_11:
|
||
add.ftz.f32 %f162, %f356, %f356;
|
||
selp.f32 %f346, %f162, 0f00000000, %p23;
|
||
bra.uni $L__BB1_12;
|
||
|
||
$L__BB1_8:
|
||
setp.geu.ftz.f32 %p25, %f356, 0f00000000;
|
||
@%p25 bra $L__BB1_12;
|
||
|
||
mov.f32 %f158, 0f3EDCDE7E;
|
||
cvt.rzi.f32.f32 %f159, %f158;
|
||
setp.eq.ftz.f32 %p26, %f159, 0f3EDCDE7E;
|
||
mov.b32 %r29, %f346;
|
||
xor.b32 %r30, %r29, -2147483648;
|
||
mov.b32 %f160, %r30;
|
||
selp.f32 %f346, %f160, %f346, %p1;
|
||
@%p26 bra $L__BB1_12;
|
||
|
||
mov.f32 %f346, 0f7FFFFFFF;
|
||
|
||
$L__BB1_12:
|
||
add.ftz.f32 %f163, %f10, %f11;
|
||
mov.b32 %r31, %f163;
|
||
setp.lt.s32 %p28, %r31, 2139095040;
|
||
@%p28 bra $L__BB1_19;
|
||
|
||
setp.gtu.ftz.f32 %p29, %f10, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p30, %f11, 0f7F800000;
|
||
or.pred %p31, %p29, %p30;
|
||
@%p31 bra $L__BB1_18;
|
||
bra.uni $L__BB1_14;
|
||
|
||
$L__BB1_18:
|
||
add.ftz.f32 %f346, %f356, 0f3EDCDE7E;
|
||
bra.uni $L__BB1_19;
|
||
|
||
$L__BB1_14:
|
||
setp.eq.ftz.f32 %p32, %f11, 0f7F800000;
|
||
@%p32 bra $L__BB1_17;
|
||
bra.uni $L__BB1_15;
|
||
|
||
$L__BB1_17:
|
||
setp.gt.ftz.f32 %p34, %f10, 0f3F800000;
|
||
selp.f32 %f164, 0f7F800000, 0f00000000, %p34;
|
||
setp.eq.ftz.f32 %p35, %f356, 0fBF800000;
|
||
selp.f32 %f346, 0f3F800000, %f164, %p35;
|
||
bra.uni $L__BB1_19;
|
||
|
||
$L__BB1_15:
|
||
setp.neu.ftz.f32 %p33, %f10, 0f7F800000;
|
||
@%p33 bra $L__BB1_19;
|
||
|
||
selp.f32 %f346, 0fFF800000, 0f7F800000, %p1;
|
||
|
||
$L__BB1_19:
|
||
fma.rn.ftz.f32 %f165, %f346, 0f3FD2ADE4, 0f3B40C470;
|
||
setp.eq.ftz.f32 %p36, %f356, 0f3F800000;
|
||
selp.f32 %f347, 0f3FD30E46, %f165, %p36;
|
||
|
||
$L__BB1_21:
|
||
setp.gtu.ftz.f32 %p37, %f357, 0f35D3FDC0;
|
||
@%p37 bra $L__BB1_23;
|
||
bra.uni $L__BB1_22;
|
||
|
||
$L__BB1_23:
|
||
setp.gtu.ftz.f32 %p38, %f357, 0f3D03ED56;
|
||
@%p38 bra $L__BB1_39;
|
||
bra.uni $L__BB1_24;
|
||
|
||
$L__BB1_39:
|
||
add.ftz.f32 %f254, %f357, 0f3BCD22E0;
|
||
lg2.approx.ftz.f32 %f255, %f254;
|
||
fma.rn.ftz.f32 %f351, %f255, 0f3E08BDF0, 0f3F7FA592;
|
||
bra.uni $L__BB1_40;
|
||
|
||
$L__BB1_22:
|
||
mul.ftz.f32 %f351, %f357, 0f44B09ACE;
|
||
bra.uni $L__BB1_40;
|
||
|
||
$L__BB1_24:
|
||
mov.f32 %f171, 0f3E5CDE7E;
|
||
cvt.rzi.f32.f32 %f172, %f171;
|
||
add.ftz.f32 %f173, %f172, %f172;
|
||
mov.f32 %f174, 0f3EDCDE7E;
|
||
sub.ftz.f32 %f175, %f174, %f173;
|
||
abs.ftz.f32 %f27, %f175;
|
||
abs.ftz.f32 %f28, %f357;
|
||
mov.b32 %r32, %f28;
|
||
and.b32 %r33, %r32, 8388607;
|
||
or.b32 %r34, %r33, 1065353216;
|
||
mov.b32 %f176, %r34;
|
||
shr.u32 %r35, %r32, 23;
|
||
cvt.rn.f32.u32 %f177, %r35;
|
||
add.ftz.f32 %f178, %f177, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p39, %f176, 0f3FB504F3;
|
||
mul.ftz.f32 %f179, %f176, 0f3F000000;
|
||
add.ftz.f32 %f180, %f178, 0f3F800000;
|
||
selp.f32 %f181, %f180, %f178, %p39;
|
||
selp.f32 %f182, %f179, %f176, %p39;
|
||
add.ftz.f32 %f183, %f182, 0fBF800000;
|
||
add.ftz.f32 %f169, %f182, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f168,%f169;
|
||
// end inline asm
|
||
add.ftz.f32 %f184, %f183, %f183;
|
||
mul.ftz.f32 %f185, %f168, %f184;
|
||
mul.ftz.f32 %f186, %f185, %f185;
|
||
mov.f32 %f187, 0f3C4CAF63;
|
||
mov.f32 %f188, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f189, %f188, %f186, %f187;
|
||
mov.f32 %f190, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f191, %f189, %f186, %f190;
|
||
mul.rn.ftz.f32 %f192, %f191, %f186;
|
||
mul.rn.ftz.f32 %f193, %f192, %f185;
|
||
sub.ftz.f32 %f194, %f183, %f185;
|
||
add.ftz.f32 %f195, %f194, %f194;
|
||
neg.ftz.f32 %f196, %f185;
|
||
fma.rn.ftz.f32 %f197, %f196, %f183, %f195;
|
||
mul.rn.ftz.f32 %f198, %f168, %f197;
|
||
add.ftz.f32 %f199, %f193, %f185;
|
||
sub.ftz.f32 %f200, %f185, %f199;
|
||
add.ftz.f32 %f201, %f193, %f200;
|
||
add.ftz.f32 %f202, %f198, %f201;
|
||
add.ftz.f32 %f203, %f199, %f202;
|
||
sub.ftz.f32 %f204, %f199, %f203;
|
||
add.ftz.f32 %f205, %f202, %f204;
|
||
mov.f32 %f206, 0f3F317200;
|
||
mul.rn.ftz.f32 %f207, %f181, %f206;
|
||
mov.f32 %f208, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f209, %f181, %f208;
|
||
add.ftz.f32 %f210, %f207, %f203;
|
||
sub.ftz.f32 %f211, %f207, %f210;
|
||
add.ftz.f32 %f212, %f203, %f211;
|
||
add.ftz.f32 %f213, %f205, %f212;
|
||
add.ftz.f32 %f214, %f209, %f213;
|
||
add.ftz.f32 %f215, %f210, %f214;
|
||
sub.ftz.f32 %f216, %f210, %f215;
|
||
add.ftz.f32 %f217, %f214, %f216;
|
||
abs.ftz.f32 %f29, %f174;
|
||
setp.gt.ftz.f32 %p40, %f29, 0f77F684DF;
|
||
selp.f32 %f218, 0f385CDE7E, 0f3EDCDE7E, %p40;
|
||
mul.rn.ftz.f32 %f219, %f218, %f215;
|
||
neg.ftz.f32 %f220, %f219;
|
||
fma.rn.ftz.f32 %f221, %f218, %f215, %f220;
|
||
fma.rn.ftz.f32 %f222, %f218, %f217, %f221;
|
||
mov.f32 %f223, 0f00000000;
|
||
fma.rn.ftz.f32 %f224, %f223, %f215, %f222;
|
||
add.rn.ftz.f32 %f225, %f219, %f224;
|
||
neg.ftz.f32 %f226, %f225;
|
||
add.rn.ftz.f32 %f227, %f219, %f226;
|
||
add.rn.ftz.f32 %f228, %f227, %f224;
|
||
mov.b32 %r36, %f225;
|
||
setp.eq.s32 %p41, %r36, 1118925336;
|
||
add.s32 %r37, %r36, -1;
|
||
mov.b32 %f229, %r37;
|
||
add.ftz.f32 %f230, %f228, 0f37000000;
|
||
selp.f32 %f30, %f230, %f228, %p41;
|
||
selp.f32 %f231, %f229, %f225, %p41;
|
||
mov.f32 %f232, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f233, %f231, %f232;
|
||
cvt.rzi.f32.f32 %f234, %f233;
|
||
abs.ftz.f32 %f235, %f234;
|
||
setp.gt.ftz.f32 %p42, %f235, 0f42FC0000;
|
||
mov.b32 %r38, %f234;
|
||
and.b32 %r39, %r38, -2147483648;
|
||
or.b32 %r40, %r39, 1123811328;
|
||
mov.b32 %f236, %r40;
|
||
selp.f32 %f237, %f236, %f234, %p42;
|
||
mov.f32 %f238, 0fBF317218;
|
||
fma.rn.ftz.f32 %f239, %f237, %f238, %f231;
|
||
mov.f32 %f240, 0f3102E308;
|
||
fma.rn.ftz.f32 %f241, %f237, %f240, %f239;
|
||
mul.ftz.f32 %f242, %f241, 0f3FB8AA3B;
|
||
add.ftz.f32 %f243, %f237, 0f4B40007F;
|
||
mov.b32 %r41, %f243;
|
||
shl.b32 %r42, %r41, 23;
|
||
mov.b32 %f244, %r42;
|
||
ex2.approx.ftz.f32 %f245, %f242;
|
||
mul.ftz.f32 %f31, %f245, %f244;
|
||
setp.eq.ftz.f32 %p43, %f31, 0f7F800000;
|
||
mov.f32 %f350, 0f7F800000;
|
||
@%p43 bra $L__BB1_26;
|
||
|
||
fma.rn.ftz.f32 %f350, %f31, %f30, %f31;
|
||
|
||
$L__BB1_26:
|
||
setp.lt.ftz.f32 %p44, %f357, 0f00000000;
|
||
setp.eq.ftz.f32 %p45, %f27, 0f3F800000;
|
||
and.pred %p2, %p44, %p45;
|
||
setp.eq.ftz.f32 %p46, %f357, 0f00000000;
|
||
@%p46 bra $L__BB1_30;
|
||
bra.uni $L__BB1_27;
|
||
|
||
$L__BB1_30:
|
||
add.ftz.f32 %f250, %f357, %f357;
|
||
selp.f32 %f350, %f250, 0f00000000, %p45;
|
||
bra.uni $L__BB1_31;
|
||
|
||
$L__BB1_27:
|
||
setp.geu.ftz.f32 %p47, %f357, 0f00000000;
|
||
@%p47 bra $L__BB1_31;
|
||
|
||
mov.f32 %f246, 0f3EDCDE7E;
|
||
cvt.rzi.f32.f32 %f247, %f246;
|
||
setp.eq.ftz.f32 %p48, %f247, 0f3EDCDE7E;
|
||
mov.b32 %r43, %f350;
|
||
xor.b32 %r44, %r43, -2147483648;
|
||
mov.b32 %f248, %r44;
|
||
selp.f32 %f350, %f248, %f350, %p2;
|
||
@%p48 bra $L__BB1_31;
|
||
|
||
mov.f32 %f350, 0f7FFFFFFF;
|
||
|
||
$L__BB1_31:
|
||
add.ftz.f32 %f251, %f28, %f29;
|
||
mov.b32 %r45, %f251;
|
||
setp.lt.s32 %p50, %r45, 2139095040;
|
||
@%p50 bra $L__BB1_38;
|
||
|
||
setp.gtu.ftz.f32 %p51, %f28, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p52, %f29, 0f7F800000;
|
||
or.pred %p53, %p51, %p52;
|
||
@%p53 bra $L__BB1_37;
|
||
bra.uni $L__BB1_33;
|
||
|
||
$L__BB1_37:
|
||
add.ftz.f32 %f350, %f357, 0f3EDCDE7E;
|
||
bra.uni $L__BB1_38;
|
||
|
||
$L__BB1_33:
|
||
setp.eq.ftz.f32 %p54, %f29, 0f7F800000;
|
||
@%p54 bra $L__BB1_36;
|
||
bra.uni $L__BB1_34;
|
||
|
||
$L__BB1_36:
|
||
setp.gt.ftz.f32 %p56, %f28, 0f3F800000;
|
||
selp.f32 %f252, 0f7F800000, 0f00000000, %p56;
|
||
setp.eq.ftz.f32 %p57, %f357, 0fBF800000;
|
||
selp.f32 %f350, 0f3F800000, %f252, %p57;
|
||
bra.uni $L__BB1_38;
|
||
|
||
$L__BB1_34:
|
||
setp.neu.ftz.f32 %p55, %f28, 0f7F800000;
|
||
@%p55 bra $L__BB1_38;
|
||
|
||
selp.f32 %f350, 0fFF800000, 0f7F800000, %p2;
|
||
|
||
$L__BB1_38:
|
||
fma.rn.ftz.f32 %f253, %f350, 0f3FD2ADE4, 0f3B40C470;
|
||
setp.eq.ftz.f32 %p58, %f357, 0f3F800000;
|
||
selp.f32 %f351, 0f3FD30E46, %f253, %p58;
|
||
|
||
$L__BB1_40:
|
||
setp.gtu.ftz.f32 %p59, %f358, 0f35D3FDC0;
|
||
@%p59 bra $L__BB1_42;
|
||
bra.uni $L__BB1_41;
|
||
|
||
$L__BB1_42:
|
||
setp.gtu.ftz.f32 %p60, %f358, 0f3D03ED56;
|
||
@%p60 bra $L__BB1_58;
|
||
bra.uni $L__BB1_43;
|
||
|
||
$L__BB1_58:
|
||
add.ftz.f32 %f342, %f358, 0f3BCD22E0;
|
||
lg2.approx.ftz.f32 %f343, %f342;
|
||
fma.rn.ftz.f32 %f355, %f343, 0f3E08BDF0, 0f3F7FA592;
|
||
bra.uni $L__BB1_59;
|
||
|
||
$L__BB1_41:
|
||
mul.ftz.f32 %f355, %f358, 0f44B09ACE;
|
||
bra.uni $L__BB1_59;
|
||
|
||
$L__BB1_43:
|
||
mov.f32 %f259, 0f3E5CDE7E;
|
||
cvt.rzi.f32.f32 %f260, %f259;
|
||
add.ftz.f32 %f261, %f260, %f260;
|
||
mov.f32 %f262, 0f3EDCDE7E;
|
||
sub.ftz.f32 %f263, %f262, %f261;
|
||
abs.ftz.f32 %f45, %f263;
|
||
abs.ftz.f32 %f46, %f358;
|
||
mov.b32 %r46, %f46;
|
||
and.b32 %r47, %r46, 8388607;
|
||
or.b32 %r48, %r47, 1065353216;
|
||
mov.b32 %f264, %r48;
|
||
shr.u32 %r49, %r46, 23;
|
||
cvt.rn.f32.u32 %f265, %r49;
|
||
add.ftz.f32 %f266, %f265, 0fC2FE0000;
|
||
setp.gt.ftz.f32 %p61, %f264, 0f3FB504F3;
|
||
mul.ftz.f32 %f267, %f264, 0f3F000000;
|
||
add.ftz.f32 %f268, %f266, 0f3F800000;
|
||
selp.f32 %f269, %f268, %f266, %p61;
|
||
selp.f32 %f270, %f267, %f264, %p61;
|
||
add.ftz.f32 %f271, %f270, 0fBF800000;
|
||
add.ftz.f32 %f257, %f270, 0f3F800000;
|
||
// begin inline asm
|
||
rcp.approx.ftz.f32 %f256,%f257;
|
||
// end inline asm
|
||
add.ftz.f32 %f272, %f271, %f271;
|
||
mul.ftz.f32 %f273, %f256, %f272;
|
||
mul.ftz.f32 %f274, %f273, %f273;
|
||
mov.f32 %f275, 0f3C4CAF63;
|
||
mov.f32 %f276, 0f3B18F0FE;
|
||
fma.rn.ftz.f32 %f277, %f276, %f274, %f275;
|
||
mov.f32 %f278, 0f3DAAAABD;
|
||
fma.rn.ftz.f32 %f279, %f277, %f274, %f278;
|
||
mul.rn.ftz.f32 %f280, %f279, %f274;
|
||
mul.rn.ftz.f32 %f281, %f280, %f273;
|
||
sub.ftz.f32 %f282, %f271, %f273;
|
||
add.ftz.f32 %f283, %f282, %f282;
|
||
neg.ftz.f32 %f284, %f273;
|
||
fma.rn.ftz.f32 %f285, %f284, %f271, %f283;
|
||
mul.rn.ftz.f32 %f286, %f256, %f285;
|
||
add.ftz.f32 %f287, %f281, %f273;
|
||
sub.ftz.f32 %f288, %f273, %f287;
|
||
add.ftz.f32 %f289, %f281, %f288;
|
||
add.ftz.f32 %f290, %f286, %f289;
|
||
add.ftz.f32 %f291, %f287, %f290;
|
||
sub.ftz.f32 %f292, %f287, %f291;
|
||
add.ftz.f32 %f293, %f290, %f292;
|
||
mov.f32 %f294, 0f3F317200;
|
||
mul.rn.ftz.f32 %f295, %f269, %f294;
|
||
mov.f32 %f296, 0f35BFBE8E;
|
||
mul.rn.ftz.f32 %f297, %f269, %f296;
|
||
add.ftz.f32 %f298, %f295, %f291;
|
||
sub.ftz.f32 %f299, %f295, %f298;
|
||
add.ftz.f32 %f300, %f291, %f299;
|
||
add.ftz.f32 %f301, %f293, %f300;
|
||
add.ftz.f32 %f302, %f297, %f301;
|
||
add.ftz.f32 %f303, %f298, %f302;
|
||
sub.ftz.f32 %f304, %f298, %f303;
|
||
add.ftz.f32 %f305, %f302, %f304;
|
||
abs.ftz.f32 %f47, %f262;
|
||
setp.gt.ftz.f32 %p62, %f47, 0f77F684DF;
|
||
selp.f32 %f306, 0f385CDE7E, 0f3EDCDE7E, %p62;
|
||
mul.rn.ftz.f32 %f307, %f306, %f303;
|
||
neg.ftz.f32 %f308, %f307;
|
||
fma.rn.ftz.f32 %f309, %f306, %f303, %f308;
|
||
fma.rn.ftz.f32 %f310, %f306, %f305, %f309;
|
||
mov.f32 %f311, 0f00000000;
|
||
fma.rn.ftz.f32 %f312, %f311, %f303, %f310;
|
||
add.rn.ftz.f32 %f313, %f307, %f312;
|
||
neg.ftz.f32 %f314, %f313;
|
||
add.rn.ftz.f32 %f315, %f307, %f314;
|
||
add.rn.ftz.f32 %f316, %f315, %f312;
|
||
mov.b32 %r50, %f313;
|
||
setp.eq.s32 %p63, %r50, 1118925336;
|
||
add.s32 %r51, %r50, -1;
|
||
mov.b32 %f317, %r51;
|
||
add.ftz.f32 %f318, %f316, 0f37000000;
|
||
selp.f32 %f48, %f318, %f316, %p63;
|
||
selp.f32 %f319, %f317, %f313, %p63;
|
||
mov.f32 %f320, 0f3FB8AA3B;
|
||
mul.rn.ftz.f32 %f321, %f319, %f320;
|
||
cvt.rzi.f32.f32 %f322, %f321;
|
||
abs.ftz.f32 %f323, %f322;
|
||
setp.gt.ftz.f32 %p64, %f323, 0f42FC0000;
|
||
mov.b32 %r52, %f322;
|
||
and.b32 %r53, %r52, -2147483648;
|
||
or.b32 %r54, %r53, 1123811328;
|
||
mov.b32 %f324, %r54;
|
||
selp.f32 %f325, %f324, %f322, %p64;
|
||
mov.f32 %f326, 0fBF317218;
|
||
fma.rn.ftz.f32 %f327, %f325, %f326, %f319;
|
||
mov.f32 %f328, 0f3102E308;
|
||
fma.rn.ftz.f32 %f329, %f325, %f328, %f327;
|
||
mul.ftz.f32 %f330, %f329, 0f3FB8AA3B;
|
||
add.ftz.f32 %f331, %f325, 0f4B40007F;
|
||
mov.b32 %r55, %f331;
|
||
shl.b32 %r56, %r55, 23;
|
||
mov.b32 %f332, %r56;
|
||
ex2.approx.ftz.f32 %f333, %f330;
|
||
mul.ftz.f32 %f49, %f333, %f332;
|
||
setp.eq.ftz.f32 %p65, %f49, 0f7F800000;
|
||
mov.f32 %f354, 0f7F800000;
|
||
@%p65 bra $L__BB1_45;
|
||
|
||
fma.rn.ftz.f32 %f354, %f49, %f48, %f49;
|
||
|
||
$L__BB1_45:
|
||
setp.lt.ftz.f32 %p66, %f358, 0f00000000;
|
||
setp.eq.ftz.f32 %p67, %f45, 0f3F800000;
|
||
and.pred %p3, %p66, %p67;
|
||
setp.eq.ftz.f32 %p68, %f358, 0f00000000;
|
||
@%p68 bra $L__BB1_49;
|
||
bra.uni $L__BB1_46;
|
||
|
||
$L__BB1_49:
|
||
add.ftz.f32 %f338, %f358, %f358;
|
||
selp.f32 %f354, %f338, 0f00000000, %p67;
|
||
bra.uni $L__BB1_50;
|
||
|
||
$L__BB1_46:
|
||
setp.geu.ftz.f32 %p69, %f358, 0f00000000;
|
||
@%p69 bra $L__BB1_50;
|
||
|
||
mov.f32 %f334, 0f3EDCDE7E;
|
||
cvt.rzi.f32.f32 %f335, %f334;
|
||
setp.eq.ftz.f32 %p70, %f335, 0f3EDCDE7E;
|
||
mov.b32 %r57, %f354;
|
||
xor.b32 %r58, %r57, -2147483648;
|
||
mov.b32 %f336, %r58;
|
||
selp.f32 %f354, %f336, %f354, %p3;
|
||
@%p70 bra $L__BB1_50;
|
||
|
||
mov.f32 %f354, 0f7FFFFFFF;
|
||
|
||
$L__BB1_50:
|
||
add.ftz.f32 %f339, %f46, %f47;
|
||
mov.b32 %r59, %f339;
|
||
setp.lt.s32 %p72, %r59, 2139095040;
|
||
@%p72 bra $L__BB1_57;
|
||
|
||
setp.gtu.ftz.f32 %p73, %f46, 0f7F800000;
|
||
setp.gtu.ftz.f32 %p74, %f47, 0f7F800000;
|
||
or.pred %p75, %p73, %p74;
|
||
@%p75 bra $L__BB1_56;
|
||
bra.uni $L__BB1_52;
|
||
|
||
$L__BB1_56:
|
||
add.ftz.f32 %f354, %f358, 0f3EDCDE7E;
|
||
bra.uni $L__BB1_57;
|
||
|
||
$L__BB1_52:
|
||
setp.eq.ftz.f32 %p76, %f47, 0f7F800000;
|
||
@%p76 bra $L__BB1_55;
|
||
bra.uni $L__BB1_53;
|
||
|
||
$L__BB1_55:
|
||
setp.gt.ftz.f32 %p78, %f46, 0f3F800000;
|
||
selp.f32 %f340, 0f7F800000, 0f00000000, %p78;
|
||
setp.eq.ftz.f32 %p79, %f358, 0fBF800000;
|
||
selp.f32 %f354, 0f3F800000, %f340, %p79;
|
||
bra.uni $L__BB1_57;
|
||
|
||
$L__BB1_53:
|
||
setp.neu.ftz.f32 %p77, %f46, 0f7F800000;
|
||
@%p77 bra $L__BB1_57;
|
||
|
||
selp.f32 %f354, 0fFF800000, 0f7F800000, %p3;
|
||
|
||
$L__BB1_57:
|
||
fma.rn.ftz.f32 %f341, %f354, 0f3FD2ADE4, 0f3B40C470;
|
||
setp.eq.ftz.f32 %p80, %f358, 0f3F800000;
|
||
selp.f32 %f355, 0f3FD30E46, %f341, %p80;
|
||
|
||
$L__BB1_59:
|
||
mul.ftz.f32 %f356, %f347, 0f3F04E259;
|
||
mul.ftz.f32 %f357, %f351, 0f3F04E259;
|
||
mul.ftz.f32 %f358, %f355, 0f3F04E259;
|
||
|
||
$L__BB1_60:
|
||
shl.b64 %rd6, %rd1, 4;
|
||
add.s64 %rd7, %rd3, %rd6;
|
||
st.global.v4.f32 [%rd7], {%f356, %f357, %f358, %f4};
|
||
|
||
$L__BB1_61:
|
||
ret;
|
||
|
||
}
|
||
|
||
|