// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: UNKNOWN // Unknown Toolkit Version // Based on NVVM 7.0.1 // .version 8.5 .target sm_86, texmode_independent .address_size 64 // .globl BackwardPUE .entry BackwardPUE( .param .u64 .ptr .global .align 16 BackwardPUE_param_0, .param .u64 .ptr .global .align 16 BackwardPUE_param_1, .param .u32 BackwardPUE_param_2 ) { .reg .pred %p<81>; .reg .f32 %f<366>; .reg .b32 %r<60>; .reg .b64 %rd<9>; ld.param.u64 %rd2, [BackwardPUE_param_0]; ld.param.u32 %r3, [BackwardPUE_param_2]; setp.eq.s32 %p4, %r3, 0; @%p4 bra $L__BB0_60; mov.b32 %r4, %envreg3; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; add.s32 %r8, %r7, %r4; mad.lo.s32 %r1, %r6, %r5, %r8; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %ntid.y; mov.u32 %r11, %tid.y; mov.b32 %r12, %envreg4; add.s32 %r13, %r11, %r12; mad.lo.s32 %r2, %r10, %r9, %r13; setp.gt.s32 %p5, %r1, 2047; setp.gt.s32 %p6, %r2, 2047; or.pred %p7, %p5, %p6; @%p7 bra $L__BB0_60; shl.b32 %r14, %r2, 11; add.s32 %r15, %r14, %r1; cvt.s64.s32 %rd1, %r15; mul.wide.s32 %rd4, %r15, 16; add.s64 %rd5, %rd2, %rd4; ld.global.nc.v4.f32 {%f68, %f69, %f70, %f71}, [%rd5]; abs.ftz.f32 %f73, %f68; abs.ftz.f32 %f75, %f69; abs.ftz.f32 %f77, %f70; abs.ftz.f32 %f79, %f71; setp.geu.ftz.f32 %p8, %f77, 0f7F800000; setp.geu.ftz.f32 %p9, %f79, 0f7F800000; setp.geu.ftz.f32 %p10, %f75, 0f7F800000; setp.geu.ftz.f32 %p11, %f73, 0f7F800000; or.pred %p12, %p11, %p10; selp.b32 %r16, -1, 0, %p12; or.pred %p13, %p9, %p8; selp.b32 %r17, -1, %r16, %p13; setp.gt.s32 %p14, %r17, -1; selp.f32 %f4, %f71, 0f00000000, %p14; selp.f32 %f3, %f70, 0f00000000, %p14; selp.f32 %f2, %f69, 0f00000000, %p14; selp.f32 %f1, %f68, 0f00000000, %p14; mul.ftz.f32 %f6, %f1, 0f3FF69738; mul.ftz.f32 %f8, %f2, 0f3FF69738; mul.ftz.f32 %f10, %f3, 0f3FF69738; setp.gtu.ftz.f32 %p15, %f6, 0f3B123EA5; @%p15 bra $L__BB0_4; bra.uni $L__BB0_3; $L__BB0_4: setp.gtu.ftz.f32 %p16, %f6, 0f3EBDF067; @%p16 bra $L__BB0_20; bra.uni $L__BB0_5; $L__BB0_20: fma.rn.ftz.f32 %f165, %f1, 0f411FFF18, 0fC0A5DF51; mul.ftz.f32 %f166, %f165, 0f3FB8AA3B; ex2.approx.ftz.f32 %f167, %f166; add.ftz.f32 %f357, %f167, 0fBBCD22E0; bra.uni $L__BB0_21; $L__BB0_3: mul.ftz.f32 %f357, %f1, 0f3AB2B981; bra.uni $L__BB0_21; $L__BB0_5: fma.rn.ftz.f32 %f12, %f1, 0f3F95D17C, 0fBAEA3C23; mov.f32 %f83, 0f3F945BFF; cvt.rzi.f32.f32 %f84, %f83; add.ftz.f32 %f85, %f84, %f84; mov.f32 %f86, 0f40145BFF; sub.ftz.f32 %f87, %f86, %f85; abs.ftz.f32 %f13, %f87; abs.ftz.f32 %f14, %f12; mov.b32 %r18, %f14; and.b32 %r19, %r18, 8388607; or.b32 %r20, %r19, 1065353216; mov.b32 %f88, %r20; shr.u32 %r21, %r18, 23; cvt.rn.f32.u32 %f89, %r21; add.ftz.f32 %f90, %f89, 0fC2FE0000; setp.gt.ftz.f32 %p17, %f88, 0f3FB504F3; mul.ftz.f32 %f91, %f88, 0f3F000000; add.ftz.f32 %f92, %f90, 0f3F800000; selp.f32 %f93, %f92, %f90, %p17; selp.f32 %f94, %f91, %f88, %p17; add.ftz.f32 %f95, %f94, 0fBF800000; add.ftz.f32 %f81, %f94, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f80,%f81; // end inline asm add.ftz.f32 %f96, %f95, %f95; mul.ftz.f32 %f97, %f80, %f96; mul.ftz.f32 %f98, %f97, %f97; mov.f32 %f99, 0f3C4CAF63; mov.f32 %f100, 0f3B18F0FE; fma.rn.ftz.f32 %f101, %f100, %f98, %f99; mov.f32 %f102, 0f3DAAAABD; fma.rn.ftz.f32 %f103, %f101, %f98, %f102; mul.rn.ftz.f32 %f104, %f103, %f98; mul.rn.ftz.f32 %f105, %f104, %f97; sub.ftz.f32 %f106, %f95, %f97; add.ftz.f32 %f107, %f106, %f106; neg.ftz.f32 %f108, %f97; fma.rn.ftz.f32 %f109, %f108, %f95, %f107; mul.rn.ftz.f32 %f110, %f80, %f109; add.ftz.f32 %f111, %f105, %f97; sub.ftz.f32 %f112, %f97, %f111; add.ftz.f32 %f113, %f105, %f112; add.ftz.f32 %f114, %f110, %f113; add.ftz.f32 %f115, %f111, %f114; sub.ftz.f32 %f116, %f111, %f115; add.ftz.f32 %f117, %f114, %f116; mov.f32 %f118, 0f3F317200; mul.rn.ftz.f32 %f119, %f93, %f118; mov.f32 %f120, 0f35BFBE8E; mul.rn.ftz.f32 %f121, %f93, %f120; add.ftz.f32 %f122, %f119, %f115; sub.ftz.f32 %f123, %f119, %f122; add.ftz.f32 %f124, %f115, %f123; add.ftz.f32 %f125, %f117, %f124; add.ftz.f32 %f126, %f121, %f125; add.ftz.f32 %f127, %f122, %f126; sub.ftz.f32 %f128, %f122, %f127; add.ftz.f32 %f129, %f126, %f128; abs.ftz.f32 %f15, %f86; setp.gt.ftz.f32 %p18, %f15, 0f77F684DF; selp.f32 %f130, 0f39945BFF, 0f40145BFF, %p18; mul.rn.ftz.f32 %f131, %f130, %f127; neg.ftz.f32 %f132, %f131; fma.rn.ftz.f32 %f133, %f130, %f127, %f132; fma.rn.ftz.f32 %f134, %f130, %f129, %f133; mov.f32 %f135, 0f00000000; fma.rn.ftz.f32 %f136, %f135, %f127, %f134; add.rn.ftz.f32 %f137, %f131, %f136; neg.ftz.f32 %f138, %f137; add.rn.ftz.f32 %f139, %f131, %f138; add.rn.ftz.f32 %f140, %f139, %f136; mov.b32 %r22, %f137; setp.eq.s32 %p19, %r22, 1118925336; add.s32 %r23, %r22, -1; mov.b32 %f141, %r23; add.ftz.f32 %f142, %f140, 0f37000000; selp.f32 %f16, %f142, %f140, %p19; selp.f32 %f143, %f141, %f137, %p19; mov.f32 %f144, 0f3FB8AA3B; mul.rn.ftz.f32 %f145, %f143, %f144; cvt.rzi.f32.f32 %f146, %f145; abs.ftz.f32 %f147, %f146; setp.gt.ftz.f32 %p20, %f147, 0f42FC0000; mov.b32 %r24, %f146; and.b32 %r25, %r24, -2147483648; or.b32 %r26, %r25, 1123811328; mov.b32 %f148, %r26; selp.f32 %f149, %f148, %f146, %p20; mov.f32 %f150, 0fBF317218; fma.rn.ftz.f32 %f151, %f149, %f150, %f143; mov.f32 %f152, 0f3102E308; fma.rn.ftz.f32 %f153, %f149, %f152, %f151; mul.ftz.f32 %f154, %f153, 0f3FB8AA3B; add.ftz.f32 %f155, %f149, 0f4B40007F; mov.b32 %r27, %f155; shl.b32 %r28, %r27, 23; mov.b32 %f156, %r28; ex2.approx.ftz.f32 %f157, %f154; mul.ftz.f32 %f17, %f157, %f156; setp.eq.ftz.f32 %p21, %f17, 0f7F800000; mov.f32 %f356, 0f7F800000; @%p21 bra $L__BB0_7; fma.rn.ftz.f32 %f356, %f17, %f16, %f17; $L__BB0_7: setp.lt.ftz.f32 %p22, %f12, 0f00000000; setp.eq.ftz.f32 %p23, %f13, 0f3F800000; and.pred %p1, %p22, %p23; setp.eq.ftz.f32 %p24, %f12, 0f00000000; @%p24 bra $L__BB0_11; bra.uni $L__BB0_8; $L__BB0_11: add.ftz.f32 %f162, %f12, %f12; selp.f32 %f356, %f162, 0f00000000, %p23; bra.uni $L__BB0_12; $L__BB0_8: setp.geu.ftz.f32 %p25, %f12, 0f00000000; @%p25 bra $L__BB0_12; mov.f32 %f158, 0f40145BFF; cvt.rzi.f32.f32 %f159, %f158; setp.eq.ftz.f32 %p26, %f159, 0f40145BFF; mov.b32 %r29, %f356; xor.b32 %r30, %r29, -2147483648; mov.b32 %f160, %r30; selp.f32 %f356, %f160, %f356, %p1; @%p26 bra $L__BB0_12; mov.f32 %f356, 0f7FFFFFFF; $L__BB0_12: add.ftz.f32 %f163, %f14, %f15; mov.b32 %r31, %f163; setp.lt.s32 %p28, %r31, 2139095040; @%p28 bra $L__BB0_19; setp.gtu.ftz.f32 %p29, %f14, 0f7F800000; setp.gtu.ftz.f32 %p30, %f15, 0f7F800000; or.pred %p31, %p29, %p30; @%p31 bra $L__BB0_18; bra.uni $L__BB0_14; $L__BB0_18: add.ftz.f32 %f356, %f12, 0f40145BFF; bra.uni $L__BB0_19; $L__BB0_14: setp.eq.ftz.f32 %p32, %f15, 0f7F800000; @%p32 bra $L__BB0_17; bra.uni $L__BB0_15; $L__BB0_17: setp.gt.ftz.f32 %p34, %f14, 0f3F800000; selp.f32 %f164, 0f7F800000, 0f00000000, %p34; setp.eq.ftz.f32 %p35, %f12, 0fBF800000; selp.f32 %f356, 0f3F800000, %f164, %p35; bra.uni $L__BB0_19; $L__BB0_15: setp.neu.ftz.f32 %p33, %f14, 0f7F800000; @%p33 bra $L__BB0_19; selp.f32 %f356, 0fFF800000, 0f7F800000, %p1; $L__BB0_19: setp.eq.ftz.f32 %p36, %f12, 0f3F800000; selp.f32 %f357, 0f3F800000, %f356, %p36; $L__BB0_21: setp.gtu.ftz.f32 %p37, %f8, 0f3B123EA5; @%p37 bra $L__BB0_23; bra.uni $L__BB0_22; $L__BB0_23: setp.gtu.ftz.f32 %p38, %f8, 0f3EBDF067; @%p38 bra $L__BB0_39; bra.uni $L__BB0_24; $L__BB0_39: fma.rn.ftz.f32 %f253, %f2, 0f411FFF18, 0fC0A5DF51; mul.ftz.f32 %f254, %f253, 0f3FB8AA3B; ex2.approx.ftz.f32 %f255, %f254; add.ftz.f32 %f361, %f255, 0fBBCD22E0; bra.uni $L__BB0_40; $L__BB0_22: mul.ftz.f32 %f361, %f2, 0f3AB2B981; bra.uni $L__BB0_40; $L__BB0_24: fma.rn.ftz.f32 %f31, %f2, 0f3F95D17C, 0fBAEA3C23; mov.f32 %f171, 0f3F945BFF; cvt.rzi.f32.f32 %f172, %f171; add.ftz.f32 %f173, %f172, %f172; mov.f32 %f174, 0f40145BFF; sub.ftz.f32 %f175, %f174, %f173; abs.ftz.f32 %f32, %f175; abs.ftz.f32 %f33, %f31; mov.b32 %r32, %f33; and.b32 %r33, %r32, 8388607; or.b32 %r34, %r33, 1065353216; mov.b32 %f176, %r34; shr.u32 %r35, %r32, 23; cvt.rn.f32.u32 %f177, %r35; add.ftz.f32 %f178, %f177, 0fC2FE0000; setp.gt.ftz.f32 %p39, %f176, 0f3FB504F3; mul.ftz.f32 %f179, %f176, 0f3F000000; add.ftz.f32 %f180, %f178, 0f3F800000; selp.f32 %f181, %f180, %f178, %p39; selp.f32 %f182, %f179, %f176, %p39; add.ftz.f32 %f183, %f182, 0fBF800000; add.ftz.f32 %f169, %f182, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f168,%f169; // end inline asm add.ftz.f32 %f184, %f183, %f183; mul.ftz.f32 %f185, %f168, %f184; mul.ftz.f32 %f186, %f185, %f185; mov.f32 %f187, 0f3C4CAF63; mov.f32 %f188, 0f3B18F0FE; fma.rn.ftz.f32 %f189, %f188, %f186, %f187; mov.f32 %f190, 0f3DAAAABD; fma.rn.ftz.f32 %f191, %f189, %f186, %f190; mul.rn.ftz.f32 %f192, %f191, %f186; mul.rn.ftz.f32 %f193, %f192, %f185; sub.ftz.f32 %f194, %f183, %f185; add.ftz.f32 %f195, %f194, %f194; neg.ftz.f32 %f196, %f185; fma.rn.ftz.f32 %f197, %f196, %f183, %f195; mul.rn.ftz.f32 %f198, %f168, %f197; add.ftz.f32 %f199, %f193, %f185; sub.ftz.f32 %f200, %f185, %f199; add.ftz.f32 %f201, %f193, %f200; add.ftz.f32 %f202, %f198, %f201; add.ftz.f32 %f203, %f199, %f202; sub.ftz.f32 %f204, %f199, %f203; add.ftz.f32 %f205, %f202, %f204; mov.f32 %f206, 0f3F317200; mul.rn.ftz.f32 %f207, %f181, %f206; mov.f32 %f208, 0f35BFBE8E; mul.rn.ftz.f32 %f209, %f181, %f208; add.ftz.f32 %f210, %f207, %f203; sub.ftz.f32 %f211, %f207, %f210; add.ftz.f32 %f212, %f203, %f211; add.ftz.f32 %f213, %f205, %f212; add.ftz.f32 %f214, %f209, %f213; add.ftz.f32 %f215, %f210, %f214; sub.ftz.f32 %f216, %f210, %f215; add.ftz.f32 %f217, %f214, %f216; abs.ftz.f32 %f34, %f174; setp.gt.ftz.f32 %p40, %f34, 0f77F684DF; selp.f32 %f218, 0f39945BFF, 0f40145BFF, %p40; mul.rn.ftz.f32 %f219, %f218, %f215; neg.ftz.f32 %f220, %f219; fma.rn.ftz.f32 %f221, %f218, %f215, %f220; fma.rn.ftz.f32 %f222, %f218, %f217, %f221; mov.f32 %f223, 0f00000000; fma.rn.ftz.f32 %f224, %f223, %f215, %f222; add.rn.ftz.f32 %f225, %f219, %f224; neg.ftz.f32 %f226, %f225; add.rn.ftz.f32 %f227, %f219, %f226; add.rn.ftz.f32 %f228, %f227, %f224; mov.b32 %r36, %f225; setp.eq.s32 %p41, %r36, 1118925336; add.s32 %r37, %r36, -1; mov.b32 %f229, %r37; add.ftz.f32 %f230, %f228, 0f37000000; selp.f32 %f35, %f230, %f228, %p41; selp.f32 %f231, %f229, %f225, %p41; mov.f32 %f232, 0f3FB8AA3B; mul.rn.ftz.f32 %f233, %f231, %f232; cvt.rzi.f32.f32 %f234, %f233; abs.ftz.f32 %f235, %f234; setp.gt.ftz.f32 %p42, %f235, 0f42FC0000; mov.b32 %r38, %f234; and.b32 %r39, %r38, -2147483648; or.b32 %r40, %r39, 1123811328; mov.b32 %f236, %r40; selp.f32 %f237, %f236, %f234, %p42; mov.f32 %f238, 0fBF317218; fma.rn.ftz.f32 %f239, %f237, %f238, %f231; mov.f32 %f240, 0f3102E308; fma.rn.ftz.f32 %f241, %f237, %f240, %f239; mul.ftz.f32 %f242, %f241, 0f3FB8AA3B; add.ftz.f32 %f243, %f237, 0f4B40007F; mov.b32 %r41, %f243; shl.b32 %r42, %r41, 23; mov.b32 %f244, %r42; ex2.approx.ftz.f32 %f245, %f242; mul.ftz.f32 %f36, %f245, %f244; setp.eq.ftz.f32 %p43, %f36, 0f7F800000; mov.f32 %f360, 0f7F800000; @%p43 bra $L__BB0_26; fma.rn.ftz.f32 %f360, %f36, %f35, %f36; $L__BB0_26: setp.lt.ftz.f32 %p44, %f31, 0f00000000; setp.eq.ftz.f32 %p45, %f32, 0f3F800000; and.pred %p2, %p44, %p45; setp.eq.ftz.f32 %p46, %f31, 0f00000000; @%p46 bra $L__BB0_30; bra.uni $L__BB0_27; $L__BB0_30: add.ftz.f32 %f250, %f31, %f31; selp.f32 %f360, %f250, 0f00000000, %p45; bra.uni $L__BB0_31; $L__BB0_27: setp.geu.ftz.f32 %p47, %f31, 0f00000000; @%p47 bra $L__BB0_31; mov.f32 %f246, 0f40145BFF; cvt.rzi.f32.f32 %f247, %f246; setp.eq.ftz.f32 %p48, %f247, 0f40145BFF; mov.b32 %r43, %f360; xor.b32 %r44, %r43, -2147483648; mov.b32 %f248, %r44; selp.f32 %f360, %f248, %f360, %p2; @%p48 bra $L__BB0_31; mov.f32 %f360, 0f7FFFFFFF; $L__BB0_31: add.ftz.f32 %f251, %f33, %f34; mov.b32 %r45, %f251; setp.lt.s32 %p50, %r45, 2139095040; @%p50 bra $L__BB0_38; setp.gtu.ftz.f32 %p51, %f33, 0f7F800000; setp.gtu.ftz.f32 %p52, %f34, 0f7F800000; or.pred %p53, %p51, %p52; @%p53 bra $L__BB0_37; bra.uni $L__BB0_33; $L__BB0_37: add.ftz.f32 %f360, %f31, 0f40145BFF; bra.uni $L__BB0_38; $L__BB0_33: setp.eq.ftz.f32 %p54, %f34, 0f7F800000; @%p54 bra $L__BB0_36; bra.uni $L__BB0_34; $L__BB0_36: setp.gt.ftz.f32 %p56, %f33, 0f3F800000; selp.f32 %f252, 0f7F800000, 0f00000000, %p56; setp.eq.ftz.f32 %p57, %f31, 0fBF800000; selp.f32 %f360, 0f3F800000, %f252, %p57; bra.uni $L__BB0_38; $L__BB0_34: setp.neu.ftz.f32 %p55, %f33, 0f7F800000; @%p55 bra $L__BB0_38; selp.f32 %f360, 0fFF800000, 0f7F800000, %p2; $L__BB0_38: setp.eq.ftz.f32 %p58, %f31, 0f3F800000; selp.f32 %f361, 0f3F800000, %f360, %p58; $L__BB0_40: setp.gtu.ftz.f32 %p59, %f10, 0f3B123EA5; @%p59 bra $L__BB0_42; bra.uni $L__BB0_41; $L__BB0_42: setp.gtu.ftz.f32 %p60, %f10, 0f3EBDF067; @%p60 bra $L__BB0_58; bra.uni $L__BB0_43; $L__BB0_58: fma.rn.ftz.f32 %f341, %f3, 0f411FFF18, 0fC0A5DF51; mul.ftz.f32 %f342, %f341, 0f3FB8AA3B; ex2.approx.ftz.f32 %f343, %f342; add.ftz.f32 %f365, %f343, 0fBBCD22E0; bra.uni $L__BB0_59; $L__BB0_41: mul.ftz.f32 %f365, %f3, 0f3AB2B981; bra.uni $L__BB0_59; $L__BB0_43: fma.rn.ftz.f32 %f50, %f3, 0f3F95D17C, 0fBAEA3C23; mov.f32 %f259, 0f3F945BFF; cvt.rzi.f32.f32 %f260, %f259; add.ftz.f32 %f261, %f260, %f260; mov.f32 %f262, 0f40145BFF; sub.ftz.f32 %f263, %f262, %f261; abs.ftz.f32 %f51, %f263; abs.ftz.f32 %f52, %f50; mov.b32 %r46, %f52; and.b32 %r47, %r46, 8388607; or.b32 %r48, %r47, 1065353216; mov.b32 %f264, %r48; shr.u32 %r49, %r46, 23; cvt.rn.f32.u32 %f265, %r49; add.ftz.f32 %f266, %f265, 0fC2FE0000; setp.gt.ftz.f32 %p61, %f264, 0f3FB504F3; mul.ftz.f32 %f267, %f264, 0f3F000000; add.ftz.f32 %f268, %f266, 0f3F800000; selp.f32 %f269, %f268, %f266, %p61; selp.f32 %f270, %f267, %f264, %p61; add.ftz.f32 %f271, %f270, 0fBF800000; add.ftz.f32 %f257, %f270, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f256,%f257; // end inline asm add.ftz.f32 %f272, %f271, %f271; mul.ftz.f32 %f273, %f256, %f272; mul.ftz.f32 %f274, %f273, %f273; mov.f32 %f275, 0f3C4CAF63; mov.f32 %f276, 0f3B18F0FE; fma.rn.ftz.f32 %f277, %f276, %f274, %f275; mov.f32 %f278, 0f3DAAAABD; fma.rn.ftz.f32 %f279, %f277, %f274, %f278; mul.rn.ftz.f32 %f280, %f279, %f274; mul.rn.ftz.f32 %f281, %f280, %f273; sub.ftz.f32 %f282, %f271, %f273; add.ftz.f32 %f283, %f282, %f282; neg.ftz.f32 %f284, %f273; fma.rn.ftz.f32 %f285, %f284, %f271, %f283; mul.rn.ftz.f32 %f286, %f256, %f285; add.ftz.f32 %f287, %f281, %f273; sub.ftz.f32 %f288, %f273, %f287; add.ftz.f32 %f289, %f281, %f288; add.ftz.f32 %f290, %f286, %f289; add.ftz.f32 %f291, %f287, %f290; sub.ftz.f32 %f292, %f287, %f291; add.ftz.f32 %f293, %f290, %f292; mov.f32 %f294, 0f3F317200; mul.rn.ftz.f32 %f295, %f269, %f294; mov.f32 %f296, 0f35BFBE8E; mul.rn.ftz.f32 %f297, %f269, %f296; add.ftz.f32 %f298, %f295, %f291; sub.ftz.f32 %f299, %f295, %f298; add.ftz.f32 %f300, %f291, %f299; add.ftz.f32 %f301, %f293, %f300; add.ftz.f32 %f302, %f297, %f301; add.ftz.f32 %f303, %f298, %f302; sub.ftz.f32 %f304, %f298, %f303; add.ftz.f32 %f305, %f302, %f304; abs.ftz.f32 %f53, %f262; setp.gt.ftz.f32 %p62, %f53, 0f77F684DF; selp.f32 %f306, 0f39945BFF, 0f40145BFF, %p62; mul.rn.ftz.f32 %f307, %f306, %f303; neg.ftz.f32 %f308, %f307; fma.rn.ftz.f32 %f309, %f306, %f303, %f308; fma.rn.ftz.f32 %f310, %f306, %f305, %f309; mov.f32 %f311, 0f00000000; fma.rn.ftz.f32 %f312, %f311, %f303, %f310; add.rn.ftz.f32 %f313, %f307, %f312; neg.ftz.f32 %f314, %f313; add.rn.ftz.f32 %f315, %f307, %f314; add.rn.ftz.f32 %f316, %f315, %f312; mov.b32 %r50, %f313; setp.eq.s32 %p63, %r50, 1118925336; add.s32 %r51, %r50, -1; mov.b32 %f317, %r51; add.ftz.f32 %f318, %f316, 0f37000000; selp.f32 %f54, %f318, %f316, %p63; selp.f32 %f319, %f317, %f313, %p63; mov.f32 %f320, 0f3FB8AA3B; mul.rn.ftz.f32 %f321, %f319, %f320; cvt.rzi.f32.f32 %f322, %f321; abs.ftz.f32 %f323, %f322; setp.gt.ftz.f32 %p64, %f323, 0f42FC0000; mov.b32 %r52, %f322; and.b32 %r53, %r52, -2147483648; or.b32 %r54, %r53, 1123811328; mov.b32 %f324, %r54; selp.f32 %f325, %f324, %f322, %p64; mov.f32 %f326, 0fBF317218; fma.rn.ftz.f32 %f327, %f325, %f326, %f319; mov.f32 %f328, 0f3102E308; fma.rn.ftz.f32 %f329, %f325, %f328, %f327; mul.ftz.f32 %f330, %f329, 0f3FB8AA3B; add.ftz.f32 %f331, %f325, 0f4B40007F; mov.b32 %r55, %f331; shl.b32 %r56, %r55, 23; mov.b32 %f332, %r56; ex2.approx.ftz.f32 %f333, %f330; mul.ftz.f32 %f55, %f333, %f332; setp.eq.ftz.f32 %p65, %f55, 0f7F800000; mov.f32 %f364, 0f7F800000; @%p65 bra $L__BB0_45; fma.rn.ftz.f32 %f364, %f55, %f54, %f55; $L__BB0_45: setp.lt.ftz.f32 %p66, %f50, 0f00000000; setp.eq.ftz.f32 %p67, %f51, 0f3F800000; and.pred %p3, %p66, %p67; setp.eq.ftz.f32 %p68, %f50, 0f00000000; @%p68 bra $L__BB0_49; bra.uni $L__BB0_46; $L__BB0_49: add.ftz.f32 %f338, %f50, %f50; selp.f32 %f364, %f338, 0f00000000, %p67; bra.uni $L__BB0_50; $L__BB0_46: setp.geu.ftz.f32 %p69, %f50, 0f00000000; @%p69 bra $L__BB0_50; mov.f32 %f334, 0f40145BFF; cvt.rzi.f32.f32 %f335, %f334; setp.eq.ftz.f32 %p70, %f335, 0f40145BFF; mov.b32 %r57, %f364; xor.b32 %r58, %r57, -2147483648; mov.b32 %f336, %r58; selp.f32 %f364, %f336, %f364, %p3; @%p70 bra $L__BB0_50; mov.f32 %f364, 0f7FFFFFFF; $L__BB0_50: add.ftz.f32 %f339, %f52, %f53; mov.b32 %r59, %f339; setp.lt.s32 %p72, %r59, 2139095040; @%p72 bra $L__BB0_57; setp.gtu.ftz.f32 %p73, %f52, 0f7F800000; setp.gtu.ftz.f32 %p74, %f53, 0f7F800000; or.pred %p75, %p73, %p74; @%p75 bra $L__BB0_56; bra.uni $L__BB0_52; $L__BB0_56: add.ftz.f32 %f364, %f50, 0f40145BFF; bra.uni $L__BB0_57; $L__BB0_52: setp.eq.ftz.f32 %p76, %f53, 0f7F800000; @%p76 bra $L__BB0_55; bra.uni $L__BB0_53; $L__BB0_55: setp.gt.ftz.f32 %p78, %f52, 0f3F800000; selp.f32 %f340, 0f7F800000, 0f00000000, %p78; setp.eq.ftz.f32 %p79, %f50, 0fBF800000; selp.f32 %f364, 0f3F800000, %f340, %p79; bra.uni $L__BB0_57; $L__BB0_53: setp.neu.ftz.f32 %p77, %f52, 0f7F800000; @%p77 bra $L__BB0_57; selp.f32 %f364, 0fFF800000, 0f7F800000, %p3; $L__BB0_57: setp.eq.ftz.f32 %p80, %f50, 0f3F800000; selp.f32 %f365, 0f3F800000, %f364, %p80; $L__BB0_59: ld.param.u64 %rd8, [BackwardPUE_param_1]; mov.f32 %f344, 0f00000000; max.f32 %f345, %f357, %f344; mov.f32 %f346, 0f7F7FFFFF; max.f32 %f347, %f361, %f344; max.f32 %f348, %f365, %f344; max.f32 %f349, %f4, %f344; min.f32 %f350, %f348, %f346; min.f32 %f351, %f347, %f346; min.f32 %f352, %f345, %f346; min.f32 %f353, %f349, %f346; shl.b64 %rd6, %rd1, 4; add.s64 %rd7, %rd8, %rd6; st.global.v4.f32 [%rd7], {%f352, %f351, %f350, %f353}; $L__BB0_60: ret; } // .globl ForwardPUE .entry ForwardPUE( .param .u64 .ptr .global .align 16 ForwardPUE_param_0, .param .u64 .ptr .global .align 16 ForwardPUE_param_1, .param .u32 ForwardPUE_param_2 ) { .reg .pred %p<81>; .reg .f32 %f<359>; .reg .b32 %r<60>; .reg .b64 %rd<8>; ld.param.u64 %rd2, [ForwardPUE_param_0]; ld.param.u64 %rd3, [ForwardPUE_param_1]; ld.param.u32 %r3, [ForwardPUE_param_2]; mov.b32 %r4, %envreg3; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; add.s32 %r8, %r7, %r4; mad.lo.s32 %r1, %r6, %r5, %r8; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %ntid.y; mov.u32 %r11, %tid.y; mov.b32 %r12, %envreg4; add.s32 %r13, %r11, %r12; mad.lo.s32 %r2, %r10, %r9, %r13; setp.gt.s32 %p4, %r1, 2047; setp.gt.s32 %p5, %r2, 2047; or.pred %p6, %p4, %p5; @%p6 bra $L__BB1_61; shl.b32 %r14, %r2, 11; add.s32 %r15, %r14, %r1; cvt.s64.s32 %rd1, %r15; mul.wide.s32 %rd4, %r15, 16; add.s64 %rd5, %rd2, %rd4; ld.global.nc.v4.f32 {%f68, %f69, %f70, %f71}, [%rd5]; abs.ftz.f32 %f73, %f68; abs.ftz.f32 %f75, %f69; abs.ftz.f32 %f77, %f70; abs.ftz.f32 %f79, %f71; setp.geu.ftz.f32 %p7, %f77, 0f7F800000; setp.geu.ftz.f32 %p8, %f79, 0f7F800000; setp.geu.ftz.f32 %p9, %f75, 0f7F800000; setp.geu.ftz.f32 %p10, %f73, 0f7F800000; or.pred %p11, %p10, %p9; selp.b32 %r16, -1, 0, %p11; or.pred %p12, %p8, %p7; selp.b32 %r17, -1, %r16, %p12; setp.gt.s32 %p13, %r17, -1; selp.f32 %f4, %f71, 0f00000000, %p13; selp.f32 %f358, %f70, 0f00000000, %p13; selp.f32 %f357, %f69, 0f00000000, %p13; selp.f32 %f356, %f68, 0f00000000, %p13; setp.eq.s32 %p14, %r3, 0; @%p14 bra $L__BB1_60; setp.gtu.ftz.f32 %p15, %f356, 0f35D3FDC0; @%p15 bra $L__BB1_4; bra.uni $L__BB1_3; $L__BB1_4: setp.gtu.ftz.f32 %p16, %f356, 0f3D03ED56; @%p16 bra $L__BB1_20; bra.uni $L__BB1_5; $L__BB1_20: add.ftz.f32 %f166, %f356, 0f3BCD22E0; lg2.approx.ftz.f32 %f167, %f166; fma.rn.ftz.f32 %f347, %f167, 0f3E08BDF0, 0f3F7FA592; bra.uni $L__BB1_21; $L__BB1_3: mul.ftz.f32 %f347, %f356, 0f44B09ACE; bra.uni $L__BB1_21; $L__BB1_5: mov.f32 %f83, 0f3E5CDE7E; cvt.rzi.f32.f32 %f84, %f83; add.ftz.f32 %f85, %f84, %f84; mov.f32 %f86, 0f3EDCDE7E; sub.ftz.f32 %f87, %f86, %f85; abs.ftz.f32 %f9, %f87; abs.ftz.f32 %f10, %f356; mov.b32 %r18, %f10; and.b32 %r19, %r18, 8388607; or.b32 %r20, %r19, 1065353216; mov.b32 %f88, %r20; shr.u32 %r21, %r18, 23; cvt.rn.f32.u32 %f89, %r21; add.ftz.f32 %f90, %f89, 0fC2FE0000; setp.gt.ftz.f32 %p17, %f88, 0f3FB504F3; mul.ftz.f32 %f91, %f88, 0f3F000000; add.ftz.f32 %f92, %f90, 0f3F800000; selp.f32 %f93, %f92, %f90, %p17; selp.f32 %f94, %f91, %f88, %p17; add.ftz.f32 %f95, %f94, 0fBF800000; add.ftz.f32 %f81, %f94, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f80,%f81; // end inline asm add.ftz.f32 %f96, %f95, %f95; mul.ftz.f32 %f97, %f80, %f96; mul.ftz.f32 %f98, %f97, %f97; mov.f32 %f99, 0f3C4CAF63; mov.f32 %f100, 0f3B18F0FE; fma.rn.ftz.f32 %f101, %f100, %f98, %f99; mov.f32 %f102, 0f3DAAAABD; fma.rn.ftz.f32 %f103, %f101, %f98, %f102; mul.rn.ftz.f32 %f104, %f103, %f98; mul.rn.ftz.f32 %f105, %f104, %f97; sub.ftz.f32 %f106, %f95, %f97; add.ftz.f32 %f107, %f106, %f106; neg.ftz.f32 %f108, %f97; fma.rn.ftz.f32 %f109, %f108, %f95, %f107; mul.rn.ftz.f32 %f110, %f80, %f109; add.ftz.f32 %f111, %f105, %f97; sub.ftz.f32 %f112, %f97, %f111; add.ftz.f32 %f113, %f105, %f112; add.ftz.f32 %f114, %f110, %f113; add.ftz.f32 %f115, %f111, %f114; sub.ftz.f32 %f116, %f111, %f115; add.ftz.f32 %f117, %f114, %f116; mov.f32 %f118, 0f3F317200; mul.rn.ftz.f32 %f119, %f93, %f118; mov.f32 %f120, 0f35BFBE8E; mul.rn.ftz.f32 %f121, %f93, %f120; add.ftz.f32 %f122, %f119, %f115; sub.ftz.f32 %f123, %f119, %f122; add.ftz.f32 %f124, %f115, %f123; add.ftz.f32 %f125, %f117, %f124; add.ftz.f32 %f126, %f121, %f125; add.ftz.f32 %f127, %f122, %f126; sub.ftz.f32 %f128, %f122, %f127; add.ftz.f32 %f129, %f126, %f128; abs.ftz.f32 %f11, %f86; setp.gt.ftz.f32 %p18, %f11, 0f77F684DF; selp.f32 %f130, 0f385CDE7E, 0f3EDCDE7E, %p18; mul.rn.ftz.f32 %f131, %f130, %f127; neg.ftz.f32 %f132, %f131; fma.rn.ftz.f32 %f133, %f130, %f127, %f132; fma.rn.ftz.f32 %f134, %f130, %f129, %f133; mov.f32 %f135, 0f00000000; fma.rn.ftz.f32 %f136, %f135, %f127, %f134; add.rn.ftz.f32 %f137, %f131, %f136; neg.ftz.f32 %f138, %f137; add.rn.ftz.f32 %f139, %f131, %f138; add.rn.ftz.f32 %f140, %f139, %f136; mov.b32 %r22, %f137; setp.eq.s32 %p19, %r22, 1118925336; add.s32 %r23, %r22, -1; mov.b32 %f141, %r23; add.ftz.f32 %f142, %f140, 0f37000000; selp.f32 %f12, %f142, %f140, %p19; selp.f32 %f143, %f141, %f137, %p19; mov.f32 %f144, 0f3FB8AA3B; mul.rn.ftz.f32 %f145, %f143, %f144; cvt.rzi.f32.f32 %f146, %f145; abs.ftz.f32 %f147, %f146; setp.gt.ftz.f32 %p20, %f147, 0f42FC0000; mov.b32 %r24, %f146; and.b32 %r25, %r24, -2147483648; or.b32 %r26, %r25, 1123811328; mov.b32 %f148, %r26; selp.f32 %f149, %f148, %f146, %p20; mov.f32 %f150, 0fBF317218; fma.rn.ftz.f32 %f151, %f149, %f150, %f143; mov.f32 %f152, 0f3102E308; fma.rn.ftz.f32 %f153, %f149, %f152, %f151; mul.ftz.f32 %f154, %f153, 0f3FB8AA3B; add.ftz.f32 %f155, %f149, 0f4B40007F; mov.b32 %r27, %f155; shl.b32 %r28, %r27, 23; mov.b32 %f156, %r28; ex2.approx.ftz.f32 %f157, %f154; mul.ftz.f32 %f13, %f157, %f156; setp.eq.ftz.f32 %p21, %f13, 0f7F800000; mov.f32 %f346, 0f7F800000; @%p21 bra $L__BB1_7; fma.rn.ftz.f32 %f346, %f13, %f12, %f13; $L__BB1_7: setp.lt.ftz.f32 %p22, %f356, 0f00000000; setp.eq.ftz.f32 %p23, %f9, 0f3F800000; and.pred %p1, %p22, %p23; setp.eq.ftz.f32 %p24, %f356, 0f00000000; @%p24 bra $L__BB1_11; bra.uni $L__BB1_8; $L__BB1_11: add.ftz.f32 %f162, %f356, %f356; selp.f32 %f346, %f162, 0f00000000, %p23; bra.uni $L__BB1_12; $L__BB1_8: setp.geu.ftz.f32 %p25, %f356, 0f00000000; @%p25 bra $L__BB1_12; mov.f32 %f158, 0f3EDCDE7E; cvt.rzi.f32.f32 %f159, %f158; setp.eq.ftz.f32 %p26, %f159, 0f3EDCDE7E; mov.b32 %r29, %f346; xor.b32 %r30, %r29, -2147483648; mov.b32 %f160, %r30; selp.f32 %f346, %f160, %f346, %p1; @%p26 bra $L__BB1_12; mov.f32 %f346, 0f7FFFFFFF; $L__BB1_12: add.ftz.f32 %f163, %f10, %f11; mov.b32 %r31, %f163; setp.lt.s32 %p28, %r31, 2139095040; @%p28 bra $L__BB1_19; setp.gtu.ftz.f32 %p29, %f10, 0f7F800000; setp.gtu.ftz.f32 %p30, %f11, 0f7F800000; or.pred %p31, %p29, %p30; @%p31 bra $L__BB1_18; bra.uni $L__BB1_14; $L__BB1_18: add.ftz.f32 %f346, %f356, 0f3EDCDE7E; bra.uni $L__BB1_19; $L__BB1_14: setp.eq.ftz.f32 %p32, %f11, 0f7F800000; @%p32 bra $L__BB1_17; bra.uni $L__BB1_15; $L__BB1_17: setp.gt.ftz.f32 %p34, %f10, 0f3F800000; selp.f32 %f164, 0f7F800000, 0f00000000, %p34; setp.eq.ftz.f32 %p35, %f356, 0fBF800000; selp.f32 %f346, 0f3F800000, %f164, %p35; bra.uni $L__BB1_19; $L__BB1_15: setp.neu.ftz.f32 %p33, %f10, 0f7F800000; @%p33 bra $L__BB1_19; selp.f32 %f346, 0fFF800000, 0f7F800000, %p1; $L__BB1_19: fma.rn.ftz.f32 %f165, %f346, 0f3FD2ADE4, 0f3B40C470; setp.eq.ftz.f32 %p36, %f356, 0f3F800000; selp.f32 %f347, 0f3FD30E46, %f165, %p36; $L__BB1_21: setp.gtu.ftz.f32 %p37, %f357, 0f35D3FDC0; @%p37 bra $L__BB1_23; bra.uni $L__BB1_22; $L__BB1_23: setp.gtu.ftz.f32 %p38, %f357, 0f3D03ED56; @%p38 bra $L__BB1_39; bra.uni $L__BB1_24; $L__BB1_39: add.ftz.f32 %f254, %f357, 0f3BCD22E0; lg2.approx.ftz.f32 %f255, %f254; fma.rn.ftz.f32 %f351, %f255, 0f3E08BDF0, 0f3F7FA592; bra.uni $L__BB1_40; $L__BB1_22: mul.ftz.f32 %f351, %f357, 0f44B09ACE; bra.uni $L__BB1_40; $L__BB1_24: mov.f32 %f171, 0f3E5CDE7E; cvt.rzi.f32.f32 %f172, %f171; add.ftz.f32 %f173, %f172, %f172; mov.f32 %f174, 0f3EDCDE7E; sub.ftz.f32 %f175, %f174, %f173; abs.ftz.f32 %f27, %f175; abs.ftz.f32 %f28, %f357; mov.b32 %r32, %f28; and.b32 %r33, %r32, 8388607; or.b32 %r34, %r33, 1065353216; mov.b32 %f176, %r34; shr.u32 %r35, %r32, 23; cvt.rn.f32.u32 %f177, %r35; add.ftz.f32 %f178, %f177, 0fC2FE0000; setp.gt.ftz.f32 %p39, %f176, 0f3FB504F3; mul.ftz.f32 %f179, %f176, 0f3F000000; add.ftz.f32 %f180, %f178, 0f3F800000; selp.f32 %f181, %f180, %f178, %p39; selp.f32 %f182, %f179, %f176, %p39; add.ftz.f32 %f183, %f182, 0fBF800000; add.ftz.f32 %f169, %f182, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f168,%f169; // end inline asm add.ftz.f32 %f184, %f183, %f183; mul.ftz.f32 %f185, %f168, %f184; mul.ftz.f32 %f186, %f185, %f185; mov.f32 %f187, 0f3C4CAF63; mov.f32 %f188, 0f3B18F0FE; fma.rn.ftz.f32 %f189, %f188, %f186, %f187; mov.f32 %f190, 0f3DAAAABD; fma.rn.ftz.f32 %f191, %f189, %f186, %f190; mul.rn.ftz.f32 %f192, %f191, %f186; mul.rn.ftz.f32 %f193, %f192, %f185; sub.ftz.f32 %f194, %f183, %f185; add.ftz.f32 %f195, %f194, %f194; neg.ftz.f32 %f196, %f185; fma.rn.ftz.f32 %f197, %f196, %f183, %f195; mul.rn.ftz.f32 %f198, %f168, %f197; add.ftz.f32 %f199, %f193, %f185; sub.ftz.f32 %f200, %f185, %f199; add.ftz.f32 %f201, %f193, %f200; add.ftz.f32 %f202, %f198, %f201; add.ftz.f32 %f203, %f199, %f202; sub.ftz.f32 %f204, %f199, %f203; add.ftz.f32 %f205, %f202, %f204; mov.f32 %f206, 0f3F317200; mul.rn.ftz.f32 %f207, %f181, %f206; mov.f32 %f208, 0f35BFBE8E; mul.rn.ftz.f32 %f209, %f181, %f208; add.ftz.f32 %f210, %f207, %f203; sub.ftz.f32 %f211, %f207, %f210; add.ftz.f32 %f212, %f203, %f211; add.ftz.f32 %f213, %f205, %f212; add.ftz.f32 %f214, %f209, %f213; add.ftz.f32 %f215, %f210, %f214; sub.ftz.f32 %f216, %f210, %f215; add.ftz.f32 %f217, %f214, %f216; abs.ftz.f32 %f29, %f174; setp.gt.ftz.f32 %p40, %f29, 0f77F684DF; selp.f32 %f218, 0f385CDE7E, 0f3EDCDE7E, %p40; mul.rn.ftz.f32 %f219, %f218, %f215; neg.ftz.f32 %f220, %f219; fma.rn.ftz.f32 %f221, %f218, %f215, %f220; fma.rn.ftz.f32 %f222, %f218, %f217, %f221; mov.f32 %f223, 0f00000000; fma.rn.ftz.f32 %f224, %f223, %f215, %f222; add.rn.ftz.f32 %f225, %f219, %f224; neg.ftz.f32 %f226, %f225; add.rn.ftz.f32 %f227, %f219, %f226; add.rn.ftz.f32 %f228, %f227, %f224; mov.b32 %r36, %f225; setp.eq.s32 %p41, %r36, 1118925336; add.s32 %r37, %r36, -1; mov.b32 %f229, %r37; add.ftz.f32 %f230, %f228, 0f37000000; selp.f32 %f30, %f230, %f228, %p41; selp.f32 %f231, %f229, %f225, %p41; mov.f32 %f232, 0f3FB8AA3B; mul.rn.ftz.f32 %f233, %f231, %f232; cvt.rzi.f32.f32 %f234, %f233; abs.ftz.f32 %f235, %f234; setp.gt.ftz.f32 %p42, %f235, 0f42FC0000; mov.b32 %r38, %f234; and.b32 %r39, %r38, -2147483648; or.b32 %r40, %r39, 1123811328; mov.b32 %f236, %r40; selp.f32 %f237, %f236, %f234, %p42; mov.f32 %f238, 0fBF317218; fma.rn.ftz.f32 %f239, %f237, %f238, %f231; mov.f32 %f240, 0f3102E308; fma.rn.ftz.f32 %f241, %f237, %f240, %f239; mul.ftz.f32 %f242, %f241, 0f3FB8AA3B; add.ftz.f32 %f243, %f237, 0f4B40007F; mov.b32 %r41, %f243; shl.b32 %r42, %r41, 23; mov.b32 %f244, %r42; ex2.approx.ftz.f32 %f245, %f242; mul.ftz.f32 %f31, %f245, %f244; setp.eq.ftz.f32 %p43, %f31, 0f7F800000; mov.f32 %f350, 0f7F800000; @%p43 bra $L__BB1_26; fma.rn.ftz.f32 %f350, %f31, %f30, %f31; $L__BB1_26: setp.lt.ftz.f32 %p44, %f357, 0f00000000; setp.eq.ftz.f32 %p45, %f27, 0f3F800000; and.pred %p2, %p44, %p45; setp.eq.ftz.f32 %p46, %f357, 0f00000000; @%p46 bra $L__BB1_30; bra.uni $L__BB1_27; $L__BB1_30: add.ftz.f32 %f250, %f357, %f357; selp.f32 %f350, %f250, 0f00000000, %p45; bra.uni $L__BB1_31; $L__BB1_27: setp.geu.ftz.f32 %p47, %f357, 0f00000000; @%p47 bra $L__BB1_31; mov.f32 %f246, 0f3EDCDE7E; cvt.rzi.f32.f32 %f247, %f246; setp.eq.ftz.f32 %p48, %f247, 0f3EDCDE7E; mov.b32 %r43, %f350; xor.b32 %r44, %r43, -2147483648; mov.b32 %f248, %r44; selp.f32 %f350, %f248, %f350, %p2; @%p48 bra $L__BB1_31; mov.f32 %f350, 0f7FFFFFFF; $L__BB1_31: add.ftz.f32 %f251, %f28, %f29; mov.b32 %r45, %f251; setp.lt.s32 %p50, %r45, 2139095040; @%p50 bra $L__BB1_38; setp.gtu.ftz.f32 %p51, %f28, 0f7F800000; setp.gtu.ftz.f32 %p52, %f29, 0f7F800000; or.pred %p53, %p51, %p52; @%p53 bra $L__BB1_37; bra.uni $L__BB1_33; $L__BB1_37: add.ftz.f32 %f350, %f357, 0f3EDCDE7E; bra.uni $L__BB1_38; $L__BB1_33: setp.eq.ftz.f32 %p54, %f29, 0f7F800000; @%p54 bra $L__BB1_36; bra.uni $L__BB1_34; $L__BB1_36: setp.gt.ftz.f32 %p56, %f28, 0f3F800000; selp.f32 %f252, 0f7F800000, 0f00000000, %p56; setp.eq.ftz.f32 %p57, %f357, 0fBF800000; selp.f32 %f350, 0f3F800000, %f252, %p57; bra.uni $L__BB1_38; $L__BB1_34: setp.neu.ftz.f32 %p55, %f28, 0f7F800000; @%p55 bra $L__BB1_38; selp.f32 %f350, 0fFF800000, 0f7F800000, %p2; $L__BB1_38: fma.rn.ftz.f32 %f253, %f350, 0f3FD2ADE4, 0f3B40C470; setp.eq.ftz.f32 %p58, %f357, 0f3F800000; selp.f32 %f351, 0f3FD30E46, %f253, %p58; $L__BB1_40: setp.gtu.ftz.f32 %p59, %f358, 0f35D3FDC0; @%p59 bra $L__BB1_42; bra.uni $L__BB1_41; $L__BB1_42: setp.gtu.ftz.f32 %p60, %f358, 0f3D03ED56; @%p60 bra $L__BB1_58; bra.uni $L__BB1_43; $L__BB1_58: add.ftz.f32 %f342, %f358, 0f3BCD22E0; lg2.approx.ftz.f32 %f343, %f342; fma.rn.ftz.f32 %f355, %f343, 0f3E08BDF0, 0f3F7FA592; bra.uni $L__BB1_59; $L__BB1_41: mul.ftz.f32 %f355, %f358, 0f44B09ACE; bra.uni $L__BB1_59; $L__BB1_43: mov.f32 %f259, 0f3E5CDE7E; cvt.rzi.f32.f32 %f260, %f259; add.ftz.f32 %f261, %f260, %f260; mov.f32 %f262, 0f3EDCDE7E; sub.ftz.f32 %f263, %f262, %f261; abs.ftz.f32 %f45, %f263; abs.ftz.f32 %f46, %f358; mov.b32 %r46, %f46; and.b32 %r47, %r46, 8388607; or.b32 %r48, %r47, 1065353216; mov.b32 %f264, %r48; shr.u32 %r49, %r46, 23; cvt.rn.f32.u32 %f265, %r49; add.ftz.f32 %f266, %f265, 0fC2FE0000; setp.gt.ftz.f32 %p61, %f264, 0f3FB504F3; mul.ftz.f32 %f267, %f264, 0f3F000000; add.ftz.f32 %f268, %f266, 0f3F800000; selp.f32 %f269, %f268, %f266, %p61; selp.f32 %f270, %f267, %f264, %p61; add.ftz.f32 %f271, %f270, 0fBF800000; add.ftz.f32 %f257, %f270, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f256,%f257; // end inline asm add.ftz.f32 %f272, %f271, %f271; mul.ftz.f32 %f273, %f256, %f272; mul.ftz.f32 %f274, %f273, %f273; mov.f32 %f275, 0f3C4CAF63; mov.f32 %f276, 0f3B18F0FE; fma.rn.ftz.f32 %f277, %f276, %f274, %f275; mov.f32 %f278, 0f3DAAAABD; fma.rn.ftz.f32 %f279, %f277, %f274, %f278; mul.rn.ftz.f32 %f280, %f279, %f274; mul.rn.ftz.f32 %f281, %f280, %f273; sub.ftz.f32 %f282, %f271, %f273; add.ftz.f32 %f283, %f282, %f282; neg.ftz.f32 %f284, %f273; fma.rn.ftz.f32 %f285, %f284, %f271, %f283; mul.rn.ftz.f32 %f286, %f256, %f285; add.ftz.f32 %f287, %f281, %f273; sub.ftz.f32 %f288, %f273, %f287; add.ftz.f32 %f289, %f281, %f288; add.ftz.f32 %f290, %f286, %f289; add.ftz.f32 %f291, %f287, %f290; sub.ftz.f32 %f292, %f287, %f291; add.ftz.f32 %f293, %f290, %f292; mov.f32 %f294, 0f3F317200; mul.rn.ftz.f32 %f295, %f269, %f294; mov.f32 %f296, 0f35BFBE8E; mul.rn.ftz.f32 %f297, %f269, %f296; add.ftz.f32 %f298, %f295, %f291; sub.ftz.f32 %f299, %f295, %f298; add.ftz.f32 %f300, %f291, %f299; add.ftz.f32 %f301, %f293, %f300; add.ftz.f32 %f302, %f297, %f301; add.ftz.f32 %f303, %f298, %f302; sub.ftz.f32 %f304, %f298, %f303; add.ftz.f32 %f305, %f302, %f304; abs.ftz.f32 %f47, %f262; setp.gt.ftz.f32 %p62, %f47, 0f77F684DF; selp.f32 %f306, 0f385CDE7E, 0f3EDCDE7E, %p62; mul.rn.ftz.f32 %f307, %f306, %f303; neg.ftz.f32 %f308, %f307; fma.rn.ftz.f32 %f309, %f306, %f303, %f308; fma.rn.ftz.f32 %f310, %f306, %f305, %f309; mov.f32 %f311, 0f00000000; fma.rn.ftz.f32 %f312, %f311, %f303, %f310; add.rn.ftz.f32 %f313, %f307, %f312; neg.ftz.f32 %f314, %f313; add.rn.ftz.f32 %f315, %f307, %f314; add.rn.ftz.f32 %f316, %f315, %f312; mov.b32 %r50, %f313; setp.eq.s32 %p63, %r50, 1118925336; add.s32 %r51, %r50, -1; mov.b32 %f317, %r51; add.ftz.f32 %f318, %f316, 0f37000000; selp.f32 %f48, %f318, %f316, %p63; selp.f32 %f319, %f317, %f313, %p63; mov.f32 %f320, 0f3FB8AA3B; mul.rn.ftz.f32 %f321, %f319, %f320; cvt.rzi.f32.f32 %f322, %f321; abs.ftz.f32 %f323, %f322; setp.gt.ftz.f32 %p64, %f323, 0f42FC0000; mov.b32 %r52, %f322; and.b32 %r53, %r52, -2147483648; or.b32 %r54, %r53, 1123811328; mov.b32 %f324, %r54; selp.f32 %f325, %f324, %f322, %p64; mov.f32 %f326, 0fBF317218; fma.rn.ftz.f32 %f327, %f325, %f326, %f319; mov.f32 %f328, 0f3102E308; fma.rn.ftz.f32 %f329, %f325, %f328, %f327; mul.ftz.f32 %f330, %f329, 0f3FB8AA3B; add.ftz.f32 %f331, %f325, 0f4B40007F; mov.b32 %r55, %f331; shl.b32 %r56, %r55, 23; mov.b32 %f332, %r56; ex2.approx.ftz.f32 %f333, %f330; mul.ftz.f32 %f49, %f333, %f332; setp.eq.ftz.f32 %p65, %f49, 0f7F800000; mov.f32 %f354, 0f7F800000; @%p65 bra $L__BB1_45; fma.rn.ftz.f32 %f354, %f49, %f48, %f49; $L__BB1_45: setp.lt.ftz.f32 %p66, %f358, 0f00000000; setp.eq.ftz.f32 %p67, %f45, 0f3F800000; and.pred %p3, %p66, %p67; setp.eq.ftz.f32 %p68, %f358, 0f00000000; @%p68 bra $L__BB1_49; bra.uni $L__BB1_46; $L__BB1_49: add.ftz.f32 %f338, %f358, %f358; selp.f32 %f354, %f338, 0f00000000, %p67; bra.uni $L__BB1_50; $L__BB1_46: setp.geu.ftz.f32 %p69, %f358, 0f00000000; @%p69 bra $L__BB1_50; mov.f32 %f334, 0f3EDCDE7E; cvt.rzi.f32.f32 %f335, %f334; setp.eq.ftz.f32 %p70, %f335, 0f3EDCDE7E; mov.b32 %r57, %f354; xor.b32 %r58, %r57, -2147483648; mov.b32 %f336, %r58; selp.f32 %f354, %f336, %f354, %p3; @%p70 bra $L__BB1_50; mov.f32 %f354, 0f7FFFFFFF; $L__BB1_50: add.ftz.f32 %f339, %f46, %f47; mov.b32 %r59, %f339; setp.lt.s32 %p72, %r59, 2139095040; @%p72 bra $L__BB1_57; setp.gtu.ftz.f32 %p73, %f46, 0f7F800000; setp.gtu.ftz.f32 %p74, %f47, 0f7F800000; or.pred %p75, %p73, %p74; @%p75 bra $L__BB1_56; bra.uni $L__BB1_52; $L__BB1_56: add.ftz.f32 %f354, %f358, 0f3EDCDE7E; bra.uni $L__BB1_57; $L__BB1_52: setp.eq.ftz.f32 %p76, %f47, 0f7F800000; @%p76 bra $L__BB1_55; bra.uni $L__BB1_53; $L__BB1_55: setp.gt.ftz.f32 %p78, %f46, 0f3F800000; selp.f32 %f340, 0f7F800000, 0f00000000, %p78; setp.eq.ftz.f32 %p79, %f358, 0fBF800000; selp.f32 %f354, 0f3F800000, %f340, %p79; bra.uni $L__BB1_57; $L__BB1_53: setp.neu.ftz.f32 %p77, %f46, 0f7F800000; @%p77 bra $L__BB1_57; selp.f32 %f354, 0fFF800000, 0f7F800000, %p3; $L__BB1_57: fma.rn.ftz.f32 %f341, %f354, 0f3FD2ADE4, 0f3B40C470; setp.eq.ftz.f32 %p80, %f358, 0f3F800000; selp.f32 %f355, 0f3FD30E46, %f341, %p80; $L__BB1_59: mul.ftz.f32 %f356, %f347, 0f3F04E259; mul.ftz.f32 %f357, %f351, 0f3F04E259; mul.ftz.f32 %f358, %f355, 0f3F04E259; $L__BB1_60: shl.b64 %rd6, %rd1, 4; add.s64 %rd7, %rd3, %rd6; st.global.v4.f32 [%rd7], {%f356, %f357, %f358, %f4}; $L__BB1_61: ret; }