Use mixed Horner scheme in Math.exp/expm1 to improve instruction parallelization (#311)

This commit is contained in:
Max Graey 2018-11-09 18:56:42 +02:00 committed by Daniel Wirtz
parent 54311fd0ab
commit 410036e445
8 changed files with 7763 additions and 6904 deletions

View File

@ -422,8 +422,10 @@ export namespace NativeMath {
} else if (hx > 0x3E300000) {
hi = x;
} else return 1.0 + x;
var xx = x * x;
var c = x - xx * (P1 + xx * (P2 + xx * (P3 + xx * (P4 + xx * P5))));
var xs = x * x;
// var c = x - xp2 * (P1 + xp2 * (P2 + xp2 * (P3 + xp2 * (P4 + xp2 * P5))));
var xq = xs * xs;
var c = x - (xs * P1 + xq * ((P2 + xs * P3) + xq * (P4 + xs * P5)));
var y = 1.0 + (x * c / (2 - c) - lo + hi);
if (k == 0) return y;
return scalbn(y, k);
@ -464,7 +466,9 @@ export namespace NativeMath {
} else if (hx < 0x3C900000) return x;
var hfx = 0.5 * x;
var hxs = x * hfx;
var r1 = 1.0 + hxs * (Q1 + hxs * (Q2 + hxs * (Q3 + hxs * (Q4 + hxs * Q5))));
// var r1 = 1.0 + hxs * (Q1 + hxs * (Q2 + hxs * (Q3 + hxs * (Q4 + hxs * Q5))));
var hxq = hxs * hxs;
var r1 = (1.0 + hxs * Q1) + hxq * ((Q2 + hxs * Q3) + hxq * (Q4 + hxs * Q5));
t = 3.0 - r1 * hfx;
var e = hxs * ((r1 - t) / (6.0 - x * t));
if (k == 0) return x - (x * e - hxs);

View File

@ -3457,7 +3457,7 @@
if
i32.const 0
i32.const 552
i32.const 955
i32.const 959
i32.const 4
call $~lib/env/abort
unreachable
@ -5173,7 +5173,7 @@
if
i32.const 0
i32.const 552
i32.const 964
i32.const 968
i32.const 24
call $~lib/env/abort
unreachable

View File

@ -4512,7 +4512,7 @@
if
i32.const 0
i32.const 552
i32.const 955
i32.const 959
i32.const 4
call $~lib/env/abort
unreachable
@ -7609,7 +7609,7 @@
if
i32.const 0
i32.const 552
i32.const 964
i32.const 968
i32.const 24
call $~lib/env/abort
unreachable

View File

@ -1695,7 +1695,6 @@
return
end
end
f64.const 1
get_local $0
f64.const 0.5
get_local $0
@ -1703,12 +1702,22 @@
tee_local $8
f64.mul
tee_local $2
f64.const -0.03333333333333313
get_local $2
f64.mul
set_local $1
f64.const 3
f64.const 1
get_local $2
f64.const -0.03333333333333313
f64.mul
f64.add
get_local $1
f64.const 1.5873015872548146e-03
get_local $2
f64.const -7.93650757867488e-05
get_local $2
f64.mul
f64.add
get_local $1
f64.const 4.008217827329362e-06
get_local $2
f64.const -2.0109921818362437e-07
@ -1718,20 +1727,14 @@
f64.add
f64.mul
f64.add
f64.mul
f64.add
f64.mul
f64.add
tee_local $9
set_local $1
get_local $2
get_local $9
f64.const 3
get_local $1
get_local $8
f64.mul
f64.sub
tee_local $1
set_local $1
get_local $2
get_local $9
get_local $1
f64.sub
f64.const 6
get_local $0
@ -1954,10 +1957,12 @@
)
(func $~lib/math/NativeMath.exp (; 25 ;) (type $FF) (param $0 f64) (result f64)
(local $1 i32)
(local $2 i32)
(local $3 f64)
(local $4 i32)
(local $2 f64)
(local $3 i32)
(local $4 f64)
(local $5 f64)
(local $6 i32)
(local $7 f64)
get_local $0
i64.reinterpret/f64
i64.const 32
@ -1966,7 +1971,7 @@
tee_local $1
i32.const 31
i32.shr_u
set_local $4
set_local $6
get_local $1
i32.const 2147483647
i32.and
@ -2017,22 +2022,22 @@
i32.trunc_s/f64
else
i32.const 1
get_local $4
get_local $6
i32.const 1
i32.shl
i32.sub
end
tee_local $2
tee_local $3
f64.convert_s/i32
tee_local $0
f64.const 0.6931471803691238
f64.mul
f64.sub
tee_local $3
tee_local $4
get_local $0
f64.const 1.9082149292705877e-10
f64.mul
tee_local $5
tee_local $7
f64.sub
set_local $0
else
@ -2046,23 +2051,30 @@
return
end
get_local $0
set_local $3
set_local $4
end
f64.const 1
get_local $0
get_local $0
get_local $0
get_local $0
f64.mul
tee_local $0
tee_local $2
get_local $2
f64.mul
set_local $5
f64.const 1
get_local $0
get_local $0
get_local $2
f64.const 0.16666666666666602
get_local $0
f64.mul
get_local $5
f64.const -2.7777777777015593e-03
get_local $0
get_local $2
f64.const 6.613756321437934e-05
get_local $0
f64.mul
f64.add
get_local $5
f64.const -1.6533902205465252e-06
get_local $0
get_local $2
f64.const 4.1381367970572385e-08
f64.mul
f64.add
@ -2070,9 +2082,6 @@
f64.add
f64.mul
f64.add
f64.mul
f64.add
f64.mul
f64.sub
tee_local $0
f64.mul
@ -2080,20 +2089,20 @@
get_local $0
f64.sub
f64.div
get_local $5
get_local $7
f64.sub
get_local $3
get_local $4
f64.add
f64.add
set_local $0
get_local $2
get_local $3
i32.eqz
if
get_local $0
return
end
get_local $0
get_local $2
get_local $3
call $~lib/math/NativeMath.scalbn
)
(func $~lib/math/NativeMath.cosh (; 26 ;) (type $FF) (param $0 f64) (result f64)

View File

@ -1950,7 +1950,8 @@
(local $12 f64)
(local $13 f64)
(local $14 f64)
(local $15 i32)
(local $15 f64)
(local $16 i32)
get_local $0
i64.reinterpret/f64
set_local $1
@ -2057,14 +2058,22 @@
get_local $9
f64.mul
set_local $10
get_local $10
get_local $10
f64.mul
set_local $11
f64.const 1
get_local $10
f64.const -0.03333333333333313
get_local $10
f64.mul
f64.add
get_local $11
f64.const 1.5873015872548146e-03
get_local $10
f64.const -7.93650757867488e-05
get_local $10
f64.mul
f64.add
get_local $11
f64.const 4.008217827329362e-06
get_local $10
f64.const -2.0109921818362437e-07
@ -2074,19 +2083,15 @@
f64.add
f64.mul
f64.add
f64.mul
f64.add
f64.mul
f64.add
set_local $11
set_local $12
f64.const 3
get_local $11
get_local $12
get_local $9
f64.mul
f64.sub
set_local $6
get_local $10
get_local $11
get_local $12
get_local $6
f64.sub
f64.const 6
@ -2096,14 +2101,14 @@
f64.sub
f64.div
f64.mul
set_local $12
set_local $13
get_local $3
i32.const 0
i32.eq
if
get_local $0
get_local $0
get_local $12
get_local $13
f64.mul
get_local $10
f64.sub
@ -2111,24 +2116,24 @@
return
end
get_local $0
get_local $12
get_local $13
get_local $5
f64.sub
f64.mul
get_local $5
f64.sub
set_local $12
get_local $12
set_local $13
get_local $13
get_local $10
f64.sub
set_local $12
set_local $13
get_local $3
i32.const -1
i32.eq
if
f64.const 0.5
get_local $0
get_local $12
get_local $13
f64.sub
f64.mul
f64.const 0.5
@ -2144,7 +2149,7 @@
f64.lt
if
f64.const -2
get_local $12
get_local $13
get_local $0
f64.const 0.5
f64.add
@ -2155,7 +2160,7 @@
f64.const 1
f64.const 2
get_local $0
get_local $12
get_local $13
f64.sub
f64.mul
f64.add
@ -2170,13 +2175,13 @@
set_local $1
get_local $1
f64.reinterpret/i64
set_local $13
set_local $14
get_local $3
i32.const 0
i32.lt_s
tee_local $15
tee_local $16
if (result i32)
get_local $15
get_local $16
else
get_local $3
i32.const 56
@ -2184,28 +2189,28 @@
end
if
get_local $0
get_local $12
get_local $13
f64.sub
f64.const 1
f64.add
set_local $14
set_local $15
get_local $3
i32.const 1024
i32.eq
if
get_local $14
get_local $15
f64.const 2
f64.mul
f64.const 8988465674311579538646525e283
f64.mul
set_local $14
set_local $15
else
get_local $15
get_local $14
get_local $13
f64.mul
set_local $14
set_local $15
end
get_local $14
get_local $15
f64.const 1
f64.sub
return
@ -2219,29 +2224,29 @@
set_local $1
get_local $1
f64.reinterpret/i64
set_local $14
set_local $15
get_local $3
i32.const 20
i32.lt_s
if
f64.const 1
get_local $14
get_local $15
f64.sub
get_local $12
get_local $13
f64.sub
set_local $14
set_local $15
else
f64.const 1
get_local $12
get_local $14
get_local $13
get_local $15
f64.add
f64.sub
set_local $14
set_local $15
end
get_local $0
get_local $14
get_local $15
f64.add
get_local $13
get_local $14
f64.mul
)
(func $~lib/math/NativeMath.scalbn (; 26 ;) (type $FiF) (param $0 f64) (param $1 i32) (result f64)
@ -2344,6 +2349,7 @@
(local $6 f64)
(local $7 f64)
(local $8 f64)
(local $9 f64)
get_local $0
i64.reinterpret/f64
i64.const 32
@ -2450,14 +2456,21 @@
get_local $0
f64.mul
set_local $6
get_local $6
get_local $6
f64.mul
set_local $7
get_local $0
get_local $6
f64.const 0.16666666666666602
get_local $6
f64.mul
get_local $7
f64.const -2.7777777777015593e-03
get_local $6
f64.const 6.613756321437934e-05
get_local $6
f64.mul
f64.add
get_local $7
f64.const -1.6533902205465252e-06
get_local $6
f64.const 4.1381367970572385e-08
@ -2467,17 +2480,14 @@
f64.add
f64.mul
f64.add
f64.mul
f64.add
f64.mul
f64.sub
set_local $7
set_local $8
f64.const 1
get_local $0
get_local $7
get_local $8
f64.mul
f64.const 2
get_local $7
get_local $8
f64.sub
f64.div
get_local $4
@ -2485,15 +2495,15 @@
get_local $3
f64.add
f64.add
set_local $8
set_local $9
get_local $5
i32.const 0
i32.eq
if
get_local $8
get_local $9
return
end
get_local $8
get_local $9
get_local $5
call $~lib/math/NativeMath.scalbn
)

File diff suppressed because it is too large Load Diff

View File

@ -1181,6 +1181,141 @@ assert(test_exp(-1.0397214889526365, 0.353553136702178472, 0.252727240324020386,
assert(test_exp(1.03972101211547852, 2.82842780717661224, -0.418413937091827393, INEXACT));
assert(test_exp(1.03972148895263672, 2.82842915587641164, -0.226183772087097168, INEXACT));
// some vectors from crlibm
assert(test_exp( f64.MIN_VALUE, 1.0, 0.0, INEXACT)); // smallest denorm positive
assert(test_exp(-f64.MIN_VALUE, 1.0, 0.0, INEXACT)); // smallest denorm negative
assert(test_exp(
reinterpret<f64>(0x40862E42FEFA39EF),
reinterpret<f64>(0x7FEFFFFFFFFFFF2A),
reinterpret<f64>(0xBFBB0E2640000000),
INEXACT
));
assert(test_exp(reinterpret<f64>(0x40862E42FEFA39F0), Infinity, 0.0, INEXACT | OVERFLOW));
assert(test_exp(
reinterpret<f64>(0xC0874910D52D3051),
f64.MIN_VALUE,
reinterpret<f64>(0x3FE0000000000000),
INEXACT | UNDERFLOW
));
assert(test_exp(
reinterpret<f64>(0xC0874910D52D3052),
0.0,
reinterpret<f64>(0xBFE0000000000000),
INEXACT | UNDERFLOW
));
assert(test_exp(
reinterpret<f64>(0xC086232BDD7ABCD2),
reinterpret<f64>(0x001000000000007C),
reinterpret<f64>(0x3FD0C013E0000000),
INEXACT
));
assert(test_exp(
reinterpret<f64>(0xC086232BDD7ABCD3),
reinterpret<f64>(0x000FFFFFFFFFFE7C),
reinterpret<f64>(0x000FFFFFFFFFFE7C),
INEXACT | UNDERFLOW
));
assert(test_exp(
reinterpret<f64>(0x3FE005AE04256BAB),
reinterpret<f64>(0x3FFA65D89ABF3D1F),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 5.006933289508784801213892023952e-01
assert(test_exp(
reinterpret<f64>(0x3FE41C9E095CD545),
reinterpret<f64>(0x3FFDFF1D425DE879),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 6.284933264602520219810344315192e-01
assert(test_exp(
reinterpret<f64>(0x3FEACCFBE46B4EF0),
reinterpret<f64>(0x40027C2E4BC1EE70),
reinterpret<f64>(0xBFE0000000000000),
INEXACT
)); // 8.375224553405740124389922129922e-01
assert(test_exp(
reinterpret<f64>(0x3FEB3738E335EA89),
reinterpret<f64>(0x4002B9F331610FB0),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 8.504909932810998940411195690103e-01
assert(test_exp(
reinterpret<f64>(0x3FFA083788425AB6),
reinterpret<f64>(0x40145ABE6A4C4281),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 1.627006084692465659458093796275e+00
assert(test_exp(
reinterpret<f64>(0x3FFACA7AE8DA5A7B),
reinterpret<f64>(0x401557D4ACD7E557),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 1.674433621961411544631914694037e+00
assert(test_exp(
reinterpret<f64>(0x401AA1B465630FA4),
reinterpret<f64>(0x4088576653F47E5E),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 6.657914718791207775439033866860e+00
assert(test_exp(
reinterpret<f64>(0x40260BB5FB993B99),
reinterpret<f64>(0x40EDE96D34FCCCFE),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 1.102287279363172167734319373267e+01
assert(test_exp(
reinterpret<f64>(0x4026D2883E37B4D7),
reinterpret<f64>(0x40F60D75C9585CA5),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 1.141119570188531717747082439018e+01
assert(test_exp(
reinterpret<f64>(0x402796C771AF1E4B),
reinterpret<f64>(0x41002D419F8E15F2),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 1.179449038756060552657345397165e+01
assert(test_exp(
reinterpret<f64>(0x4079CD6B6D99965B),
reinterpret<f64>(0x65284208270E2E4C),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // 4.128387275695328639812942128628e+02
assert(test_exp(
reinterpret<f64>(0x407FEE02D3D0EC9A),
reinterpret<f64>(0x6E006CCF59E5ED14),
reinterpret<f64>(0xBFE0000000000000),
INEXACT
)); // 5.108756902848341496792272664607e+02
assert(test_exp(
reinterpret<f64>(0xBD1DF00000000070),
reinterpret<f64>(0x3FEFFFFFFFFFFF11),
reinterpret<f64>(0x3FE0000000000000),
INEXACT
)); // -2.658984143977285255283151746406e-14
assert(test_exp(
reinterpret<f64>(0xBD1E900000000075),
reinterpret<f64>(0x3FEFFFFFFFFFFF0B),
reinterpret<f64>(0xBFE0000000000000),
INEXACT
)); // -2.714495295208544660026143771835e-14
// Mathf.exp ///////////////////////////////////////////////////////////////////////////////////////
function test_expf(value: f32, expected: f32, error: f32, flags: i32): bool {

File diff suppressed because it is too large Load Diff