Use mixed Horner scheme in Math.exp/expm1 to improve instruction parallelization (#311)

2025-06-10 05:21:27 +00:00 · 2018-11-09 18:56:42 +02:00 · 2018-11-09 18:56:42 +02:00 · 410036e445
commit 410036e445
parent 54311fd0ab
8 changed files with 7763 additions and 6904 deletions
--- a/std/assembly/math.ts
+++ b/std/assembly/math.ts
@ -422,8 +422,10 @@ export namespace NativeMath {
    } else if (hx > 0x3E300000) {
      hi = x;
    } else return 1.0 + x;
-    var xx = x * x;
-    var c = x - xx * (P1 + xx * (P2 + xx * (P3 + xx * (P4 + xx * P5))));
+    var xs = x * x;
+    // var c = x - xp2 * (P1 + xp2 * (P2 + xp2 * (P3 + xp2 * (P4 + xp2 * P5))));
+    var xq = xs * xs;
+    var c = x - (xs * P1 + xq * ((P2 + xs * P3) + xq * (P4 + xs * P5)));
    var y = 1.0 + (x * c / (2 - c) - lo + hi);
    if (k == 0) return y;
    return scalbn(y, k);
@ -464,7 +466,9 @@ export namespace NativeMath {
    } else if (hx < 0x3C900000) return x;
    var hfx = 0.5 * x;
    var hxs = x * hfx;
-    var r1 = 1.0 + hxs * (Q1 + hxs * (Q2 + hxs * (Q3 + hxs * (Q4 + hxs * Q5))));
+    // var r1 = 1.0 + hxs * (Q1 + hxs * (Q2 + hxs * (Q3 + hxs * (Q4 + hxs * Q5))));
+    var hxq = hxs * hxs;
+    var r1 = (1.0 + hxs * Q1) + hxq * ((Q2 + hxs * Q3) + hxq * (Q4 + hxs * Q5));
    t = 3.0 - r1 * hfx;
    var e = hxs * ((r1 - t) / (6.0 - x * t));
    if (k == 0) return x - (x * e - hxs);
--- a/tests/compiler/std/array.optimized.wat
+++ b/tests/compiler/std/array.optimized.wat
@ -3457,7 +3457,7 @@
  if
   i32.const 0
   i32.const 552
-   i32.const 955
+   i32.const 959
   i32.const 4
   call $~lib/env/abort
   unreachable
@ -5173,7 +5173,7 @@
  if
   i32.const 0
   i32.const 552
-   i32.const 964
+   i32.const 968
   i32.const 24
   call $~lib/env/abort
   unreachable
--- a/tests/compiler/std/array.untouched.wat
+++ b/tests/compiler/std/array.untouched.wat
@ -4512,7 +4512,7 @@
  if
   i32.const 0
   i32.const 552
-   i32.const 955
+   i32.const 959
   i32.const 4
   call $~lib/env/abort
   unreachable
@ -7609,7 +7609,7 @@
  if
   i32.const 0
   i32.const 552
-   i32.const 964
+   i32.const 968
   i32.const 24
   call $~lib/env/abort
   unreachable
--- a/tests/compiler/std/libm.optimized.wat
+++ b/tests/compiler/std/libm.optimized.wat
@ -1695,7 +1695,6 @@
    return
   end
  end
-  f64.const 1
  get_local $0
  f64.const 0.5
  get_local $0
@ -1703,12 +1702,22 @@
  tee_local $8
  f64.mul
  tee_local $2
-  f64.const -0.03333333333333313
  get_local $2
+  f64.mul
+  set_local $1
+  f64.const 3
+  f64.const 1
+  get_local $2
+  f64.const -0.03333333333333313
+  f64.mul
+  f64.add
+  get_local $1
  f64.const 1.5873015872548146e-03
  get_local $2
  f64.const -7.93650757867488e-05
-  get_local $2
+  f64.mul
+  f64.add
+  get_local $1
  f64.const 4.008217827329362e-06
  get_local $2
  f64.const -2.0109921818362437e-07
@ -1718,20 +1727,14 @@
  f64.add
  f64.mul
  f64.add
-  f64.mul
-  f64.add
-  f64.mul
-  f64.add
  tee_local $9
-  set_local $1
-  get_local $2
-  get_local $9
-  f64.const 3
-  get_local $1
  get_local $8
  f64.mul
  f64.sub
-  tee_local $1
+  set_local $1
+  get_local $2
+  get_local $9
+  get_local $1
  f64.sub
  f64.const 6
  get_local $0
@ -1954,10 +1957,12 @@
 )
 (func $~lib/math/NativeMath.exp (; 25 ;) (type $FF) (param $0 f64) (result f64)
  (local $1 i32)
-  (local $2 i32)
-  (local $3 f64)
-  (local $4 i32)
+  (local $2 f64)
+  (local $3 i32)
+  (local $4 f64)
  (local $5 f64)
+  (local $6 i32)
+  (local $7 f64)
  get_local $0
  i64.reinterpret/f64
  i64.const 32
@ -1966,7 +1971,7 @@
  tee_local $1
  i32.const 31
  i32.shr_u
-  set_local $4
+  set_local $6
  get_local $1
  i32.const 2147483647
  i32.and
@ -2017,22 +2022,22 @@
    i32.trunc_s/f64
   else    
    i32.const 1
-    get_local $4
+    get_local $6
    i32.const 1
    i32.shl
    i32.sub
   end
-   tee_local $2
+   tee_local $3
   f64.convert_s/i32
   tee_local $0
   f64.const 0.6931471803691238
   f64.mul
   f64.sub
-   tee_local $3
+   tee_local $4
   get_local $0
   f64.const 1.9082149292705877e-10
   f64.mul
-   tee_local $5
+   tee_local $7
   f64.sub
   set_local $0
  else   
@ -2046,23 +2051,30 @@
    return
   end
   get_local $0
-   set_local $3
+   set_local $4
  end
-  f64.const 1
-  get_local $0
-  get_local $0
  get_local $0
  get_local $0
  f64.mul
-  tee_local $0
+  tee_local $2
+  get_local $2
+  f64.mul
+  set_local $5
+  f64.const 1
+  get_local $0
+  get_local $0
+  get_local $2
  f64.const 0.16666666666666602
-  get_local $0
+  f64.mul
+  get_local $5
  f64.const -2.7777777777015593e-03
-  get_local $0
+  get_local $2
  f64.const 6.613756321437934e-05
-  get_local $0
+  f64.mul
+  f64.add
+  get_local $5
  f64.const -1.6533902205465252e-06
-  get_local $0
+  get_local $2
  f64.const 4.1381367970572385e-08
  f64.mul
  f64.add
@ -2070,9 +2082,6 @@
  f64.add
  f64.mul
  f64.add
-  f64.mul
-  f64.add
-  f64.mul
  f64.sub
  tee_local $0
  f64.mul
@ -2080,20 +2089,20 @@
  get_local $0
  f64.sub
  f64.div
-  get_local $5
+  get_local $7
  f64.sub
-  get_local $3
+  get_local $4
  f64.add
  f64.add
  set_local $0
-  get_local $2
+  get_local $3
  i32.eqz
  if
   get_local $0
   return
  end
  get_local $0
-  get_local $2
+  get_local $3
  call $~lib/math/NativeMath.scalbn
 )
 (func $~lib/math/NativeMath.cosh (; 26 ;) (type $FF) (param $0 f64) (result f64)
--- a/tests/compiler/std/libm.untouched.wat
+++ b/tests/compiler/std/libm.untouched.wat
@ -1950,7 +1950,8 @@
  (local $12 f64)
  (local $13 f64)
  (local $14 f64)
-  (local $15 i32)
+  (local $15 f64)
+  (local $16 i32)
  get_local $0
  i64.reinterpret/f64
  set_local $1
@ -2057,14 +2058,22 @@
  get_local $9
  f64.mul
  set_local $10
+  get_local $10
+  get_local $10
+  f64.mul
+  set_local $11
  f64.const 1
  get_local $10
  f64.const -0.03333333333333313
-  get_local $10
+  f64.mul
+  f64.add
+  get_local $11
  f64.const 1.5873015872548146e-03
  get_local $10
  f64.const -7.93650757867488e-05
-  get_local $10
+  f64.mul
+  f64.add
+  get_local $11
  f64.const 4.008217827329362e-06
  get_local $10
  f64.const -2.0109921818362437e-07
@ -2074,19 +2083,15 @@
  f64.add
  f64.mul
  f64.add
-  f64.mul
-  f64.add
-  f64.mul
-  f64.add
-  set_local $11
+  set_local $12
  f64.const 3
-  get_local $11
+  get_local $12
  get_local $9
  f64.mul
  f64.sub
  set_local $6
  get_local $10
-  get_local $11
+  get_local $12
  get_local $6
  f64.sub
  f64.const 6
@ -2096,14 +2101,14 @@
  f64.sub
  f64.div
  f64.mul
-  set_local $12
+  set_local $13
  get_local $3
  i32.const 0
  i32.eq
  if
   get_local $0
   get_local $0
-   get_local $12
+   get_local $13
   f64.mul
   get_local $10
   f64.sub
@ -2111,24 +2116,24 @@
   return
  end
  get_local $0
-  get_local $12
+  get_local $13
  get_local $5
  f64.sub
  f64.mul
  get_local $5
  f64.sub
-  set_local $12
-  get_local $12
+  set_local $13
+  get_local $13
  get_local $10
  f64.sub
-  set_local $12
+  set_local $13
  get_local $3
  i32.const -1
  i32.eq
  if
   f64.const 0.5
   get_local $0
-   get_local $12
+   get_local $13
   f64.sub
   f64.mul
   f64.const 0.5
@ -2144,7 +2149,7 @@
   f64.lt
   if
    f64.const -2
-    get_local $12
+    get_local $13
    get_local $0
    f64.const 0.5
    f64.add
@ -2155,7 +2160,7 @@
   f64.const 1
   f64.const 2
   get_local $0
-   get_local $12
+   get_local $13
   f64.sub
   f64.mul
   f64.add
@ -2170,13 +2175,13 @@
  set_local $1
  get_local $1
  f64.reinterpret/i64
-  set_local $13
+  set_local $14
  get_local $3
  i32.const 0
  i32.lt_s
-  tee_local $15
+  tee_local $16
  if (result i32)
-   get_local $15
+   get_local $16
  else   
   get_local $3
   i32.const 56
@ -2184,28 +2189,28 @@
  end
  if
   get_local $0
-   get_local $12
+   get_local $13
   f64.sub
   f64.const 1
   f64.add
-   set_local $14
+   set_local $15
   get_local $3
   i32.const 1024
   i32.eq
   if
-    get_local $14
+    get_local $15
    f64.const 2
    f64.mul
    f64.const 8988465674311579538646525e283
    f64.mul
-    set_local $14
+    set_local $15
   else    
+    get_local $15
    get_local $14
-    get_local $13
    f64.mul
-    set_local $14
+    set_local $15
   end
-   get_local $14
+   get_local $15
   f64.const 1
   f64.sub
   return
@ -2219,29 +2224,29 @@
  set_local $1
  get_local $1
  f64.reinterpret/i64
-  set_local $14
+  set_local $15
  get_local $3
  i32.const 20
  i32.lt_s
  if
   f64.const 1
-   get_local $14
+   get_local $15
   f64.sub
-   get_local $12
+   get_local $13
   f64.sub
-   set_local $14
+   set_local $15
  else   
   f64.const 1
-   get_local $12
-   get_local $14
+   get_local $13
+   get_local $15
   f64.add
   f64.sub
-   set_local $14
+   set_local $15
  end
  get_local $0
-  get_local $14
+  get_local $15
  f64.add
-  get_local $13
+  get_local $14
  f64.mul
 )
 (func $~lib/math/NativeMath.scalbn (; 26 ;) (type $FiF) (param $0 f64) (param $1 i32) (result f64)
@ -2344,6 +2349,7 @@
  (local $6 f64)
  (local $7 f64)
  (local $8 f64)
+  (local $9 f64)
  get_local $0
  i64.reinterpret/f64
  i64.const 32
@ -2450,14 +2456,21 @@
  get_local $0
  f64.mul
  set_local $6
+  get_local $6
+  get_local $6
+  f64.mul
+  set_local $7
  get_local $0
  get_local $6
  f64.const 0.16666666666666602
-  get_local $6
+  f64.mul
+  get_local $7
  f64.const -2.7777777777015593e-03
  get_local $6
  f64.const 6.613756321437934e-05
-  get_local $6
+  f64.mul
+  f64.add
+  get_local $7
  f64.const -1.6533902205465252e-06
  get_local $6
  f64.const 4.1381367970572385e-08
@ -2467,17 +2480,14 @@
  f64.add
  f64.mul
  f64.add
-  f64.mul
-  f64.add
-  f64.mul
  f64.sub
-  set_local $7
+  set_local $8
  f64.const 1
  get_local $0
-  get_local $7
+  get_local $8
  f64.mul
  f64.const 2
-  get_local $7
+  get_local $8
  f64.sub
  f64.div
  get_local $4
@ -2485,15 +2495,15 @@
  get_local $3
  f64.add
  f64.add
-  set_local $8
+  set_local $9
  get_local $5
  i32.const 0
  i32.eq
  if
-   get_local $8
+   get_local $9
   return
  end
-  get_local $8
+  get_local $9
  get_local $5
  call $~lib/math/NativeMath.scalbn
 )
--- a/tests/compiler/std/math.optimized.wat
+++ b/tests/compiler/std/math.optimized.wat
--- a/tests/compiler/std/math.ts
+++ b/tests/compiler/std/math.ts
@ -1181,6 +1181,141 @@ assert(test_exp(-1.0397214889526365, 0.353553136702178472, 0.252727240324020386,
 assert(test_exp(1.03972101211547852, 2.82842780717661224, -0.418413937091827393, INEXACT));
 assert(test_exp(1.03972148895263672, 2.82842915587641164, -0.226183772087097168, INEXACT));

+// some vectors from crlibm
+assert(test_exp( f64.MIN_VALUE, 1.0, 0.0, INEXACT)); // smallest denorm positive
+assert(test_exp(-f64.MIN_VALUE, 1.0, 0.0, INEXACT)); // smallest denorm negative
+
+assert(test_exp(
+  reinterpret<f64>(0x40862E42FEFA39EF),
+  reinterpret<f64>(0x7FEFFFFFFFFFFF2A),
+  reinterpret<f64>(0xBFBB0E2640000000),
+  INEXACT
+));
+
+assert(test_exp(reinterpret<f64>(0x40862E42FEFA39F0), Infinity, 0.0, INEXACT | OVERFLOW));
+assert(test_exp(
+  reinterpret<f64>(0xC0874910D52D3051),
+  f64.MIN_VALUE,
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT | UNDERFLOW
+));
+
+assert(test_exp(
+  reinterpret<f64>(0xC0874910D52D3052),
+  0.0,
+  reinterpret<f64>(0xBFE0000000000000),
+  INEXACT | UNDERFLOW
+));
+
+assert(test_exp(
+  reinterpret<f64>(0xC086232BDD7ABCD2),
+  reinterpret<f64>(0x001000000000007C),
+  reinterpret<f64>(0x3FD0C013E0000000),
+  INEXACT
+));
+
+assert(test_exp(
+  reinterpret<f64>(0xC086232BDD7ABCD3),
+  reinterpret<f64>(0x000FFFFFFFFFFE7C),
+  reinterpret<f64>(0x000FFFFFFFFFFE7C),
+  INEXACT | UNDERFLOW
+));
+
+assert(test_exp(
+  reinterpret<f64>(0x3FE005AE04256BAB),
+  reinterpret<f64>(0x3FFA65D89ABF3D1F),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 5.006933289508784801213892023952e-01
+
+assert(test_exp(
+  reinterpret<f64>(0x3FE41C9E095CD545),
+  reinterpret<f64>(0x3FFDFF1D425DE879),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 6.284933264602520219810344315192e-01
+
+assert(test_exp(
+  reinterpret<f64>(0x3FEACCFBE46B4EF0),
+  reinterpret<f64>(0x40027C2E4BC1EE70),
+  reinterpret<f64>(0xBFE0000000000000),
+  INEXACT
+)); // 8.375224553405740124389922129922e-01
+assert(test_exp(
+  reinterpret<f64>(0x3FEB3738E335EA89),
+  reinterpret<f64>(0x4002B9F331610FB0),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 8.504909932810998940411195690103e-01
+assert(test_exp(
+  reinterpret<f64>(0x3FFA083788425AB6),
+  reinterpret<f64>(0x40145ABE6A4C4281),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 1.627006084692465659458093796275e+00
+assert(test_exp(
+  reinterpret<f64>(0x3FFACA7AE8DA5A7B),
+  reinterpret<f64>(0x401557D4ACD7E557),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 1.674433621961411544631914694037e+00
+
+assert(test_exp(
+  reinterpret<f64>(0x401AA1B465630FA4),
+  reinterpret<f64>(0x4088576653F47E5E),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 6.657914718791207775439033866860e+00
+
+assert(test_exp(
+  reinterpret<f64>(0x40260BB5FB993B99),
+  reinterpret<f64>(0x40EDE96D34FCCCFE),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 1.102287279363172167734319373267e+01
+
+assert(test_exp(
+  reinterpret<f64>(0x4026D2883E37B4D7),
+  reinterpret<f64>(0x40F60D75C9585CA5),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 1.141119570188531717747082439018e+01
+
+assert(test_exp(
+  reinterpret<f64>(0x402796C771AF1E4B),
+  reinterpret<f64>(0x41002D419F8E15F2),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 1.179449038756060552657345397165e+01
+
+assert(test_exp(
+  reinterpret<f64>(0x4079CD6B6D99965B),
+  reinterpret<f64>(0x65284208270E2E4C),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // 4.128387275695328639812942128628e+02
+
+assert(test_exp(
+  reinterpret<f64>(0x407FEE02D3D0EC9A),
+  reinterpret<f64>(0x6E006CCF59E5ED14),
+  reinterpret<f64>(0xBFE0000000000000),
+  INEXACT
+)); // 5.108756902848341496792272664607e+02
+
+assert(test_exp(
+  reinterpret<f64>(0xBD1DF00000000070),
+  reinterpret<f64>(0x3FEFFFFFFFFFFF11),
+  reinterpret<f64>(0x3FE0000000000000),
+  INEXACT
+)); // -2.658984143977285255283151746406e-14
+
+assert(test_exp(
+  reinterpret<f64>(0xBD1E900000000075),
+  reinterpret<f64>(0x3FEFFFFFFFFFFF0B),
+  reinterpret<f64>(0xBFE0000000000000),
+  INEXACT
+)); // -2.714495295208544660026143771835e-14
+
 // Mathf.exp ///////////////////////////////////////////////////////////////////////////////////////

 function test_expf(value: f32, expected: f32, error: f32, flags: i32): bool {
--- a/tests/compiler/std/math.untouched.wat
+++ b/tests/compiler/std/math.untouched.wat