Add lightweight paths for memcpy & memset for shrink level != 0 (#601)

This commit is contained in:
Max Graey 2019-05-24 17:00:02 +03:00 committed by Daniel Wirtz
parent af00bdeefe
commit 7cd04b65ef
22 changed files with 26977 additions and 1492 deletions

View File

@ -1,154 +1,163 @@
// export function memcpy(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memcpy.c
// var w: u32, x: u32;
export function memcpy(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memcpy.c
if (ASC_SHRINK_LEVEL > 1) {
while (n) {
store<u8>(dest++, load<u8>(src++));
--n;
}
} else {
let w: u32, x: u32;
// // copy 1 byte each until src is aligned to 4 bytes
// while (n && (src & 3)) {
// store<u8>(dest++, load<u8>(src++));
// n--;
// }
// copy 1 byte each until src is aligned to 4 bytes
while (n && (src & 3)) {
store<u8>(dest++, load<u8>(src++));
n--;
}
// // if dst is aligned to 4 bytes as well, copy 4 bytes each
// if ((dest & 3) == 0) {
// while (n >= 16) {
// store<u32>(dest , load<u32>(src ));
// store<u32>(dest + 4, load<u32>(src + 4));
// store<u32>(dest + 8, load<u32>(src + 8));
// store<u32>(dest + 12, load<u32>(src + 12));
// src += 16; dest += 16; n -= 16;
// }
// if (n & 8) {
// store<u32>(dest , load<u32>(src ));
// store<u32>(dest + 4, load<u32>(src + 4));
// dest += 8; src += 8;
// }
// if (n & 4) {
// store<u32>(dest, load<u32>(src));
// dest += 4; src += 4;
// }
// if (n & 2) { // drop to 2 bytes each
// store<u16>(dest, load<u16>(src));
// dest += 2; src += 2;
// }
// if (n & 1) { // drop to 1 byte
// store<u8>(dest++, load<u8>(src++));
// }
// return;
// }
// if dst is aligned to 4 bytes as well, copy 4 bytes each
if ((dest & 3) == 0) {
while (n >= 16) {
store<u32>(dest , load<u32>(src ));
store<u32>(dest + 4, load<u32>(src + 4));
store<u32>(dest + 8, load<u32>(src + 8));
store<u32>(dest + 12, load<u32>(src + 12));
src += 16; dest += 16; n -= 16;
}
if (n & 8) {
store<u32>(dest , load<u32>(src ));
store<u32>(dest + 4, load<u32>(src + 4));
dest += 8; src += 8;
}
if (n & 4) {
store<u32>(dest, load<u32>(src));
dest += 4; src += 4;
}
if (n & 2) { // drop to 2 bytes each
store<u16>(dest, load<u16>(src));
dest += 2; src += 2;
}
if (n & 1) { // drop to 1 byte
store<u8>(dest++, load<u8>(src++));
}
return;
}
// // if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each
// // doing shifts if faster when copying enough bytes (here: 32 or more)
// if (n >= 32) {
// switch (dest & 3) {
// // known to be != 0
// case 1: {
// w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// n -= 3;
// while (n >= 17) {
// x = load<u32>(src + 1);
// store<u32>(dest, w >> 24 | x << 8);
// w = load<u32>(src + 5);
// store<u32>(dest + 4, x >> 24 | w << 8);
// x = load<u32>(src + 9);
// store<u32>(dest + 8, w >> 24 | x << 8);
// w = load<u32>(src + 13);
// store<u32>(dest + 12, x >> 24 | w << 8);
// src += 16; dest += 16; n -= 16;
// }
// break;
// }
// case 2: {
// w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// n -= 2;
// while (n >= 18) {
// x = load<u32>(src + 2);
// store<u32>(dest, w >> 16 | x << 16);
// w = load<u32>(src + 6);
// store<u32>(dest + 4, x >> 16 | w << 16);
// x = load<u32>(src + 10);
// store<u32>(dest + 8, w >> 16 | x << 16);
// w = load<u32>(src + 14);
// store<u32>(dest + 12, x >> 16 | w << 16);
// src += 16; dest += 16; n -= 16;
// }
// break;
// }
// case 3: {
// w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++));
// n -= 1;
// while (n >= 19) {
// x = load<u32>(src + 3);
// store<u32>(dest, w >> 8 | x << 24);
// w = load<u32>(src + 7);
// store<u32>(dest + 4, x >> 8 | w << 24);
// x = load<u32>(src + 11);
// store<u32>(dest + 8, w >> 8 | x << 24);
// w = load<u32>(src + 15);
// store<u32>(dest + 12, x >> 8 | w << 24);
// src += 16; dest += 16; n -= 16;
// }
// break;
// }
// }
// }
// if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each
// doing shifts if faster when copying enough bytes (here: 32 or more)
if (n >= 32) {
switch (dest & 3) {
// known to be != 0
case 1: {
w = load<u32>(src);
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
n -= 3;
while (n >= 17) {
x = load<u32>(src + 1);
store<u32>(dest, w >> 24 | x << 8);
w = load<u32>(src + 5);
store<u32>(dest + 4, x >> 24 | w << 8);
x = load<u32>(src + 9);
store<u32>(dest + 8, w >> 24 | x << 8);
w = load<u32>(src + 13);
store<u32>(dest + 12, x >> 24 | w << 8);
src += 16; dest += 16; n -= 16;
}
break;
}
case 2: {
w = load<u32>(src);
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
n -= 2;
while (n >= 18) {
x = load<u32>(src + 2);
store<u32>(dest, w >> 16 | x << 16);
w = load<u32>(src + 6);
store<u32>(dest + 4, x >> 16 | w << 16);
x = load<u32>(src + 10);
store<u32>(dest + 8, w >> 16 | x << 16);
w = load<u32>(src + 14);
store<u32>(dest + 12, x >> 16 | w << 16);
src += 16; dest += 16; n -= 16;
}
break;
}
case 3: {
w = load<u32>(src);
store<u8>(dest++, load<u8>(src++));
n -= 1;
while (n >= 19) {
x = load<u32>(src + 3);
store<u32>(dest, w >> 8 | x << 24);
w = load<u32>(src + 7);
store<u32>(dest + 4, x >> 8 | w << 24);
x = load<u32>(src + 11);
store<u32>(dest + 8, w >> 8 | x << 24);
w = load<u32>(src + 15);
store<u32>(dest + 12, x >> 8 | w << 24);
src += 16; dest += 16; n -= 16;
}
break;
}
}
}
// // copy remaining bytes one by one
// if (n & 16) {
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// }
// if (n & 8) {
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// }
// if (n & 4) {
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// }
// if (n & 2) {
// store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++));
// }
// if (n & 1) {
// store<u8>(dest++, load<u8>(src++));
// }
// }
// copy remaining bytes one by one
if (n & 16) {
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
}
if (n & 8) {
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
}
if (n & 4) {
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
}
if (n & 2) {
store<u8>(dest++, load<u8>(src++));
store<u8>(dest++, load<u8>(src++));
}
if (n & 1) {
store<u8>(dest++, load<u8>(src++));
}
}
}
// @ts-ignore: decorator
@inline
export function memmove(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memmove.c
if (dest === src) return;
// if (src + n <= dest || dest + n <= src) {
// memcpy(dest, src, n);
// return;
// }
if (ASC_SHRINK_LEVEL < 1) {
if (src + n <= dest || dest + n <= src) {
memcpy(dest, src, n);
return;
}
}
if (dest < src) {
if ((src & 7) == (dest & 7)) {
while (dest & 7) {
@ -187,62 +196,68 @@ export function memmove(dest: usize, src: usize, n: usize): void { // see: musl/
// @ts-ignore: decorator
@inline
export function memset(dest: usize, c: u8, n: usize): void { // see: musl/src/string/memset
if (ASC_SHRINK_LEVEL > 1) {
while (n) {
store<u8>(dest++, c);
--n;
}
} else {
// fill head and tail with minimal branching
if (!n) return;
store<u8>(dest, c);
store<u8>(dest + n - 1, c);
if (n <= 2) return;
// fill head and tail with minimal branching
if (!n) return;
store<u8>(dest, c);
store<u8>(dest + n - 1, c);
if (n <= 2) return;
store<u8>(dest + 1, c);
store<u8>(dest + 2, c);
store<u8>(dest + n - 2, c);
store<u8>(dest + n - 3, c);
if (n <= 6) return;
store<u8>(dest + 3, c);
store<u8>(dest + n - 4, c);
if (n <= 8) return;
store<u8>(dest + 1, c);
store<u8>(dest + 2, c);
store<u8>(dest + n - 2, c);
store<u8>(dest + n - 3, c);
if (n <= 6) return;
store<u8>(dest + 3, c);
store<u8>(dest + n - 4, c);
if (n <= 8) return;
// advance pointer to align it at 4-byte boundary
let k: usize = -dest & 3;
dest += k;
n -= k;
n &= -4;
// advance pointer to align it at 4-byte boundary
var k: usize = -dest & 3;
dest += k;
n -= k;
n &= -4;
let c32: u32 = <u32>-1 / 255 * c;
var c32: u32 = <u32>-1 / 255 * c;
// fill head/tail up to 28 bytes each in preparation
store<u32>(dest, c32);
store<u32>(dest + n - 4, c32);
if (n <= 8) return;
store<u32>(dest + 4, c32);
store<u32>(dest + 8, c32);
store<u32>(dest + n - 12, c32);
store<u32>(dest + n - 8, c32);
if (n <= 24) return;
store<u32>(dest + 12, c32);
store<u32>(dest + 16, c32);
store<u32>(dest + 20, c32);
store<u32>(dest + 24, c32);
store<u32>(dest + n - 28, c32);
store<u32>(dest + n - 24, c32);
store<u32>(dest + n - 20, c32);
store<u32>(dest + n - 16, c32);
// fill head/tail up to 28 bytes each in preparation
store<u32>(dest, c32);
store<u32>(dest + n - 4, c32);
if (n <= 8) return;
store<u32>(dest + 4, c32);
store<u32>(dest + 8, c32);
store<u32>(dest + n - 12, c32);
store<u32>(dest + n - 8, c32);
if (n <= 24) return;
store<u32>(dest + 12, c32);
store<u32>(dest + 16, c32);
store<u32>(dest + 20, c32);
store<u32>(dest + 24, c32);
store<u32>(dest + n - 28, c32);
store<u32>(dest + n - 24, c32);
store<u32>(dest + n - 20, c32);
store<u32>(dest + n - 16, c32);
// align to a multiple of 8
k = 24 + (dest & 4);
dest += k;
n -= k;
// align to a multiple of 8
k = 24 + (dest & 4);
dest += k;
n -= k;
// copy 32 bytes each
var c64: u64 = <u64>c32 | (<u64>c32 << 32);
while (n >= 32) {
store<u64>(dest, c64);
store<u64>(dest + 8, c64);
store<u64>(dest + 16, c64);
store<u64>(dest + 24, c64);
n -= 32;
dest += 32;
// copy 32 bytes each
let c64: u64 = <u64>c32 | (<u64>c32 << 32);
while (n >= 32) {
store<u64>(dest, c64);
store<u64>(dest + 8, c64);
store<u64>(dest + 16, c64);
store<u64>(dest + 24, c64);
n -= 32;
dest += 32;
}
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff