Add lightweight paths for memcpy & memset for shrink level != 0 (#601)

This commit is contained in:
Max Graey 2019-05-24 17:00:02 +03:00 committed by Daniel Wirtz
parent af00bdeefe
commit 7cd04b65ef
22 changed files with 26977 additions and 1492 deletions

View File

@ -1,154 +1,163 @@
// export function memcpy(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memcpy.c export function memcpy(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memcpy.c
// var w: u32, x: u32; if (ASC_SHRINK_LEVEL > 1) {
while (n) {
store<u8>(dest++, load<u8>(src++));
--n;
}
} else {
let w: u32, x: u32;
// // copy 1 byte each until src is aligned to 4 bytes // copy 1 byte each until src is aligned to 4 bytes
// while (n && (src & 3)) { while (n && (src & 3)) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// n--; n--;
// } }
// // if dst is aligned to 4 bytes as well, copy 4 bytes each // if dst is aligned to 4 bytes as well, copy 4 bytes each
// if ((dest & 3) == 0) { if ((dest & 3) == 0) {
// while (n >= 16) { while (n >= 16) {
// store<u32>(dest , load<u32>(src )); store<u32>(dest , load<u32>(src ));
// store<u32>(dest + 4, load<u32>(src + 4)); store<u32>(dest + 4, load<u32>(src + 4));
// store<u32>(dest + 8, load<u32>(src + 8)); store<u32>(dest + 8, load<u32>(src + 8));
// store<u32>(dest + 12, load<u32>(src + 12)); store<u32>(dest + 12, load<u32>(src + 12));
// src += 16; dest += 16; n -= 16; src += 16; dest += 16; n -= 16;
// } }
// if (n & 8) { if (n & 8) {
// store<u32>(dest , load<u32>(src )); store<u32>(dest , load<u32>(src ));
// store<u32>(dest + 4, load<u32>(src + 4)); store<u32>(dest + 4, load<u32>(src + 4));
// dest += 8; src += 8; dest += 8; src += 8;
// } }
// if (n & 4) { if (n & 4) {
// store<u32>(dest, load<u32>(src)); store<u32>(dest, load<u32>(src));
// dest += 4; src += 4; dest += 4; src += 4;
// } }
// if (n & 2) { // drop to 2 bytes each if (n & 2) { // drop to 2 bytes each
// store<u16>(dest, load<u16>(src)); store<u16>(dest, load<u16>(src));
// dest += 2; src += 2; dest += 2; src += 2;
// } }
// if (n & 1) { // drop to 1 byte if (n & 1) { // drop to 1 byte
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// return; return;
// } }
// // if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each // if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each
// // doing shifts if faster when copying enough bytes (here: 32 or more) // doing shifts if faster when copying enough bytes (here: 32 or more)
// if (n >= 32) { if (n >= 32) {
// switch (dest & 3) { switch (dest & 3) {
// // known to be != 0 // known to be != 0
// case 1: { case 1: {
// w = load<u32>(src); w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// n -= 3; n -= 3;
// while (n >= 17) { while (n >= 17) {
// x = load<u32>(src + 1); x = load<u32>(src + 1);
// store<u32>(dest, w >> 24 | x << 8); store<u32>(dest, w >> 24 | x << 8);
// w = load<u32>(src + 5); w = load<u32>(src + 5);
// store<u32>(dest + 4, x >> 24 | w << 8); store<u32>(dest + 4, x >> 24 | w << 8);
// x = load<u32>(src + 9); x = load<u32>(src + 9);
// store<u32>(dest + 8, w >> 24 | x << 8); store<u32>(dest + 8, w >> 24 | x << 8);
// w = load<u32>(src + 13); w = load<u32>(src + 13);
// store<u32>(dest + 12, x >> 24 | w << 8); store<u32>(dest + 12, x >> 24 | w << 8);
// src += 16; dest += 16; n -= 16; src += 16; dest += 16; n -= 16;
// } }
// break; break;
// } }
// case 2: { case 2: {
// w = load<u32>(src); w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// n -= 2; n -= 2;
// while (n >= 18) { while (n >= 18) {
// x = load<u32>(src + 2); x = load<u32>(src + 2);
// store<u32>(dest, w >> 16 | x << 16); store<u32>(dest, w >> 16 | x << 16);
// w = load<u32>(src + 6); w = load<u32>(src + 6);
// store<u32>(dest + 4, x >> 16 | w << 16); store<u32>(dest + 4, x >> 16 | w << 16);
// x = load<u32>(src + 10); x = load<u32>(src + 10);
// store<u32>(dest + 8, w >> 16 | x << 16); store<u32>(dest + 8, w >> 16 | x << 16);
// w = load<u32>(src + 14); w = load<u32>(src + 14);
// store<u32>(dest + 12, x >> 16 | w << 16); store<u32>(dest + 12, x >> 16 | w << 16);
// src += 16; dest += 16; n -= 16; src += 16; dest += 16; n -= 16;
// } }
// break; break;
// } }
// case 3: { case 3: {
// w = load<u32>(src); w = load<u32>(src);
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// n -= 1; n -= 1;
// while (n >= 19) { while (n >= 19) {
// x = load<u32>(src + 3); x = load<u32>(src + 3);
// store<u32>(dest, w >> 8 | x << 24); store<u32>(dest, w >> 8 | x << 24);
// w = load<u32>(src + 7); w = load<u32>(src + 7);
// store<u32>(dest + 4, x >> 8 | w << 24); store<u32>(dest + 4, x >> 8 | w << 24);
// x = load<u32>(src + 11); x = load<u32>(src + 11);
// store<u32>(dest + 8, w >> 8 | x << 24); store<u32>(dest + 8, w >> 8 | x << 24);
// w = load<u32>(src + 15); w = load<u32>(src + 15);
// store<u32>(dest + 12, x >> 8 | w << 24); store<u32>(dest + 12, x >> 8 | w << 24);
// src += 16; dest += 16; n -= 16; src += 16; dest += 16; n -= 16;
// } }
// break; break;
// } }
// } }
// } }
// // copy remaining bytes one by one // copy remaining bytes one by one
// if (n & 16) { if (n & 16) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// if (n & 8) { if (n & 8) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// if (n & 4) { if (n & 4) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// if (n & 2) { if (n & 2) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// if (n & 1) { if (n & 1) {
// store<u8>(dest++, load<u8>(src++)); store<u8>(dest++, load<u8>(src++));
// } }
// } }
}
// @ts-ignore: decorator // @ts-ignore: decorator
@inline @inline
export function memmove(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memmove.c export function memmove(dest: usize, src: usize, n: usize): void { // see: musl/src/string/memmove.c
if (dest === src) return; if (dest === src) return;
// if (src + n <= dest || dest + n <= src) { if (ASC_SHRINK_LEVEL < 1) {
// memcpy(dest, src, n); if (src + n <= dest || dest + n <= src) {
// return; memcpy(dest, src, n);
// } return;
}
}
if (dest < src) { if (dest < src) {
if ((src & 7) == (dest & 7)) { if ((src & 7) == (dest & 7)) {
while (dest & 7) { while (dest & 7) {
@ -187,62 +196,68 @@ export function memmove(dest: usize, src: usize, n: usize): void { // see: musl/
// @ts-ignore: decorator // @ts-ignore: decorator
@inline @inline
export function memset(dest: usize, c: u8, n: usize): void { // see: musl/src/string/memset export function memset(dest: usize, c: u8, n: usize): void { // see: musl/src/string/memset
if (ASC_SHRINK_LEVEL > 1) {
while (n) {
store<u8>(dest++, c);
--n;
}
} else {
// fill head and tail with minimal branching
if (!n) return;
store<u8>(dest, c);
store<u8>(dest + n - 1, c);
if (n <= 2) return;
// fill head and tail with minimal branching store<u8>(dest + 1, c);
if (!n) return; store<u8>(dest + 2, c);
store<u8>(dest, c); store<u8>(dest + n - 2, c);
store<u8>(dest + n - 1, c); store<u8>(dest + n - 3, c);
if (n <= 2) return; if (n <= 6) return;
store<u8>(dest + 3, c);
store<u8>(dest + n - 4, c);
if (n <= 8) return;
store<u8>(dest + 1, c); // advance pointer to align it at 4-byte boundary
store<u8>(dest + 2, c); let k: usize = -dest & 3;
store<u8>(dest + n - 2, c); dest += k;
store<u8>(dest + n - 3, c); n -= k;
if (n <= 6) return; n &= -4;
store<u8>(dest + 3, c);
store<u8>(dest + n - 4, c);
if (n <= 8) return;
// advance pointer to align it at 4-byte boundary let c32: u32 = <u32>-1 / 255 * c;
var k: usize = -dest & 3;
dest += k;
n -= k;
n &= -4;
var c32: u32 = <u32>-1 / 255 * c; // fill head/tail up to 28 bytes each in preparation
store<u32>(dest, c32);
store<u32>(dest + n - 4, c32);
if (n <= 8) return;
store<u32>(dest + 4, c32);
store<u32>(dest + 8, c32);
store<u32>(dest + n - 12, c32);
store<u32>(dest + n - 8, c32);
if (n <= 24) return;
store<u32>(dest + 12, c32);
store<u32>(dest + 16, c32);
store<u32>(dest + 20, c32);
store<u32>(dest + 24, c32);
store<u32>(dest + n - 28, c32);
store<u32>(dest + n - 24, c32);
store<u32>(dest + n - 20, c32);
store<u32>(dest + n - 16, c32);
// fill head/tail up to 28 bytes each in preparation // align to a multiple of 8
store<u32>(dest, c32); k = 24 + (dest & 4);
store<u32>(dest + n - 4, c32); dest += k;
if (n <= 8) return; n -= k;
store<u32>(dest + 4, c32);
store<u32>(dest + 8, c32);
store<u32>(dest + n - 12, c32);
store<u32>(dest + n - 8, c32);
if (n <= 24) return;
store<u32>(dest + 12, c32);
store<u32>(dest + 16, c32);
store<u32>(dest + 20, c32);
store<u32>(dest + 24, c32);
store<u32>(dest + n - 28, c32);
store<u32>(dest + n - 24, c32);
store<u32>(dest + n - 20, c32);
store<u32>(dest + n - 16, c32);
// align to a multiple of 8 // copy 32 bytes each
k = 24 + (dest & 4); let c64: u64 = <u64>c32 | (<u64>c32 << 32);
dest += k; while (n >= 32) {
n -= k; store<u64>(dest, c64);
store<u64>(dest + 8, c64);
// copy 32 bytes each store<u64>(dest + 16, c64);
var c64: u64 = <u64>c32 | (<u64>c32 << 32); store<u64>(dest + 24, c64);
while (n >= 32) { n -= 32;
store<u64>(dest, c64); dest += 32;
store<u64>(dest + 8, c64); }
store<u64>(dest + 16, c64);
store<u64>(dest + 24, c64);
n -= 32;
dest += 32;
} }
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff