function copy_memory(dest: usize, src: usize, n: usize): void { // based on musl's implementation of memcpy // not a future instruction and sufficiently covered by the upcoming move_memory intrinsic var w: u32, x: u32; // copy 1 byte each until src is aligned to 4 bytes while (n && src % 4) { store(dest++, load(src++)); n--; } // if dst is aligned to 4 bytes as well, copy 4 bytes each if (dest % 4 == 0) { while (n >= 16) { store(dest , load(src )); store(dest + 4, load(src + 4)); store(dest + 8, load(src + 8)); store(dest + 12, load(src + 12)); src += 16; dest += 16; n -= 16; } if (n & 8) { store(dest , load(src )); store(dest + 4, load(src + 4)); dest += 8; src += 8; } if (n & 4) { store(dest, load(src)); dest += 4; src += 4; } if (n & 2) { // drop to 2 bytes each store(dest, load(src)); dest += 2; src += 2; } if (n & 1) { // drop to 1 byte store(dest++, load(src++)); } return; } // if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each // doing shifts if faster when copying enough bytes (here: 32 or more) if (n >= 32) { switch (dest % 4) { // known to be != 0 case 1: w = load(src); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); n -= 3; while (n >= 17) { x = load(src + 1); store(dest, w >> 24 | x << 8); w = load(src + 5); store(dest + 4, x >> 24 | w << 8); x = load(src + 9); store(dest + 8, w >> 24 | x << 8); w = load(src + 13); store(dest + 12, x >> 24 | w << 8); src += 16; dest += 16; n -= 16; } break; case 2: w = load(src); store(dest++, load(src++)); store(dest++, load(src++)); n -= 2; while (n >= 18) { x = load(src + 2); store(dest, w >> 16 | x << 16); w = load(src + 6); store(dest + 4, x >> 16 | w << 16); x = load(src + 10); store(dest + 8, w >> 16 | x << 16); w = load(src + 14); store(dest + 12, x >> 16 | w << 16); src += 16; dest += 16; n -= 16; } break; case 3: w = load(src); store(dest++, load(src++)); n -= 1; while (n >= 19) { x = load(src + 3); store(dest, w >> 8 | x << 24); w = load(src + 7); store(dest + 4, x >> 8 | w << 24); x = load(src + 11); store(dest + 8, w >> 8 | x << 24); w = load(src + 15); store(dest + 12, x >> 8 | w << 24); src += 16; dest += 16; n -= 16; } break; } } // copy remaining bytes one by one if (n & 16) { store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); } if (n & 8) { store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); } if (n & 4) { store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); store(dest++, load(src++)); } if (n & 2) { store(dest++, load(src++)); store(dest++, load(src++)); } if (n & 1) { store(dest++, load(src++)); } } export function move_memory(dest: usize, src: usize, n: usize): void { // based on musl's implementation of memmove // becomes obsolete once https://github.com/WebAssembly/bulk-memory-operations lands if (dest == src) return; if (src + n <= dest || dest + n <= src) { copy_memory(dest, src, n); return; } if (dest < src) { if (src % 8 == dest % 8) { while (dest % 8) { if (!n) return; --n; store(dest++, load(src++)); } while (n >= 8) { store(dest, load(src)); n -= 8; dest += 8; src += 8; } } while (n) { store(dest++, load(src++)); --n; } } else { if (src % 8 == dest % 8) { while ((dest + n) % 8) { if (!n) return; store(dest + --n, load(src + n)); } while (n >= 8) { n -= 8; store(dest + n, load(src + n)); } } while (n) { store(dest + --n, load(src + n)); } } } export function set_memory(dest: usize, c: u8, n: usize): void { // based on musl's implementation of memset // becomes obsolete once https://github.com/WebAssembly/bulk-memory-operations lands // fill head and tail with minimal branching if (!n) return; store(dest, c); store(dest + n - 1, c); if (n <= 2) return; store(dest + 1, c); store(dest + 2, c); store(dest + n - 2, c); store(dest + n - 3, c); if (n <= 6) return; store(dest + 3, c); store(dest + n - 4, c); if (n <= 8) return; // advance pointer to align it at 4-byte boundary var k: usize = -dest & 3; dest += k; n -= k; n &= -4; var c32: u32 = -1 / 255 * c; // fill head/tail up to 28 bytes each in preparation store(dest, c32); store(dest + n - 4, c32); if (n <= 8) return; store(dest + 4, c32); store(dest + 8, c32); store(dest + n - 12, c32); store(dest + n - 8, c32); if (n <= 24) return; store(dest + 12, c32); store(dest + 16, c32); store(dest + 20, c32); store(dest + 24, c32); store(dest + n - 28, c32); store(dest + n - 24, c32); store(dest + n - 20, c32); store(dest + n - 16, c32); // align to a multiple of 8 k = 24 + (dest & 4); dest += k; n -= k; // copy 32 bytes each var c64: u64 = c32 | (c32 << 32); while (n >= 32) { store(dest, c64); store(dest + 8, c64); store(dest + 16, c64); store(dest + 24, c64); n -= 32; dest += 32; } } export function compare_memory(vl: usize, vr: usize, n: usize): i32 { // based on musl's implementation of memcmp // provided because there's no proposed alternative if (vl == vr) return 0; while (n && load(vl) == load(vr)) { n--; vl++; vr++; } return n ? load(vl) - load(vr) : 0; }