Improve text encoding API

This commit is contained in:
dcode
2019-03-28 18:57:35 +01:00
parent 0dcfcc7935
commit aa38d06c31
15 changed files with 3304 additions and 1577 deletions

194
std/assembly/encoding.ts Normal file
View File

@ -0,0 +1,194 @@
import { ALLOCATE, REGISTER, REALLOCATE, MAX_BYTELENGTH } from "./runtime";
import { E_INVALIDLENGTH } from "./util/error";
/** UTF16 encoding. */
export namespace UTF16 {
/** Calculates the length of a string when encoded as an UTF16 buffer. */
export function length(str: string): i32 {
return str.length << 1;
}
/** Encodes a string as an UTF16 buffer. */
export function encode(str: string): ArrayBuffer {
var size = <usize>str.length << 1;
var buf = ALLOCATE(size);
memory.copy(buf, changetype<usize>(str), size);
return REGISTER<ArrayBuffer>(buf);
}
/** Decodes an UTF16 buffer to a string.*/
export function decode(buf: ArrayBuffer): string {
return decodeRaw(changetype<usize>(buf), buf.byteLength);
}
// @ts-ignore: decorator
@unsafe
export function decodeRaw(buf: usize, len: i32): string {
if (<usize>len > <usize>MAX_BYTELENGTH) throw new RangeError(E_INVALIDLENGTH);
var size = <usize>len;
var str = ALLOCATE(size);
memory.copy(str, changetype<usize>(buf), size);
return REGISTER<string>(str);
}
}
/** UTF8 encoding. */
export namespace UTF8 {
/** Calculates the length of a string when encoded as an UTF8 buffer. */
export function length(str: string, delimited: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + (<usize>str.length << 1);
var bufLen = delimited ? 1 : 0;
while (strOff < strEnd) {
let c = <u32>load<u16>(strOff);
if (c < 128) {
bufLen += 1; strOff += 2;
} else if (c < 2048) {
bufLen += 2; strOff += 2;
} else {
if ((c & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
strOff += 4; bufLen += 4;
continue;
}
}
strOff += 2; bufLen += 3;
}
}
return bufLen;
}
/** Encodes a string as an UTF8 buffer. */
export function encode(str: string, delimited: bool = false): ArrayBuffer {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + (<usize>str.length << 1);
var buf = ALLOCATE(<usize>length(str, delimited));
var bufOff = changetype<usize>(buf);
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
store<u8>(bufOff, c1);
bufOff += 1; strOff += 2;
} else if (c1 < 2048) {
store<u8>(bufOff, c1 >> 6 | 192);
store<u8>(bufOff, c1 & 63 | 128, 1);
bufOff += 2; strOff += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
let c2 = <u32>load<u16>(strOff, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(bufOff, c1 >> 18 | 240);
store<u8>(bufOff, c1 >> 12 & 63 | 128, 1);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 2);
store<u8>(bufOff, c1 & 63 | 128, 3);
strOff += 4; bufOff += 4;
continue;
}
}
store<u8>(bufOff, c1 >> 12 | 224);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 1);
store<u8>(bufOff, c1 & 63 | 128, 2);
strOff += 2; bufOff += 3;
}
}
assert(strOff == strEnd);
if (delimited) store<u8>(bufOff, 0);
return REGISTER<ArrayBuffer>(buf);
}
/** Decodes an UTF8 buffer to a string.*/
export function decode(buf: ArrayBuffer, delimited: bool = false): string {
return delimited
? decodeRawDelimited(changetype<usize>(buf), buf.byteLength)
: decodeRaw(changetype<usize>(buf), buf.byteLength);
}
// @ts-ignore: decorator
@unsafe
export function decodeRaw(buf: usize, len: i32): string {
var bufOff = buf;
var bufEnd = buf + <usize>len;
var str = ALLOCATE(<usize>len << 1); // max is one u16 char per u8 byte
var strOff = str;
while (bufOff < bufEnd) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
store<u16>(strOff, cp);
strOff += 2;
} else if (cp > 191 && cp < 224) {
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strOff += 2;
} else if (cp > 239 && cp < 365) {
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strOff += 4;
} else {
store<u16>(strOff,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strOff += 2;
}
}
return REGISTER<string>(REALLOCATE(str, strOff - str));
}
// @ts-ignore: decorator
@unsafe
export function decodeRawDelimited(buf: usize, maxLen: i32 = MAX_BYTELENGTH): string {
var bufOff = buf;
var bufLim = buf + <usize>maxLen;
assert(bufLim > bufOff); // guard wraparound
var str = ALLOCATE(16); // optimize for small strings
var strLen = <usize>0;
while (bufOff < bufLim) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
if (!cp) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen, cp);
strLen += 2;
} else if (cp > 191 && cp < 224) {
if (bufOff >= bufLim) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strLen += 2;
} else if (cp > 239 && cp < 365) {
if (bufOff + 3 > bufLim) break;
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
str = REALLOCATE(str, strLen + 4);
let strOff = str + strLen;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strLen += 4;
} else {
if (bufOff + 2 > bufLim) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strLen += 2;
}
}
return REGISTER<string>(REALLOCATE(str, strLen));
}
}

View File

@ -1230,7 +1230,6 @@ declare class String {
static fromCodePoints(arr: i32[]): string;
readonly length: i32;
readonly lengthUTF8: i32;
charAt(index: u32): string;
charCodeAt(index: u32): u16;
@ -1253,8 +1252,21 @@ declare class String {
slice(beginIndex: i32, endIndex?: i32): string;
split(separator?: string, limit?: i32): string[];
toString(): string;
static fromUTF8(ptr: usize, len: usize): string;
toUTF8(): usize;
}
declare namespace UTF16 {
export function length(str: string): i32;
export function encode(str: string): ArrayBuffer;
export function decode(buf: ArrayBuffer, delimited?: bool): string;
export function decodeRaw(buf: usize, len: i32): string; // unsafe
}
declare namespace UTF8 {
export function length(str: string, delimited?: bool): i32;
export function encode(str: string, delimited?: bool): ArrayBuffer;
export function decode(buf: ArrayBuffer, delimited?: bool): string;
export function decodeRaw(buf: usize, len: i32): string; // unsafe
export function decodeRawDelimited(buf: usize, maxLen?: i32): string; // unsafe
}
/** Class for representing a runtime error. Base class of all errors. */

View File

@ -4,6 +4,7 @@ import { ALLOCATE, REGISTER, HEADER, HEADER_SIZE, MAKEARRAY, ArrayBufferView } f
import { MAX_SIZE_32 } from "./util/allocator";
import { compareImpl, parse, CharCode, isWhiteSpaceOrLineTerminator } from "./util/string";
import { E_INVALIDLENGTH } from "./util/error";
import { UTF8 } from "./encoding";
@sealed export abstract class String {
@ -408,112 +409,6 @@ import { E_INVALIDLENGTH } from "./util/error";
toString(): String {
return this;
}
get lengthUTF8(): i32 {
var len = 1; // null terminated
var pos: usize = 0;
var end = <usize>this.length;
while (pos < end) {
let c = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c < 128) {
len += 1; ++pos;
} else if (c < 2048) {
len += 2; ++pos;
} else {
if (
(c & 0xFC00) == 0xD800 && pos + 1 < end &&
(<u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1)) & 0xFC00) == 0xDC00
) {
len += 4; pos += 2;
} else {
len += 3; ++pos;
}
}
}
return len;
}
static fromUTF8(ptr: usize, len: usize): string {
if (len < 1) return changetype<string>("");
var ptrPos = <usize>0;
var buf = memory.allocate(<usize>len << 1);
var bufPos = <usize>0;
while (ptrPos < len) {
let cp = <u32>load<u8>(ptr + ptrPos++);
if (cp < 128) {
store<u16>(buf + bufPos, cp);
bufPos += 2;
} else if (cp > 191 && cp < 224) {
assert(ptrPos + 1 <= len);
store<u16>(buf + bufPos, (cp & 31) << 6 | load<u8>(ptr + ptrPos++) & 63);
bufPos += 2;
} else if (cp > 239 && cp < 365) {
assert(ptrPos + 3 <= len);
cp = (
(cp & 7) << 18 |
(load<u8>(ptr + ptrPos++) & 63) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
) - 0x10000;
store<u16>(buf + bufPos, 0xD800 + (cp >> 10));
bufPos += 2;
store<u16>(buf + bufPos, 0xDC00 + (cp & 1023));
bufPos += 2;
} else {
assert(ptrPos + 2 <= len);
store<u16>(buf + bufPos,
(cp & 15) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
);
bufPos += 2;
}
}
assert(ptrPos == len);
var out = ALLOCATE(bufPos);
memory.copy(changetype<usize>(out), buf, bufPos);
memory.free(buf);
return REGISTER<string>(out);
}
toUTF8(): usize {
var buf = memory.allocate(<usize>this.lengthUTF8);
var pos: usize = 0;
var end = <usize>this.length;
var off: usize = 0;
while (pos < end) {
let c1 = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c1 < 128) {
store<u8>(buf + off, c1);
++off; ++pos;
} else if (c1 < 2048) {
let ptr = buf + off;
store<u8>(ptr, c1 >> 6 | 192);
store<u8>(ptr, c1 & 63 | 128, 1);
off += 2; ++pos;
} else {
let ptr = buf + off;
if ((c1 & 0xFC00) == 0xD800 && pos + 1 < end) {
let c2 = <u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1));
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(ptr, c1 >> 18 | 240);
store<u8>(ptr, c1 >> 12 & 63 | 128, 1);
store<u8>(ptr, c1 >> 6 & 63 | 128, 2);
store<u8>(ptr, c1 & 63 | 128, 3);
off += 4; pos += 2;
continue;
}
}
store<u8>(ptr, c1 >> 12 | 224);
store<u8>(ptr, c1 >> 6 & 63 | 128, 1);
store<u8>(ptr, c1 & 63 | 128, 2);
off += 3; ++pos;
}
}
store<u8>(buf + off, 0);
return buf;
}
}
// @ts-ignore: nolib