diff --git a/README.md b/README.md index 56e7fb5..f019d27 100644 --- a/README.md +++ b/README.md @@ -374,8 +374,9 @@ and the JVM: * WebAssembly has a nice data section for byte arrays whereas the JVM does not. Right now we use a single-byte-char string constant (i.e. ISO-8859 charset). This saves class file size, but this means we call `String::getBytes` on init to load bytes from the string constant. Due to the JVM using an unsigned 16-bit int as the string constant - length, the maximum length is 65536, so we chunk data sections into as many max-65500-byte lengths we need to load it - all. + length, the maximum byte length is 65536. Since the string constants are stored as UTF-8 constants, they can be up to + four bytes a character. Therefore, we populate memory in data chunks no larger than 16300 (nice round number to make + sure that even in the worse case of 4 bytes per char in UTF-8 view, we're still under the max). * The JVM makes no guarantees about trailing bits being preserved on NaN floating point representations like WebAssembly does. This causes some mismatch on WebAssembly tests depending on how the JVM "feels" (I haven't dug into why some bit patterns stay and some don't when NaNs are passed through methods). diff --git a/compiler/src/main/kotlin/asmble/compile/jvm/ByteBufferMem.kt b/compiler/src/main/kotlin/asmble/compile/jvm/ByteBufferMem.kt index 87cfa02..9a1c3ba 100644 --- a/compiler/src/main/kotlin/asmble/compile/jvm/ByteBufferMem.kt +++ b/compiler/src/main/kotlin/asmble/compile/jvm/ByteBufferMem.kt @@ -49,8 +49,10 @@ open class ByteBufferMem(val direct: Boolean = true) : Mem { TypeInsnNode(Opcodes.CHECKCAST, memType.asmName) ).addInsns( // We're going to do this as an LDC string in ISO-8859 and read it back at runtime. However, - // due to JVM limits, we can't have a string > 65536 chars, so I'll chunk it every 65500 chars. - bytes.chunked(65500).flatMap { bytes -> + // due to JVM limits, we can't have a string > 65536 chars. We chunk into 16300 because when + // converting to UTF8 const it can be up to 4 bytes per char, so this makes sure it doesn't + // overflow. + bytes.chunked(16300).flatMap { bytes -> sequenceOf( LdcInsnNode(bytes.toString(Charsets.ISO_8859_1)), LdcInsnNode("ISO-8859-1"), diff --git a/compiler/src/test/kotlin/asmble/compile/jvm/LargeDataTest.kt b/compiler/src/test/kotlin/asmble/compile/jvm/LargeDataTest.kt index 0c189df..b938d39 100644 --- a/compiler/src/test/kotlin/asmble/compile/jvm/LargeDataTest.kt +++ b/compiler/src/test/kotlin/asmble/compile/jvm/LargeDataTest.kt @@ -12,7 +12,9 @@ import kotlin.test.assertEquals class LargeDataTest : TestBase() { @Test fun testLargeData() { - // This previously failed because string constants can't be longer than 65536 chars + // This previously failed because string constants can't be longer than 65536 chars. + // We create a byte array across the whole gambit of bytes to test UTF8 encoding. + val bytesExpected = ByteArray(70000) { ((it % 255) - Byte.MIN_VALUE).toByte() } val mod = Node.Module( memories = listOf(Node.Type.Memory( limits = Node.ResizableLimits(initial = 2, maximum = 2) @@ -20,7 +22,7 @@ class LargeDataTest : TestBase() { data = listOf(Node.Data( index = 0, offset = listOf(Node.Instr.I32Const(0)), - data = ByteArray(70000) { 'a'.toByte() } + data = bytesExpected )) ) val ctx = ClsContext( @@ -35,9 +37,9 @@ class LargeDataTest : TestBase() { val field = cls.getDeclaredField("memory").apply { isAccessible = true } val buf = field[cls.newInstance()] as ByteBuffer // Grab all + 1 and check values - val bytes = ByteArray(70001).also { buf.get(0, it) } - bytes.forEachIndexed { index, byte -> - assertEquals(if (index == 70000) 0.toByte() else 'a'.toByte(), byte) + val bytesActual = ByteArray(70001).also { buf.get(0, it) } + bytesActual.forEachIndexed { index, byte -> + assertEquals(if (index == 70000) 0.toByte() else bytesExpected[index], byte) } } } \ No newline at end of file