diff --git a/README.md b/README.md index ad0e0a7..3e7d123 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ The result will be: 70 : i32 -Which is how the test harness prints an integer. +Which is how the test harness prints an integer. See the [examples](examples) directory for more examples. ## CLI Usage diff --git a/build.gradle b/build.gradle index 9d17f24..83b5df1 100644 --- a/build.gradle +++ b/build.gradle @@ -13,7 +13,7 @@ buildscript { } dependencies { classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" - classpath 'me.champeau.gradle:jmh-gradle-plugin:0.4.4' + classpath 'me.champeau.gradle:jmh-gradle-plugin:0.4.5' } } @@ -109,14 +109,25 @@ project(':examples') { project(':examples:rust-regex') { apply plugin: 'application' + apply plugin: 'me.champeau.gradle.jmh' ext.wasmCompiledClassName = 'asmble.generated.RustRegex' dependencies { compile files('build/wasm-classes') + testCompile 'junit:junit:4.12' } compileJava { dependsOn compileRustWasm } mainClassName = 'asmble.examples.rustregex.Main' + test { + testLogging.showStandardStreams = true + testLogging.events 'PASSED', 'SKIPPED' + } + jmh { + iterations = 5 + warmupIterations = 5 + fork = 3 + } } project(':examples:rust-simple') { diff --git a/examples/README.md b/examples/README.md index e186001..2d05481 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,9 +1,11 @@ ## Examples +Below are some examples. + ### Rust -In order of complexity: +Compile Rust to WASM and then to the JVM. In order of complexity: * [rust-simple](rust-simple) * [rust-string](rust-string) -* rust-regex \ No newline at end of file +* [rust-regex](rust-regex) \ No newline at end of file diff --git a/examples/rust-regex/README.md b/examples/rust-regex/README.md index b222de5..ae37aad 100644 --- a/examples/rust-regex/README.md +++ b/examples/rust-regex/README.md @@ -1,8 +1,8 @@ ### Example: Rust Regex -This shows an example of using the Rust regex library on the JVM. This builds on [rust-simple](../rust-simple) and -[rust-string](../rust-string). There is also a simple benchmark checking the performance compared to the built-in Java -regex engine. +This shows an example of using the Rust regex library on the JVM compiled via WASM. This builds on +the [rust-simple](../rust-simple) and [rust-string](../rust-string) examples. There is also a simple benchmark checking +the performance compared to the built-in Java regex engine. #### Main @@ -20,6 +20,121 @@ In release mode, the generated class is 903KB w/ ~575 methods. The output: 'Twain' count in Java: 811 'Twain' count in Rust: 811 +#### Tests + +I wanted to compare the Java regex engine with the Rust regex engine. Before running benchmarks, I wrote a +[unit test](src/test/java/asmble/examples/rustregex/RegexTest.java) to test parity. I used the examples from the +aforementioned [blog post](https://rust-leipzig.github.io/regex/2017/03/28/comparison-of-regex-engines/) to test with. +The test simply confirms the Java regex library and the Rust regex library produce the same match counts across the +Mark Twain corpus. To run the test, execute: + + gradlew --no-daemon :examples:rust-regex:test + +Here is my output of the test part: + + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: Twain] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: (?i)Twain] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: [a-z]shing] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: Huck[a-zA-Z]+|Saw[a-zA-Z]+] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: \b\w+nn\b] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: [a-q][^u-z]{13}x] SKIPPED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: Tom|Sawyer|Huckleberry|Finn] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: (?i)Tom|Sawyer|Huckleberry|Finn] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: .{0,2}(Tom|Sawyer|Huckleberry|Finn)] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: .{2,4}(Tom|Sawyer|Huckleberry|Finn)] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: Tom.{10,25}river|river.{10,25}Tom] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: [a-zA-Z]+ing] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: \s[a-zA-Z]{0,12}ing\s] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: ([A-Za-z]awyer|[A-Za-z]inn)\s] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: ["'][^"']{0,30}[?!\.]["']] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: ?|?] PASSED + asmble.examples.rustregex.RegexTest > checkJavaVersusRust[pattern: \p{Sm}] PASSED + +As mentioned in the blog post, `[a-q][^u-z]{13}x` is a very slow pattern for Rust, so I skipped it (but it does produce +the same count if you're willing to wait a couple of minutes). Also, `?|?` is actually `∞|✓`, it's just not printable +unicode in the text output I used. + #### Benchmarks -TODO: JMH benchmarks \ No newline at end of file +With the accuracy confirmed, now was time to benchmark the two engines. I wrote a +[JMH benchmark](src/jmh/java/asmble/examples/rustregex/RegexBenchmark.java) to test the same patterns as the unit test +checks. It precompiles the patterns and preloads the target string on the Rust side before checking simple match count. +As with any benchmarks, this is just my empirical data and everyone else's will be different. To run the benchmark, +execute (it takes a while to run): + + gradlew --no-daemon :examples:rust-regex:jmh + +Here are my results (reordered and with added linebreaks for readability, higher score is better): + + Benchmark (patternString) Mode Cnt Score Error Units + + RegexBenchmark.javaRegexCheck Twain thrpt 15 29.756 ± 1.169 ops/s + RegexBenchmark.rustRegexCheck Twain thrpt 15 55.012 ± 0.677 ops/s + + RegexBenchmark.javaRegexCheck (?i)Twain thrpt 15 6.181 ± 0.560 ops/s + RegexBenchmark.rustRegexCheck (?i)Twain thrpt 15 1.333 ± 0.029 ops/s + + RegexBenchmark.javaRegexCheck [a-z]shing thrpt 15 6.138 ± 0.937 ops/s + RegexBenchmark.rustRegexCheck [a-z]shing thrpt 15 12.352 ± 0.103 ops/s + + RegexBenchmark.javaRegexCheck Huck[a-zA-Z]+|Saw[a-zA-Z]+ thrpt 15 4.774 ± 0.330 ops/s + RegexBenchmark.rustRegexCheck Huck[a-zA-Z]+|Saw[a-zA-Z]+ thrpt 15 56.079 ± 0.487 ops/s + + RegexBenchmark.javaRegexCheck \b\w+nn\b thrpt 15 2.703 ± 0.086 ops/s + RegexBenchmark.rustRegexCheck \b\w+nn\b thrpt 15 0.131 ± 0.001 ops/s + + RegexBenchmark.javaRegexCheck Tom|Sawyer|Huckleberry|Finn thrpt 15 2.633 ± 0.033 ops/s + RegexBenchmark.rustRegexCheck Tom|Sawyer|Huckleberry|Finn thrpt 15 14.388 ± 0.138 ops/s + + RegexBenchmark.javaRegexCheck (?i)Tom|Sawyer|Huckleberry|Finn thrpt 15 3.178 ± 0.045 ops/s + RegexBenchmark.rustRegexCheck (?i)Tom|Sawyer|Huckleberry|Finn thrpt 15 8.882 ± 0.110 ops/s + + RegexBenchmark.javaRegexCheck .{0,2}(Tom|Sawyer|Huckleberry|Finn) thrpt 15 1.191 ± 0.010 ops/s + RegexBenchmark.rustRegexCheck .{0,2}(Tom|Sawyer|Huckleberry|Finn) thrpt 15 0.572 ± 0.012 ops/s + + RegexBenchmark.javaRegexCheck .{2,4}(Tom|Sawyer|Huckleberry|Finn) thrpt 15 1.017 ± 0.024 ops/s + RegexBenchmark.rustRegexCheck .{2,4}(Tom|Sawyer|Huckleberry|Finn) thrpt 15 0.584 ± 0.008 ops/s + + RegexBenchmark.javaRegexCheck Tom.{10,25}river|river.{10,25}Tom thrpt 15 5.326 ± 0.050 ops/s + RegexBenchmark.rustRegexCheck Tom.{10,25}river|river.{10,25}Tom thrpt 15 15.705 ± 0.247 ops/s + + RegexBenchmark.javaRegexCheck [a-zA-Z]+ing thrpt 15 1.768 ± 0.057 ops/s + RegexBenchmark.rustRegexCheck [a-zA-Z]+ing thrpt 15 1.001 ± 0.012 ops/s + + RegexBenchmark.javaRegexCheck \s[a-zA-Z]{0,12}ing\s thrpt 15 4.020 ± 0.111 ops/s + RegexBenchmark.rustRegexCheck \s[a-zA-Z]{0,12}ing\s thrpt 15 0.416 ± 0.004 ops/s + + RegexBenchmark.javaRegexCheck ([A-Za-z]awyer|[A-Za-z]inn)\s thrpt 15 2.441 ± 0.024 ops/s + RegexBenchmark.rustRegexCheck ([A-Za-z]awyer|[A-Za-z]inn)\s thrpt 15 0.591 ± 0.004 ops/s + + RegexBenchmark.javaRegexCheck ["'][^"']{0,30}[?!\.]["'] thrpt 15 20.466 ± 0.309 ops/s + RegexBenchmark.rustRegexCheck ["'][^"']{0,30}[?!\.]["'] thrpt 15 2.459 ± 0.024 ops/s + + RegexBenchmark.javaRegexCheck ?|? thrpt 15 15.856 ± 0.158 ops/s + RegexBenchmark.rustRegexCheck ?|? thrpt 15 14.657 ± 0.177 ops/s + + RegexBenchmark.javaRegexCheck \p{Sm} thrpt 15 22.156 ± 0.406 ops/s + RegexBenchmark.rustRegexCheck \p{Sm} thrpt 15 0.592 ± 0.005 ops/s + +To keep from making this a big long post like most benchmark posts tend to be, here is a bulleted list of notes: + +* I ran this on a Win 10 box, 1.8GHz i7-8550U HP laptop. I used latest Zulu JDK 8. For JMH, I set it at 3 forks, 5 + warmup iterations, and 5 measurement iterations (that's why `cnt` above is 15 = 5 measurements * 3 forks). It took a + bit over 25 minutes to complete. +* All of the tests had the Java and Rust patterns precompiled. In Rust's case, I also placed the UTF-8 string on the + accessible heap before the benchmark started to be fair. +* Like the unit test, I excluded `[a-q][^u-z]{13}x` because Rust is really slow at it (Java wins by a mile here). Also + like the unit test, `?|?` is actually `∞|✓`. +* Of the ones tested, Rust is faster in 6 and Java is faster in the other 10. And where it is faster, it is much faster. + This is quite decent since the Rust+WASM version uses `ByteBuffer`s everywhere, has some overflow checks, and in + general there are some impedance mismatches with the WASM bytecode and the JVM bytecode. +* Notice the low error numbers on the Rust versions. The error number is the deviation between invocations. This shows + the WASM-to-JVM ends up quite deterministic (or maybe, that there is just too much cruft to JIT, heh). +* If I were more serious about it, I'd check with other OS's, add more iterations, tweak some compiler options, include + regex pattern compilation speed benchmarks, and so on. But I just needed simple proof that speed is reasonable. + +Overall, this shows running Rust on the JVM to be entirely reasonable for certain types of workloads. There are still +memory concerns, but not terribly. If given the choice, use a JVM language of course; the safety benefits of Rust don't +outweigh the problems of Rust-to-WASM-to-JVM such as build complexity, security concerns (`ByteBuffer` is where all +memory lives), debugging concerns, etc. But if you have a library in Rust, exposing it to the JVM sans-JNI is a doable +feat if you must. \ No newline at end of file diff --git a/examples/rust-regex/src/jmh/java/asmble/examples/rustregex/RegexBenchmark.java b/examples/rust-regex/src/jmh/java/asmble/examples/rustregex/RegexBenchmark.java new file mode 100644 index 0000000..6fe6de7 --- /dev/null +++ b/examples/rust-regex/src/jmh/java/asmble/examples/rustregex/RegexBenchmark.java @@ -0,0 +1,61 @@ +package asmble.examples.rustregex; + +import org.openjdk.jmh.annotations.*; + +import java.io.IOException; + +@State(Scope.Thread) +public class RegexBenchmark { + @Param({ + "Twain", + "(?i)Twain", + "[a-z]shing", + "Huck[a-zA-Z]+|Saw[a-zA-Z]+", + "\\b\\w+nn\\b", + // Too slow + // "[a-q][^u-z]{13}x", + "Tom|Sawyer|Huckleberry|Finn", + "(?i)Tom|Sawyer|Huckleberry|Finn", + ".{0,2}(Tom|Sawyer|Huckleberry|Finn)", + ".{2,4}(Tom|Sawyer|Huckleberry|Finn)", + "Tom.{10,25}river|river.{10,25}Tom", + "[a-zA-Z]+ing", + "\\s[a-zA-Z]{0,12}ing\\s", + "([A-Za-z]awyer|[A-Za-z]inn)\\s", + "[\"'][^\"']{0,30}[?!\\.][\"']", + "\u221E|\u2713", + "\\p{Sm}" + }) + private String patternString; + + private String twainString; + private JavaLib javaLib; + private JavaLib.JavaPattern precompiledJavaPattern; + private RustLib rustLib; + private RustLib.Ptr preparedRustTarget; + private RustLib.RustPattern precompiledRustPattern; + + @Setup + public void init() throws IOException { + // JMH is not handling this right, so we replace inline + if ("?|?".equals(patternString)) { + patternString = "\u221E|\u2713"; + } + twainString = Main.loadTwainText(); + javaLib = new JavaLib(); + precompiledJavaPattern = javaLib.compile(patternString); + rustLib = new RustLib(); + preparedRustTarget = rustLib.prepareTarget(twainString); + precompiledRustPattern = rustLib.compile(patternString); + } + + @Benchmark + public void javaRegexCheck() { + precompiledJavaPattern.matchCount(twainString); + } + + @Benchmark + public void rustRegexCheck() { + precompiledRustPattern.matchCount(preparedRustTarget); + } +} \ No newline at end of file diff --git a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java index 04ac8c8..616643d 100644 --- a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java +++ b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java @@ -7,9 +7,9 @@ import java.nio.charset.StandardCharsets; public class RustLib implements RegexLib { - // 300 pages is good for now + // 600 pages is enough for our use private static final int PAGE_SIZE = 65536; - private static final int MAX_MEMORY = 300 * PAGE_SIZE; + private static final int MAX_MEMORY = 600 * PAGE_SIZE; private final RustRegex rustRegex; diff --git a/examples/rust-regex/src/test/java/asmble/examples/rustregex/RegexTest.java b/examples/rust-regex/src/test/java/asmble/examples/rustregex/RegexTest.java new file mode 100644 index 0000000..323ad7f --- /dev/null +++ b/examples/rust-regex/src/test/java/asmble/examples/rustregex/RegexTest.java @@ -0,0 +1,67 @@ +package asmble.examples.rustregex; + +import org.junit.Assert; +import org.junit.Assume; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; + +@RunWith(Parameterized.class) +public class RegexTest { + // Too slow to run regularly + private static final String TOO_SLOW = "[a-q][^u-z]{13}x"; + + @Parameterized.Parameters(name = "pattern: {0}") + public static String[] data() { + return new String[] { + "Twain", + "(?i)Twain", + "[a-z]shing", + "Huck[a-zA-Z]+|Saw[a-zA-Z]+", + "\\b\\w+nn\\b", + "[a-q][^u-z]{13}x", + "Tom|Sawyer|Huckleberry|Finn", + "(?i)Tom|Sawyer|Huckleberry|Finn", + ".{0,2}(Tom|Sawyer|Huckleberry|Finn)", + ".{2,4}(Tom|Sawyer|Huckleberry|Finn)", + "Tom.{10,25}river|river.{10,25}Tom", + "[a-zA-Z]+ing", + "\\s[a-zA-Z]{0,12}ing\\s", + "([A-Za-z]awyer|[A-Za-z]inn)\\s", + "[\"'][^\"']{0,30}[?!\\.][\"']", + "\u221E|\u2713", + "\\p{Sm}" + }; + } + + private static RustLib rustLib; + private static String twainText; + private static RustLib.Ptr preparedRustTarget; + + @BeforeClass + public static void setUpClass() throws IOException { + twainText = Main.loadTwainText(); + rustLib = new RustLib(); + preparedRustTarget = rustLib.prepareTarget(twainText); + } + + private String pattern; + + public RegexTest(String pattern) { + this.pattern = pattern; + } + + @Test + public void checkJavaVersusRust() { + Assume.assumeFalse("Skipped for being too slow", pattern.equals(TOO_SLOW)); + int expected = new JavaLib().compile(pattern).matchCount(twainText); + // System.out.println("Found " + expected + " matches for pattern: " + pattern); + Assert.assertEquals( + expected, + rustLib.compile(pattern).matchCount(preparedRustTarget) + ); + } +}