diff --git a/examples/rust-regex/README.md b/examples/rust-regex/README.md index c24810e..b222de5 100644 --- a/examples/rust-regex/README.md +++ b/examples/rust-regex/README.md @@ -1,3 +1,25 @@ ### Example: Rust Regex -TODO: The regex crate does not yet build with wasm32-unknown-unknown \ No newline at end of file +This shows an example of using the Rust regex library on the JVM. This builds on [rust-simple](../rust-simple) and +[rust-string](../rust-string). There is also a simple benchmark checking the performance compared to the built-in Java +regex engine. + +#### Main + +In this version, we include the `regex` crate. The main loads a ~15k text file Project Gutenberg collection of Mark +Twain works (taken from [this blog post](https://rust-leipzig.github.io/regex/2017/03/28/comparison-of-regex-engines/) +that does Rust regex performance benchmarks). Both the Java and Rust regex engines are abstracted into a common +interface. When run, it checks how many times the word "Twain" appears via both regex engines. + +To run it yourself, run the following from the root `asmble` dir: + + gradlew --no-daemon :examples:rust-regex:run + +In release mode, the generated class is 903KB w/ ~575 methods. The output: + + 'Twain' count in Java: 811 + 'Twain' count in Rust: 811 + +#### Benchmarks + +TODO: JMH benchmarks \ No newline at end of file diff --git a/examples/rust-regex/src/lib.rs b/examples/rust-regex/src/lib.rs index f2895c3..8d16042 100644 --- a/examples/rust-regex/src/lib.rs +++ b/examples/rust-regex/src/lib.rs @@ -8,15 +8,31 @@ use std::mem; use std::str; #[no_mangle] -pub extern "C" fn compile_pattern(str_ptr: *mut u8, len: usize) -> *const Regex { +pub extern "C" fn compile_pattern(str_ptr: *mut u8, len: usize) -> *mut Regex { unsafe { let bytes = Vec::::from_raw_parts(str_ptr, len, len); - let s = str::from_utf8(&bytes).unwrap(); - let r = Regex::new(s).unwrap(); - let raw_r = &r as *const Regex; - mem::forget(s); + let s = str::from_utf8_unchecked(&bytes); + let r = Box::new(Regex::new(s).unwrap()); + Box::into_raw(r) + } +} + +#[no_mangle] +pub extern "C" fn dispose_pattern(r: *mut Regex) { + unsafe { + let _r = Box::from_raw(r); + } +} + +#[no_mangle] +pub extern "C" fn match_count(r: *mut Regex, str_ptr: *mut u8, len: usize) -> usize { + unsafe { + let bytes = Vec::::from_raw_parts(str_ptr, len, len); + let s = str::from_utf8_unchecked(&bytes); + let r = Box::from_raw(r); + let count = r.find_iter(s).count(); mem::forget(r); - raw_r + count } } diff --git a/examples/rust-regex/src/main/java/asmble/examples/rustregex/JavaLib.java b/examples/rust-regex/src/main/java/asmble/examples/rustregex/JavaLib.java index 387d873..1aab77f 100644 --- a/examples/rust-regex/src/main/java/asmble/examples/rustregex/JavaLib.java +++ b/examples/rust-regex/src/main/java/asmble/examples/rustregex/JavaLib.java @@ -3,21 +3,26 @@ package asmble.examples.rustregex; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class JavaLib implements RegexLib { +public class JavaLib implements RegexLib { @Override public JavaPattern compile(String str) { return new JavaPattern(str); } - public static class JavaPattern implements RegexPattern { + @Override + public String prepareTarget(String target) { + return target; + } - final Pattern pattern; + public class JavaPattern implements RegexPattern { - JavaPattern(String pattern) { + private final Pattern pattern; + + private JavaPattern(String pattern) { this(Pattern.compile(pattern)); } - JavaPattern(Pattern pattern) { + private JavaPattern(Pattern pattern) { this.pattern = pattern; } diff --git a/examples/rust-regex/src/main/java/asmble/examples/rustregex/Main.java b/examples/rust-regex/src/main/java/asmble/examples/rustregex/Main.java index 89d1f6c..e1577d7 100644 --- a/examples/rust-regex/src/main/java/asmble/examples/rustregex/Main.java +++ b/examples/rust-regex/src/main/java/asmble/examples/rustregex/Main.java @@ -7,13 +7,16 @@ import java.nio.charset.StandardCharsets; public class Main { - // 20 pages is good for now - private static final int PAGE_SIZE = 65536; - private static final int MAX_MEMORY = 20 * PAGE_SIZE; - public static void main(String[] args) throws Exception { - String twainText = loadTwainText(); - System.out.println("Appearances of 'Twain': " + new JavaLib().compile("Twain").matchCount(twainText)); + String twainString = loadTwainText(); + System.out.println("'Twain' count in Java: " + matchCount(twainString, "Twain", new JavaLib())); + System.out.println("'Twain' count in Rust: " + matchCount(twainString, "Twain", new RustLib())); + } + + public static int matchCount(String target, String pattern, RegexLib lib) { + RegexLib.RegexPattern compiledPattern = lib.compile(pattern); + T preparedTarget = lib.prepareTarget(target); + return compiledPattern.matchCount(preparedTarget); } public static String loadTwainText() throws IOException { diff --git a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RegexLib.java b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RegexLib.java index d5e96b6..959fdd7 100644 --- a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RegexLib.java +++ b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RegexLib.java @@ -1,10 +1,12 @@ package asmble.examples.rustregex; -public interface RegexLib { +public interface RegexLib { - RegexPattern compile(String str); + RegexPattern compile(String str); - interface RegexPattern { - int matchCount(String target); + T prepareTarget(String target); + + interface RegexPattern { + int matchCount(T target); } } diff --git a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java index 8b06013..04ac8c8 100644 --- a/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java +++ b/examples/rust-regex/src/main/java/asmble/examples/rustregex/RustLib.java @@ -1,19 +1,83 @@ package asmble.examples.rustregex; -public class RustLib implements RegexLib { +import asmble.generated.RustRegex; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + +public class RustLib implements RegexLib { + + // 300 pages is good for now + private static final int PAGE_SIZE = 65536; + private static final int MAX_MEMORY = 300 * PAGE_SIZE; + + private final RustRegex rustRegex; + + public RustLib() { + rustRegex = new RustRegex(MAX_MEMORY); + } @Override public RustPattern compile(String str) { - // TODO - return null; + return new RustPattern(str); + } + + @Override + public Ptr prepareTarget(String target) { + return ptrFromString(target); } - public class RustPattern implements RegexPattern { + private Ptr ptrFromString(String str) { + byte[] bytes = str.getBytes(StandardCharsets.UTF_8); + Ptr ptr = new Ptr(bytes.length); + ptr.put(bytes); + return ptr; + } + + public class RustPattern implements RegexPattern { + + private final int pointer; + + private RustPattern(String pattern) { + Ptr ptr = ptrFromString(pattern); + pointer = rustRegex.compile_pattern(ptr.offset, ptr.size); + } @Override - public int matchCount(String target) { - // TODO - return 0; + protected void finalize() throws Throwable { + rustRegex.dispose_pattern(pointer); + } + + @Override + public int matchCount(Ptr target) { + return rustRegex.match_count(pointer, target.offset, target.size); + } + } + + public class Ptr { + + final int offset; + final int size; + + Ptr(int offset, int size) { + this.offset = offset; + this.size = size; + } + + Ptr(int size) { + this(rustRegex.alloc(size), size); + } + + void put(byte[] bytes) { + // Yeah, yeah, not thread safe + ByteBuffer memory = rustRegex.getMemory(); + memory.position(offset); + memory.put(bytes); + } + + @Override + protected void finalize() throws Throwable { + rustRegex.dealloc(offset, size); } } }