More work on Rust regex example.

Issue #9
This commit is contained in:
Chad Retz 2017-12-05 23:05:27 -06:00
parent 4febf34e69
commit 1418ba86cb
6 changed files with 141 additions and 29 deletions

View File

@ -1,3 +1,25 @@
### Example: Rust Regex
TODO: The regex crate does not yet build with wasm32-unknown-unknown
This shows an example of using the Rust regex library on the JVM. This builds on [rust-simple](../rust-simple) and
[rust-string](../rust-string). There is also a simple benchmark checking the performance compared to the built-in Java
regex engine.
#### Main
In this version, we include the `regex` crate. The main loads a ~15k text file Project Gutenberg collection of Mark
Twain works (taken from [this blog post](https://rust-leipzig.github.io/regex/2017/03/28/comparison-of-regex-engines/)
that does Rust regex performance benchmarks). Both the Java and Rust regex engines are abstracted into a common
interface. When run, it checks how many times the word "Twain" appears via both regex engines.
To run it yourself, run the following from the root `asmble` dir:
gradlew --no-daemon :examples:rust-regex:run
In release mode, the generated class is 903KB w/ ~575 methods. The output:
'Twain' count in Java: 811
'Twain' count in Rust: 811
#### Benchmarks
TODO: JMH benchmarks

View File

@ -8,15 +8,31 @@ use std::mem;
use std::str;
#[no_mangle]
pub extern "C" fn compile_pattern(str_ptr: *mut u8, len: usize) -> *const Regex {
pub extern "C" fn compile_pattern(str_ptr: *mut u8, len: usize) -> *mut Regex {
unsafe {
let bytes = Vec::<u8>::from_raw_parts(str_ptr, len, len);
let s = str::from_utf8(&bytes).unwrap();
let r = Regex::new(s).unwrap();
let raw_r = &r as *const Regex;
mem::forget(s);
let s = str::from_utf8_unchecked(&bytes);
let r = Box::new(Regex::new(s).unwrap());
Box::into_raw(r)
}
}
#[no_mangle]
pub extern "C" fn dispose_pattern(r: *mut Regex) {
unsafe {
let _r = Box::from_raw(r);
}
}
#[no_mangle]
pub extern "C" fn match_count(r: *mut Regex, str_ptr: *mut u8, len: usize) -> usize {
unsafe {
let bytes = Vec::<u8>::from_raw_parts(str_ptr, len, len);
let s = str::from_utf8_unchecked(&bytes);
let r = Box::from_raw(r);
let count = r.find_iter(s).count();
mem::forget(r);
raw_r
count
}
}

View File

@ -3,21 +3,26 @@ package asmble.examples.rustregex;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class JavaLib implements RegexLib {
public class JavaLib implements RegexLib<String> {
@Override
public JavaPattern compile(String str) {
return new JavaPattern(str);
}
public static class JavaPattern implements RegexPattern {
@Override
public String prepareTarget(String target) {
return target;
}
final Pattern pattern;
public class JavaPattern implements RegexPattern<String> {
JavaPattern(String pattern) {
private final Pattern pattern;
private JavaPattern(String pattern) {
this(Pattern.compile(pattern));
}
JavaPattern(Pattern pattern) {
private JavaPattern(Pattern pattern) {
this.pattern = pattern;
}

View File

@ -7,13 +7,16 @@ import java.nio.charset.StandardCharsets;
public class Main {
// 20 pages is good for now
private static final int PAGE_SIZE = 65536;
private static final int MAX_MEMORY = 20 * PAGE_SIZE;
public static void main(String[] args) throws Exception {
String twainText = loadTwainText();
System.out.println("Appearances of 'Twain': " + new JavaLib().compile("Twain").matchCount(twainText));
String twainString = loadTwainText();
System.out.println("'Twain' count in Java: " + matchCount(twainString, "Twain", new JavaLib()));
System.out.println("'Twain' count in Rust: " + matchCount(twainString, "Twain", new RustLib()));
}
public static <T> int matchCount(String target, String pattern, RegexLib<T> lib) {
RegexLib.RegexPattern<T> compiledPattern = lib.compile(pattern);
T preparedTarget = lib.prepareTarget(target);
return compiledPattern.matchCount(preparedTarget);
}
public static String loadTwainText() throws IOException {

View File

@ -1,10 +1,12 @@
package asmble.examples.rustregex;
public interface RegexLib {
public interface RegexLib<T> {
RegexPattern compile(String str);
RegexPattern<T> compile(String str);
interface RegexPattern {
int matchCount(String target);
T prepareTarget(String target);
interface RegexPattern<T> {
int matchCount(T target);
}
}

View File

@ -1,19 +1,83 @@
package asmble.examples.rustregex;
public class RustLib implements RegexLib {
import asmble.generated.RustRegex;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
public class RustLib implements RegexLib<RustLib.Ptr> {
// 300 pages is good for now
private static final int PAGE_SIZE = 65536;
private static final int MAX_MEMORY = 300 * PAGE_SIZE;
private final RustRegex rustRegex;
public RustLib() {
rustRegex = new RustRegex(MAX_MEMORY);
}
@Override
public RustPattern compile(String str) {
// TODO
return null;
return new RustPattern(str);
}
@Override
public Ptr prepareTarget(String target) {
return ptrFromString(target);
}
public class RustPattern implements RegexPattern {
private Ptr ptrFromString(String str) {
byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
Ptr ptr = new Ptr(bytes.length);
ptr.put(bytes);
return ptr;
}
public class RustPattern implements RegexPattern<Ptr> {
private final int pointer;
private RustPattern(String pattern) {
Ptr ptr = ptrFromString(pattern);
pointer = rustRegex.compile_pattern(ptr.offset, ptr.size);
}
@Override
public int matchCount(String target) {
// TODO
return 0;
protected void finalize() throws Throwable {
rustRegex.dispose_pattern(pointer);
}
@Override
public int matchCount(Ptr target) {
return rustRegex.match_count(pointer, target.offset, target.size);
}
}
public class Ptr {
final int offset;
final int size;
Ptr(int offset, int size) {
this.offset = offset;
this.size = size;
}
Ptr(int size) {
this(rustRegex.alloc(size), size);
}
void put(byte[] bytes) {
// Yeah, yeah, not thread safe
ByteBuffer memory = rustRegex.getMemory();
memory.position(offset);
memory.put(bytes);
}
@Override
protected void finalize() throws Throwable {
rustRegex.dealloc(offset, size);
}
}
}