feat: Allow the tokenizer to contain custom skip regexes/literals

cc #14
This commit is contained in:
Markus Westerlind 2020-03-02 10:41:01 +01:00
parent 16194d5719
commit ee2f7060e9
10 changed files with 202 additions and 107 deletions

View File

@ -0,0 +1,12 @@
grammar;
match {
r"[0-9]+" => NUM,
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { },
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
}
pub(crate) Term: Vec<&'input str> = {
<NUM*>,
};

View File

@ -143,6 +143,8 @@ lalrpop_mod!(
dyn_argument dyn_argument
); );
lalrpop_mod!(comments);
pub fn use_cfg_created_parser() { pub fn use_cfg_created_parser() {
cfg::CreatedParser::new(); cfg::CreatedParser::new();
} }
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
Use ./snap.sh to generate a new snapshot of the lrgrammar", Use ./snap.sh to generate a new snapshot of the lrgrammar",
); );
} }
#[test]
fn comments() {
assert_eq!(
comments::TermParser::new().parse("22 3 5 13").unwrap(),
vec!["22", "3", "5", "13"]
);
assert_eq!(
comments::TermParser::new()
.parse(
"22 /* 123 */ 3 5
// abc
13 // "
)
.unwrap(),
vec!["22", "3", "5", "13"]
);
}

View File

@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
} }
} }
struct RegexEntry {
regex: regex::Regex,
skip: bool,
}
pub struct MatcherBuilder { pub struct MatcherBuilder {
regex_set: regex::RegexSet, regex_set: regex::RegexSet,
regex_vec: Vec<regex::Regex>, regex_vec: Vec<RegexEntry>,
} }
impl MatcherBuilder { impl MatcherBuilder {
pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error> pub fn new<S>(
exprs: impl IntoIterator<Item = (S, bool)>,
) -> Result<MatcherBuilder, regex::Error>
where where
S: AsRef<str>, S: AsRef<str>,
{ {
let exprs = exprs.into_iter(); let exprs = exprs.into_iter();
let mut regex_vec = Vec::with_capacity(exprs.size_hint().0); let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
let mut first_error = None; let mut first_error = None;
let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| { let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
regex_vec.push(match regex::Regex::new(s.as_ref()) { regex_vec.push(match regex::Regex::new(s.as_ref()) {
Ok(regex) => regex, Ok(regex) => RegexEntry { regex, skip },
Err(err) => { Err(err) => {
first_error = Some(err); first_error = Some(err);
return None; return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
text: &'input str, text: &'input str,
consumed: usize, consumed: usize,
regex_set: &'builder regex::RegexSet, regex_set: &'builder regex::RegexSet,
regex_vec: &'builder Vec<regex::Regex>, regex_vec: &'builder Vec<RegexEntry>,
_marker: PhantomData<fn() -> E>, _marker: PhantomData<fn() -> E>,
} }
@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>; type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let text = self.text.trim_start(); loop {
let whitespace = self.text.len() - text.len(); let text = self.text;
let start_offset = self.consumed + whitespace; let start_offset = self.consumed;
if text.is_empty() { eprintln!("{:?}", text);
self.text = text; if text.is_empty() {
self.consumed = start_offset; self.consumed = start_offset;
None return None;
} else {
let matches = self.regex_set.matches(text);
if !matches.matched_any() {
Some(Err(ParseError::InvalidToken {
location: start_offset,
}))
} else { } else {
let mut longest_match = 0; let matches = self.regex_set.matches(text);
let mut index = 0; if !matches.matched_any() {
for i in matches.iter() { return Some(Err(ParseError::InvalidToken {
let match_ = self.regex_vec[i].find(text).unwrap(); location: start_offset,
let len = match_.end(); }));
if len >= longest_match { } else {
longest_match = len; let mut longest_match = 0;
index = i; let mut index = 0;
let mut skip = false;
for i in matches.iter() {
let entry = &self.regex_vec[i];
let match_ = entry.regex.find(text).unwrap();
let len = match_.end();
if len >= longest_match {
longest_match = len;
index = i;
skip = entry.skip;
}
} }
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
// Skip any whitespace matches
if skip {
if longest_match == 0 {
return Some(Err(ParseError::InvalidToken {
location: start_offset,
}));
}
continue;
}
return Some(Ok((start_offset, Token(index, result), end_offset)));
} }
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
Some(Ok((start_offset, Token(index, result), end_offset)))
} }
} }
} }

View File

@ -109,7 +109,29 @@ impl MatchItem {
} }
pub type MatchSymbol = TerminalLiteral; pub type MatchSymbol = TerminalLiteral;
pub type MatchMapping = TerminalString;
#[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
pub enum MatchMapping {
Terminal(TerminalString),
Skip,
}
impl Debug for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
Self::Terminal(term) => write!(fmt, "{:?}", term),
Self::Skip => write!(fmt, "{{ }}"),
}
}
}
impl Display for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
Self::Terminal(term) => write!(fmt, "{}", term),
Self::Skip => write!(fmt, "{{ }}"),
}
}
}
/// Intern tokens are not typed by the user: they are synthesized in /// Intern tokens are not typed by the user: they are synthesized in
/// the absence of an "extern" declaration with information about the /// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
/// NB: This field must go first, so that `PartialOrd` sorts by precedence first! /// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
pub precedence: usize, pub precedence: usize,
pub match_literal: TerminalLiteral, pub match_literal: TerminalLiteral,
pub user_name: TerminalString, pub user_name: MatchMapping,
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]

View File

@ -1,6 +1,6 @@
//! Generates an iterator type `Matcher` that looks roughly like //! Generates an iterator type `Matcher` that looks roughly like
use grammar::parse_tree::InternToken; use grammar::parse_tree::{InternToken, MatchMapping};
use grammar::repr::{Grammar, TerminalLiteral}; use grammar::repr::{Grammar, TerminalLiteral};
use lexer::re; use lexer::re;
use rust::RustWrite; use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(
// create a vector of rust string literals with the text of each // create a vector of rust string literals with the text of each
// regular expression // regular expression
let regex_strings: Vec<String> = { let regex_strings = intern_token
intern_token .match_entries
.match_entries .iter()
.iter() .map(|match_entry| {
.map(|match_entry| match match_entry.match_literal { (
TerminalLiteral::Quoted(ref s) => re::parse_literal(&s), match match_entry.match_literal {
TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(), TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
}) TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
.map(|regex| { },
// make sure all regex are anchored at the beginning of the input match match_entry.user_name {
format!("^({})", regex) MatchMapping::Terminal(_) => false,
}) MatchMapping::Skip => true,
.map(|regex_str| { },
// create a rust string with text of the regex; the Debug impl )
// will add quotes and escape })
format!("{:?}", regex_str) .map(|(regex, skip)| {
}) // make sure all regex are anchored at the beginning of the input
.collect() (format!("^({})", regex), skip)
}; })
.map(|(regex_str, skip)| {
// create a rust string with text of the regex; the Debug impl
// will add quotes and escape
(format!("{:?}", regex_str), skip)
});
rust!(out, "let {}strs: &[&str] = &[", prefix); let mut contains_skip = false;
for literal in &regex_strings {
rust!(out, "{},", literal); rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
for (literal, skip) in regex_strings {
rust!(out, "({}, {}),", literal, skip);
contains_skip |= skip;
} }
if !contains_skip {
rust!(out, r#"(r"^(\s*)", true),"#);
}
rust!(out, "];"); rust!(out, "];");
rust!( rust!(
out, out,
"{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()", "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
p = prefix p = prefix
); );

View File

@ -5,8 +5,8 @@ use collections::{map, Map};
use grammar::consts::CFG; use grammar::consts::CFG;
use grammar::parse_tree as pt; use grammar::parse_tree as pt;
use grammar::parse_tree::{ use grammar::parse_tree::{
read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path, read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
TerminalString, Path, TerminalString,
}; };
use grammar::pattern::{Pattern, PatternKind}; use grammar::pattern::{Pattern, PatternKind};
use grammar::repr as r; use grammar::repr as r;
@ -79,26 +79,29 @@ impl<'s> LowerState<'s> {
})), })),
}; };
self.conversions self.conversions
.extend(data.match_entries.iter().enumerate().map( .extend(data.match_entries.iter().enumerate().filter_map(
|(index, match_entry)| { |(index, match_entry)| match &match_entry.user_name {
let pattern = Pattern { MatchMapping::Terminal(user_name) => {
span, let pattern = Pattern {
kind: PatternKind::TupleStruct( span,
internal_token_path.clone(), kind: PatternKind::TupleStruct(
vec![ internal_token_path.clone(),
Pattern { vec![
span, Pattern {
kind: PatternKind::Usize(index), span,
}, kind: PatternKind::Usize(index),
Pattern { },
span, Pattern {
kind: PatternKind::Choose(input_str.clone()), span,
}, kind: PatternKind::Choose(input_str.clone()),
], },
), ],
}; ),
};
(match_entry.user_name.clone(), pattern) Some((user_name.clone(), pattern))
}
MatchMapping::Skip => None,
}, },
)); ));
self.intern_token = Some(data); self.intern_token = Some(data);

View File

@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
.flat_map(|match_token| &match_token.contents) .flat_map(|match_token| &match_token.contents)
.flat_map(|match_contents| &match_contents.items) .flat_map(|match_contents| &match_contents.items)
.filter_map(|item| match *item { .filter_map(|item| match *item {
MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => { MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
Some((item.span(), id.clone(), Def::Terminal)) Some((item.span(), id.clone(), Def::Terminal))
} }
_ => None, _ => None,

View File

@ -133,7 +133,7 @@ impl MatchBlock {
match_block.add_match_entry( match_block.add_match_entry(
precedence, precedence,
sym.clone(), sym.clone(),
TerminalString::Literal(sym.clone()), MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
span, span,
)?; )?;
} }
@ -162,7 +162,7 @@ impl MatchBlock {
&mut self, &mut self,
match_group_precedence: usize, match_group_precedence: usize,
sym: TerminalLiteral, sym: TerminalLiteral,
user_name: TerminalString, user_name: MatchMapping,
span: Span, span: Span,
) -> NormResult<()> { ) -> NormResult<()> {
if let Some(_old_span) = self.spans.insert(sym.clone(), span) { if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
} }
// NB: It's legal for multiple regex to produce same terminal. // NB: It's legal for multiple regex to produce same terminal.
self.match_user_names.insert(user_name.clone()); if let MatchMapping::Terminal(user_name) = &user_name {
self.match_user_names.insert(user_name.clone());
}
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: match_group_precedence * 2 + sym.base_precedence(), precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: sym.base_precedence(), precedence: sym.base_precedence(),
match_literal: sym.clone(), match_literal: sym.clone(),
user_name: TerminalString::Literal(sym.clone()), user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
}); });
self.spans.insert(sym, span); self.spans.insert(sym, span);
@ -328,29 +330,26 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
// one of precedences, that are parallel with `literals`. // one of precedences, that are parallel with `literals`.
let mut regexs = Vec::with_capacity(match_entries.len()); let mut regexs = Vec::with_capacity(match_entries.len());
let mut precedences = Vec::with_capacity(match_entries.len()); let mut precedences = Vec::with_capacity(match_entries.len());
{ for match_entry in &match_entries {
for match_entry in &match_entries { precedences.push(Precedence(match_entry.precedence));
precedences.push(Precedence(match_entry.precedence)); match match_entry.match_literal {
match match_entry.match_literal { TerminalLiteral::Quoted(ref s) => {
TerminalLiteral::Quoted(ref s) => { regexs.push(re::parse_literal(&s));
regexs.push(re::parse_literal(&s)); }
} TerminalLiteral::Regex(ref s) => {
TerminalLiteral::Regex(ref s) => { match re::parse_regex(&s) {
match re::parse_regex(&s) { Ok(regex) => regexs.push(regex),
Ok(regex) => regexs.push(regex), Err(error) => {
Err(error) => { let literal_span = spans[&match_entry.match_literal];
let literal_span = spans[&match_entry.match_literal]; // FIXME -- take offset into account for
// FIXME -- take offset into account for // span; this requires knowing how many #
// span; this requires knowing how many # // the user used, which we do not track
// the user used, which we do not track return_err!(literal_span, "invalid regular expression: {}", error);
return_err!(literal_span, "invalid regular expression: {}", error);
}
} }
} }
} }
} }
Ok(()) }
}?;
let dfa = match dfa::build_dfa(&regexs, &precedences) { let dfa = match dfa::build_dfa(&regexs, &precedences) {
Ok(dfa) => dfa, Ok(dfa) => dfa,

View File

@ -3,8 +3,8 @@ use super::{NormError, NormResult};
use grammar::consts::{ERROR, LOCATION}; use grammar::consts::{ERROR, LOCATION};
use grammar::parse_tree::{ use grammar::parse_tree::{
ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString, ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
Path, Span, SymbolKind, TypeParameter, TypeRef, NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
}; };
use grammar::repr::{NominalTypeRepr, TypeRepr, Types}; use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type); let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);
for match_entry in &intern_token.match_entries { for match_entry in &intern_token.match_entries {
types.add_term_type(match_entry.user_name.clone(), input_str.clone()); if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
types.add_term_type(user_name.clone(), input_str.clone());
}
} }
types types

View File

@ -22,7 +22,7 @@ mod test;
pub enum Top { pub enum Top {
Grammar(Grammar), Grammar(Grammar),
Pattern(Pattern<TypeRef>), Pattern(Pattern<TypeRef>),
MatchMapping(TerminalString), MatchMapping(MatchMapping),
TypeRef(TypeRef), TypeRef(TypeRef),
GrammarWhereClauses(Vec<WhereClause<TypeRef>>), GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
} }