perf(parse_table): Avoid generating unused rows in the matrix

Only the states in the beginning of a reduction are actually used, the states for all other rows will never be used (really, no zero elements will be accessed in the GOTO table so we might be able to do even better without compromising performance). By simply reordering the states so that states needed in the GOTO table occupy the lower indices we can simply avoid generating the remaining part of the GOTO table. For the LALRPOP parser itself this is a reduction of about -40% (~100kB) of the size of the parse tables (not counting the generated code)
2025-04-25 03:02:15 +00:00 · 2020-04-30 00:58:11 +02:00 · 2020-04-30 00:58:11 +02:00 · 688b91930a
commit 688b91930a
parent cb1924632b
8 changed files with 1339 additions and 1961 deletions
--- a/lalrpop/src/build/mod.rs
+++ b/lalrpop/src/build/mod.rs
@ -376,7 +376,7 @@ fn emit_recursive_ascent(
            lr1::generate_report(&mut output_report_file, &lr1result)?;
        }

-        let mut states = match lr1result {
+        let states = match lr1result {
            Ok(states) => states,
            Err(error) => {
                let messages = lr1::report_error(&grammar, &error);
@ -385,34 +385,6 @@ fn emit_recursive_ascent(
            }
        };

-        let mut start_states = vec![false; states.len()];
-        for (index, state) in states.iter_mut().enumerate() {
-            debug_assert!(state.index.0 == index);
-            if grammar
-                .nonterminals
-                .keys()
-                .any(|nonterminal| state.gotos.get(&nonterminal).is_some())
-            {
-                start_states[index] = true;
-            }
-        }
-        states.sort_by_key(|state| start_states[state.index.0]);
-
-        let mut state_rewrite = vec![0; states.len()];
-        for (new_index, state) in states.iter_mut().enumerate() {
-            state_rewrite[state.index.0] = new_index;
-            state.index.0 = new_index;
-        }
-
-        for state in &mut states {
-            for goto in state.gotos.values_mut() {
-                goto.0 = state_rewrite[goto.0];
-            }
-            for shift in state.shifts.values_mut() {
-                shift.0 = state_rewrite[shift.0];
-            }
-        }
-
        match grammar.algorithm.codegen {
            r::LrCodeGeneration::RecursiveAscent => lr1::codegen::ascent::compile(
                &grammar,
--- a/lalrpop/src/lr1/build/mod.rs
+++ b/lalrpop/src/lr1/build/mod.rs
@ -7,9 +7,8 @@ use crate::lr1::core::*;
 use crate::lr1::first;
 use crate::lr1::lane_table::*;
 use crate::lr1::lookahead::*;
-use std::env;
-use std::rc::Rc;
 use crate::tls::Tls;
+use std::env;

 #[cfg(test)]
 mod test;
@ -254,9 +253,7 @@ impl<'grammar, L: LookaheadBuild> LR<'grammar, L> {
            .map(|(lr0_item, lookahead)| lr0_item.with_lookahead(lookahead))
            .collect();

-        Items {
-            vec: Rc::new(final_items),
-        }
+        Items { vec: final_items }
    }
 }

--- a/lalrpop/src/lr1/build_lalr/mod.rs
+++ b/lalrpop/src/lr1/build_lalr/mod.rs
@ -2,13 +2,12 @@

 use crate::collections::{map, Map, Multimap};
 use crate::grammar::repr::*;
-use itertools::Itertools;
 use crate::lr1::build;
 use crate::lr1::core::*;
 use crate::lr1::lookahead::*;
-use std::mem;
-use std::rc::Rc;
 use crate::tls::Tls;
+use itertools::Itertools;
+use std::mem;

 #[cfg(test)]
 mod test;
@ -39,11 +38,13 @@ pub fn build_lalr_states<'grammar>(
        return Ok(lr_states);
    }

-    profile! {
+    let lr1_states = profile! {
        &Tls::session(),
        "LALR(1) state collapse",
        collapse_to_lalr_states(&lr_states)
-    }
+    }?;
+
+    Ok(lr1_states)
 }

 pub fn collapse_to_lalr_states<'grammar>(lr_states: &[LR1State<'grammar>]) -> LR1Result<'grammar> {
@ -138,9 +139,7 @@ pub fn collapse_to_lalr_states<'grammar>(lr_states: &[LR1State<'grammar>]) -> LR
        .into_iter()
        .map(|lr| State {
            index: lr.index,
-            items: Items {
-                vec: Rc::new(lr.items),
-            },
+            items: Items { vec: lr.items },
            shifts: lr.shifts,
            reductions: lr.reductions.into_iter().map(|(p, ts)| (ts, p)).collect(),
            gotos: lr.gotos,
--- a/lalrpop/src/lr1/codegen/parse_table.rs
+++ b/lalrpop/src/lr1/codegen/parse_table.rs
@ -527,9 +527,11 @@ impl<'ascent, 'grammar, W: Write> CodeGenerator<'ascent, 'grammar, W, TableDrive
            self.prefix,
            self.custom.state_type
        );
+
+        let mut row = Vec::new();
        for (index, state) in self.states.iter().enumerate() {
            rust!(self.out, "// State {}", index);
-            let iterator = self.grammar.nonterminals.keys().map(|nonterminal| {
+            row.extend(self.grammar.nonterminals.keys().map(|nonterminal| {
                if let Some(&new_state) = state.gotos.get(&nonterminal) {
                    (
                        new_state.0 as i32 + 1,
@ -538,8 +540,12 @@ impl<'ascent, 'grammar, W: Write> CodeGenerator<'ascent, 'grammar, W, TableDrive
                } else {
                    (0, Comment::Error(nonterminal))
                }
-            });
-            self.out.write_table_row(iterator)?;
+            }));
+            // The remaining rows will be all error and is never accessed so we may omit them from the table
+            if row.iter().all(|t| t.0 == 0) {
+                break;
+            }
+            self.out.write_table_row(row.drain(..))?;
        }
        rust!(self.out, "];");

--- a/lalrpop/src/lr1/core/mod.rs
+++ b/lalrpop/src/lr1/core/mod.rs
@ -5,7 +5,6 @@ use crate::grammar::repr::*;
 use crate::util::Prefix;
 use itertools::Itertools;
 use std::fmt::{Debug, Display, Error, Formatter};
-use std::rc::Rc;

 use super::lookahead::*;

@ -123,7 +122,7 @@ pub struct StateIndex(pub usize);

 #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
 pub struct Items<'grammar, L: Lookahead> {
-    pub vec: Rc<Vec<Item<'grammar, L>>>,
+    pub vec: Vec<Item<'grammar, L>>,
 }

 #[allow(dead_code)]
--- a/lalrpop/src/lr1/lane_table/construct/mod.rs
+++ b/lalrpop/src/lr1/lane_table/construct/mod.rs
@ -1,7 +1,6 @@
 //!

 use crate::collections::{Map, Set};
-use ena::unify::InPlaceUnificationTable;
 use crate::grammar::repr::*;
 use crate::lr1::build;
 use crate::lr1::core::*;
@ -11,7 +10,7 @@ use crate::lr1::lane_table::table::context_set::OverlappingLookahead;
 use crate::lr1::lane_table::table::{ConflictIndex, LaneTable};
 use crate::lr1::lookahead::{Lookahead, TokenSet};
 use crate::lr1::state_graph::StateGraph;
-use std::rc::Rc;
+use ena::unify::InPlaceUnificationTable;

 mod merge;
 use self::merge::Merge;
@ -120,9 +119,7 @@ impl<'grammar> LaneTableConstruct<'grammar> {
                    .collect();
                State {
                    index: s.index,
-                    items: Items {
-                        vec: Rc::new(items),
-                    },
+                    items: Items { vec: items },
                    shifts: s.shifts,
                    reductions,
                    gotos: s.gotos,
--- a/lalrpop/src/lr1/mod.rs
+++ b/lalrpop/src/lr1/mod.rs
@ -28,11 +28,15 @@ pub fn build_states<'grammar>(
    grammar: &'grammar Grammar,
    start: NonterminalString,
 ) -> LR1Result<'grammar> {
-    if !grammar.algorithm.lalr {
-        build::build_lr1_states(grammar, start)
+    let mut lr1_states = if !grammar.algorithm.lalr {
+        build::build_lr1_states(grammar, start)?
    } else {
-        build_lalr::build_lalr_states(grammar, start)
-    }
+        build_lalr::build_lalr_states(grammar, start)?
+    };
+
+    rewrite_state_indices(grammar, &mut lr1_states);
+
+    Ok(lr1_states)
 }

 pub fn generate_report<'grammar, W: Write + 'grammar>(
@ -41,3 +45,37 @@ pub fn generate_report<'grammar, W: Write + 'grammar>(
 ) -> io::Result<()> {
    report::generate_report(out, lr1result)
 }
+
+/// By packing all states which start a reduction we can generate a smaller goto table as any
+/// states not starting a reduction will not need a row
+fn rewrite_state_indices(grammar: &Grammar, states: &mut [core::LR1State]) {
+    let mut start_states = vec![false; states.len()];
+    for (index, state) in states.iter_mut().enumerate() {
+        debug_assert!(state.index.0 == index);
+        if grammar
+            .nonterminals
+            .keys()
+            .any(|nonterminal| state.gotos.get(&nonterminal).is_some())
+        {
+            start_states[index] = true;
+        }
+    }
+
+    // Since the sort is stable and we put starting states first, the initial state is still 0
+    states.sort_by_key(|state| !start_states[state.index.0]);
+
+    let mut state_rewrite = vec![0; states.len()];
+    for (new_index, state) in states.iter_mut().enumerate() {
+        state_rewrite[state.index.0] = new_index;
+        state.index.0 = new_index;
+    }
+
+    for state in states {
+        for goto in state.gotos.values_mut() {
+            goto.0 = state_rewrite[goto.0];
+        }
+        for shift in state.shifts.values_mut() {
+            shift.0 = state_rewrite[shift.0];
+        }
+    }
+}
--- a/lalrpop/src/parser/lrgrammar.rs
+++ b/lalrpop/src/parser/lrgrammar.rs