refactor tokenizers away from one giant switch statement

2025-02-07 09:21:23 +01:00 · 2025-02-07 09:21:23 +01:00 · a257beb170
commit a257beb170
parent 3cf8ef02d1
14 changed files with 366 additions and 8 deletions
--- a/rust/rox/src/keywords.rs
+++ b/rust/rox/src/keywords.rs
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
 use std::collections::HashMap;
 lazy_static! {
    /// Mapping of reserved keywords to their respective TokenType.
    pub static ref KEYWORDS: HashMap<std::string::String, TokenType> = {
        let mut m = HashMap::new();
        m.insert("and".into(), And);
--- a/rust/rox/src/lib.rs
+++ b/rust/rox/src/lib.rs
@ -1,3 +1,6 @@
 //! Interpret the Lox language. Either compile (interpret for now though) some source code or run a
 //! REPL.
 use std::{
    fs::{self},
    io::{self, Write},
@ -22,6 +25,7 @@ pub mod tokenizer {
    pub mod whitespace;
 }
 /// Read the source code in a file and scan it to tokens.
 pub fn compile(source: &Path) -> Result<(), io::Error> {
    let input = fs::read_to_string(source)?;
    let _tokens = scanner::tokenize(&input);
@ -29,6 +33,7 @@ pub fn compile(source: &Path) -> Result<(), io::Error> {
    Ok(())
 }
 /// Run a Lox REPL until SIGINT.
 pub fn repl() {
    loop {
        print!("> ");
--- a/rust/rox/src/main.rs
+++ b/rust/rox/src/main.rs
@ -4,6 +4,7 @@ use clap::Parser;
 use rox::cli::{Cli, Commands};
 use tracing::error;
 /// Cli entrypoint.
 fn main() {
    if std::env::var_os("RUST_LOG").is_none() {
        std::env::set_var("RUST_LOG", "info");
--- a/rust/rox/src/scanner.rs
+++ b/rust/rox/src/scanner.rs
@ -1,3 +1,5 @@
 //! Scan source code to create tokens out of it.
 use crate::{
    token::{Token, TokenType::Eof},
    tokenizer::{
@ -10,6 +12,8 @@ use lazy_static::lazy_static;
 use tracing::{debug, error};
 lazy_static! {
    /// Tokenizers to use in scanning. They are tried in the exact order in which they appear in
    /// the list.
    static ref TOKENIZERS: Vec<Box<dyn Tokenizer>> = vec![
        Box::new(SingleChar),
        Box::new(Whitespace),
@ -22,21 +26,20 @@ lazy_static! {
    ];
 }
 /// Take source code as input and return a list of tokens representing it.
 pub fn tokenize(source: &str) -> Vec<Token> {
    let mut tokens: Vec<Token> = Vec::new();
    let mut source_chars = source.char_indices().peekable();
    let mut line = 1;
    while let Some(c) = source_chars.next() {
-        let mut tokenizer_idx = 0;
+        // careful, a tokenizer run can move the iterator but I have not found a more ergonomic variant yet
-        let mut tokenizer_result = None;
+        match TOKENIZERS
-        while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() {
+            .iter()
-            tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line);
+            .find_map(|x| x.run(c, &mut source_chars, source, line))
-            tokenizer_idx += 1;
+        {
        }
        match tokenizer_result {
            Some((line_advance, token)) => {
                // I do not like handling it this way, but it suffices for now
                line += line_advance;
                if let Some(token) = token {
@ -48,6 +51,7 @@ pub fn tokenize(source: &str) -> Vec<Token> {
            }
        }
    }
    // Eof is always the last token
    tokens.push(Token {
        token_type: Eof,
        lexeme: "".to_string(),
@ -58,3 +62,312 @@ pub fn tokenize(source: &str) -> Vec<Token> {
    tokens
 }
 #[cfg(test)]
 mod tests {
    use crate::token::Literal;
    use crate::token::Token;
    use crate::token::TokenType::*;
    use super::tokenize;
    const FIBONACCI: &str = r#"
      // first 21 elements in the Fibonacci sequence
      var a = 0;
      var temp;
      for (var b = 1; a < 10000; b = temp + b) {
        print a;
        temp = a;
        a = b;
      }
    "#;
    #[test]
    fn floating_points() {
        assert_eq!(
            vec![Token {
                token_type: Number,
                lexeme: "0".into(),
                literal: Some(Literal::Number(0.0)),
                line: 1
            },],
            vec![Token {
                token_type: Number,
                lexeme: "0".into(),
                literal: Some(Literal::Number(0.0)),
                line: 1
            },]
        );
    }
    #[test]
    fn multiline_string() {
        let input = r#""Hello,
 world
 !""#;
        let tokens = tokenize(input);
        assert_eq!(
            tokens,
            vec![
                Token {
                    token_type: String,
                    lexeme: "\"Hello,\n world\n!\"".into(),
                    literal: Some(Literal::String("Hello,\n world\n!".into())),
                    line: 1
                },
                Token {
                    token_type: Eof,
                    lexeme: "".into(),
                    literal: None,
                    line: 3
                },
            ]
        );
    }
    #[test]
    fn fibonacci_tokens() {
        let tokens = tokenize(FIBONACCI);
        assert_eq!(
            tokens,
            vec![
                Token {
                    token_type: Var,
                    lexeme: "var".into(),
                    literal: None,
                    line: 3
                },
                Token {
                    token_type: Identifier,
                    lexeme: "a".into(),
                    literal: None,
                    line: 3
                },
                Token {
                    token_type: Equal,
                    lexeme: "=".into(),
                    literal: None,
                    line: 3
                },
                Token {
                    token_type: Number,
                    lexeme: "0".into(),
                    literal: Some(Literal::Number(0.0)),
                    line: 3
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 3
                },
                Token {
                    token_type: Var,
                    lexeme: "var".into(),
                    literal: None,
                    line: 4
                },
                Token {
                    token_type: Identifier,
                    lexeme: "temp".into(),
                    literal: None,
                    line: 4
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 4
                },
                Token {
                    token_type: For,
                    lexeme: "for".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: LeftParen,
                    lexeme: "(".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Var,
                    lexeme: "var".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Identifier,
                    lexeme: "b".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Equal,
                    lexeme: "=".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Number,
                    lexeme: "1".into(),
                    literal: Some(Literal::Number(1.0)),
                    line: 6
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Identifier,
                    lexeme: "a".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Less,
                    lexeme: "<".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Number,
                    lexeme: "10000".into(),
                    literal: Some(Literal::Number(10000.0)),
                    line: 6
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Identifier,
                    lexeme: "b".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Equal,
                    lexeme: "=".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Identifier,
                    lexeme: "temp".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Plus,
                    lexeme: "+".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Identifier,
                    lexeme: "b".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: RightParen,
                    lexeme: ")".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: LeftBrace,
                    lexeme: "{".into(),
                    literal: None,
                    line: 6
                },
                Token {
                    token_type: Print,
                    lexeme: "print".into(),
                    literal: None,
                    line: 7
                },
                Token {
                    token_type: Identifier,
                    lexeme: "a".into(),
                    literal: None,
                    line: 7
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 7
                },
                Token {
                    token_type: Identifier,
                    lexeme: "temp".into(),
                    literal: None,
                    line: 8
                },
                Token {
                    token_type: Equal,
                    lexeme: "=".into(),
                    literal: None,
                    line: 8
                },
                Token {
                    token_type: Identifier,
                    lexeme: "a".into(),
                    literal: None,
                    line: 8
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 8
                },
                Token {
                    token_type: Identifier,
                    lexeme: "a".into(),
                    literal: None,
                    line: 9
                },
                Token {
                    token_type: Equal,
                    lexeme: "=".into(),
                    literal: None,
                    line: 9
                },
                Token {
                    token_type: Identifier,
                    lexeme: "b".into(),
                    literal: None,
                    line: 9
                },
                Token {
                    token_type: Semicolon,
                    lexeme: ";".into(),
                    literal: None,
                    line: 9
                },
                Token {
                    token_type: RightBrace,
                    lexeme: "}".into(),
                    literal: None,
                    line: 10
                },
                Token {
                    token_type: Eof,
                    lexeme: "".into(),
                    literal: None,
                    line: 11
                },
            ]
        );
    }
 }
--- a/rust/rox/src/token.rs
+++ b/rust/rox/src/token.rs
@ -1,5 +1,8 @@
 //! Token values and data structs.
 use std::fmt::Display;
 /// Exhaustive enumeration of all types of different tokens.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum TokenType {
    // Single-character tokens.
@ -51,17 +54,25 @@ pub enum TokenType {
    Eof,
 }
 /// Literal value.
 #[derive(Clone, Debug, PartialEq)]
 pub enum Literal {
    /// String literal.
    String(String),
    // Number literal, represented as f64 (thus it can be decimal).
    Number(f64),
 }
 /// Consumed token.
 #[derive(Clone, Debug, PartialEq)]
 pub struct Token {
    /// Type of the token.
    pub token_type: TokenType,
    /// Lexeme that was consumed to create this token.
    pub lexeme: String,
    /// Literal value of the token, if any.
    pub literal: Option<Literal>,
    /// Starting line on which the token was oonsumed from the source.
    pub line: usize,
 }
--- a/rust/rox/src/tokenizer/comment.rs
+++ b/rust/rox/src/tokenizer/comment.rs
@ -5,6 +5,10 @@ use crate::token::Token;
 use super::interface::Tokenizer;
 /// Consume comments.
 ///
 /// A comment starts with '//' and runs until the end of the line.
 /// If only one '/' is seen, it is consumed as a Slash token.
 pub struct Comment;
 impl Tokenizer for Comment {
    fn run(
--- a/rust/rox/src/tokenizer/identifier.rs
+++ b/rust/rox/src/tokenizer/identifier.rs
@ -6,6 +6,10 @@ use std::{iter::Peekable, str::CharIndices};
 use super::interface::Tokenizer;
 /// Consume an identifier which also might be a keyword.
 ///
 /// An identifier starts with an alphabetic character and goes on consuming alphanumeric and
 /// underscore characters until the first different one.
 pub struct Identifier;
 impl Tokenizer for Identifier {
    fn run(
--- a/rust/rox/src/tokenizer/interface.rs
+++ b/rust/rox/src/tokenizer/interface.rs
@ -2,7 +2,12 @@ use std::{iter::Peekable, str::CharIndices};
 use crate::token::Token;
 /// Interface to implement by a tokenizer.
 pub trait Tokenizer: Send + Sync {
    /// Take a tuple consisting of the index of a char and the char itself, the whole source code
    /// iterator, the source itself and the current line. Return None if you can not handle the
    /// current lexeme or an Option consisting of a tuple where the first element is how much the
    /// current line moved and the second element is an Option that can have the consumed token.
    fn run(
        &self,
        c: (usize, char),
--- a/rust/rox/src/tokenizer/lookahead.rs
+++ b/rust/rox/src/tokenizer/lookahead.rs
@ -4,12 +4,17 @@ use crate::token::{Token, TokenType};
 use lazy_static::lazy_static;
 use std::{collections::HashMap, iter::Peekable, str::CharIndices};
 /// Data for one and two character lexemes.
 struct LookaheadEntry {
    /// TokenType if a lexeme is a one character one.
    default_token: TokenType,
    /// Mapping of second level character to a TokenType.
    lookahead_map: HashMap<char, TokenType>,
 }
 lazy_static! {
    /// Mapping of one and two character lexemes, specifiyng the one character variant and as many
    /// two character ones as needed.
    static ref LOOKAHEAD_TOKENS: HashMap<char, LookaheadEntry> = {
        let mut m = HashMap::new();
@ -57,6 +62,7 @@ lazy_static! {
    };
 }
 /// Consume lexemes that consist of exactly one or two characters.
 pub struct Lookahead;
 impl Tokenizer for Lookahead {
    fn run(
--- a/rust/rox/src/tokenizer/newline.rs
+++ b/rust/rox/src/tokenizer/newline.rs
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
 use crate::token::Token;
 use std::{iter::Peekable, str::CharIndices};
 /// Consume newlines. Do not yield a token but increase the current line.
 pub struct Newline;
 impl Tokenizer for Newline {
    fn run(
--- a/rust/rox/src/tokenizer/number.rs
+++ b/rust/rox/src/tokenizer/number.rs
@ -4,6 +4,7 @@ use tracing::error;
 use super::interface::Tokenizer;
 /// Consume a number literal. Numbers can have one decimal point.
 pub struct Number;
 impl Tokenizer for Number {
    fn run(
--- a/rust/rox/src/tokenizer/single_char.rs
+++ b/rust/rox/src/tokenizer/single_char.rs
@ -7,6 +7,7 @@ use lazy_static::lazy_static;
 use std::{collections::HashMap, iter::Peekable, str::CharIndices};
 lazy_static! {
    /// Mapping of single characters to their respective TokenType.
    static ref SINGLE_CHAR_TOKENS: HashMap<char, TokenType> = {
        let mut m = HashMap::new();
        m.insert('(', LeftParen);
@ -23,6 +24,7 @@ lazy_static! {
    };
 }
 /// Consume a single character and produce its corresponding token.
 pub struct SingleChar;
 impl Tokenizer for SingleChar {
    fn run(
--- a/rust/rox/src/tokenizer/string.rs
+++ b/rust/rox/src/tokenizer/string.rs
@ -3,6 +3,9 @@ use crate::token::{Literal, Token, TokenType};
 use std::{iter::Peekable, str::CharIndices};
 use tracing::error;
 /// Consume a string literal.
 ///
 /// A string literal consists of everything between two '"' and can stretch across multiple lines.
 pub struct String;
 impl Tokenizer for String {
    fn run(
--- a/rust/rox/src/tokenizer/whitespace.rs
+++ b/rust/rox/src/tokenizer/whitespace.rs
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
 use crate::token::Token;
 use std::{iter::Peekable, str::CharIndices};
 /// Consume and ignore whitespace characters.
 pub struct Whitespace;
 impl Tokenizer for Whitespace {
    fn run(