refactor tokenizers away from one giant switch statement

2025-02-07 09:21:23 +01:00 · 2025-02-07 09:21:23 +01:00 · a257beb170
commit a257beb170
parent 3cf8ef02d1
14 changed files with 366 additions and 8 deletions
--- a/rust/rox/src/keywords.rs
+++ b/rust/rox/src/keywords.rs
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
 use std::collections::HashMap;

 lazy_static! {
+    /// Mapping of reserved keywords to their respective TokenType.
    pub static ref KEYWORDS: HashMap<std::string::String, TokenType> = {
        let mut m = HashMap::new();
        m.insert("and".into(), And);
--- a/rust/rox/src/lib.rs
+++ b/rust/rox/src/lib.rs
@ -1,3 +1,6 @@
+//! Interpret the Lox language. Either compile (interpret for now though) some source code or run a
+//! REPL.
+
 use std::{
    fs::{self},
    io::{self, Write},
@ -22,6 +25,7 @@ pub mod tokenizer {
    pub mod whitespace;
 }

+/// Read the source code in a file and scan it to tokens.
 pub fn compile(source: &Path) -> Result<(), io::Error> {
    let input = fs::read_to_string(source)?;
    let _tokens = scanner::tokenize(&input);
@ -29,6 +33,7 @@ pub fn compile(source: &Path) -> Result<(), io::Error> {
    Ok(())
 }

+/// Run a Lox REPL until SIGINT.
 pub fn repl() {
    loop {
        print!("> ");
--- a/rust/rox/src/main.rs
+++ b/rust/rox/src/main.rs
@ -4,6 +4,7 @@ use clap::Parser;
 use rox::cli::{Cli, Commands};
 use tracing::error;

+/// Cli entrypoint.
 fn main() {
    if std::env::var_os("RUST_LOG").is_none() {
        std::env::set_var("RUST_LOG", "info");
--- a/rust/rox/src/scanner.rs
+++ b/rust/rox/src/scanner.rs
@ -1,3 +1,5 @@
+//! Scan source code to create tokens out of it.
+
 use crate::{
    token::{Token, TokenType::Eof},
    tokenizer::{
@ -10,6 +12,8 @@ use lazy_static::lazy_static;
 use tracing::{debug, error};

 lazy_static! {
+    /// Tokenizers to use in scanning. They are tried in the exact order in which they appear in
+    /// the list.
    static ref TOKENIZERS: Vec<Box<dyn Tokenizer>> = vec![
        Box::new(SingleChar),
        Box::new(Whitespace),
@ -22,21 +26,20 @@ lazy_static! {
    ];
 }

+/// Take source code as input and return a list of tokens representing it.
 pub fn tokenize(source: &str) -> Vec<Token> {
    let mut tokens: Vec<Token> = Vec::new();
    let mut source_chars = source.char_indices().peekable();

    let mut line = 1;
    while let Some(c) = source_chars.next() {
-        let mut tokenizer_idx = 0;
-        let mut tokenizer_result = None;
-        while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() {
-            tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line);
-            tokenizer_idx += 1;
-        }
-
-        match tokenizer_result {
+        // careful, a tokenizer run can move the iterator but I have not found a more ergonomic variant yet
+        match TOKENIZERS
+            .iter()
+            .find_map(|x| x.run(c, &mut source_chars, source, line))
+        {
            Some((line_advance, token)) => {
+                // I do not like handling it this way, but it suffices for now
                line += line_advance;

                if let Some(token) = token {
@ -48,6 +51,7 @@ pub fn tokenize(source: &str) -> Vec<Token> {
            }
        }
    }
+    // Eof is always the last token
    tokens.push(Token {
        token_type: Eof,
        lexeme: "".to_string(),
@ -58,3 +62,312 @@ pub fn tokenize(source: &str) -> Vec<Token> {

    tokens
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::token::Literal;
+    use crate::token::Token;
+    use crate::token::TokenType::*;
+
+    use super::tokenize;
+
+    const FIBONACCI: &str = r#"
+      // first 21 elements in the Fibonacci sequence
+      var a = 0;
+      var temp;
+
+      for (var b = 1; a < 10000; b = temp + b) {
+        print a;
+        temp = a;
+        a = b;
+      }
+    "#;
+
+    #[test]
+    fn floating_points() {
+        assert_eq!(
+            vec![Token {
+                token_type: Number,
+                lexeme: "0".into(),
+                literal: Some(Literal::Number(0.0)),
+                line: 1
+            },],
+            vec![Token {
+                token_type: Number,
+                lexeme: "0".into(),
+                literal: Some(Literal::Number(0.0)),
+                line: 1
+            },]
+        );
+    }
+
+    #[test]
+    fn multiline_string() {
+        let input = r#""Hello,
+ world
+!""#;
+        let tokens = tokenize(input);
+
+        assert_eq!(
+            tokens,
+            vec![
+                Token {
+                    token_type: String,
+                    lexeme: "\"Hello,\n world\n!\"".into(),
+                    literal: Some(Literal::String("Hello,\n world\n!".into())),
+                    line: 1
+                },
+                Token {
+                    token_type: Eof,
+                    lexeme: "".into(),
+                    literal: None,
+                    line: 3
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn fibonacci_tokens() {
+        let tokens = tokenize(FIBONACCI);
+        assert_eq!(
+            tokens,
+            vec![
+                Token {
+                    token_type: Var,
+                    lexeme: "var".into(),
+                    literal: None,
+                    line: 3
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "a".into(),
+                    literal: None,
+                    line: 3
+                },
+                Token {
+                    token_type: Equal,
+                    lexeme: "=".into(),
+                    literal: None,
+                    line: 3
+                },
+                Token {
+                    token_type: Number,
+                    lexeme: "0".into(),
+                    literal: Some(Literal::Number(0.0)),
+                    line: 3
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 3
+                },
+                Token {
+                    token_type: Var,
+                    lexeme: "var".into(),
+                    literal: None,
+                    line: 4
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "temp".into(),
+                    literal: None,
+                    line: 4
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 4
+                },
+                Token {
+                    token_type: For,
+                    lexeme: "for".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: LeftParen,
+                    lexeme: "(".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Var,
+                    lexeme: "var".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "b".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Equal,
+                    lexeme: "=".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Number,
+                    lexeme: "1".into(),
+                    literal: Some(Literal::Number(1.0)),
+                    line: 6
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "a".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Less,
+                    lexeme: "<".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Number,
+                    lexeme: "10000".into(),
+                    literal: Some(Literal::Number(10000.0)),
+                    line: 6
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "b".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Equal,
+                    lexeme: "=".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "temp".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Plus,
+                    lexeme: "+".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "b".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: RightParen,
+                    lexeme: ")".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: LeftBrace,
+                    lexeme: "{".into(),
+                    literal: None,
+                    line: 6
+                },
+                Token {
+                    token_type: Print,
+                    lexeme: "print".into(),
+                    literal: None,
+                    line: 7
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "a".into(),
+                    literal: None,
+                    line: 7
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 7
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "temp".into(),
+                    literal: None,
+                    line: 8
+                },
+                Token {
+                    token_type: Equal,
+                    lexeme: "=".into(),
+                    literal: None,
+                    line: 8
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "a".into(),
+                    literal: None,
+                    line: 8
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 8
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "a".into(),
+                    literal: None,
+                    line: 9
+                },
+                Token {
+                    token_type: Equal,
+                    lexeme: "=".into(),
+                    literal: None,
+                    line: 9
+                },
+                Token {
+                    token_type: Identifier,
+                    lexeme: "b".into(),
+                    literal: None,
+                    line: 9
+                },
+                Token {
+                    token_type: Semicolon,
+                    lexeme: ";".into(),
+                    literal: None,
+                    line: 9
+                },
+                Token {
+                    token_type: RightBrace,
+                    lexeme: "}".into(),
+                    literal: None,
+                    line: 10
+                },
+                Token {
+                    token_type: Eof,
+                    lexeme: "".into(),
+                    literal: None,
+                    line: 11
+                },
+            ]
+        );
+    }
+}
--- a/rust/rox/src/token.rs
+++ b/rust/rox/src/token.rs
@ -1,5 +1,8 @@
+//! Token values and data structs.
+
 use std::fmt::Display;

+/// Exhaustive enumeration of all types of different tokens.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum TokenType {
    // Single-character tokens.
@ -51,17 +54,25 @@ pub enum TokenType {
    Eof,
 }

+/// Literal value.
 #[derive(Clone, Debug, PartialEq)]
 pub enum Literal {
+    /// String literal.
    String(String),
+    // Number literal, represented as f64 (thus it can be decimal).
    Number(f64),
 }

+/// Consumed token.
 #[derive(Clone, Debug, PartialEq)]
 pub struct Token {
+    /// Type of the token.
    pub token_type: TokenType,
+    /// Lexeme that was consumed to create this token.
    pub lexeme: String,
+    /// Literal value of the token, if any.
    pub literal: Option<Literal>,
+    /// Starting line on which the token was oonsumed from the source.
    pub line: usize,
 }

--- a/rust/rox/src/tokenizer/comment.rs
+++ b/rust/rox/src/tokenizer/comment.rs
@ -5,6 +5,10 @@ use crate::token::Token;

 use super::interface::Tokenizer;

+/// Consume comments.
+///
+/// A comment starts with '//' and runs until the end of the line.
+/// If only one '/' is seen, it is consumed as a Slash token.
 pub struct Comment;
 impl Tokenizer for Comment {
    fn run(
--- a/rust/rox/src/tokenizer/identifier.rs
+++ b/rust/rox/src/tokenizer/identifier.rs
@ -6,6 +6,10 @@ use std::{iter::Peekable, str::CharIndices};

 use super::interface::Tokenizer;

+/// Consume an identifier which also might be a keyword.
+///
+/// An identifier starts with an alphabetic character and goes on consuming alphanumeric and
+/// underscore characters until the first different one.
 pub struct Identifier;
 impl Tokenizer for Identifier {
    fn run(
--- a/rust/rox/src/tokenizer/interface.rs
+++ b/rust/rox/src/tokenizer/interface.rs
@ -2,7 +2,12 @@ use std::{iter::Peekable, str::CharIndices};

 use crate::token::Token;

+/// Interface to implement by a tokenizer.
 pub trait Tokenizer: Send + Sync {
+    /// Take a tuple consisting of the index of a char and the char itself, the whole source code
+    /// iterator, the source itself and the current line. Return None if you can not handle the
+    /// current lexeme or an Option consisting of a tuple where the first element is how much the
+    /// current line moved and the second element is an Option that can have the consumed token.
    fn run(
        &self,
        c: (usize, char),
--- a/rust/rox/src/tokenizer/lookahead.rs
+++ b/rust/rox/src/tokenizer/lookahead.rs
@ -4,12 +4,17 @@ use crate::token::{Token, TokenType};
 use lazy_static::lazy_static;
 use std::{collections::HashMap, iter::Peekable, str::CharIndices};

+/// Data for one and two character lexemes.
 struct LookaheadEntry {
+    /// TokenType if a lexeme is a one character one.
    default_token: TokenType,
+    /// Mapping of second level character to a TokenType.
    lookahead_map: HashMap<char, TokenType>,
 }

 lazy_static! {
+    /// Mapping of one and two character lexemes, specifiyng the one character variant and as many
+    /// two character ones as needed.
    static ref LOOKAHEAD_TOKENS: HashMap<char, LookaheadEntry> = {
        let mut m = HashMap::new();

@ -57,6 +62,7 @@ lazy_static! {
    };
 }

+/// Consume lexemes that consist of exactly one or two characters.
 pub struct Lookahead;
 impl Tokenizer for Lookahead {
    fn run(
--- a/rust/rox/src/tokenizer/newline.rs
+++ b/rust/rox/src/tokenizer/newline.rs
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
 use crate::token::Token;
 use std::{iter::Peekable, str::CharIndices};

+/// Consume newlines. Do not yield a token but increase the current line.
 pub struct Newline;
 impl Tokenizer for Newline {
    fn run(
--- a/rust/rox/src/tokenizer/number.rs
+++ b/rust/rox/src/tokenizer/number.rs
@ -4,6 +4,7 @@ use tracing::error;

 use super::interface::Tokenizer;

+/// Consume a number literal. Numbers can have one decimal point.
 pub struct Number;
 impl Tokenizer for Number {
    fn run(
--- a/rust/rox/src/tokenizer/single_char.rs
+++ b/rust/rox/src/tokenizer/single_char.rs
@ -7,6 +7,7 @@ use lazy_static::lazy_static;
 use std::{collections::HashMap, iter::Peekable, str::CharIndices};

 lazy_static! {
+    /// Mapping of single characters to their respective TokenType.
    static ref SINGLE_CHAR_TOKENS: HashMap<char, TokenType> = {
        let mut m = HashMap::new();
        m.insert('(', LeftParen);
@ -23,6 +24,7 @@ lazy_static! {
    };
 }

+/// Consume a single character and produce its corresponding token.
 pub struct SingleChar;
 impl Tokenizer for SingleChar {
    fn run(
--- a/rust/rox/src/tokenizer/string.rs
+++ b/rust/rox/src/tokenizer/string.rs
@ -3,6 +3,9 @@ use crate::token::{Literal, Token, TokenType};
 use std::{iter::Peekable, str::CharIndices};
 use tracing::error;

+/// Consume a string literal.
+///
+/// A string literal consists of everything between two '"' and can stretch across multiple lines.
 pub struct String;
 impl Tokenizer for String {
    fn run(
--- a/rust/rox/src/tokenizer/whitespace.rs
+++ b/rust/rox/src/tokenizer/whitespace.rs
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
 use crate::token::Token;
 use std::{iter::Peekable, str::CharIndices};

+/// Consume and ignore whitespace characters.
 pub struct Whitespace;
 impl Tokenizer for Whitespace {
    fn run(