refactor tokenizers away from one giant switch statement

This commit is contained in:
Sebastian Hugentobler 2025-02-07 09:21:23 +01:00
parent 3cf8ef02d1
commit a257beb170
14 changed files with 366 additions and 8 deletions

View File

@ -3,6 +3,7 @@ use lazy_static::lazy_static;
use std::collections::HashMap;
lazy_static! {
/// Mapping of reserved keywords to their respective TokenType.
pub static ref KEYWORDS: HashMap<std::string::String, TokenType> = {
let mut m = HashMap::new();
m.insert("and".into(), And);

View File

@ -1,3 +1,6 @@
//! Interpret the Lox language. Either compile (interpret for now though) some source code or run a
//! REPL.
use std::{
fs::{self},
io::{self, Write},
@ -22,6 +25,7 @@ pub mod tokenizer {
pub mod whitespace;
}
/// Read the source code in a file and scan it to tokens.
pub fn compile(source: &Path) -> Result<(), io::Error> {
let input = fs::read_to_string(source)?;
let _tokens = scanner::tokenize(&input);
@ -29,6 +33,7 @@ pub fn compile(source: &Path) -> Result<(), io::Error> {
Ok(())
}
/// Run a Lox REPL until SIGINT.
pub fn repl() {
loop {
print!("> ");

View File

@ -4,6 +4,7 @@ use clap::Parser;
use rox::cli::{Cli, Commands};
use tracing::error;
/// Cli entrypoint.
fn main() {
if std::env::var_os("RUST_LOG").is_none() {
std::env::set_var("RUST_LOG", "info");

View File

@ -1,3 +1,5 @@
//! Scan source code to create tokens out of it.
use crate::{
token::{Token, TokenType::Eof},
tokenizer::{
@ -10,6 +12,8 @@ use lazy_static::lazy_static;
use tracing::{debug, error};
lazy_static! {
/// Tokenizers to use in scanning. They are tried in the exact order in which they appear in
/// the list.
static ref TOKENIZERS: Vec<Box<dyn Tokenizer>> = vec![
Box::new(SingleChar),
Box::new(Whitespace),
@ -22,21 +26,20 @@ lazy_static! {
];
}
/// Take source code as input and return a list of tokens representing it.
pub fn tokenize(source: &str) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new();
let mut source_chars = source.char_indices().peekable();
let mut line = 1;
while let Some(c) = source_chars.next() {
let mut tokenizer_idx = 0;
let mut tokenizer_result = None;
while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() {
tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line);
tokenizer_idx += 1;
}
match tokenizer_result {
// careful, a tokenizer run can move the iterator but I have not found a more ergonomic variant yet
match TOKENIZERS
.iter()
.find_map(|x| x.run(c, &mut source_chars, source, line))
{
Some((line_advance, token)) => {
// I do not like handling it this way, but it suffices for now
line += line_advance;
if let Some(token) = token {
@ -48,6 +51,7 @@ pub fn tokenize(source: &str) -> Vec<Token> {
}
}
}
// Eof is always the last token
tokens.push(Token {
token_type: Eof,
lexeme: "".to_string(),
@ -58,3 +62,312 @@ pub fn tokenize(source: &str) -> Vec<Token> {
tokens
}
#[cfg(test)]
mod tests {
use crate::token::Literal;
use crate::token::Token;
use crate::token::TokenType::*;
use super::tokenize;
const FIBONACCI: &str = r#"
// first 21 elements in the Fibonacci sequence
var a = 0;
var temp;
for (var b = 1; a < 10000; b = temp + b) {
print a;
temp = a;
a = b;
}
"#;
#[test]
fn floating_points() {
assert_eq!(
vec![Token {
token_type: Number,
lexeme: "0".into(),
literal: Some(Literal::Number(0.0)),
line: 1
},],
vec![Token {
token_type: Number,
lexeme: "0".into(),
literal: Some(Literal::Number(0.0)),
line: 1
},]
);
}
#[test]
fn multiline_string() {
let input = r#""Hello,
world
!""#;
let tokens = tokenize(input);
assert_eq!(
tokens,
vec![
Token {
token_type: String,
lexeme: "\"Hello,\n world\n!\"".into(),
literal: Some(Literal::String("Hello,\n world\n!".into())),
line: 1
},
Token {
token_type: Eof,
lexeme: "".into(),
literal: None,
line: 3
},
]
);
}
#[test]
fn fibonacci_tokens() {
let tokens = tokenize(FIBONACCI);
assert_eq!(
tokens,
vec![
Token {
token_type: Var,
lexeme: "var".into(),
literal: None,
line: 3
},
Token {
token_type: Identifier,
lexeme: "a".into(),
literal: None,
line: 3
},
Token {
token_type: Equal,
lexeme: "=".into(),
literal: None,
line: 3
},
Token {
token_type: Number,
lexeme: "0".into(),
literal: Some(Literal::Number(0.0)),
line: 3
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 3
},
Token {
token_type: Var,
lexeme: "var".into(),
literal: None,
line: 4
},
Token {
token_type: Identifier,
lexeme: "temp".into(),
literal: None,
line: 4
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 4
},
Token {
token_type: For,
lexeme: "for".into(),
literal: None,
line: 6
},
Token {
token_type: LeftParen,
lexeme: "(".into(),
literal: None,
line: 6
},
Token {
token_type: Var,
lexeme: "var".into(),
literal: None,
line: 6
},
Token {
token_type: Identifier,
lexeme: "b".into(),
literal: None,
line: 6
},
Token {
token_type: Equal,
lexeme: "=".into(),
literal: None,
line: 6
},
Token {
token_type: Number,
lexeme: "1".into(),
literal: Some(Literal::Number(1.0)),
line: 6
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 6
},
Token {
token_type: Identifier,
lexeme: "a".into(),
literal: None,
line: 6
},
Token {
token_type: Less,
lexeme: "<".into(),
literal: None,
line: 6
},
Token {
token_type: Number,
lexeme: "10000".into(),
literal: Some(Literal::Number(10000.0)),
line: 6
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 6
},
Token {
token_type: Identifier,
lexeme: "b".into(),
literal: None,
line: 6
},
Token {
token_type: Equal,
lexeme: "=".into(),
literal: None,
line: 6
},
Token {
token_type: Identifier,
lexeme: "temp".into(),
literal: None,
line: 6
},
Token {
token_type: Plus,
lexeme: "+".into(),
literal: None,
line: 6
},
Token {
token_type: Identifier,
lexeme: "b".into(),
literal: None,
line: 6
},
Token {
token_type: RightParen,
lexeme: ")".into(),
literal: None,
line: 6
},
Token {
token_type: LeftBrace,
lexeme: "{".into(),
literal: None,
line: 6
},
Token {
token_type: Print,
lexeme: "print".into(),
literal: None,
line: 7
},
Token {
token_type: Identifier,
lexeme: "a".into(),
literal: None,
line: 7
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 7
},
Token {
token_type: Identifier,
lexeme: "temp".into(),
literal: None,
line: 8
},
Token {
token_type: Equal,
lexeme: "=".into(),
literal: None,
line: 8
},
Token {
token_type: Identifier,
lexeme: "a".into(),
literal: None,
line: 8
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 8
},
Token {
token_type: Identifier,
lexeme: "a".into(),
literal: None,
line: 9
},
Token {
token_type: Equal,
lexeme: "=".into(),
literal: None,
line: 9
},
Token {
token_type: Identifier,
lexeme: "b".into(),
literal: None,
line: 9
},
Token {
token_type: Semicolon,
lexeme: ";".into(),
literal: None,
line: 9
},
Token {
token_type: RightBrace,
lexeme: "}".into(),
literal: None,
line: 10
},
Token {
token_type: Eof,
lexeme: "".into(),
literal: None,
line: 11
},
]
);
}
}

View File

@ -1,5 +1,8 @@
//! Token values and data structs.
use std::fmt::Display;
/// Exhaustive enumeration of all types of different tokens.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum TokenType {
// Single-character tokens.
@ -51,17 +54,25 @@ pub enum TokenType {
Eof,
}
/// Literal value.
#[derive(Clone, Debug, PartialEq)]
pub enum Literal {
/// String literal.
String(String),
// Number literal, represented as f64 (thus it can be decimal).
Number(f64),
}
/// Consumed token.
#[derive(Clone, Debug, PartialEq)]
pub struct Token {
/// Type of the token.
pub token_type: TokenType,
/// Lexeme that was consumed to create this token.
pub lexeme: String,
/// Literal value of the token, if any.
pub literal: Option<Literal>,
/// Starting line on which the token was oonsumed from the source.
pub line: usize,
}

View File

@ -5,6 +5,10 @@ use crate::token::Token;
use super::interface::Tokenizer;
/// Consume comments.
///
/// A comment starts with '//' and runs until the end of the line.
/// If only one '/' is seen, it is consumed as a Slash token.
pub struct Comment;
impl Tokenizer for Comment {
fn run(

View File

@ -6,6 +6,10 @@ use std::{iter::Peekable, str::CharIndices};
use super::interface::Tokenizer;
/// Consume an identifier which also might be a keyword.
///
/// An identifier starts with an alphabetic character and goes on consuming alphanumeric and
/// underscore characters until the first different one.
pub struct Identifier;
impl Tokenizer for Identifier {
fn run(

View File

@ -2,7 +2,12 @@ use std::{iter::Peekable, str::CharIndices};
use crate::token::Token;
/// Interface to implement by a tokenizer.
pub trait Tokenizer: Send + Sync {
/// Take a tuple consisting of the index of a char and the char itself, the whole source code
/// iterator, the source itself and the current line. Return None if you can not handle the
/// current lexeme or an Option consisting of a tuple where the first element is how much the
/// current line moved and the second element is an Option that can have the consumed token.
fn run(
&self,
c: (usize, char),

View File

@ -4,12 +4,17 @@ use crate::token::{Token, TokenType};
use lazy_static::lazy_static;
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
/// Data for one and two character lexemes.
struct LookaheadEntry {
/// TokenType if a lexeme is a one character one.
default_token: TokenType,
/// Mapping of second level character to a TokenType.
lookahead_map: HashMap<char, TokenType>,
}
lazy_static! {
/// Mapping of one and two character lexemes, specifiyng the one character variant and as many
/// two character ones as needed.
static ref LOOKAHEAD_TOKENS: HashMap<char, LookaheadEntry> = {
let mut m = HashMap::new();
@ -57,6 +62,7 @@ lazy_static! {
};
}
/// Consume lexemes that consist of exactly one or two characters.
pub struct Lookahead;
impl Tokenizer for Lookahead {
fn run(

View File

@ -2,6 +2,7 @@ use super::interface::Tokenizer;
use crate::token::Token;
use std::{iter::Peekable, str::CharIndices};
/// Consume newlines. Do not yield a token but increase the current line.
pub struct Newline;
impl Tokenizer for Newline {
fn run(

View File

@ -4,6 +4,7 @@ use tracing::error;
use super::interface::Tokenizer;
/// Consume a number literal. Numbers can have one decimal point.
pub struct Number;
impl Tokenizer for Number {
fn run(

View File

@ -7,6 +7,7 @@ use lazy_static::lazy_static;
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
lazy_static! {
/// Mapping of single characters to their respective TokenType.
static ref SINGLE_CHAR_TOKENS: HashMap<char, TokenType> = {
let mut m = HashMap::new();
m.insert('(', LeftParen);
@ -23,6 +24,7 @@ lazy_static! {
};
}
/// Consume a single character and produce its corresponding token.
pub struct SingleChar;
impl Tokenizer for SingleChar {
fn run(

View File

@ -3,6 +3,9 @@ use crate::token::{Literal, Token, TokenType};
use std::{iter::Peekable, str::CharIndices};
use tracing::error;
/// Consume a string literal.
///
/// A string literal consists of everything between two '"' and can stretch across multiple lines.
pub struct String;
impl Tokenizer for String {
fn run(

View File

@ -2,6 +2,7 @@ use super::interface::Tokenizer;
use crate::token::Token;
use std::{iter::Peekable, str::CharIndices};
/// Consume and ignore whitespace characters.
pub struct Whitespace;
impl Tokenizer for Whitespace {
fn run(