refactor tokenizers away from one giant switch statement
This commit is contained in:
parent
3cf8ef02d1
commit
a257beb170
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
|
||||
use std::collections::HashMap;
|
||||
|
||||
lazy_static! {
|
||||
/// Mapping of reserved keywords to their respective TokenType.
|
||||
pub static ref KEYWORDS: HashMap<std::string::String, TokenType> = {
|
||||
let mut m = HashMap::new();
|
||||
m.insert("and".into(), And);
|
||||
|
@ -1,3 +1,6 @@
|
||||
//! Interpret the Lox language. Either compile (interpret for now though) some source code or run a
|
||||
//! REPL.
|
||||
|
||||
use std::{
|
||||
fs::{self},
|
||||
io::{self, Write},
|
||||
@ -22,6 +25,7 @@ pub mod tokenizer {
|
||||
pub mod whitespace;
|
||||
}
|
||||
|
||||
/// Read the source code in a file and scan it to tokens.
|
||||
pub fn compile(source: &Path) -> Result<(), io::Error> {
|
||||
let input = fs::read_to_string(source)?;
|
||||
let _tokens = scanner::tokenize(&input);
|
||||
@ -29,6 +33,7 @@ pub fn compile(source: &Path) -> Result<(), io::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run a Lox REPL until SIGINT.
|
||||
pub fn repl() {
|
||||
loop {
|
||||
print!("> ");
|
||||
|
@ -4,6 +4,7 @@ use clap::Parser;
|
||||
use rox::cli::{Cli, Commands};
|
||||
use tracing::error;
|
||||
|
||||
/// Cli entrypoint.
|
||||
fn main() {
|
||||
if std::env::var_os("RUST_LOG").is_none() {
|
||||
std::env::set_var("RUST_LOG", "info");
|
||||
|
@ -1,3 +1,5 @@
|
||||
//! Scan source code to create tokens out of it.
|
||||
|
||||
use crate::{
|
||||
token::{Token, TokenType::Eof},
|
||||
tokenizer::{
|
||||
@ -10,6 +12,8 @@ use lazy_static::lazy_static;
|
||||
use tracing::{debug, error};
|
||||
|
||||
lazy_static! {
|
||||
/// Tokenizers to use in scanning. They are tried in the exact order in which they appear in
|
||||
/// the list.
|
||||
static ref TOKENIZERS: Vec<Box<dyn Tokenizer>> = vec![
|
||||
Box::new(SingleChar),
|
||||
Box::new(Whitespace),
|
||||
@ -22,21 +26,20 @@ lazy_static! {
|
||||
];
|
||||
}
|
||||
|
||||
/// Take source code as input and return a list of tokens representing it.
|
||||
pub fn tokenize(source: &str) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = Vec::new();
|
||||
let mut source_chars = source.char_indices().peekable();
|
||||
|
||||
let mut line = 1;
|
||||
while let Some(c) = source_chars.next() {
|
||||
let mut tokenizer_idx = 0;
|
||||
let mut tokenizer_result = None;
|
||||
while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() {
|
||||
tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line);
|
||||
tokenizer_idx += 1;
|
||||
}
|
||||
|
||||
match tokenizer_result {
|
||||
// careful, a tokenizer run can move the iterator but I have not found a more ergonomic variant yet
|
||||
match TOKENIZERS
|
||||
.iter()
|
||||
.find_map(|x| x.run(c, &mut source_chars, source, line))
|
||||
{
|
||||
Some((line_advance, token)) => {
|
||||
// I do not like handling it this way, but it suffices for now
|
||||
line += line_advance;
|
||||
|
||||
if let Some(token) = token {
|
||||
@ -48,6 +51,7 @@ pub fn tokenize(source: &str) -> Vec<Token> {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Eof is always the last token
|
||||
tokens.push(Token {
|
||||
token_type: Eof,
|
||||
lexeme: "".to_string(),
|
||||
@ -58,3 +62,312 @@ pub fn tokenize(source: &str) -> Vec<Token> {
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::token::Literal;
|
||||
use crate::token::Token;
|
||||
use crate::token::TokenType::*;
|
||||
|
||||
use super::tokenize;
|
||||
|
||||
const FIBONACCI: &str = r#"
|
||||
// first 21 elements in the Fibonacci sequence
|
||||
var a = 0;
|
||||
var temp;
|
||||
|
||||
for (var b = 1; a < 10000; b = temp + b) {
|
||||
print a;
|
||||
temp = a;
|
||||
a = b;
|
||||
}
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn floating_points() {
|
||||
assert_eq!(
|
||||
vec![Token {
|
||||
token_type: Number,
|
||||
lexeme: "0".into(),
|
||||
literal: Some(Literal::Number(0.0)),
|
||||
line: 1
|
||||
},],
|
||||
vec![Token {
|
||||
token_type: Number,
|
||||
lexeme: "0".into(),
|
||||
literal: Some(Literal::Number(0.0)),
|
||||
line: 1
|
||||
},]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiline_string() {
|
||||
let input = r#""Hello,
|
||||
world
|
||||
!""#;
|
||||
let tokens = tokenize(input);
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![
|
||||
Token {
|
||||
token_type: String,
|
||||
lexeme: "\"Hello,\n world\n!\"".into(),
|
||||
literal: Some(Literal::String("Hello,\n world\n!".into())),
|
||||
line: 1
|
||||
},
|
||||
Token {
|
||||
token_type: Eof,
|
||||
lexeme: "".into(),
|
||||
literal: None,
|
||||
line: 3
|
||||
},
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fibonacci_tokens() {
|
||||
let tokens = tokenize(FIBONACCI);
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![
|
||||
Token {
|
||||
token_type: Var,
|
||||
lexeme: "var".into(),
|
||||
literal: None,
|
||||
line: 3
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "a".into(),
|
||||
literal: None,
|
||||
line: 3
|
||||
},
|
||||
Token {
|
||||
token_type: Equal,
|
||||
lexeme: "=".into(),
|
||||
literal: None,
|
||||
line: 3
|
||||
},
|
||||
Token {
|
||||
token_type: Number,
|
||||
lexeme: "0".into(),
|
||||
literal: Some(Literal::Number(0.0)),
|
||||
line: 3
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 3
|
||||
},
|
||||
Token {
|
||||
token_type: Var,
|
||||
lexeme: "var".into(),
|
||||
literal: None,
|
||||
line: 4
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "temp".into(),
|
||||
literal: None,
|
||||
line: 4
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 4
|
||||
},
|
||||
Token {
|
||||
token_type: For,
|
||||
lexeme: "for".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: LeftParen,
|
||||
lexeme: "(".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Var,
|
||||
lexeme: "var".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "b".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Equal,
|
||||
lexeme: "=".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Number,
|
||||
lexeme: "1".into(),
|
||||
literal: Some(Literal::Number(1.0)),
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "a".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Less,
|
||||
lexeme: "<".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Number,
|
||||
lexeme: "10000".into(),
|
||||
literal: Some(Literal::Number(10000.0)),
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "b".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Equal,
|
||||
lexeme: "=".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "temp".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Plus,
|
||||
lexeme: "+".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "b".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: RightParen,
|
||||
lexeme: ")".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: LeftBrace,
|
||||
lexeme: "{".into(),
|
||||
literal: None,
|
||||
line: 6
|
||||
},
|
||||
Token {
|
||||
token_type: Print,
|
||||
lexeme: "print".into(),
|
||||
literal: None,
|
||||
line: 7
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "a".into(),
|
||||
literal: None,
|
||||
line: 7
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 7
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "temp".into(),
|
||||
literal: None,
|
||||
line: 8
|
||||
},
|
||||
Token {
|
||||
token_type: Equal,
|
||||
lexeme: "=".into(),
|
||||
literal: None,
|
||||
line: 8
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "a".into(),
|
||||
literal: None,
|
||||
line: 8
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 8
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "a".into(),
|
||||
literal: None,
|
||||
line: 9
|
||||
},
|
||||
Token {
|
||||
token_type: Equal,
|
||||
lexeme: "=".into(),
|
||||
literal: None,
|
||||
line: 9
|
||||
},
|
||||
Token {
|
||||
token_type: Identifier,
|
||||
lexeme: "b".into(),
|
||||
literal: None,
|
||||
line: 9
|
||||
},
|
||||
Token {
|
||||
token_type: Semicolon,
|
||||
lexeme: ";".into(),
|
||||
literal: None,
|
||||
line: 9
|
||||
},
|
||||
Token {
|
||||
token_type: RightBrace,
|
||||
lexeme: "}".into(),
|
||||
literal: None,
|
||||
line: 10
|
||||
},
|
||||
Token {
|
||||
token_type: Eof,
|
||||
lexeme: "".into(),
|
||||
literal: None,
|
||||
line: 11
|
||||
},
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
//! Token values and data structs.
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
/// Exhaustive enumeration of all types of different tokens.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum TokenType {
|
||||
// Single-character tokens.
|
||||
@ -51,17 +54,25 @@ pub enum TokenType {
|
||||
Eof,
|
||||
}
|
||||
|
||||
/// Literal value.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum Literal {
|
||||
/// String literal.
|
||||
String(String),
|
||||
// Number literal, represented as f64 (thus it can be decimal).
|
||||
Number(f64),
|
||||
}
|
||||
|
||||
/// Consumed token.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Token {
|
||||
/// Type of the token.
|
||||
pub token_type: TokenType,
|
||||
/// Lexeme that was consumed to create this token.
|
||||
pub lexeme: String,
|
||||
/// Literal value of the token, if any.
|
||||
pub literal: Option<Literal>,
|
||||
/// Starting line on which the token was oonsumed from the source.
|
||||
pub line: usize,
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,10 @@ use crate::token::Token;
|
||||
|
||||
use super::interface::Tokenizer;
|
||||
|
||||
/// Consume comments.
|
||||
///
|
||||
/// A comment starts with '//' and runs until the end of the line.
|
||||
/// If only one '/' is seen, it is consumed as a Slash token.
|
||||
pub struct Comment;
|
||||
impl Tokenizer for Comment {
|
||||
fn run(
|
||||
|
@ -6,6 +6,10 @@ use std::{iter::Peekable, str::CharIndices};
|
||||
|
||||
use super::interface::Tokenizer;
|
||||
|
||||
/// Consume an identifier which also might be a keyword.
|
||||
///
|
||||
/// An identifier starts with an alphabetic character and goes on consuming alphanumeric and
|
||||
/// underscore characters until the first different one.
|
||||
pub struct Identifier;
|
||||
impl Tokenizer for Identifier {
|
||||
fn run(
|
||||
|
@ -2,7 +2,12 @@ use std::{iter::Peekable, str::CharIndices};
|
||||
|
||||
use crate::token::Token;
|
||||
|
||||
/// Interface to implement by a tokenizer.
|
||||
pub trait Tokenizer: Send + Sync {
|
||||
/// Take a tuple consisting of the index of a char and the char itself, the whole source code
|
||||
/// iterator, the source itself and the current line. Return None if you can not handle the
|
||||
/// current lexeme or an Option consisting of a tuple where the first element is how much the
|
||||
/// current line moved and the second element is an Option that can have the consumed token.
|
||||
fn run(
|
||||
&self,
|
||||
c: (usize, char),
|
||||
|
@ -4,12 +4,17 @@ use crate::token::{Token, TokenType};
|
||||
use lazy_static::lazy_static;
|
||||
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
|
||||
|
||||
/// Data for one and two character lexemes.
|
||||
struct LookaheadEntry {
|
||||
/// TokenType if a lexeme is a one character one.
|
||||
default_token: TokenType,
|
||||
/// Mapping of second level character to a TokenType.
|
||||
lookahead_map: HashMap<char, TokenType>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
/// Mapping of one and two character lexemes, specifiyng the one character variant and as many
|
||||
/// two character ones as needed.
|
||||
static ref LOOKAHEAD_TOKENS: HashMap<char, LookaheadEntry> = {
|
||||
let mut m = HashMap::new();
|
||||
|
||||
@ -57,6 +62,7 @@ lazy_static! {
|
||||
};
|
||||
}
|
||||
|
||||
/// Consume lexemes that consist of exactly one or two characters.
|
||||
pub struct Lookahead;
|
||||
impl Tokenizer for Lookahead {
|
||||
fn run(
|
||||
|
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
|
||||
use crate::token::Token;
|
||||
use std::{iter::Peekable, str::CharIndices};
|
||||
|
||||
/// Consume newlines. Do not yield a token but increase the current line.
|
||||
pub struct Newline;
|
||||
impl Tokenizer for Newline {
|
||||
fn run(
|
||||
|
@ -4,6 +4,7 @@ use tracing::error;
|
||||
|
||||
use super::interface::Tokenizer;
|
||||
|
||||
/// Consume a number literal. Numbers can have one decimal point.
|
||||
pub struct Number;
|
||||
impl Tokenizer for Number {
|
||||
fn run(
|
||||
|
@ -7,6 +7,7 @@ use lazy_static::lazy_static;
|
||||
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
|
||||
|
||||
lazy_static! {
|
||||
/// Mapping of single characters to their respective TokenType.
|
||||
static ref SINGLE_CHAR_TOKENS: HashMap<char, TokenType> = {
|
||||
let mut m = HashMap::new();
|
||||
m.insert('(', LeftParen);
|
||||
@ -23,6 +24,7 @@ lazy_static! {
|
||||
};
|
||||
}
|
||||
|
||||
/// Consume a single character and produce its corresponding token.
|
||||
pub struct SingleChar;
|
||||
impl Tokenizer for SingleChar {
|
||||
fn run(
|
||||
|
@ -3,6 +3,9 @@ use crate::token::{Literal, Token, TokenType};
|
||||
use std::{iter::Peekable, str::CharIndices};
|
||||
use tracing::error;
|
||||
|
||||
/// Consume a string literal.
|
||||
///
|
||||
/// A string literal consists of everything between two '"' and can stretch across multiple lines.
|
||||
pub struct String;
|
||||
impl Tokenizer for String {
|
||||
fn run(
|
||||
|
@ -2,6 +2,7 @@ use super::interface::Tokenizer;
|
||||
use crate::token::Token;
|
||||
use std::{iter::Peekable, str::CharIndices};
|
||||
|
||||
/// Consume and ignore whitespace characters.
|
||||
pub struct Whitespace;
|
||||
impl Tokenizer for Whitespace {
|
||||
fn run(
|
||||
|
Loading…
Reference in New Issue
Block a user