diff --git a/rust/rox/src/keywords.rs b/rust/rox/src/keywords.rs index 23743a5..7e86ce4 100644 --- a/rust/rox/src/keywords.rs +++ b/rust/rox/src/keywords.rs @@ -3,6 +3,7 @@ use lazy_static::lazy_static; use std::collections::HashMap; lazy_static! { + /// Mapping of reserved keywords to their respective TokenType. pub static ref KEYWORDS: HashMap = { let mut m = HashMap::new(); m.insert("and".into(), And); diff --git a/rust/rox/src/lib.rs b/rust/rox/src/lib.rs index adf569e..9b2becd 100644 --- a/rust/rox/src/lib.rs +++ b/rust/rox/src/lib.rs @@ -1,3 +1,6 @@ +//! Interpret the Lox language. Either compile (interpret for now though) some source code or run a +//! REPL. + use std::{ fs::{self}, io::{self, Write}, @@ -22,6 +25,7 @@ pub mod tokenizer { pub mod whitespace; } +/// Read the source code in a file and scan it to tokens. pub fn compile(source: &Path) -> Result<(), io::Error> { let input = fs::read_to_string(source)?; let _tokens = scanner::tokenize(&input); @@ -29,6 +33,7 @@ pub fn compile(source: &Path) -> Result<(), io::Error> { Ok(()) } +/// Run a Lox REPL until SIGINT. pub fn repl() { loop { print!("> "); diff --git a/rust/rox/src/main.rs b/rust/rox/src/main.rs index 4c38f72..5dda64e 100644 --- a/rust/rox/src/main.rs +++ b/rust/rox/src/main.rs @@ -4,6 +4,7 @@ use clap::Parser; use rox::cli::{Cli, Commands}; use tracing::error; +/// Cli entrypoint. fn main() { if std::env::var_os("RUST_LOG").is_none() { std::env::set_var("RUST_LOG", "info"); diff --git a/rust/rox/src/scanner.rs b/rust/rox/src/scanner.rs index 4a94dc8..f9655c3 100644 --- a/rust/rox/src/scanner.rs +++ b/rust/rox/src/scanner.rs @@ -1,3 +1,5 @@ +//! Scan source code to create tokens out of it. + use crate::{ token::{Token, TokenType::Eof}, tokenizer::{ @@ -10,6 +12,8 @@ use lazy_static::lazy_static; use tracing::{debug, error}; lazy_static! { + /// Tokenizers to use in scanning. They are tried in the exact order in which they appear in + /// the list. static ref TOKENIZERS: Vec> = vec![ Box::new(SingleChar), Box::new(Whitespace), @@ -22,21 +26,20 @@ lazy_static! { ]; } +/// Take source code as input and return a list of tokens representing it. pub fn tokenize(source: &str) -> Vec { let mut tokens: Vec = Vec::new(); let mut source_chars = source.char_indices().peekable(); let mut line = 1; while let Some(c) = source_chars.next() { - let mut tokenizer_idx = 0; - let mut tokenizer_result = None; - while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() { - tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line); - tokenizer_idx += 1; - } - - match tokenizer_result { + // careful, a tokenizer run can move the iterator but I have not found a more ergonomic variant yet + match TOKENIZERS + .iter() + .find_map(|x| x.run(c, &mut source_chars, source, line)) + { Some((line_advance, token)) => { + // I do not like handling it this way, but it suffices for now line += line_advance; if let Some(token) = token { @@ -48,6 +51,7 @@ pub fn tokenize(source: &str) -> Vec { } } } + // Eof is always the last token tokens.push(Token { token_type: Eof, lexeme: "".to_string(), @@ -58,3 +62,312 @@ pub fn tokenize(source: &str) -> Vec { tokens } + +#[cfg(test)] +mod tests { + use crate::token::Literal; + use crate::token::Token; + use crate::token::TokenType::*; + + use super::tokenize; + + const FIBONACCI: &str = r#" + // first 21 elements in the Fibonacci sequence + var a = 0; + var temp; + + for (var b = 1; a < 10000; b = temp + b) { + print a; + temp = a; + a = b; + } + "#; + + #[test] + fn floating_points() { + assert_eq!( + vec![Token { + token_type: Number, + lexeme: "0".into(), + literal: Some(Literal::Number(0.0)), + line: 1 + },], + vec![Token { + token_type: Number, + lexeme: "0".into(), + literal: Some(Literal::Number(0.0)), + line: 1 + },] + ); + } + + #[test] + fn multiline_string() { + let input = r#""Hello, + world +!""#; + let tokens = tokenize(input); + + assert_eq!( + tokens, + vec![ + Token { + token_type: String, + lexeme: "\"Hello,\n world\n!\"".into(), + literal: Some(Literal::String("Hello,\n world\n!".into())), + line: 1 + }, + Token { + token_type: Eof, + lexeme: "".into(), + literal: None, + line: 3 + }, + ] + ); + } + + #[test] + fn fibonacci_tokens() { + let tokens = tokenize(FIBONACCI); + assert_eq!( + tokens, + vec![ + Token { + token_type: Var, + lexeme: "var".into(), + literal: None, + line: 3 + }, + Token { + token_type: Identifier, + lexeme: "a".into(), + literal: None, + line: 3 + }, + Token { + token_type: Equal, + lexeme: "=".into(), + literal: None, + line: 3 + }, + Token { + token_type: Number, + lexeme: "0".into(), + literal: Some(Literal::Number(0.0)), + line: 3 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 3 + }, + Token { + token_type: Var, + lexeme: "var".into(), + literal: None, + line: 4 + }, + Token { + token_type: Identifier, + lexeme: "temp".into(), + literal: None, + line: 4 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 4 + }, + Token { + token_type: For, + lexeme: "for".into(), + literal: None, + line: 6 + }, + Token { + token_type: LeftParen, + lexeme: "(".into(), + literal: None, + line: 6 + }, + Token { + token_type: Var, + lexeme: "var".into(), + literal: None, + line: 6 + }, + Token { + token_type: Identifier, + lexeme: "b".into(), + literal: None, + line: 6 + }, + Token { + token_type: Equal, + lexeme: "=".into(), + literal: None, + line: 6 + }, + Token { + token_type: Number, + lexeme: "1".into(), + literal: Some(Literal::Number(1.0)), + line: 6 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 6 + }, + Token { + token_type: Identifier, + lexeme: "a".into(), + literal: None, + line: 6 + }, + Token { + token_type: Less, + lexeme: "<".into(), + literal: None, + line: 6 + }, + Token { + token_type: Number, + lexeme: "10000".into(), + literal: Some(Literal::Number(10000.0)), + line: 6 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 6 + }, + Token { + token_type: Identifier, + lexeme: "b".into(), + literal: None, + line: 6 + }, + Token { + token_type: Equal, + lexeme: "=".into(), + literal: None, + line: 6 + }, + Token { + token_type: Identifier, + lexeme: "temp".into(), + literal: None, + line: 6 + }, + Token { + token_type: Plus, + lexeme: "+".into(), + literal: None, + line: 6 + }, + Token { + token_type: Identifier, + lexeme: "b".into(), + literal: None, + line: 6 + }, + Token { + token_type: RightParen, + lexeme: ")".into(), + literal: None, + line: 6 + }, + Token { + token_type: LeftBrace, + lexeme: "{".into(), + literal: None, + line: 6 + }, + Token { + token_type: Print, + lexeme: "print".into(), + literal: None, + line: 7 + }, + Token { + token_type: Identifier, + lexeme: "a".into(), + literal: None, + line: 7 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 7 + }, + Token { + token_type: Identifier, + lexeme: "temp".into(), + literal: None, + line: 8 + }, + Token { + token_type: Equal, + lexeme: "=".into(), + literal: None, + line: 8 + }, + Token { + token_type: Identifier, + lexeme: "a".into(), + literal: None, + line: 8 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 8 + }, + Token { + token_type: Identifier, + lexeme: "a".into(), + literal: None, + line: 9 + }, + Token { + token_type: Equal, + lexeme: "=".into(), + literal: None, + line: 9 + }, + Token { + token_type: Identifier, + lexeme: "b".into(), + literal: None, + line: 9 + }, + Token { + token_type: Semicolon, + lexeme: ";".into(), + literal: None, + line: 9 + }, + Token { + token_type: RightBrace, + lexeme: "}".into(), + literal: None, + line: 10 + }, + Token { + token_type: Eof, + lexeme: "".into(), + literal: None, + line: 11 + }, + ] + ); + } +} diff --git a/rust/rox/src/token.rs b/rust/rox/src/token.rs index 192afa6..a286bb3 100644 --- a/rust/rox/src/token.rs +++ b/rust/rox/src/token.rs @@ -1,5 +1,8 @@ +//! Token values and data structs. + use std::fmt::Display; +/// Exhaustive enumeration of all types of different tokens. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum TokenType { // Single-character tokens. @@ -51,17 +54,25 @@ pub enum TokenType { Eof, } +/// Literal value. #[derive(Clone, Debug, PartialEq)] pub enum Literal { + /// String literal. String(String), + // Number literal, represented as f64 (thus it can be decimal). Number(f64), } +/// Consumed token. #[derive(Clone, Debug, PartialEq)] pub struct Token { + /// Type of the token. pub token_type: TokenType, + /// Lexeme that was consumed to create this token. pub lexeme: String, + /// Literal value of the token, if any. pub literal: Option, + /// Starting line on which the token was oonsumed from the source. pub line: usize, } diff --git a/rust/rox/src/tokenizer/comment.rs b/rust/rox/src/tokenizer/comment.rs index 0a1d4c2..1e01267 100644 --- a/rust/rox/src/tokenizer/comment.rs +++ b/rust/rox/src/tokenizer/comment.rs @@ -5,6 +5,10 @@ use crate::token::Token; use super::interface::Tokenizer; +/// Consume comments. +/// +/// A comment starts with '//' and runs until the end of the line. +/// If only one '/' is seen, it is consumed as a Slash token. pub struct Comment; impl Tokenizer for Comment { fn run( diff --git a/rust/rox/src/tokenizer/identifier.rs b/rust/rox/src/tokenizer/identifier.rs index 31952bf..e4aea6d 100644 --- a/rust/rox/src/tokenizer/identifier.rs +++ b/rust/rox/src/tokenizer/identifier.rs @@ -6,6 +6,10 @@ use std::{iter::Peekable, str::CharIndices}; use super::interface::Tokenizer; +/// Consume an identifier which also might be a keyword. +/// +/// An identifier starts with an alphabetic character and goes on consuming alphanumeric and +/// underscore characters until the first different one. pub struct Identifier; impl Tokenizer for Identifier { fn run( diff --git a/rust/rox/src/tokenizer/interface.rs b/rust/rox/src/tokenizer/interface.rs index 7b1e63b..2d851a0 100644 --- a/rust/rox/src/tokenizer/interface.rs +++ b/rust/rox/src/tokenizer/interface.rs @@ -2,7 +2,12 @@ use std::{iter::Peekable, str::CharIndices}; use crate::token::Token; +/// Interface to implement by a tokenizer. pub trait Tokenizer: Send + Sync { + /// Take a tuple consisting of the index of a char and the char itself, the whole source code + /// iterator, the source itself and the current line. Return None if you can not handle the + /// current lexeme or an Option consisting of a tuple where the first element is how much the + /// current line moved and the second element is an Option that can have the consumed token. fn run( &self, c: (usize, char), diff --git a/rust/rox/src/tokenizer/lookahead.rs b/rust/rox/src/tokenizer/lookahead.rs index 6e0294e..3277180 100644 --- a/rust/rox/src/tokenizer/lookahead.rs +++ b/rust/rox/src/tokenizer/lookahead.rs @@ -4,12 +4,17 @@ use crate::token::{Token, TokenType}; use lazy_static::lazy_static; use std::{collections::HashMap, iter::Peekable, str::CharIndices}; +/// Data for one and two character lexemes. struct LookaheadEntry { + /// TokenType if a lexeme is a one character one. default_token: TokenType, + /// Mapping of second level character to a TokenType. lookahead_map: HashMap, } lazy_static! { + /// Mapping of one and two character lexemes, specifiyng the one character variant and as many + /// two character ones as needed. static ref LOOKAHEAD_TOKENS: HashMap = { let mut m = HashMap::new(); @@ -57,6 +62,7 @@ lazy_static! { }; } +/// Consume lexemes that consist of exactly one or two characters. pub struct Lookahead; impl Tokenizer for Lookahead { fn run( diff --git a/rust/rox/src/tokenizer/newline.rs b/rust/rox/src/tokenizer/newline.rs index 74827d3..33abaa2 100644 --- a/rust/rox/src/tokenizer/newline.rs +++ b/rust/rox/src/tokenizer/newline.rs @@ -2,6 +2,7 @@ use super::interface::Tokenizer; use crate::token::Token; use std::{iter::Peekable, str::CharIndices}; +/// Consume newlines. Do not yield a token but increase the current line. pub struct Newline; impl Tokenizer for Newline { fn run( diff --git a/rust/rox/src/tokenizer/number.rs b/rust/rox/src/tokenizer/number.rs index b6766cd..a437bad 100644 --- a/rust/rox/src/tokenizer/number.rs +++ b/rust/rox/src/tokenizer/number.rs @@ -4,6 +4,7 @@ use tracing::error; use super::interface::Tokenizer; +/// Consume a number literal. Numbers can have one decimal point. pub struct Number; impl Tokenizer for Number { fn run( diff --git a/rust/rox/src/tokenizer/single_char.rs b/rust/rox/src/tokenizer/single_char.rs index 18e04b3..efdd55a 100644 --- a/rust/rox/src/tokenizer/single_char.rs +++ b/rust/rox/src/tokenizer/single_char.rs @@ -7,6 +7,7 @@ use lazy_static::lazy_static; use std::{collections::HashMap, iter::Peekable, str::CharIndices}; lazy_static! { + /// Mapping of single characters to their respective TokenType. static ref SINGLE_CHAR_TOKENS: HashMap = { let mut m = HashMap::new(); m.insert('(', LeftParen); @@ -23,6 +24,7 @@ lazy_static! { }; } +/// Consume a single character and produce its corresponding token. pub struct SingleChar; impl Tokenizer for SingleChar { fn run( diff --git a/rust/rox/src/tokenizer/string.rs b/rust/rox/src/tokenizer/string.rs index 07d6309..2c30e7f 100644 --- a/rust/rox/src/tokenizer/string.rs +++ b/rust/rox/src/tokenizer/string.rs @@ -3,6 +3,9 @@ use crate::token::{Literal, Token, TokenType}; use std::{iter::Peekable, str::CharIndices}; use tracing::error; +/// Consume a string literal. +/// +/// A string literal consists of everything between two '"' and can stretch across multiple lines. pub struct String; impl Tokenizer for String { fn run( diff --git a/rust/rox/src/tokenizer/whitespace.rs b/rust/rox/src/tokenizer/whitespace.rs index a47f5c2..a462217 100644 --- a/rust/rox/src/tokenizer/whitespace.rs +++ b/rust/rox/src/tokenizer/whitespace.rs @@ -2,6 +2,7 @@ use super::interface::Tokenizer; use crate::token::Token; use std::{iter::Peekable, str::CharIndices}; +/// Consume and ignore whitespace characters. pub struct Whitespace; impl Tokenizer for Whitespace { fn run(