diff --git a/rust/rox/Cargo.lock b/rust/rox/Cargo.lock index 3103be5..169f10a 100644 --- a/rust/rox/Cargo.lock +++ b/rust/rox/Cargo.lock @@ -247,6 +247,8 @@ name = "rox" version = "0.1.0" dependencies = [ "clap", + "lazy_static", + "thiserror", "tracing", "tracing-subscriber", ] @@ -283,6 +285,26 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "thread_local" version = "1.1.8" diff --git a/rust/rox/Cargo.toml b/rust/rox/Cargo.toml index 9c77e69..9b1332a 100644 --- a/rust/rox/Cargo.toml +++ b/rust/rox/Cargo.toml @@ -7,5 +7,7 @@ authors = ["Sebastian Hugentobler "] [dependencies] clap = { version = "4.5.28", features = ["derive"] } +lazy_static = "1.5.0" +thiserror = "2.0.11" tracing = "0.1.41" tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } diff --git a/rust/rox/src/keywords.rs b/rust/rox/src/keywords.rs new file mode 100644 index 0000000..23743a5 --- /dev/null +++ b/rust/rox/src/keywords.rs @@ -0,0 +1,25 @@ +use crate::token::TokenType::{self, *}; +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + pub static ref KEYWORDS: HashMap = { + let mut m = HashMap::new(); + m.insert("and".into(), And); + m.insert("class".into(), Class); + m.insert("else".into(), Else); + m.insert("false".into(), False); + m.insert("for".into(), For); + m.insert("fun".into(), Fun); + m.insert("if".into(), If); + m.insert("nil".into(), Nil); + m.insert("or".into(), Or); + m.insert("print".into(), Print); + m.insert("return".into(), Return); + m.insert("super".into(), Super); + m.insert("true".into(), True); + m.insert("var".into(), Var); + m.insert("while".into(), While); + m + }; +} diff --git a/rust/rox/src/lib.rs b/rust/rox/src/lib.rs index fe8e646..adf569e 100644 --- a/rust/rox/src/lib.rs +++ b/rust/rox/src/lib.rs @@ -1,6 +1,45 @@ -use std::path::Path; +use std::{ + fs::{self}, + io::{self, Write}, + path::Path, +}; + +use tracing::error; pub mod cli; +pub mod keywords; +pub mod scanner; +pub mod token; +pub mod tokenizer { + pub mod comment; + pub mod identifier; + pub mod interface; + pub mod lookahead; + pub mod newline; + pub mod number; + pub mod single_char; + pub mod string; + pub mod whitespace; +} -pub fn compile(source: &Path) {} -pub fn repl() {} +pub fn compile(source: &Path) -> Result<(), io::Error> { + let input = fs::read_to_string(source)?; + let _tokens = scanner::tokenize(&input); + + Ok(()) +} + +pub fn repl() { + loop { + print!("> "); + let _ = io::stdout().flush(); + + let mut input = String::new(); + match io::stdin().read_line(&mut input) { + Ok(_) => {} + Err(e) => error!("{}", e), + } + let input = input.trim().to_string(); + let _tokens = scanner::tokenize(&input); + } +} diff --git a/rust/rox/src/main.rs b/rust/rox/src/main.rs index 4686b71..4c38f72 100644 --- a/rust/rox/src/main.rs +++ b/rust/rox/src/main.rs @@ -1,5 +1,8 @@ +use std::process::exit; + use clap::Parser; use rox::cli::{Cli, Commands}; +use tracing::error; fn main() { if std::env::var_os("RUST_LOG").is_none() { @@ -11,7 +14,21 @@ fn main() { match &cli.command { Commands::Compile(compile_config) => { - rox::compile(&compile_config.source); + if !&compile_config.source.exists() { + error!( + "{} does not exist", + &compile_config.source.to_string_lossy() + ); + exit(1); + } + + if let Err(e) = rox::compile(&compile_config.source) { + error!( + "failed to compile {}: {}", + &compile_config.source.to_string_lossy(), + e + ); + } } Commands::Repl => { rox::repl(); diff --git a/rust/rox/src/scanner.rs b/rust/rox/src/scanner.rs new file mode 100644 index 0000000..4a94dc8 --- /dev/null +++ b/rust/rox/src/scanner.rs @@ -0,0 +1,60 @@ +use crate::{ + token::{Token, TokenType::Eof}, + tokenizer::{ + comment::Comment, identifier::Identifier, interface::Tokenizer, lookahead::Lookahead, + newline::Newline, number::Number, single_char::SingleChar, string::String, + whitespace::Whitespace, + }, +}; +use lazy_static::lazy_static; +use tracing::{debug, error}; + +lazy_static! { + static ref TOKENIZERS: Vec> = vec![ + Box::new(SingleChar), + Box::new(Whitespace), + Box::new(Newline), + Box::new(Lookahead), + Box::new(Comment), + Box::new(String), + Box::new(Number), + Box::new(Identifier), + ]; +} + +pub fn tokenize(source: &str) -> Vec { + let mut tokens: Vec = Vec::new(); + let mut source_chars = source.char_indices().peekable(); + + let mut line = 1; + while let Some(c) = source_chars.next() { + let mut tokenizer_idx = 0; + let mut tokenizer_result = None; + while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() { + tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line); + tokenizer_idx += 1; + } + + match tokenizer_result { + Some((line_advance, token)) => { + line += line_advance; + + if let Some(token) = token { + tokens.push(token); + } + } + None => { + error!("unexpected character: {}", c.1) + } + } + } + tokens.push(Token { + token_type: Eof, + lexeme: "".to_string(), + literal: None, + line, + }); + debug!("{:?}", tokens); + + tokens +} diff --git a/rust/rox/src/token.rs b/rust/rox/src/token.rs new file mode 100644 index 0000000..192afa6 --- /dev/null +++ b/rust/rox/src/token.rs @@ -0,0 +1,76 @@ +use std::fmt::Display; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum TokenType { + // Single-character tokens. + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + // One or two character tokens. + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals. + Identifier, + String, + Number, + + // Keywords. + And, + Class, + Else, + False, + Fun, + For, + If, + Nil, + Or, + Print, + Return, + Super, + This, + True, + Var, + While, + + Eof, +} + +#[derive(Clone, Debug, PartialEq)] +pub enum Literal { + String(String), + Number(f64), +} + +#[derive(Clone, Debug, PartialEq)] +pub struct Token { + pub token_type: TokenType, + pub lexeme: String, + pub literal: Option, + pub line: usize, +} + +impl Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}: {:?} {} {:?}", + self.line, self.token_type, self.lexeme, self.literal + ) + } +} diff --git a/rust/rox/src/tokenizer/comment.rs b/rust/rox/src/tokenizer/comment.rs new file mode 100644 index 0000000..0a1d4c2 --- /dev/null +++ b/rust/rox/src/tokenizer/comment.rs @@ -0,0 +1,39 @@ +use crate::token::TokenType::Slash; +use std::{iter::Peekable, str::CharIndices}; + +use crate::token::Token; + +use super::interface::Tokenizer; + +pub struct Comment; +impl Tokenizer for Comment { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + match c.1 { + '/' => { + let (line_advance, token) = if chars.next_if(|(_, peek)| *peek == '/').is_some() { + while chars.next_if(|(_, peek)| *peek != '\n').is_some() {} + chars.next(); + (1, None) + } else { + ( + 0, + Some(Token { + token_type: Slash, + lexeme: source[c.0..=c.0].to_string(), + literal: None, + line, + }), + ) + }; + Some((line_advance, token)) + } + _ => None, + } + } +} diff --git a/rust/rox/src/tokenizer/identifier.rs b/rust/rox/src/tokenizer/identifier.rs new file mode 100644 index 0000000..31952bf --- /dev/null +++ b/rust/rox/src/tokenizer/identifier.rs @@ -0,0 +1,42 @@ +use crate::{ + keywords::KEYWORDS, + token::{Token, TokenType}, +}; +use std::{iter::Peekable, str::CharIndices}; + +use super::interface::Tokenizer; + +pub struct Identifier; +impl Tokenizer for Identifier { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + if c.1.is_alphabetic() || c.1 == '_' { + let mut end_idx = c.0; + while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_alphanumeric() || c.1 == '_') { + end_idx = idx; + } + + let lexeme = source[c.0..=end_idx].to_string(); + let token_type = match KEYWORDS.get(&lexeme) { + Some(token_type) => *token_type, + None => TokenType::Identifier, + }; + Some(( + 0, + Some(Token { + token_type, + lexeme, + literal: None, + line, + }), + )) + } else { + None + } + } +} diff --git a/rust/rox/src/tokenizer/interface.rs b/rust/rox/src/tokenizer/interface.rs new file mode 100644 index 0000000..7b1e63b --- /dev/null +++ b/rust/rox/src/tokenizer/interface.rs @@ -0,0 +1,13 @@ +use std::{iter::Peekable, str::CharIndices}; + +use crate::token::Token; + +pub trait Tokenizer: Send + Sync { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)>; +} diff --git a/rust/rox/src/tokenizer/lookahead.rs b/rust/rox/src/tokenizer/lookahead.rs new file mode 100644 index 0000000..6e0294e --- /dev/null +++ b/rust/rox/src/tokenizer/lookahead.rs @@ -0,0 +1,91 @@ +use super::interface::Tokenizer; +use crate::token::TokenType::*; +use crate::token::{Token, TokenType}; +use lazy_static::lazy_static; +use std::{collections::HashMap, iter::Peekable, str::CharIndices}; + +struct LookaheadEntry { + default_token: TokenType, + lookahead_map: HashMap, +} + +lazy_static! { + static ref LOOKAHEAD_TOKENS: HashMap = { + let mut m = HashMap::new(); + + let mut bang_map = HashMap::new(); + bang_map.insert('=', BangEqual); + m.insert( + '!', + LookaheadEntry { + default_token: Bang, + lookahead_map: bang_map, + }, + ); + + let mut equal_map = HashMap::new(); + equal_map.insert('=', EqualEqual); + m.insert( + '=', + LookaheadEntry { + default_token: Equal, + lookahead_map: equal_map, + }, + ); + + let mut less_map = HashMap::new(); + less_map.insert('=', LessEqual); + m.insert( + '<', + LookaheadEntry { + default_token: Less, + lookahead_map: less_map, + }, + ); + + let mut greater_map = HashMap::new(); + greater_map.insert('=', GreaterEqual); + m.insert( + '>', + LookaheadEntry { + default_token: Greater, + lookahead_map: greater_map, + }, + ); + + m + }; +} + +pub struct Lookahead; +impl Tokenizer for Lookahead { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + LOOKAHEAD_TOKENS.get(&c.1).map(|entry| { + let (lexeme, token_type) = if let Some(&(_, peeked)) = chars.peek() { + if let Some(&token_type) = entry.lookahead_map.get(&peeked) { + chars.next(); + (source[c.0..=c.0 + 1].to_string(), token_type) + } else { + (source[c.0..=c.0].to_string(), entry.default_token) + } + } else { + (source[c.0..=c.0].to_string(), entry.default_token) + }; + ( + 0, + Some(Token { + token_type, + lexeme, + literal: None, + line, + }), + ) + }) + } +} diff --git a/rust/rox/src/tokenizer/newline.rs b/rust/rox/src/tokenizer/newline.rs new file mode 100644 index 0000000..74827d3 --- /dev/null +++ b/rust/rox/src/tokenizer/newline.rs @@ -0,0 +1,19 @@ +use super::interface::Tokenizer; +use crate::token::Token; +use std::{iter::Peekable, str::CharIndices}; + +pub struct Newline; +impl Tokenizer for Newline { + fn run( + &self, + c: (usize, char), + _chars: &mut Peekable>, + _source: &str, + _line: usize, + ) -> Option<(usize, Option)> { + match c.1 { + '\n' => Some((1, None)), + _ => None, + } + } +} diff --git a/rust/rox/src/tokenizer/number.rs b/rust/rox/src/tokenizer/number.rs new file mode 100644 index 0000000..b6766cd --- /dev/null +++ b/rust/rox/src/tokenizer/number.rs @@ -0,0 +1,45 @@ +use crate::token::{Literal, Token, TokenType}; +use std::{iter::Peekable, str::CharIndices}; +use tracing::error; + +use super::interface::Tokenizer; + +pub struct Number; +impl Tokenizer for Number { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + if c.1.is_ascii_digit() { + let mut end_idx = c.0; + while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_ascii_digit()) { + end_idx = idx; + } + chars.next_if(|(_, x)| *x == '.'); + while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_ascii_digit()) { + end_idx = idx; + } + + let lexeme = source[c.0..=end_idx].to_string(); + let token = match lexeme.parse::() { + Ok(literal) => Some(Token { + token_type: TokenType::Number, + lexeme, + literal: Some(Literal::Number(literal)), + line, + }), + Err(e) => { + error!("failed to parse number: {e}"); + None + } + }; + + Some((0, token)) + } else { + None + } + } +} diff --git a/rust/rox/src/tokenizer/single_char.rs b/rust/rox/src/tokenizer/single_char.rs new file mode 100644 index 0000000..18e04b3 --- /dev/null +++ b/rust/rox/src/tokenizer/single_char.rs @@ -0,0 +1,49 @@ +use super::interface::Tokenizer; +use crate::token::{ + Token, + TokenType::{self, *}, +}; +use lazy_static::lazy_static; +use std::{collections::HashMap, iter::Peekable, str::CharIndices}; + +lazy_static! { + static ref SINGLE_CHAR_TOKENS: HashMap = { + let mut m = HashMap::new(); + m.insert('(', LeftParen); + m.insert(')', RightParen); + m.insert('{', LeftBrace); + m.insert('}', RightBrace); + m.insert(',', Comma); + m.insert('.', Dot); + m.insert('-', Minus); + m.insert('+', Plus); + m.insert(';', Semicolon); + m.insert('*', Star); + m + }; +} + +pub struct SingleChar; +impl Tokenizer for SingleChar { + fn run( + &self, + c: (usize, char), + _chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + let lexeme = source[c.0..=c.0].to_string(); + + SINGLE_CHAR_TOKENS.get(&c.1).map(|token_type| { + ( + 0, + Some(Token { + token_type: *token_type, + lexeme, + literal: None, + line, + }), + ) + }) + } +} diff --git a/rust/rox/src/tokenizer/string.rs b/rust/rox/src/tokenizer/string.rs new file mode 100644 index 0000000..07d6309 --- /dev/null +++ b/rust/rox/src/tokenizer/string.rs @@ -0,0 +1,48 @@ +use super::interface::Tokenizer; +use crate::token::{Literal, Token, TokenType}; +use std::{iter::Peekable, str::CharIndices}; +use tracing::error; + +pub struct String; +impl Tokenizer for String { + fn run( + &self, + c: (usize, char), + chars: &mut Peekable>, + source: &str, + line: usize, + ) -> Option<(usize, Option)> { + match c.1 { + '"' => { + let mut lines = 0; + while let Some((_, c)) = chars.next_if(|(_, peek)| *peek != '"') { + if c == '\n' { + lines += 1; + } + } + + match chars.next_if(|(_, x)| *x == '"') { + Some((end_idx, _)) => { + let lexeme = source[c.0..=end_idx].to_string(); + let literal = source[c.0 + 1..end_idx].to_string(); + + Some(( + lines, + Some(Token { + token_type: TokenType::String, + lexeme, + literal: Some(Literal::String(literal)), + line, + }), + )) + } + None => { + error!("unterminated string"); + None + } + } + } + _ => None, + } + } +} diff --git a/rust/rox/src/tokenizer/whitespace.rs b/rust/rox/src/tokenizer/whitespace.rs new file mode 100644 index 0000000..a47f5c2 --- /dev/null +++ b/rust/rox/src/tokenizer/whitespace.rs @@ -0,0 +1,19 @@ +use super::interface::Tokenizer; +use crate::token::Token; +use std::{iter::Peekable, str::CharIndices}; + +pub struct Whitespace; +impl Tokenizer for Whitespace { + fn run( + &self, + c: (usize, char), + _chars: &mut Peekable>, + _source: &str, + _line: usize, + ) -> Option<(usize, Option)> { + match c.1 { + ' ' | '\r' | '\t' => Some((0, None)), + _ => None, + } + } +}