From ae959f7768ab08fc324624ab64067a2f793705cc Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Mon, 10 Feb 2025 14:36:55 +0100 Subject: [PATCH] parser in rust --- rust/rox/src/expression.rs | 20 +++ rust/rox/src/lib.rs | 16 +- rust/rox/src/parser.rs | 299 +++++++++++++++++++++++++++++++++++++ rust/rox/src/scanner.rs | 3 +- rust/rox/src/token.rs | 4 + 5 files changed, 337 insertions(+), 5 deletions(-) create mode 100644 rust/rox/src/expression.rs create mode 100644 rust/rox/src/parser.rs diff --git a/rust/rox/src/expression.rs b/rust/rox/src/expression.rs new file mode 100644 index 0000000..79f0082 --- /dev/null +++ b/rust/rox/src/expression.rs @@ -0,0 +1,20 @@ +use crate::token::{self, Token}; + +#[derive(Debug, Clone, PartialEq)] +pub enum Expression { + Binary { + left: Box, + operator: Token, + right: Box, + }, + Grouping { + expression: Box, + }, + Literal { + value: token::Literal, + }, + Unary { + operator: Token, + right: Box, + }, +} diff --git a/rust/rox/src/lib.rs b/rust/rox/src/lib.rs index 9b2becd..975ba85 100644 --- a/rust/rox/src/lib.rs +++ b/rust/rox/src/lib.rs @@ -7,10 +7,12 @@ use std::{ path::Path, }; -use tracing::error; +use tracing::{error, info}; pub mod cli; +pub mod expression; pub mod keywords; +pub mod parser; pub mod scanner; pub mod token; pub mod tokenizer { @@ -28,7 +30,11 @@ pub mod tokenizer { /// Read the source code in a file and scan it to tokens. pub fn compile(source: &Path) -> Result<(), io::Error> { let input = fs::read_to_string(source)?; - let _tokens = scanner::tokenize(&input); + let tokens = scanner::tokenize(&input); + match parser::generate_ast(tokens) { + Ok(ast) => info!("{ast:?}"), + Err(e) => error!("{e}"), + } Ok(()) } @@ -45,6 +51,10 @@ pub fn repl() { Err(e) => error!("{}", e), } let input = input.trim().to_string(); - let _tokens = scanner::tokenize(&input); + let tokens = scanner::tokenize(&input); + match parser::generate_ast(tokens) { + Ok(ast) => info!("{ast:?}"), + Err(e) => error!("{e}"), + } } } diff --git a/rust/rox/src/parser.rs b/rust/rox/src/parser.rs new file mode 100644 index 0000000..dbd8dae --- /dev/null +++ b/rust/rox/src/parser.rs @@ -0,0 +1,299 @@ +use thiserror::Error; +use tracing::error; + +use crate::{ + expression::Expression, + token::{ + self, Token, + TokenType::{self, *}, + }, +}; + +#[derive(Error, Debug)] +pub enum ParserError { + #[error("empty token stream")] + NoTokens, + #[error("line {0}: expected expression")] + ExpressionExpected(usize), + #[error("line {0}: expected ')' after expression.")] + ParenAfterExpression(usize), + #[error("Out of bounds access at index {0}.")] + OutOfBoundsAccess(usize), + #[error("line {0}: literal expected.")] + LiteralExpected(usize), +} + +/// Parse the Lox language. +#[derive(Debug, Clone)] +struct Parser { + current: usize, + current_token: Token, + tokens: Vec, +} + +impl Parser { + /// Create a new parser instance, fail if the tokens vector is empty. + fn new(tokens: Vec) -> Result { + let current_token = tokens.first().ok_or(ParserError::NoTokens)?.clone(); + + Ok(Self { + current: 0, + current_token, + tokens, + }) + } + + /// Check if any of the provided types match the type of the current token. + /// + /// If so, advance the current token. + fn matches(&mut self, types: &[TokenType]) -> bool { + let matches = types.iter().any(|x| self.check(x)); + matches.then(|| self.advance()); + matches + } + + /// Return true if the current token type matches the match_type, false otherwise. + fn check(&self, match_type: &TokenType) -> bool { + self.current_token.token_type == *match_type + } + + /// Advance the current token if we have not hit Eof yet. + /// + /// Return the token before the advancement. + fn advance(&mut self) -> Result<&Token, ParserError> { + if !self.is_at_end() { + self.current += 1; + self.current_token = self + .tokens + .get(self.current) + .ok_or(ParserError::OutOfBoundsAccess(self.current))? + .clone(); + } + + self.previous() + } + + /// Return true if the current token is Eof, false otherwise. + fn is_at_end(&self) -> bool { + self.current_token.token_type == Eof + } + + /// Return the token before the current one or an error if there is none. + fn previous(&self) -> Result<&Token, ParserError> { + self.tokens + .get(self.current - 1) + .ok_or_else(|| ParserError::OutOfBoundsAccess(self.current - 1)) + } + + /// Consume the current token if its token type matches the provided token_type and advance the + /// current token. Otherwise return None.. + fn consume(&mut self, token_type: &TokenType) -> Option<&Token> { + if self.check(token_type) { + self.advance().ok() + } else { + None + } + } + + /// Parse a binary expression using the next_precedence function and operators to match. + fn binary_expr( + &mut self, + next_precedence: impl Fn(&mut Self) -> Result, + operators: &[TokenType], + ) -> Result { + let mut expr = next_precedence(self)?; + + while self.matches(operators) { + let operator = self.previous()?.clone(); + let right = next_precedence(self)?; + expr = Expression::Binary { + left: Box::new(expr.clone()), + operator, + right: Box::new(right), + }; + } + Ok(expr) + } + + /// expression -> equality ; + fn expression(&mut self) -> Result { + self.equality() + } + + /// equality -> comparison ( ( "!=" | "==" ) comparison )* ; + fn equality(&mut self) -> Result { + self.binary_expr(Self::comparison, &[BangEqual, EqualEqual]) + } + + /// comparison -> term ( ( ">" | ">=" | "<" | "<=" ) term )* ; + fn comparison(&mut self) -> Result { + self.binary_expr(Self::term, &[Greater, GreaterEqual, Less, LessEqual]) + } + + /// term -> factor ( ( "-" | "+" ) factor )* ; + fn term(&mut self) -> Result { + self.binary_expr(Self::factor, &[Minus, Plus]) + } + + /// factor -> unary ( ( "/" | "*" ) unary )* ; + fn factor(&mut self) -> Result { + self.binary_expr(Self::unary, &[Slash, Star]) + } + + /// unary -> ( "!" | "-" ) unary | primary ; + fn unary(&mut self) -> Result { + if self.matches(&[Bang, Minus]) { + let operator = self.previous()?.clone(); + let right = self.unary()?; + + Ok(Expression::Unary { + operator, + right: Box::new(right), + }) + } else { + self.primary() + } + } + + /// primary -> NUMBER | STRING | "true" | "false" | "nil" | "(" expression ")" ; + fn primary(&mut self) -> Result { + if self.matches(&[False]) { + Ok(Expression::Literal { + value: token::Literal::Boolean(false), + }) + } else if self.matches(&[True]) { + Ok(Expression::Literal { + value: token::Literal::Boolean(true), + }) + } else if self.matches(&[Nil]) { + Ok(Expression::Literal { + value: token::Literal::Nil, + }) + } else if self.matches(&[Number, String]) { + let prev = self.previous()?; + let value = prev + .literal + .clone() + .ok_or(ParserError::LiteralExpected(prev.line))?; + + Ok(Expression::Literal { value }) + } else if self.matches(&[LeftParen]) { + let expr = self.expression()?; + let line = self.current_token.line; + self.consume(&RightParen) + .ok_or(ParserError::ParenAfterExpression(line))?; + + Ok(Expression::Grouping { + expression: Box::new(expr), + }) + } else { + let prev = self.previous()?; + Err(ParserError::ExpressionExpected(prev.line)) + } + } +} + +/// Try to parse the provided tokens into an AST. +pub fn generate_ast(tokens: Vec) -> Result { + let mut parser = Parser::new(tokens)?; + parser.expression() +} + +#[cfg(test)] +mod tests { + use crate::{ + expression::Expression, + token::{Literal, Token, TokenType}, + }; + + use super::generate_ast; + + #[test] + fn simple_expression() { + let ast = generate_ast(vec![ + Token { + token_type: TokenType::Number, + lexeme: "3".into(), + literal: Some(Literal::Number(3.0)), + line: 1, + }, + Token { + token_type: TokenType::Star, + lexeme: "*".into(), + literal: None, + line: 1, + }, + Token { + token_type: TokenType::Number, + lexeme: "4".into(), + literal: Some(Literal::Number(4.0)), + line: 1, + }, + Token { + token_type: TokenType::Plus, + lexeme: "+".into(), + literal: None, + line: 1, + }, + Token { + token_type: TokenType::Number, + lexeme: "2".into(), + literal: Some(Literal::Number(2.0)), + line: 1, + }, + Token { + token_type: TokenType::Star, + lexeme: "*".into(), + literal: None, + line: 1, + }, + Token { + token_type: TokenType::Number, + lexeme: "6".into(), + literal: Some(Literal::Number(6.0)), + line: 1, + }, + ]) + .unwrap(); + + assert_eq!( + ast, + Expression::Binary { + left: Box::new(Expression::Binary { + left: Box::new(Expression::Literal { + value: Literal::Number(3.0) + }), + operator: Token { + token_type: TokenType::Star, + lexeme: "*".into(), + literal: None, + line: 1 + }, + right: Box::new(Expression::Literal { + value: Literal::Number(4.0) + }) + }), + operator: Token { + token_type: TokenType::Plus, + lexeme: "+".into(), + literal: None, + line: 1 + }, + right: Box::new(Expression::Binary { + left: Box::new(Expression::Literal { + value: Literal::Number(2.0) + }), + operator: Token { + token_type: TokenType::Star, + lexeme: "*".into(), + literal: None, + line: 1 + }, + right: Box::new(Expression::Literal { + value: Literal::Number(6.0) + }) + }) + } + ) + } +} diff --git a/rust/rox/src/scanner.rs b/rust/rox/src/scanner.rs index f9655c3..01bd42c 100644 --- a/rust/rox/src/scanner.rs +++ b/rust/rox/src/scanner.rs @@ -9,7 +9,7 @@ use crate::{ }, }; use lazy_static::lazy_static; -use tracing::{debug, error}; +use tracing::error; lazy_static! { /// Tokenizers to use in scanning. They are tried in the exact order in which they appear in @@ -58,7 +58,6 @@ pub fn tokenize(source: &str) -> Vec { literal: None, line, }); - debug!("{:?}", tokens); tokens } diff --git a/rust/rox/src/token.rs b/rust/rox/src/token.rs index a286bb3..47b8cf8 100644 --- a/rust/rox/src/token.rs +++ b/rust/rox/src/token.rs @@ -61,6 +61,10 @@ pub enum Literal { String(String), // Number literal, represented as f64 (thus it can be decimal). Number(f64), + /// Boolean literal. + Boolean(bool), + /// Null literal. + Nil, } /// Consumed token.