initial rust scanner

This commit is contained in:
Sebastian Hugentobler 2025-02-06 18:55:54 +01:00
parent 818816d16d
commit 3cf8ef02d1
16 changed files with 610 additions and 4 deletions

22
rust/rox/Cargo.lock generated
View File

@ -247,6 +247,8 @@ name = "rox"
version = "0.1.0"
dependencies = [
"clap",
"lazy_static",
"thiserror",
"tracing",
"tracing-subscriber",
]
@ -283,6 +285,26 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.8"

View File

@ -7,5 +7,7 @@ authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
[dependencies]
clap = { version = "4.5.28", features = ["derive"] }
lazy_static = "1.5.0"
thiserror = "2.0.11"
tracing = "0.1.41"
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }

25
rust/rox/src/keywords.rs Normal file
View File

@ -0,0 +1,25 @@
use crate::token::TokenType::{self, *};
use lazy_static::lazy_static;
use std::collections::HashMap;
lazy_static! {
pub static ref KEYWORDS: HashMap<std::string::String, TokenType> = {
let mut m = HashMap::new();
m.insert("and".into(), And);
m.insert("class".into(), Class);
m.insert("else".into(), Else);
m.insert("false".into(), False);
m.insert("for".into(), For);
m.insert("fun".into(), Fun);
m.insert("if".into(), If);
m.insert("nil".into(), Nil);
m.insert("or".into(), Or);
m.insert("print".into(), Print);
m.insert("return".into(), Return);
m.insert("super".into(), Super);
m.insert("true".into(), True);
m.insert("var".into(), Var);
m.insert("while".into(), While);
m
};
}

View File

@ -1,6 +1,45 @@
use std::path::Path;
use std::{
fs::{self},
io::{self, Write},
path::Path,
};
use tracing::error;
pub mod cli;
pub mod keywords;
pub mod scanner;
pub mod token;
pub mod tokenizer {
pub mod comment;
pub mod identifier;
pub mod interface;
pub mod lookahead;
pub mod newline;
pub mod number;
pub mod single_char;
pub mod string;
pub mod whitespace;
}
pub fn compile(source: &Path) {}
pub fn repl() {}
pub fn compile(source: &Path) -> Result<(), io::Error> {
let input = fs::read_to_string(source)?;
let _tokens = scanner::tokenize(&input);
Ok(())
}
pub fn repl() {
loop {
print!("> ");
let _ = io::stdout().flush();
let mut input = String::new();
match io::stdin().read_line(&mut input) {
Ok(_) => {}
Err(e) => error!("{}", e),
}
let input = input.trim().to_string();
let _tokens = scanner::tokenize(&input);
}
}

View File

@ -1,5 +1,8 @@
use std::process::exit;
use clap::Parser;
use rox::cli::{Cli, Commands};
use tracing::error;
fn main() {
if std::env::var_os("RUST_LOG").is_none() {
@ -11,7 +14,21 @@ fn main() {
match &cli.command {
Commands::Compile(compile_config) => {
rox::compile(&compile_config.source);
if !&compile_config.source.exists() {
error!(
"{} does not exist",
&compile_config.source.to_string_lossy()
);
exit(1);
}
if let Err(e) = rox::compile(&compile_config.source) {
error!(
"failed to compile {}: {}",
&compile_config.source.to_string_lossy(),
e
);
}
}
Commands::Repl => {
rox::repl();

60
rust/rox/src/scanner.rs Normal file
View File

@ -0,0 +1,60 @@
use crate::{
token::{Token, TokenType::Eof},
tokenizer::{
comment::Comment, identifier::Identifier, interface::Tokenizer, lookahead::Lookahead,
newline::Newline, number::Number, single_char::SingleChar, string::String,
whitespace::Whitespace,
},
};
use lazy_static::lazy_static;
use tracing::{debug, error};
lazy_static! {
static ref TOKENIZERS: Vec<Box<dyn Tokenizer>> = vec![
Box::new(SingleChar),
Box::new(Whitespace),
Box::new(Newline),
Box::new(Lookahead),
Box::new(Comment),
Box::new(String),
Box::new(Number),
Box::new(Identifier),
];
}
pub fn tokenize(source: &str) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new();
let mut source_chars = source.char_indices().peekable();
let mut line = 1;
while let Some(c) = source_chars.next() {
let mut tokenizer_idx = 0;
let mut tokenizer_result = None;
while tokenizer_idx < TOKENIZERS.len() && tokenizer_result.is_none() {
tokenizer_result = TOKENIZERS[tokenizer_idx].run(c, &mut source_chars, source, line);
tokenizer_idx += 1;
}
match tokenizer_result {
Some((line_advance, token)) => {
line += line_advance;
if let Some(token) = token {
tokens.push(token);
}
}
None => {
error!("unexpected character: {}", c.1)
}
}
}
tokens.push(Token {
token_type: Eof,
lexeme: "".to_string(),
literal: None,
line,
});
debug!("{:?}", tokens);
tokens
}

76
rust/rox/src/token.rs Normal file
View File

@ -0,0 +1,76 @@
use std::fmt::Display;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum TokenType {
// Single-character tokens.
LeftParen,
RightParen,
LeftBrace,
RightBrace,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
// One or two character tokens.
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
// Literals.
Identifier,
String,
Number,
// Keywords.
And,
Class,
Else,
False,
Fun,
For,
If,
Nil,
Or,
Print,
Return,
Super,
This,
True,
Var,
While,
Eof,
}
#[derive(Clone, Debug, PartialEq)]
pub enum Literal {
String(String),
Number(f64),
}
#[derive(Clone, Debug, PartialEq)]
pub struct Token {
pub token_type: TokenType,
pub lexeme: String,
pub literal: Option<Literal>,
pub line: usize,
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}: {:?} {} {:?}",
self.line, self.token_type, self.lexeme, self.literal
)
}
}

View File

@ -0,0 +1,39 @@
use crate::token::TokenType::Slash;
use std::{iter::Peekable, str::CharIndices};
use crate::token::Token;
use super::interface::Tokenizer;
pub struct Comment;
impl Tokenizer for Comment {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
match c.1 {
'/' => {
let (line_advance, token) = if chars.next_if(|(_, peek)| *peek == '/').is_some() {
while chars.next_if(|(_, peek)| *peek != '\n').is_some() {}
chars.next();
(1, None)
} else {
(
0,
Some(Token {
token_type: Slash,
lexeme: source[c.0..=c.0].to_string(),
literal: None,
line,
}),
)
};
Some((line_advance, token))
}
_ => None,
}
}
}

View File

@ -0,0 +1,42 @@
use crate::{
keywords::KEYWORDS,
token::{Token, TokenType},
};
use std::{iter::Peekable, str::CharIndices};
use super::interface::Tokenizer;
pub struct Identifier;
impl Tokenizer for Identifier {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
if c.1.is_alphabetic() || c.1 == '_' {
let mut end_idx = c.0;
while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_alphanumeric() || c.1 == '_') {
end_idx = idx;
}
let lexeme = source[c.0..=end_idx].to_string();
let token_type = match KEYWORDS.get(&lexeme) {
Some(token_type) => *token_type,
None => TokenType::Identifier,
};
Some((
0,
Some(Token {
token_type,
lexeme,
literal: None,
line,
}),
))
} else {
None
}
}
}

View File

@ -0,0 +1,13 @@
use std::{iter::Peekable, str::CharIndices};
use crate::token::Token;
pub trait Tokenizer: Send + Sync {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)>;
}

View File

@ -0,0 +1,91 @@
use super::interface::Tokenizer;
use crate::token::TokenType::*;
use crate::token::{Token, TokenType};
use lazy_static::lazy_static;
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
struct LookaheadEntry {
default_token: TokenType,
lookahead_map: HashMap<char, TokenType>,
}
lazy_static! {
static ref LOOKAHEAD_TOKENS: HashMap<char, LookaheadEntry> = {
let mut m = HashMap::new();
let mut bang_map = HashMap::new();
bang_map.insert('=', BangEqual);
m.insert(
'!',
LookaheadEntry {
default_token: Bang,
lookahead_map: bang_map,
},
);
let mut equal_map = HashMap::new();
equal_map.insert('=', EqualEqual);
m.insert(
'=',
LookaheadEntry {
default_token: Equal,
lookahead_map: equal_map,
},
);
let mut less_map = HashMap::new();
less_map.insert('=', LessEqual);
m.insert(
'<',
LookaheadEntry {
default_token: Less,
lookahead_map: less_map,
},
);
let mut greater_map = HashMap::new();
greater_map.insert('=', GreaterEqual);
m.insert(
'>',
LookaheadEntry {
default_token: Greater,
lookahead_map: greater_map,
},
);
m
};
}
pub struct Lookahead;
impl Tokenizer for Lookahead {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
LOOKAHEAD_TOKENS.get(&c.1).map(|entry| {
let (lexeme, token_type) = if let Some(&(_, peeked)) = chars.peek() {
if let Some(&token_type) = entry.lookahead_map.get(&peeked) {
chars.next();
(source[c.0..=c.0 + 1].to_string(), token_type)
} else {
(source[c.0..=c.0].to_string(), entry.default_token)
}
} else {
(source[c.0..=c.0].to_string(), entry.default_token)
};
(
0,
Some(Token {
token_type,
lexeme,
literal: None,
line,
}),
)
})
}
}

View File

@ -0,0 +1,19 @@
use super::interface::Tokenizer;
use crate::token::Token;
use std::{iter::Peekable, str::CharIndices};
pub struct Newline;
impl Tokenizer for Newline {
fn run(
&self,
c: (usize, char),
_chars: &mut Peekable<CharIndices<'_>>,
_source: &str,
_line: usize,
) -> Option<(usize, Option<Token>)> {
match c.1 {
'\n' => Some((1, None)),
_ => None,
}
}
}

View File

@ -0,0 +1,45 @@
use crate::token::{Literal, Token, TokenType};
use std::{iter::Peekable, str::CharIndices};
use tracing::error;
use super::interface::Tokenizer;
pub struct Number;
impl Tokenizer for Number {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
if c.1.is_ascii_digit() {
let mut end_idx = c.0;
while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_ascii_digit()) {
end_idx = idx;
}
chars.next_if(|(_, x)| *x == '.');
while let Some((idx, _)) = chars.next_if(|(_, x)| x.is_ascii_digit()) {
end_idx = idx;
}
let lexeme = source[c.0..=end_idx].to_string();
let token = match lexeme.parse::<f64>() {
Ok(literal) => Some(Token {
token_type: TokenType::Number,
lexeme,
literal: Some(Literal::Number(literal)),
line,
}),
Err(e) => {
error!("failed to parse number: {e}");
None
}
};
Some((0, token))
} else {
None
}
}
}

View File

@ -0,0 +1,49 @@
use super::interface::Tokenizer;
use crate::token::{
Token,
TokenType::{self, *},
};
use lazy_static::lazy_static;
use std::{collections::HashMap, iter::Peekable, str::CharIndices};
lazy_static! {
static ref SINGLE_CHAR_TOKENS: HashMap<char, TokenType> = {
let mut m = HashMap::new();
m.insert('(', LeftParen);
m.insert(')', RightParen);
m.insert('{', LeftBrace);
m.insert('}', RightBrace);
m.insert(',', Comma);
m.insert('.', Dot);
m.insert('-', Minus);
m.insert('+', Plus);
m.insert(';', Semicolon);
m.insert('*', Star);
m
};
}
pub struct SingleChar;
impl Tokenizer for SingleChar {
fn run(
&self,
c: (usize, char),
_chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
let lexeme = source[c.0..=c.0].to_string();
SINGLE_CHAR_TOKENS.get(&c.1).map(|token_type| {
(
0,
Some(Token {
token_type: *token_type,
lexeme,
literal: None,
line,
}),
)
})
}
}

View File

@ -0,0 +1,48 @@
use super::interface::Tokenizer;
use crate::token::{Literal, Token, TokenType};
use std::{iter::Peekable, str::CharIndices};
use tracing::error;
pub struct String;
impl Tokenizer for String {
fn run(
&self,
c: (usize, char),
chars: &mut Peekable<CharIndices<'_>>,
source: &str,
line: usize,
) -> Option<(usize, Option<Token>)> {
match c.1 {
'"' => {
let mut lines = 0;
while let Some((_, c)) = chars.next_if(|(_, peek)| *peek != '"') {
if c == '\n' {
lines += 1;
}
}
match chars.next_if(|(_, x)| *x == '"') {
Some((end_idx, _)) => {
let lexeme = source[c.0..=end_idx].to_string();
let literal = source[c.0 + 1..end_idx].to_string();
Some((
lines,
Some(Token {
token_type: TokenType::String,
lexeme,
literal: Some(Literal::String(literal)),
line,
}),
))
}
None => {
error!("unterminated string");
None
}
}
}
_ => None,
}
}
}

View File

@ -0,0 +1,19 @@
use super::interface::Tokenizer;
use crate::token::Token;
use std::{iter::Peekable, str::CharIndices};
pub struct Whitespace;
impl Tokenizer for Whitespace {
fn run(
&self,
c: (usize, char),
_chars: &mut Peekable<CharIndices<'_>>,
_source: &str,
_line: usize,
) -> Option<(usize, Option<Token>)> {
match c.1 {
' ' | '\r' | '\t' => Some((0, None)),
_ => None,
}
}
}