use std::{fmt::{Debug, Display}, error::Error, cmp::min, str::FromStr}; use lazy_static::lazy_static; use regex::Regex; // ex. UnknownToken: ..., at [char-14] static MAX_LEN_OF_ERROR_STRING_SHOWN: usize = 10; lazy_static! { static ref NUMBER_RE: Regex = Regex::new(r"^\d+").unwrap(); static ref STRING_RE: Regex = Regex::new(r#"^"[^"]*""#).unwrap(); static ref NONE_RE: Regex = Regex::new(r#"^none"#).unwrap(); static ref BOOL_RE: Regex = Regex::new(r#"^(yes|no)"#).unwrap(); static ref IDENT_RE: Regex = Regex::new(r#"^[a-zA-z][a-zA-z\d]+"#).unwrap(); static ref MATH_OP_RE: Regex = Regex::new(r#"^(\+|\-|\*|/)"#).unwrap(); static ref BITW_OP_RE: Regex = Regex::new(r#"^(\^|&|\||~)"#).unwrap(); static ref KEYW_RE: Regex = Regex::new(r#"^(not|is not|is|also|or|do|if|while|job|with info|sayhello)"#).unwrap(); static ref STMT_BLK_OPN_RE: Regex = Regex::new(r#"^#"#).unwrap(); static ref STMT_BLK_CLS_RE: Regex = Regex::new(r#"^>#"#).unwrap(); static ref PAREN_OPN_RE: Regex = Regex::new(r#"^\("#).unwrap(); static ref PAREN_CLS_RE: Regex = Regex::new(r#"^\)"#).unwrap(); static ref COMMENT_RE: Regex = Regex::new(r#"^\-\-"#).unwrap(); static ref WHTSPCE_RE: Regex = Regex::new(r#"^\s+"#).unwrap(); } macro_rules! token_match_regex { ($re:expr, $token:expr, $t_buf:expr) => { if let Some(m) = $re.find($t_buf) { return Some(( $token, m.end() - m.start() )); } }; } #[derive(Debug)] pub enum Token<'a> { None, NumericLiteral { value: i64 }, StringLiteral { value: &'a str }, Bool { value: bool }, Identifier { value: &'a str }, MathOperator { value: &'a str }, BitwiseOperator{ value: &'a str }, Keyword { value: &'a str }, Whitespace { value: &'a str }, Comment, StmtBlockOpen, StmtBlockClose, ParenOpen, ParenClose, } pub struct TokenIterator<'a> { source: &'a str, cursor: usize, } impl<'a> TokenIterator<'a> { pub fn new(source: &'a str) -> Self { Self { source: source.trim(), cursor: 0 } } fn match_token(&self, t_buf: &'a str) -> Option<(Token<'a>, usize)> { if t_buf.is_empty() { return None }; if let Some(m) = WHTSPCE_RE.find(t_buf) { return Some(( Token::Whitespace { value: m.as_str() }, m.end() - m.start() )); }; token_match_regex!(COMMENT_RE, Token::Comment, t_buf); // if let Some(m) = COMMENT_RE.find(t_buf) { // return Some(( // Token::Comment, // m.end() - m.start() // )); // }; if let Some(m) = KEYW_RE.find(t_buf) { return Some(( Token::Keyword { value: m.as_str() }, m.end() - m.start() )); }; if let Some(m) = NUMBER_RE.find(t_buf) { return Some(( Token::NumericLiteral { value: m.as_str().parse().ok()? }, m.end() - m.start() )); }; if let Some(m) = STRING_RE.find(t_buf) { return Some(( Token::StringLiteral { value: &m.as_str()[m.start()+1..m.end()-1] }, m.end() - m.start() )); }; if let Some(m) = NONE_RE.find(t_buf) { return Some(( Token::None, m.end() - m.start() )); }; if let Some(m) = BOOL_RE.find(t_buf) { return Some(( Token::Bool { value: match m.as_str() {"yes"=>true,"no"=>false,_=>None?} }, m.end() - m.start() )); }; if let Some(m) = IDENT_RE.find(t_buf) { return Some(( Token::Identifier { value: m.as_str() }, m.end() - m.start() )); }; if let Some(m) = MATH_OP_RE.find(t_buf) { return Some(( Token::MathOperator { value: m.as_str() }, m.end() - m.start() )); }; if let Some(m) = BITW_OP_RE.find(t_buf) { return Some(( Token::BitwiseOperator { value: m.as_str() }, m.end() - m.start() )); }; if let Some(m) = STMT_BLK_OPN_RE.find(t_buf) { return Some(( Token::StmtBlockOpen, m.end() - m.start() )); }; if let Some(m) = STMT_BLK_CLS_RE.find(t_buf) { return Some(( Token::StmtBlockClose, m.end() - m.start() )); }; if let Some(m) = PAREN_OPN_RE.find(t_buf) { return Some(( Token::ParenOpen, m.end() - m.start() )); }; if let Some(m) = PAREN_CLS_RE.find(t_buf) { return Some(( Token::ParenClose, m.end() - m.start() )); }; None } } impl<'a> Iterator for TokenIterator<'a> { type Item = Result, LexerError>; fn next(&mut self) -> Option { if self.cursor >= self.source.len() { return None; } if let Some((tok, tok_len)) = self.match_token( &self.source[self.cursor..] ) { self.cursor += tok_len; return Some(Ok(tok)); } return Some(Err( LexerError::UnknownToken { value: self.source[ self.cursor.. self.cursor + min( self.source.len()-self.cursor, MAX_LEN_OF_ERROR_STRING_SHOWN ) ] .to_string(), loc: self.cursor } )); } } pub struct Lexer<'a> { pub token_generator: TokenIterator<'a> } pub enum LexerError { UnknownToken { value: String, loc: usize, } } impl Display for LexerError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { LexerError::UnknownToken{value, loc} => { write!(f, "UnknownToken: {{{}}} -> at [char-{}]", value.escape_debug(), loc) } } } } impl Debug for LexerError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f,"{}",self) } } impl Error for LexerError {} impl<'a> Lexer<'a> { pub fn new(code: &'a str) -> Self { Self { token_generator: TokenIterator::new(code) } } pub fn get_token(&mut self) -> Option, LexerError>> { self.token_generator.next() } // fn match_bool() {} }