|
@@ -0,0 +1,210 @@
|
|
|
+use std::str::Chars;
|
|
|
+use crate::lexer::LexerState::{Decimal, Space, Word};
|
|
|
+use crate::token::{INSTRUCTIONS, Token, TokenInfo};
|
|
|
+
|
|
|
+/// Possible errors the `Lexer` may encounter.
|
|
|
+#[derive(Debug)]
|
|
|
+pub enum LexerError {
|
|
|
+ /// An unknown instruction was found.
|
|
|
+ InvalidInstruction {
|
|
|
+ /// The line in the input.
|
|
|
+ line: usize,
|
|
|
+
|
|
|
+ /// The column in the input where the instruction started.
|
|
|
+ column: usize,
|
|
|
+
|
|
|
+ /// The invalid instruction.
|
|
|
+ instruction: String,
|
|
|
+ },
|
|
|
+
|
|
|
+ /// A decimal literal was too large to fit in a `u64`.
|
|
|
+ TooLargeDecimalLiteral {
|
|
|
+ /// The line in the input.
|
|
|
+ line: usize,
|
|
|
+
|
|
|
+ /// The column in the input where the literal started.
|
|
|
+ column: usize,
|
|
|
+ },
|
|
|
+}
|
|
|
+
|
|
|
+#[doc(hidden)]
|
|
|
+enum LexerState {
|
|
|
+ Space,
|
|
|
+ Word { start: usize, value: String },
|
|
|
+ Decimal { start: usize, value: u64 },
|
|
|
+}
|
|
|
+
|
|
|
+/// Class turning characters into `Token`s.
|
|
|
+pub struct Lexer {
|
|
|
+ /// 1-based line number.
|
|
|
+ line: usize,
|
|
|
+
|
|
|
+ /// 1 base column.
|
|
|
+ column: usize,
|
|
|
+
|
|
|
+ /// The lexer is a state machine and this is the current state.
|
|
|
+ state: LexerState,
|
|
|
+}
|
|
|
+
|
|
|
+impl Lexer {
|
|
|
+ /// Create a new lexer.
|
|
|
+ pub fn new() -> Self {
|
|
|
+ Self {
|
|
|
+ line: 1,
|
|
|
+ column: 1,
|
|
|
+ state: Space,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Tokenize the given characters.
|
|
|
+ ///
|
|
|
+ /// Every `TokenInfo` contains additional information about the source location of the `Token`.
|
|
|
+ pub fn tokenize(&mut self, input: Chars) -> Result<Vec<TokenInfo>, LexerError> {
|
|
|
+ let mut tokens = Vec::new();
|
|
|
+
|
|
|
+ for c in input {
|
|
|
+ self.handle_character(c, &mut tokens)?;
|
|
|
+ }
|
|
|
+ if let Space = &self.state {} else {
|
|
|
+ self.handle_character('\n', &mut tokens)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(tokens)
|
|
|
+ }
|
|
|
+
|
|
|
+ fn handle_character(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
|
|
|
+ match &self.state {
|
|
|
+ Space => self.state_space(c, tokens),
|
|
|
+ Word { .. } => self.state_word(c, tokens),
|
|
|
+ Decimal { .. } => self.state_decimal(c, tokens),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn state_space(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
|
|
|
+ if c.is_ascii_alphabetic() {
|
|
|
+ self.state = Word { start: self.column, value: String::new() };
|
|
|
+ self.state_word(c, tokens)
|
|
|
+ } else if c.is_ascii_digit() {
|
|
|
+ self.state = Decimal { start: self.column, value: 0 };
|
|
|
+ self.state_decimal(c, tokens)
|
|
|
+ } else if c == ',' {
|
|
|
+ tokens.push(TokenInfo {
|
|
|
+ token: Token::Comma,
|
|
|
+ line: self.line,
|
|
|
+ column: self.column,
|
|
|
+ });
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn state_word(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
|
|
|
+ if let Word { start, value } = &self.state {
|
|
|
+ if c.is_ascii_alphanumeric() {
|
|
|
+ let mut v = value.clone();
|
|
|
+ v.push(c);
|
|
|
+ self.state = Word { start: *start, value: v };
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ let inst = value.clone();
|
|
|
+ let column = *start;
|
|
|
+
|
|
|
+ if !INSTRUCTIONS.contains(&inst) {
|
|
|
+ return Err(LexerError::InvalidInstruction {
|
|
|
+ line: self.line,
|
|
|
+ column,
|
|
|
+ instruction: inst,
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ tokens.push(TokenInfo {
|
|
|
+ token: Token::Instruction(inst),
|
|
|
+ line: self.line,
|
|
|
+ column,
|
|
|
+ });
|
|
|
+ self.state = Space;
|
|
|
+ self.state_space(c, tokens)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ unreachable!("state_word is only called when state is a Word.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn state_decimal(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
|
|
|
+ if let Decimal { start, value } = &self.state {
|
|
|
+ let s = *start;
|
|
|
+ let v = *value;
|
|
|
+ if c.is_ascii_digit() {
|
|
|
+ let digit = unsafe { c.to_digit(10).unwrap_unchecked() } as u64;
|
|
|
+ let new_v = v
|
|
|
+ .checked_mul(10u64)
|
|
|
+ .and_then(|x| x.checked_add(digit))
|
|
|
+ .ok_or(LexerError::TooLargeDecimalLiteral { line: self.line, column: s })?;
|
|
|
+ self.state = Decimal {
|
|
|
+ start: s,
|
|
|
+ value: new_v,
|
|
|
+ };
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ tokens.push(TokenInfo {
|
|
|
+ token: Token::DecimalLiteral(v),
|
|
|
+ line: self.line,
|
|
|
+ column: s,
|
|
|
+ });
|
|
|
+ self.state = Space;
|
|
|
+ self.state_space(c, tokens)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ unreachable!("state_word is only called when state is a Word.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use crate::lexer::{Lexer, LexerError};
|
|
|
+ use crate::token::{INSTRUCTIONS, Token, TokenInfo};
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn instructions() -> Result<(), LexerError> {
|
|
|
+ for &inst in INSTRUCTIONS.iter() {
|
|
|
+ let mut lexer = Lexer::new();
|
|
|
+ let tokens = lexer.tokenize(inst.chars())?;
|
|
|
+ let expected = vec![
|
|
|
+ TokenInfo {
|
|
|
+ token: Token::Instruction(inst.to_string()),
|
|
|
+ line: 1,
|
|
|
+ column: 1,
|
|
|
+ }
|
|
|
+ ];
|
|
|
+ assert_eq!(tokens, expected);
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn decimals() -> Result<(), LexerError> {
|
|
|
+ for n in [0, 23, 42, 1337, 500000] {
|
|
|
+ let mut lexer = Lexer::new();
|
|
|
+ let tokens = lexer.tokenize(n.to_string().chars())?;
|
|
|
+ let expected = vec![
|
|
|
+ TokenInfo {
|
|
|
+ token: Token::DecimalLiteral(n),
|
|
|
+ line: 1,
|
|
|
+ column: 1,
|
|
|
+ }
|
|
|
+ ];
|
|
|
+ assert_eq!(tokens, expected);
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ #[should_panic(expected = "TooLargeDecimalLiteral { line: 1, column: 1 }")]
|
|
|
+ fn too_large_decimal_literal() {
|
|
|
+ Lexer::new()
|
|
|
+ .tokenize((u64::MAX.to_string() + "0").chars())
|
|
|
+ .unwrap();
|
|
|
+ }
|
|
|
+}
|