Selaa lähdekoodia

:tada: first commit, can tokenize db, dw, dd and dq, as well as comma and decimal literals

Felix Bytow 2 vuotta sitten
sitoutus
874e938802

+ 10 - 0
.gitignore

@@ -0,0 +1,10 @@
+# Rust
+/target
+Cargo.lock
+
+# CLion
+*.iml
+.idea/
+
+# MacOS
+.DS_Store

+ 19 - 0
.run/Documentation.run.xml

@@ -0,0 +1,19 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Documentation" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
+    <option name="command" value="doc --no-deps" />
+    <option name="workingDirectory" value="file://$PROJECT_DIR$" />
+    <option name="channel" value="STABLE" />
+    <option name="requiredFeatures" value="true" />
+    <option name="allFeatures" value="false" />
+    <option name="emulateTerminal" value="false" />
+    <option name="withSudo" value="false" />
+    <option name="buildTarget" value="REMOTE" />
+    <option name="backtrace" value="SHORT" />
+    <envs />
+    <option name="isRedirectInput" value="false" />
+    <option name="redirectInputPath" value="" />
+    <method v="2">
+      <option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
+    </method>
+  </configuration>
+</component>

+ 19 - 0
.run/Test assembly.run.xml

@@ -0,0 +1,19 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Test assembly" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
+    <option name="command" value="test --package assembly" />
+    <option name="workingDirectory" value="file://$PROJECT_DIR$" />
+    <option name="channel" value="STABLE" />
+    <option name="requiredFeatures" value="true" />
+    <option name="allFeatures" value="true" />
+    <option name="emulateTerminal" value="false" />
+    <option name="withSudo" value="false" />
+    <option name="buildTarget" value="REMOTE" />
+    <option name="backtrace" value="SHORT" />
+    <envs />
+    <option name="isRedirectInput" value="false" />
+    <option name="redirectInputPath" value="" />
+    <method v="2">
+      <option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
+    </method>
+  </configuration>
+</component>

+ 5 - 0
Cargo.toml

@@ -0,0 +1,5 @@
+[workspace]
+members = [
+    "assembly",
+    "assembler"
+]

+ 12 - 0
assembler/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+name = "assembler"
+description = "A simple assembler"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+authors = ["Felix Bytow <drako@drako.guru>"]
+publish = ["crates-drako-guru"]
+repository = "https://git.drako.guru/drako/asm-rs"
+
+[dependencies]
+assembly = { path = "../assembly" }

+ 3 - 0
assembler/src/main.rs

@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}

+ 15 - 0
assembly/Cargo.toml

@@ -0,0 +1,15 @@
+[package]
+name = "assembly"
+description = "An embeddable assembler"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+authors = ["Felix Bytow"]
+publish = ["crates-drako-guru"]
+repository = "https://git.drako.guru/drako/asm-rs"
+
+[lib]
+doctest = true
+
+[dependencies]
+phf = { version = "0.11.1", features = ["macros"] }

+ 210 - 0
assembly/src/lexer.rs

@@ -0,0 +1,210 @@
+use std::str::Chars;
+use crate::lexer::LexerState::{Decimal, Space, Word};
+use crate::token::{INSTRUCTIONS, Token, TokenInfo};
+
+/// Possible errors the `Lexer` may encounter.
+#[derive(Debug)]
+pub enum LexerError {
+    /// An unknown instruction was found.
+    InvalidInstruction {
+        /// The line in the input.
+        line: usize,
+
+        /// The column in the input where the instruction started.
+        column: usize,
+
+        /// The invalid instruction.
+        instruction: String,
+    },
+
+    /// A decimal literal was too large to fit in a `u64`.
+    TooLargeDecimalLiteral {
+        /// The line in the input.
+        line: usize,
+
+        /// The column in the input where the literal started.
+        column: usize,
+    },
+}
+
+#[doc(hidden)]
+enum LexerState {
+    Space,
+    Word { start: usize, value: String },
+    Decimal { start: usize, value: u64 },
+}
+
+/// Class turning characters into `Token`s.
+pub struct Lexer {
+    /// 1-based line number.
+    line: usize,
+
+    /// 1 base column.
+    column: usize,
+
+    /// The lexer is a state machine and this is the current state.
+    state: LexerState,
+}
+
+impl Lexer {
+    /// Create a new lexer.
+    pub fn new() -> Self {
+        Self {
+            line: 1,
+            column: 1,
+            state: Space,
+        }
+    }
+
+    /// Tokenize the given characters.
+    ///
+    /// Every `TokenInfo` contains additional information about the source location of the `Token`.
+    pub fn tokenize(&mut self, input: Chars) -> Result<Vec<TokenInfo>, LexerError> {
+        let mut tokens = Vec::new();
+
+        for c in input {
+            self.handle_character(c, &mut tokens)?;
+        }
+        if let Space = &self.state {} else {
+            self.handle_character('\n', &mut tokens)?;
+        }
+
+        Ok(tokens)
+    }
+
+    fn handle_character(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
+        match &self.state {
+            Space => self.state_space(c, tokens),
+            Word { .. } => self.state_word(c, tokens),
+            Decimal { .. } => self.state_decimal(c, tokens),
+        }
+    }
+
+    fn state_space(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
+        if c.is_ascii_alphabetic() {
+            self.state = Word { start: self.column, value: String::new() };
+            self.state_word(c, tokens)
+        } else if c.is_ascii_digit() {
+            self.state = Decimal { start: self.column, value: 0 };
+            self.state_decimal(c, tokens)
+        } else if c == ',' {
+            tokens.push(TokenInfo {
+                token: Token::Comma,
+                line: self.line,
+                column: self.column,
+            });
+            Ok(())
+        } else {
+            Ok(())
+        }
+    }
+
+    fn state_word(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
+        if let Word { start, value } = &self.state {
+            if c.is_ascii_alphanumeric() {
+                let mut v = value.clone();
+                v.push(c);
+                self.state = Word { start: *start, value: v };
+                Ok(())
+            } else {
+                let inst = value.clone();
+                let column = *start;
+
+                if !INSTRUCTIONS.contains(&inst) {
+                    return Err(LexerError::InvalidInstruction {
+                        line: self.line,
+                        column,
+                        instruction: inst,
+                    });
+                }
+
+                tokens.push(TokenInfo {
+                    token: Token::Instruction(inst),
+                    line: self.line,
+                    column,
+                });
+                self.state = Space;
+                self.state_space(c, tokens)
+            }
+        } else {
+            unreachable!("state_word is only called when state is a Word.")
+        }
+    }
+
+    fn state_decimal(&mut self, c: char, tokens: &mut Vec<TokenInfo>) -> Result<(), LexerError> {
+        if let Decimal { start, value } = &self.state {
+            let s = *start;
+            let v = *value;
+            if c.is_ascii_digit() {
+                let digit = unsafe { c.to_digit(10).unwrap_unchecked() } as u64;
+                let new_v = v
+                    .checked_mul(10u64)
+                    .and_then(|x| x.checked_add(digit))
+                    .ok_or(LexerError::TooLargeDecimalLiteral { line: self.line, column: s })?;
+                self.state = Decimal {
+                    start: s,
+                    value: new_v,
+                };
+                Ok(())
+            } else {
+                tokens.push(TokenInfo {
+                    token: Token::DecimalLiteral(v),
+                    line: self.line,
+                    column: s,
+                });
+                self.state = Space;
+                self.state_space(c, tokens)
+            }
+        } else {
+            unreachable!("state_word is only called when state is a Word.")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::lexer::{Lexer, LexerError};
+    use crate::token::{INSTRUCTIONS, Token, TokenInfo};
+
+    #[test]
+    fn instructions() -> Result<(), LexerError> {
+        for &inst in INSTRUCTIONS.iter() {
+            let mut lexer = Lexer::new();
+            let tokens = lexer.tokenize(inst.chars())?;
+            let expected = vec![
+                TokenInfo {
+                    token: Token::Instruction(inst.to_string()),
+                    line: 1,
+                    column: 1,
+                }
+            ];
+            assert_eq!(tokens, expected);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn decimals() -> Result<(), LexerError> {
+        for n in [0, 23, 42, 1337, 500000] {
+            let mut lexer = Lexer::new();
+            let tokens = lexer.tokenize(n.to_string().chars())?;
+            let expected = vec![
+                TokenInfo {
+                    token: Token::DecimalLiteral(n),
+                    line: 1,
+                    column: 1,
+                }
+            ];
+            assert_eq!(tokens, expected);
+        }
+        Ok(())
+    }
+
+    #[test]
+    #[should_panic(expected = "TooLargeDecimalLiteral { line: 1, column: 1 }")]
+    fn too_large_decimal_literal() {
+        Lexer::new()
+            .tokenize((u64::MAX.to_string() + "0").chars())
+            .unwrap();
+    }
+}

+ 8 - 0
assembly/src/lib.rs

@@ -0,0 +1,8 @@
+#![warn(missing_docs)]
+#![doc = "An embeddable assembler."]
+
+/// The `Lexer`, turning characters into `Token`s.
+pub mod lexer;
+
+/// `Token`s and `TokenInfo`s.
+pub mod token;

+ 35 - 0
assembly/src/token.rs

@@ -0,0 +1,35 @@
+use phf::phf_set;
+
+/// An assembly token.
+#[derive(Debug, PartialEq, Clone)]
+pub enum Token {
+    /// An instruction in the given instruction set.
+    Instruction(String),
+
+    /// The `,` character.
+    Comma,
+
+    /// A base-10 unsigned integer literal.
+    DecimalLiteral(u64),
+}
+
+/// A `Token` with source position information.
+#[derive(Debug, PartialEq, Clone)]
+pub struct TokenInfo {
+    /// The `Token`.
+    pub token: Token,
+
+    /// The line in the source.
+    pub line: usize,
+
+    /// The column in the source where the token started.
+    pub column: usize,
+}
+
+/// All currently supported instructions.
+pub const INSTRUCTIONS: phf::Set<&'static str> = phf_set! {
+    "db",
+    "dw",
+    "dd",
+    "dq"
+};