aoc24: day03 tokenizer

2024-12-03 16:43:38 +01:00
parent 616f6bd610
commit c685dcddfb
13 changed files with 328 additions and 155 deletions
--- a/sol/24/03/entry.cpp
+++ b/sol/24/03/entry.cpp
@@ -0,0 +1,186 @@
+#include <fstream>
+#include <expected>
+#include <string>
+
+#include "aoc/aoc.hpp"
+#include "fmt/format.h"
+#include "ctre.hpp"
+
+#define ENUMERATOR_AOC_TOKENS                               \
+        ENUMERATOR_AOC_TOKEN(mul            , operator_  )  \
+        ENUMERATOR_AOC_TOKEN(invalid        , invalid    )  \
+        ENUMERATOR_AOC_TOKEN(numeric_literal, number     )  \
+        ENUMERATOR_AOC_TOKEN(newline        , punctuation)  \
+        ENUMERATOR_AOC_TOKEN(paren_open     , punctuation)  \
+        ENUMERATOR_AOC_TOKEN(paren_close    , punctuation)  \
+        ENUMERATOR_AOC_TOKEN(comma          , punctuation)  \
+        ENUMERATOR_AOC_TOKEN(identifier     , identifier )
+
+enum class token_type : std::uint32_t {
+#define ENUMERATOR_AOC_TOKEN(type, category) type,
+    ENUMERATOR_AOC_TOKENS
+#undef  ENUMERATOR_AOC_TOKEN
+    _count
+};
+
+enum class token_category : std::uint32_t {
+     operator_,
+     invalid,
+     number,
+     punctuation,
+    identifier,
+     _count
+};
+
+auto token_type_str(token_type type) -> char const* {
+    switch (type) {
+        using enum token_type;
+#define ENUMERATOR_AOC_TOKEN(type, category) case type: return #type;
+    ENUMERATOR_AOC_TOKENS
+#undef  ENUMERATOR_AOC_TOKEN
+        default: return "invalid";
+    }
+}
+
+auto token_type_category(token_type type) -> token_category {
+    switch (type) {
+        using enum token_category;
+#define ENUMERATOR_AOC_TOKEN(type, category) case token_type::type: return category;
+    ENUMERATOR_AOC_TOKENS
+#undef  ENUMERATOR_AOC_TOKEN
+        default: return token_category::invalid;
+    }
+}
+
+class token {
+public:
+    token(std::string const& str, token_type type, token_category category, std::size_t row, std::size_t col)
+        : m_type(type)
+        , m_category(category)
+        , m_value(str)
+        , m_row(row)
+        , m_column(col) { }
+
+    auto type() const -> token_type { return m_type; }
+    auto category() const -> token_category { return m_category; }
+    auto value() const -> std::string const& { return m_value; }
+
+    auto row() const -> std::size_t { return m_row; }
+    auto col() const -> std::size_t { return m_column; }
+
+    auto str() const -> std::string {
+        using namespace std::string_literals;
+        std::string str{"token {"};
+        str += " type: "s  + token_type_str(m_type) + ","s;
+        str += " value: \""s + m_value + "\","s;
+        str += " row: "s   + std::to_string(m_row) + ","s;
+        str += " col: "s   + std::to_string(m_column);
+        str += " }";
+        return str;
+    }
+
+public:
+    inline static auto is_identifier(std::string_view const& str) -> bool {
+        return ctre::match<"^[a-z]+$">(str);
+    }
+
+private:
+    token_type     m_type;
+    token_category m_category;
+    std::string    m_value;
+    std::size_t    m_row;
+    std::size_t    m_column;
+};
+
+enum class lexer_error {
+    eof,
+    unknown
+};
+
+class lexer {
+public:
+    lexer(std::filesystem::path const& source)
+        : m_strm(source, std::ios::in | std::ios::binary)
+        , m_line(1), m_col(1) {
+    }
+
+    auto tokenize() -> std::vector<token> {
+        std::vector<token> tokens{};
+        auto tk = next_token();
+        while (tk) {
+            tokens.emplace_back(std::move(tk.value()));
+            tk = next_token();
+        }
+        return tokens;
+    }
+
+private:
+    auto next_token() -> std::optional<token> {
+        if (!has_next()) return {};
+        if (peek() == '\n') {
+            peek_consume();
+            m_line = m_line + 1;
+            m_col  = 0;
+        }
+
+        std::string str{};
+        if (peek() == 'm') {
+            auto const col = m_col;
+            auto const is_valid_identifier_char = [](auto const c) {
+                return c >= 'a' && c <= 'z';
+            };
+            while (is_valid_identifier_char(peek())) str += peek_consume();
+            auto const& type = token::is_identifier(str) ? token_type::identifier : token_type::invalid;
+            return token(str, type, token_type_category(type), m_line, col);
+        }
+
+        if (peek() == '(') {
+            auto const col = m_col;
+            str += peek_consume();
+            return token(str, token_type::paren_open, token_type_category(token_type::paren_open), m_line, col);
+        }
+
+        if (peek() == ')') {
+            auto const col = m_col;
+            str += peek_consume();
+            return token(str, token_type::paren_close, token_type_category(token_type::paren_close), m_line, col);
+        }
+
+        if (peek() == ',') {
+            auto const col = m_col;
+            str += peek_consume();
+            return token(str, token_type::comma, token_type_category(token_type::comma), m_line, col);
+        }
+
+        auto const col = m_col;
+        str += peek_consume();
+        return token(str, token_type::invalid, token_type_category(token_type::invalid), m_line, col);
+    }
+
+    auto peek() -> char {
+        return static_cast<char>(m_strm.peek());
+    }
+    auto peek_consume() -> char {
+        ++m_col;
+        return static_cast<char>(m_strm.get());
+    }
+    auto has_next() const -> bool {
+        return !m_strm.eof();
+    }
+
+private:
+    std::fstream m_strm;
+    std::size_t  m_line;
+    std::size_t  m_col;
+};
+
+auto aoc::entry([[maybe_unused]]std::vector<std::string_view> const& args) -> void {
+    lexer lexer{"./dat/24/ex/03.txt"};
+
+    auto const tokens = lexer.tokenize();
+
+    for (auto const& tk : tokens) {
+        fmt::print("{}\n", tk.str());
+    }
+}
+