feat(ic2c): further ic2c implementation

YakshaLang · Jun 18, 2023 · 3da613c · 3da613c
1 parent 0b4938a
commit 3da613c
Show file tree

Hide file tree

Showing 20 changed files with 634 additions and 89 deletions.
diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt
@@ -16,7 +16,7 @@ include_directories(tests)
 include_directories(${UTF8_DIR})
 
 set(YAKSHA_SOURCE_FILES
-        src/ast/ast.h src/ast/ast_printer.h src/ast/ast_vis.h src/ast/codefiles.h src/ast/environment.h src/ast/environment_stack.h src/ast/parser.h src/builtins/builtin.h src/builtins/builtins.h src/compiler/compiler.h src/compiler/compiler_utils.h src/compiler/datatype_compiler.h src/compiler/datatype_parser.h src/compiler/def_class_visitor.h src/compiler/delete_stack.h src/compiler/delete_stack_stack.h src/compiler/desugaring_compiler.h src/compiler/entry_struct_func_compiler.h src/compiler/function_datatype_extractor.h src/compiler/multifile_compiler.h src/compiler/slot_matcher.h src/compiler/statement_writer.h src/compiler/type_checker.h src/file_formats/ic_tokens_file.h src/file_formats/tokens_file.h src/ic2c/ic2c.h src/ic2c/ic_ast.h src/ic2c/ic_compiler.h src/ic2c/ic_level2_parser.h src/ic2c/ic_level2_tokenizer.h src/ic2c/ic_optimizer.h src/ic2c/ic_parser.h src/ic2c/ic_preprocessor.h src/ic2c/ic_token.h src/ic2c/ic_tokenizer.h src/tokenizer/block_analyzer.h src/tokenizer/string_utils.h src/tokenizer/token.h src/tokenizer/tokenizer.h src/utilities/annotation.h src/utilities/annotations.h src/utilities/defer_stack.h src/utilities/defer_stack_stack.h src/utilities/error_printer.h src/utilities/ykdatatype.h src/utilities/ykdt_pool.h src/utilities/ykobject.h src/ast/ast.cpp src/ast/ast_printer.cpp src/ast/ast_vis.cpp src/ast/codefiles.cpp src/ast/environment.cpp src/ast/environment_stack.cpp src/ast/parser.cpp src/builtins/builtins.cpp src/compiler/compiler.cpp src/compiler/compiler_utils.cpp src/compiler/def_class_visitor.cpp src/compiler/delete_stack.cpp src/compiler/delete_stack_stack.cpp src/compiler/desugaring_compiler.cpp src/compiler/entry_struct_func_compiler.cpp src/compiler/multifile_compiler.cpp src/compiler/type_checker.cpp src/file_formats/ic_tokens_file.cpp src/file_formats/tokens_file.cpp src/ic2c/ic2c.cpp src/ic2c/ic_ast.cpp src/ic2c/ic_compiler.cpp src/ic2c/ic_level2_parser.cpp src/ic2c/ic_level2_tokenizer.cpp src/ic2c/ic_optimizer.cpp src/ic2c/ic_parser.cpp src/ic2c/ic_preprocessor.cpp src/ic2c/ic_tokenizer.cpp src/tokenizer/block_analyzer.cpp src/tokenizer/string_utils.cpp src/tokenizer/tokenizer.cpp src/utilities/annotation.cpp src/utilities/annotations.cpp src/utilities/defer_stack.cpp src/utilities/defer_stack_stack.cpp src/utilities/error_printer.cpp src/utilities/ykdatatype.cpp src/utilities/ykdt_pool.cpp src/utilities/ykobject.cpp) # update_makefile.py SRC
+        src/ast/ast.h src/ast/ast_printer.h src/ast/ast_vis.h src/ast/codefiles.h src/ast/environment.h src/ast/environment_stack.h src/ast/parser.h src/builtins/builtin.h src/builtins/builtins.h src/compiler/compiler.h src/compiler/compiler_utils.h src/compiler/datatype_compiler.h src/compiler/datatype_parser.h src/compiler/def_class_visitor.h src/compiler/delete_stack.h src/compiler/delete_stack_stack.h src/compiler/desugaring_compiler.h src/compiler/entry_struct_func_compiler.h src/compiler/function_datatype_extractor.h src/compiler/multifile_compiler.h src/compiler/slot_matcher.h src/compiler/statement_writer.h src/compiler/type_checker.h src/file_formats/ic_tokens_file.h src/file_formats/tokens_file.h src/ic2c/ic2c.h src/ic2c/ic_ast.h src/ic2c/ic_compiler.h src/ic2c/ic_level2_parser.h src/ic2c/ic_level2_tokenizer.h src/ic2c/ic_line_splicer.h src/ic2c/ic_optimizer.h src/ic2c/ic_parser.h src/ic2c/ic_peek_ahead_iter.h src/ic2c/ic_preprocessor.h src/ic2c/ic_simple_character_iter.h src/ic2c/ic_token.h src/ic2c/ic_tokenizer.h src/ic2c/ic_trigraph_translater.h src/tokenizer/block_analyzer.h src/tokenizer/string_utils.h src/tokenizer/token.h src/tokenizer/tokenizer.h src/utilities/annotation.h src/utilities/annotations.h src/utilities/defer_stack.h src/utilities/defer_stack_stack.h src/utilities/error_printer.h src/utilities/ykdatatype.h src/utilities/ykdt_pool.h src/utilities/ykobject.h src/ast/ast.cpp src/ast/ast_printer.cpp src/ast/ast_vis.cpp src/ast/codefiles.cpp src/ast/environment.cpp src/ast/environment_stack.cpp src/ast/parser.cpp src/builtins/builtins.cpp src/compiler/compiler.cpp src/compiler/compiler_utils.cpp src/compiler/def_class_visitor.cpp src/compiler/delete_stack.cpp src/compiler/delete_stack_stack.cpp src/compiler/desugaring_compiler.cpp src/compiler/entry_struct_func_compiler.cpp src/compiler/multifile_compiler.cpp src/compiler/type_checker.cpp src/file_formats/ic_tokens_file.cpp src/file_formats/tokens_file.cpp src/ic2c/ic2c.cpp src/ic2c/ic_ast.cpp src/ic2c/ic_compiler.cpp src/ic2c/ic_level2_parser.cpp src/ic2c/ic_level2_tokenizer.cpp src/ic2c/ic_line_splicer.cpp src/ic2c/ic_optimizer.cpp src/ic2c/ic_parser.cpp src/ic2c/ic_peek_ahead_iter.cpp src/ic2c/ic_preprocessor.cpp src/ic2c/ic_tokenizer.cpp src/ic2c/ic_trigraph_translater.cpp src/tokenizer/block_analyzer.cpp src/tokenizer/string_utils.cpp src/tokenizer/tokenizer.cpp src/utilities/annotation.cpp src/utilities/annotations.cpp src/utilities/defer_stack.cpp src/utilities/defer_stack_stack.cpp src/utilities/error_printer.cpp src/utilities/ykdatatype.cpp src/utilities/ykdt_pool.cpp src/utilities/ykobject.cpp) # update_makefile.py SRC
 
 set(YAKSHA_TEST_FILES
         tests/btest.h tests/test_block_analyzer.cpp tests/test_compiler.cpp tests/test_ic_tokenizer.cpp tests/test_parser.cpp tests/test_string_utils.cpp tests/test_tokenizer.cpp tests/test_type_checker.cpp) # update_makefile.py TESTS

diff --git a/compiler/hammer.toml b/compiler/hammer.toml
@@ -21,7 +21,7 @@ args_c=["-std=c99"]
 include_paths=["src", "3rd/utfcpp/source", "runtime"]
 # .c or .cpp files that get compiled to .o files
 # so this is all except the .c/.cpp file with main()
-sources=["src/ast/ast.cpp", "src/ast/ast_printer.cpp", "src/ast/ast_vis.cpp", "src/ast/codefiles.cpp", "src/ast/environment.cpp", "src/ast/environment_stack.cpp", "src/ast/parser.cpp", "src/builtins/builtins.cpp", "src/compiler/compiler.cpp", "src/compiler/compiler_utils.cpp", "src/compiler/def_class_visitor.cpp", "src/compiler/delete_stack.cpp", "src/compiler/delete_stack_stack.cpp", "src/compiler/desugaring_compiler.cpp", "src/compiler/entry_struct_func_compiler.cpp", "src/compiler/multifile_compiler.cpp", "src/compiler/type_checker.cpp", "src/file_formats/ic_tokens_file.cpp", "src/file_formats/tokens_file.cpp", "src/ic2c/ic2c.cpp", "src/ic2c/ic_ast.cpp", "src/ic2c/ic_compiler.cpp", "src/ic2c/ic_level2_parser.cpp", "src/ic2c/ic_level2_tokenizer.cpp", "src/ic2c/ic_optimizer.cpp", "src/ic2c/ic_parser.cpp", "src/ic2c/ic_preprocessor.cpp", "src/ic2c/ic_tokenizer.cpp", "src/tokenizer/block_analyzer.cpp", "src/tokenizer/string_utils.cpp", "src/tokenizer/tokenizer.cpp", "src/utilities/annotation.cpp", "src/utilities/annotations.cpp", "src/utilities/defer_stack.cpp", "src/utilities/defer_stack_stack.cpp", "src/utilities/error_printer.cpp", "src/utilities/ykdatatype.cpp", "src/utilities/ykdt_pool.cpp", "src/utilities/ykobject.cpp"] # update_makefile.py HAMMER_CPP
+sources=["src/ast/ast.cpp", "src/ast/ast_printer.cpp", "src/ast/ast_vis.cpp", "src/ast/codefiles.cpp", "src/ast/environment.cpp", "src/ast/environment_stack.cpp", "src/ast/parser.cpp", "src/builtins/builtins.cpp", "src/compiler/compiler.cpp", "src/compiler/compiler_utils.cpp", "src/compiler/def_class_visitor.cpp", "src/compiler/delete_stack.cpp", "src/compiler/delete_stack_stack.cpp", "src/compiler/desugaring_compiler.cpp", "src/compiler/entry_struct_func_compiler.cpp", "src/compiler/multifile_compiler.cpp", "src/compiler/type_checker.cpp", "src/file_formats/ic_tokens_file.cpp", "src/file_formats/tokens_file.cpp", "src/ic2c/ic2c.cpp", "src/ic2c/ic_ast.cpp", "src/ic2c/ic_compiler.cpp", "src/ic2c/ic_level2_parser.cpp", "src/ic2c/ic_level2_tokenizer.cpp", "src/ic2c/ic_line_splicer.cpp", "src/ic2c/ic_optimizer.cpp", "src/ic2c/ic_parser.cpp", "src/ic2c/ic_peek_ahead_iter.cpp", "src/ic2c/ic_preprocessor.cpp", "src/ic2c/ic_tokenizer.cpp", "src/ic2c/ic_trigraph_translater.cpp", "src/tokenizer/block_analyzer.cpp", "src/tokenizer/string_utils.cpp", "src/tokenizer/tokenizer.cpp", "src/utilities/annotation.cpp", "src/utilities/annotations.cpp", "src/utilities/defer_stack.cpp", "src/utilities/defer_stack_stack.cpp", "src/utilities/error_printer.cpp", "src/utilities/ykdatatype.cpp", "src/utilities/ykdt_pool.cpp", "src/utilities/ykobject.cpp"] # update_makefile.py HAMMER_CPP
 temp_out_dir="bin/hammer"
 binaries=["yaksha"]
 disable_parallel=false

diff --git a/compiler/scripts/output_tester.py b/compiler/scripts/output_tester.py
@@ -0,0 +1,73 @@
+import subprocess
+import tkinter as tk
+import tkinter.messagebox
+from tkinter.scrolledtext import ScrolledText
+import os
+
+MAX_EXECUTION_TIME_SEC = 30
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+
+def execute(arg: str) -> (str, str, int):
+    proc = subprocess.Popen(
+        arg,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        encoding="utf-8",
+        universal_newlines=True,
+        shell=True)
+    try:
+        so, se = proc.communicate(timeout=MAX_EXECUTION_TIME_SEC)
+        return_value = proc.returncode
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        proc.communicate()
+        return "", "", -1
+    return so.strip(), se.strip(), return_value
+
+
+def get_text(w):
+    return w.get("1.0", tkinter.END)
+
+
+def set_text(w, txt):
+    w.delete('1.0', tkinter.END)
+    w.insert(tk.INSERT, txt)
+
+
+class OutputTester:
+
+    def __init__(self):
+        self.out = None
+        self.inp = None
+        self.run_pp = None
+        self.root = None
+
+    def _run_pp(self):
+        self._check_output("temp_file_please_delete.c", "zig cc -E %TEMP%")
+
+    def _check_output(self, temp_file_name, command):
+        text = get_text(self.inp)
+        i_file = os.path.join(ROOT, temp_file_name)
+        command_f = command.replace("%TEMP%", i_file)
+        with open(i_file, "w+", encoding="utf-8") as h:
+            h.write(text)
+        so, se, r = execute(command_f)
+        set_text(self.out, so + se)
+        os.unlink(i_file)
+
+    def start(self):
+        self.root = tk.Tk()
+        self.root.title("Output tester")
+        self.root.resizable(False, False)
+        self.run_pp = tk.Button(self.root, text="Run real C preprocessor", command=self._run_pp)
+        self.inp = ScrolledText(self.root)
+        self.out = ScrolledText(self.root)
+        self.run_pp.pack()
+        self.inp.pack()
+        self.out.pack()
+        self.root.mainloop()
+
+
+if __name__ == "__main__":
+    OutputTester().start()
diff --git a/compiler/src/ic2c/ic_ast.h b/compiler/src/ic2c/ic_ast.h
@@ -303,8 +303,8 @@ namespace yaksha {
     std::vector<ic_stmt *> cleanup_stmt_;
   };
   /**
-* Parameter for a user defined function declaration
-*/
+  * Parameter for a user defined function declaration
+  */
   struct ic_parameter {
     ic_token *name_;
     std::vector<ic_token *> data_type_;

diff --git a/compiler/src/ic2c/ic_line_splicer.cpp b/compiler/src/ic2c/ic_line_splicer.cpp
@@ -0,0 +1,49 @@
+// ic_line_splicer.cpp
+#include "ic_line_splicer.h"
+using namespace yaksha;
+ic_line_splicer::ic_line_splicer(ic_trigraph_translater &tt)
+    : tt_(tt), current_(0), consumed_extra_(false) {
+  read();
+}
+void ic_line_splicer::next() {
+  if (consumed_extra_) {
+    read();
+    return;
+  }
+  if (!reached_end()) { tt_.next(); }
+  read();
+}
+[[nodiscard]] bool ic_line_splicer::reached_end() { return tt_.reached_end(); }
+[[nodiscard]] uint32_t ic_line_splicer::get_current() { return current_; }
+[[nodiscard]] int ic_line_splicer::get_column() {
+  return consumed_extra_ ? tt_.get_column() - 1 : tt_.get_column();
+}
+[[nodiscard]] int ic_line_splicer::get_line() { return tt_.get_line(); }
+ic_line_splicer::~ic_line_splicer() = default;
+void ic_line_splicer::read() {
+  consumed_extra_ = false;
+  current_ = tt_.get_current();
+  if (current_ != '\\') return;
+  if (tt_.reached_end()) return;
+  tt_.next();
+  if (tt_.get_current() == '\r' && !tt_.reached_end()) {
+    tt_.next();
+    if (tt_.get_current() == '\n') {
+      if (tt_.reached_end()) {
+        current_ = 0;
+      } else {
+        tt_.next();
+        current_ = tt_.get_current();
+      }
+    } else {
+      current_ = tt_.get_current();
+    }
+  } else if (tt_.get_current() == '\n' && !tt_.reached_end()) {
+    tt_.next();
+    current_ = tt_.get_current();
+  } else if (tt_.reached_end()) {
+    current_ = 0;
+  } else {
+    consumed_extra_ = true;
+  }
+}
diff --git a/compiler/src/ic2c/ic_line_splicer.h b/compiler/src/ic2c/ic_line_splicer.h
@@ -0,0 +1,23 @@
+// ic_line_splicer.h
+#ifndef IC_LINE_SPLICER_H
+#define IC_LINE_SPLICER_H
+#include "ic_simple_character_iter.h"
+#include "ic_trigraph_translater.h"
+namespace yaksha {
+  struct ic_line_splicer : public ic_simple_character_iter {
+    explicit ic_line_splicer(ic_trigraph_translater &tt);
+    void next() override;
+    [[nodiscard]] bool reached_end() override;
+    [[nodiscard]] uint32_t get_current() override;
+    [[nodiscard]] int get_column() override;
+    [[nodiscard]] int get_line() override;
+    ~ic_line_splicer() override;
+
+private:
+    void read();
+    ic_trigraph_translater &tt_;
+    utf8::uint32_t current_;
+    bool consumed_extra_;
+  };
+}// namespace yaksha
+#endif
diff --git a/compiler/src/ic2c/ic_parser.cpp b/compiler/src/ic2c/ic_parser.cpp
@@ -1,5 +1,150 @@
 // ic_parser.cpp
 #include "ic_parser.h"
+#include <algorithm>
+#include <cassert>
 using namespace yaksha;
-ic_parser::ic_parser() = default;
+// TODO add a dummy token for comments to ensure it is theoretically replaced by a single space
+//  we can not do it if it is not really required, I do not think this is required as line_splicer takes care of this!
+//  newline after comment is also preserved
 ic_parser::~ic_parser() = default;
+ic_parser::ic_parser(std::vector<ic_token> &tokens, ic_ast_pool *ast_pool)
+    : tokens_(tokens), ast_pool_(ast_pool), errors_(), current_(0) {}
+void ic_parser::parse() {
+  while (!is_at_end()) {
+    try {
+      statements_.emplace_back(preprocessor_statement());
+    } catch (ic_parsing_error &ex) {
+      // synchronize
+      while (peek()->type_ != ic_token_type::NEWLINE && !is_at_end()) {
+        advance();
+      }
+    }
+  }
+}
+ic_token *ic_parser::advance() {
+  if (!is_at_end()) { current_++; }
+  return previous();
+}
+ic_token *ic_parser::recede() {
+  if (current_ != 0) { current_--; }
+  return peek();
+}
+bool ic_parser::is_at_end() { return peek()->type_ == ic_token_type::TC_EOF; }
+ic_token *ic_parser::peek() { return &tokens_[current_]; }
+ic_token *ic_parser::previous() {
+  assert(current_ > 0);
+  return &tokens_[current_ - 1];
+}
+bool ic_parser::match(std::initializer_list<ic_token_type> types) {
+  return std::any_of(types.begin(), types.end(), [this](ic_token_type t) {
+    if (this->check(t)) {
+      this->advance();
+      return true;
+    }
+    return false;
+  });
+}
+bool ic_parser::check(ic_token_type t) {
+  if (is_at_end()) { return false; }
+  return peek()->type_ == t;
+}
+ic_token *ic_parser::consume(ic_token_type t, const std::string &message) {
+  if (check(t)) return advance();
+  throw error(peek(), message);
+}
+ic_parsing_error ic_parser::error(ic_token *tok, const std::string &message) {
+  auto err = ic_parsing_error{message, tok};
+  errors_.push_back(err);
+  return err;
+}
+ic_token *ic_parser::consume_or_eof(ic_token_type t,
+                                    const std::string &message) {
+  if (check(t)) return advance();
+  if (is_at_end()) return peek();
+  throw error(peek(), message);
+}
+ic_stmt *ic_parser::preprocessor_statement() {
+  if (peek()->type_ == ic_token_type::HASH) {
+    auto hash_t = advance();
+    if (peek()->type_ != ic_token_type::IDENTIFIER) {
+      throw ic_parsing_error("Invalid token after #", hash_t);
+    }
+    auto identifier = peek()->token_;
+    if (identifier == "define") { return define_st(hash_t); }
+    if (identifier == "include") { return include_st(hash_t); }
+    if (identifier == "line") { return line_st(hash_t); }
+    if (identifier == "undef") { return undef_st(hash_t); }
+    if (identifier == "error") { return error_st(hash_t); }
+    if (identifier == "warning") { return warning_st(hash_t); }
+    if (identifier == "pragma") { return pragma_st(hash_t); }
+    if (identifier == "if" || identifier == "ifdef" || identifier == "ifndef") {
+      return if_st(hash_t);
+    }
+    throw ic_parsing_error("Unknown preprocessor statement", hash_t);
+  }
+  return code_line();
+}
+ic_stmt *ic_parser::define_st(ic_token *hash_t) {
+  auto define_tok = advance();
+  if (!check(ic_token_type::IDENTIFIER)) {
+    throw ic_parsing_error("Expected a valid identifier after #define", hash_t);
+  }
+  std::vector<ic_token *> replacement_{};
+  auto identifier_tok = advance();
+  std::vector<ic_token *> args{};
+  if (check(ic_token_type::OPEN_PAREN)) {
+    auto paren_open = advance();
+    // parse macro #define
+    if (!check(ic_token_type::CLOSE_PAREN)) {
+      do {
+        args.emplace_back(advance());
+      } while (match({ic_token_type::COMMA}));
+    }
+    auto paren_close = consume(ic_token_type::CLOSE_PAREN,
+                               "#define macro must have a valid ')'");
+    std::vector<ic_token *> tok_string = token_string();
+    consume(ic_token_type::NEWLINE, "new line is expected after #define");
+    return ast_pool_->ic_c_pp_define_function_stmt(
+        hash_t, define_tok, identifier_tok,
+        paren_open, args, paren_close,
+        tok_string);
+  }
+  // parse simple #define
+  std::vector<ic_token *> tok_string = token_string();
+  consume(ic_token_type::NEWLINE, "new line is expected after #define");
+  return ast_pool_->ic_c_pp_define_normal_stmt(hash_t, define_tok,
+                                               identifier_tok, tok_string);
+}
+ic_stmt *ic_parser::include_st(ic_token *hash_t) {
+  auto include_tok = advance();
+  auto path_spec = consume(ic_token_type::STRING_LITERAL,
+                           "Expected an integer constant after #line");
+  consume(ic_token_type::NEWLINE, "New line expected after #include");
+  return ast_pool_->ic_c_pp_include_stmt(hash_t, include_tok, path_spec);
+}
+ic_stmt *ic_parser::line_st(ic_token *hash_t) {
+  auto line_tok = advance();
+  auto integer_const_tok = consume(ic_token_type::STRING_LITERAL,
+                                   "Expected an integer constant after #line");
+  auto path_tok = consume(ic_token_type::STRING_LITERAL,
+                          "Expected a valid path token after #line");
+  consume(ic_token_type::NEWLINE, "New line expected after #line");
+  return ast_pool_->ic_c_pp_line_stmt(hash_t, line_tok, integer_const_tok,
+                                      path_tok);
+}
+ic_stmt *ic_parser::undef_st(ic_token *hash_t) {
+  auto undef_tok = advance();
+  auto identifier_tok =
+      consume(ic_token_type::IDENTIFIER, "Expected an identifier after #undef");
+  consume(ic_token_type::NEWLINE, "New line expected after #undef");
+  return ast_pool_->ic_c_pp_undef_stmt(hash_t, undef_tok, identifier_tok);
+}
+ic_stmt *ic_parser::error_st(ic_token *hash_t) { return nullptr; }
+ic_stmt *ic_parser::warning_st(ic_token *hash_t) { return nullptr; }
+ic_stmt *ic_parser::pragma_st(ic_token *hash_t) { return nullptr; }
+ic_stmt *ic_parser::code_line() { return nullptr; }
+ic_stmt *ic_parser::block(ic_token *hash_t) { return nullptr; }
+ic_stmt *ic_parser::if_st(ic_token *hash_t) { return nullptr; }
+std::vector<ic_token *> ic_parser::token_string() {
+  return std::vector<ic_token *>{};
+}