TheTrioWLT · newcomb-luke · Feb 25, 2022 · Feb 25, 2022 · TroyNeubauer · Feb 25, 2022
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
@@ -24,6 +24,11 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
     // we had an error or not after lexing is complete
     let mut had_error = false;
 
+    // This keeps track of if we are in a multi-line comment, which will have to be removed at this
+    // stage because it becomes a burden if it must be removed at a later stage, as it does not
+    // affect any of the actual code
+    let mut multi_comment_start: Option<PToken> = None;
+
     while let Some(kind) = lexer.next() {
         // Gets the slice of the source code that the current token is from
         let slice = lexer.slice();
@@ -36,20 +41,53 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
             end: index + slice.len(),
         };
 
-        if token.kind == PTokenKind::ErrorGeneric {
-            let text = session.span_to_string(&token.into()).unwrap();
-
-            session
-                .struct_error(format!("error lexing token `{}`", text))
-                .span_label(token.into(), "invalid token found")
-                .emit();
-
-            had_error = true;
+        if token.kind == PTokenKind::CommentMultiStart {
+            if let Some(comment_start) = multi_comment_start {
+                session
+                    .struct_span_warn(comment_start.into(), "`/*` within block comment")
+                    .note("block comments cannot be nested")
+                    .emit();
+            } else {
+                multi_comment_start = Some(token);
+            }
+        } else if token.kind == PTokenKind::CommentMultiEnd {
+            if multi_comment_start.is_some() {
+                multi_comment_start = None;
+            } else {
+                session
+                    .struct_error("unexpected token `*/`")
+                    .span_label(token.into(), "lone block comment terminator")
+                    .emit();
+
+                had_error = true;
+            }
+        } else if multi_comment_start.is_none() {
+            if token.kind == PTokenKind::ErrorGeneric {
+                let text = session.span_to_string(&token.into()).unwrap();
+
+                session
+                    .struct_error(format!("error lexing token `{}`", text))
+                    .span_label(token.into(), "invalid token found")
+                    .emit();
+
+                had_error = true;
+            }
+
+            tokens.push(token);
         }
 
         index += slice.len();
+    }
+
+    if let Some(comment_start) = multi_comment_start {
+        session
+            .struct_span_error(
+                comment_start.into(),
+                "Unterminated block comment begins here",
+            )
+            .emit();
 
-        tokens.push(token);
+        had_error = true;
     }
 
     if !had_error {
@@ -212,7 +250,7 @@ mod tests {
     #[test]
     fn lex_punctuators() {
         let (sess, src) = dummy_sess(
-            r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%:"#,
+            r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%: \"#,
         );
 
         let input = super::lex(&sess, src.clone()).unwrap();
@@ -272,6 +310,31 @@ mod tests {
             (PTokenKind::Punctuator, "%>"),
             (PTokenKind::Punctuator, "%:"),
             (PTokenKind::Punctuator, "%:%:"),
+            (PTokenKind::Backslash, "\\"),
+        ];
+
+        check_matches(src, input, reference);
+    }
+
+    #[test]
+    fn lex_comments() {
+        let (sess, src) = dummy_sess(
+            r#"// This is a single line comment
+/*
+ * This is a multi-line comment
+ */"#,
+        );
+
+        let input = super::lex(&sess, src.clone()).unwrap();
+
+        // NOTE: Multi-line comments are stripped during lexing, and therefore should not show up
+        // here
+        let reference = vec![
+            (
+                PTokenKind::CommentSingle,
+                r#"// This is a single line comment"#,
+            ),
+            (PTokenKind::Newline, "\n"),
         ];
 
         check_matches(src, input, reference);

diff --git a/src/lexer/token.rs b/src/lexer/token.rs
@@ -66,8 +66,14 @@ pub enum PTokenKind {
     CommentSingle,
 
     /// A multi-line comment
-    #[regex(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")]
-    CommentMulti,
+    #[regex(r"/\*")]
+    CommentMultiStart,
+
+    #[regex(r"\*/")]
+    CommentMultiEnd,
+
+    #[token("\\")]
+    Backslash,
 
     /// Any non-newline whitespace, which we can't skip for the single reason that: preprocessor
     /// operations

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 #![allow(clippy::result_unit_err)]
 pub mod diagnostic;
 pub mod lexer;
+pub mod preprocessor;
 
 #[cfg(test)]
 mod tests {

diff --git a/src/main.rs b/src/main.rs
@@ -1,6 +1,7 @@
 use sacc::{
     diagnostic::{session::Session, Handler, HandlerFlags, SourceManager},
     lexer::lex,
+    preprocessor::phase2::phase2,
 };
 use std::{path::Path, process::exit, rc::Rc};
 
@@ -21,9 +22,13 @@ fn main() {
 
     match session.load_file(path) {
         Ok(root_src) => {
+            // Lex tokens from our main source
             if let Ok(tokens) = lex(&session, root_src) {
-                for token in tokens.iter() {
-                    println!("{:?}", token);
+                // Run phase 2 of translation, which removes comments and backslashes and newlines
+                if let Ok(tokens) = phase2(tokens, &session) {
+                    for token in tokens.iter() {
+                        println!("{:?}", token);
+                    }
                 }
             }
         }

diff --git a/src/preprocessor/mod.rs b/src/preprocessor/mod.rs
@@ -0,0 +1 @@
+pub mod phase2;
diff --git a/src/preprocessor/phase2.rs b/src/preprocessor/phase2.rs
@@ -0,0 +1,73 @@
+use crate::{
+    diagnostic::session::Session,
+    lexer::{PToken, PTokenKind},
+};
+
+/// Phase 1 according to the C specification is replacing trigraph sequences. Because of the nature
+/// of preprocessing tokens, and a distaste of looping through every character before it gets to
+/// the lexer, that phase will be postponed as it correctly can be. Therefore phase 2 will come
+/// first.
+///
+/// According to the C specification, phase 2 consists of:
+///
+/// Each instance of a backslash character ( \) immediately followed by a new-line
+/// character is deleted, splicing physical source lines to form logical source lines.
+/// Only the last backslash on any physical source line shall be eligible for being part
+/// of such a splice. A source file that is not empty shall end in a new-line character,
+/// which shall not be immediately preceded by a backslash character before any such
+/// splicing takes place.
+///
+/// Therefore this function removes all newlines following a backslash. Because comments also
+/// have no effect on the code generated from C, they are also stripped here.
+///
+pub fn phase2(tokens: Vec<PToken>, session: &Session) -> Result<Vec<PToken>, ()> {
+    let mut new_tokens = Vec::with_capacity(tokens.capacity());
+
+    let mut backslash: Option<PToken> = None;
+    let mut has_error = false;
+
+    for token in tokens {
+        if backslash.is_some() {
+            if token.kind == PTokenKind::Newline {
+                backslash = None;
+            } else if token.kind == PTokenKind::Whitespace {
+                session
+                    .struct_span_warn(token.into(), "whitespace before newline after `\\`")
+                    .emit();
+            } else {
+                // At this point we don't have to worry about other files being included in the
+                // token stream
+                let s = session.span_to_string(&token.into()).unwrap();
+
+                session
+                    .struct_error(format!("found unexpected token `{}`", s))
+                    .span_label(token.into(), "expected newline after `\\`, found this")
+                    .emit();
+
+                // We can continue to try, just in case they make the same mistake again?
+                has_error = true;
+                backslash = None;
+            }
+        } else if token.kind != PTokenKind::CommentSingle {
+            if token.kind == PTokenKind::Backslash {
+                backslash = Some(token);
+            } else {
+                new_tokens.push(token);
+            }
+        }
+    }
+
+    if let Some(backslash) = backslash {
+        session
+            .struct_error("unexpected end of file")
+            .span_label(backslash.into(), "after backslash")
+            .emit();
+        has_error = true;
+    }
+
+    if has_error {
+        Err(())
+    } else {
+        Ok(new_tokens)
+    }
+}
diff --git a/test.c b/test.c
@@ -3,5 +3,12 @@
 int main() {
 	printf("Hello world");
 
-	return 0;
+	// Single-line comment
+
+	/*
+	 * $$$ test $$$
+	 */
+
+	return \
+		0;
 }