Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement translation phase 2 #7

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 74 additions & 11 deletions src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
// we had an error or not after lexing is complete
let mut had_error = false;

// This keeps track of if we are in a multi-line comment, which will have to be removed at this

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: long sentence

// stage because it becomes a burden if it must be removed at a later stage, as it does not
// affect any of the actual code
let mut multi_comment_start: Option<PToken> = None;

while let Some(kind) = lexer.next() {
// Gets the slice of the source code that the current token is from
let slice = lexer.slice();
Expand All @@ -36,20 +41,53 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
end: index + slice.len(),
};

if token.kind == PTokenKind::ErrorGeneric {
let text = session.span_to_string(&token.into()).unwrap();

session
.struct_error(format!("error lexing token `{}`", text))
.span_label(token.into(), "invalid token found")
.emit();

had_error = true;
if token.kind == PTokenKind::CommentMultiStart {
if let Some(comment_start) = multi_comment_start {
session
.struct_span_warn(comment_start.into(), "`/*` within block comment")
.note("block comments cannot be nested")
.emit();
} else {
multi_comment_start = Some(token);
}
} else if token.kind == PTokenKind::CommentMultiEnd {
if multi_comment_start.is_some() {
multi_comment_start = None;
} else {
session
.struct_error("unexpected token `*/`")
.span_label(token.into(), "lone block comment terminator")
.emit();

had_error = true;
}
} else if multi_comment_start.is_none() {
if token.kind == PTokenKind::ErrorGeneric {
let text = session.span_to_string(&token.into()).unwrap();

session
.struct_error(format!("error lexing token `{}`", text))
.span_label(token.into(), "invalid token found")
.emit();

had_error = true;
}

tokens.push(token);
}

index += slice.len();
}

if let Some(comment_start) = multi_comment_start {
session
.struct_span_error(
comment_start.into(),
"Unterminated block comment begins here",
)
.emit();

tokens.push(token);
had_error = true;
}

if !had_error {
Expand Down Expand Up @@ -212,7 +250,7 @@ mod tests {
#[test]
fn lex_punctuators() {
let (sess, src) = dummy_sess(
r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%:"#,
r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%: \"#,
);

let input = super::lex(&sess, src.clone()).unwrap();
Expand Down Expand Up @@ -272,6 +310,31 @@ mod tests {
(PTokenKind::Punctuator, "%>"),
(PTokenKind::Punctuator, "%:"),
(PTokenKind::Punctuator, "%:%:"),
(PTokenKind::Backslash, "\\"),
];

check_matches(src, input, reference);
}

#[test]
fn lex_comments() {
let (sess, src) = dummy_sess(
r#"// This is a single line comment
/*
* This is a multi-line comment
*/"#,
);

let input = super::lex(&sess, src.clone()).unwrap();

// NOTE: Multi-line comments are stripped during lexing, and therefore should not show up
// here
let reference = vec![
(
PTokenKind::CommentSingle,
r#"// This is a single line comment"#,
),
(PTokenKind::Newline, "\n"),
];

check_matches(src, input, reference);
Expand Down
10 changes: 8 additions & 2 deletions src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,14 @@ pub enum PTokenKind {
CommentSingle,

/// A multi-line comment
#[regex(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")]
CommentMulti,
#[regex(r"/\*")]
CommentMultiStart,

#[regex(r"\*/")]
CommentMultiEnd,

#[token("\\")]
Backslash,

/// Any non-newline whitespace, which we can't skip for the single reason that: preprocessor
/// operations
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#![allow(clippy::result_unit_err)]
pub mod diagnostic;
pub mod lexer;
pub mod preprocessor;

#[cfg(test)]
mod tests {
Expand Down
9 changes: 7 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use sacc::{
diagnostic::{session::Session, Handler, HandlerFlags, SourceManager},
lexer::lex,
preprocessor::phase2::phase2,
};
use std::{path::Path, process::exit, rc::Rc};

Expand All @@ -21,9 +22,13 @@ fn main() {

match session.load_file(path) {
Ok(root_src) => {
// Lex tokens from our main source
if let Ok(tokens) = lex(&session, root_src) {
for token in tokens.iter() {
println!("{:?}", token);
// Run phase 2 of translation, which removes comments and backslashes and newlines
if let Ok(tokens) = phase2(tokens, &session) {
for token in tokens.iter() {
println!("{:?}", token);
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/preprocessor/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod phase2;
73 changes: 73 additions & 0 deletions src/preprocessor/phase2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use crate::{
diagnostic::session::Session,
lexer::{PToken, PTokenKind},
};

/// Phase 1 according to the C specification is replacing trigraph sequences. Because of the nature
/// of preprocessing tokens, and a distaste of looping through every character before it gets to
/// the lexer, that phase will be postponed as it correctly can be. Therefore phase 2 will come
/// first.
///
/// According to the C specification, phase 2 consists of:
///
/// Each instance of a backslash character ( \) immediately followed by a new-line
/// character is deleted, splicing physical source lines to form logical source lines.
/// Only the last backslash on any physical source line shall be eligible for being part
/// of such a splice. A source file that is not empty shall end in a new-line character,
/// which shall not be immediately preceded by a backslash character before any such
/// splicing takes place.
///
/// Therefore this function removes all newlines following a backslash. Because comments also
/// have no effect on the code generated from C, they are also stripped here.
///
pub fn phase2(tokens: Vec<PToken>, session: &Session) -> Result<Vec<PToken>, ()> {
let mut new_tokens = Vec::with_capacity(tokens.capacity());

let mut backslash: Option<PToken> = None;
let mut has_error = false;

for token in tokens {
if backslash.is_some() {
if token.kind == PTokenKind::Newline {
backslash = None;
} else if token.kind == PTokenKind::Whitespace {
session
.struct_span_warn(token.into(), "whitespace before newline after `\\`")
.emit();
} else {
// At this point we don't have to worry about other files being included in the
// token stream
let s = session.span_to_string(&token.into()).unwrap();

session
.struct_error(format!("found unexpected token `{}`", s))
.span_label(token.into(), "expected newline after `\\`, found this")
.emit();

// We can continue to try, just in case they make the same mistake again?
has_error = true;
backslash = None;
}
} else if token.kind != PTokenKind::CommentSingle {
if token.kind == PTokenKind::Backslash {
backslash = Some(token);
} else {
new_tokens.push(token);
}
}
}

if let Some(backslash) = backslash {
session
.struct_error("unexpected end of file")
.span_label(backslash.into(), "after backslash")
.emit();
has_error = true;
}

if has_error {
Err(())
} else {
Ok(new_tokens)
}
}
9 changes: 8 additions & 1 deletion test.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,12 @@
int main() {
printf("Hello world");

return 0;
// Single-line comment

/*
* $$$ test $$$
*/

return \
0;
}