Skip to content

Commit

Permalink
feat: parse email using dfa
Browse files Browse the repository at this point in the history
  • Loading branch information
bingxueshuang committed Sep 22, 2023
1 parent a366bf0 commit 55b31ff
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 39 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ authors = ["Manojna <[email protected]>"]
license = "MIT"

[dependencies]
thiserror = "1.0.48"
204 changes: 204 additions & 0 deletions src/fsm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
//!
//! --- Transition diagram or transition table ---

use std::str::Chars;

/// FSM is an abstraction over behavior of deterministic finite automata. A DFA has a set of states
/// (generic type S) and alphabets (all possible symbols). One of them is a start state (start fn).
/// A transition takes the DFA from one state to other by consuming a symbol. If the input is
/// completely consumed and DFA is in a final state (or accepting state) then we say that the input
/// belongs to the language accepted by the DFA.
pub trait FSM<S> {
type Symbol;
fn transition(state: S, symbol: Self::Symbol) -> S;
fn is_final(state: &S) -> bool;
fn start() -> S;
}

/// The set of possible states in a DFA that represents a language accepting all valid email
/// addresses. [State::Error] is a dead state (or trap state).
#[derive(Clone, Debug, Copy)]
pub enum State {
AddrSpec,
LocalAtom,
LocalQText,
LocalDot,
LocalEscape,
LocalQString,
LocalPart,
DomainAtom,
DomainDText,
DomainDot,
DomainLiteral,
Error,
}

/// Certain symbols and group of symbols useful for determining transition rules.
impl State {
const DQUOTE: char = '"';
const DOT: char = '.';
const BACKSLASH: char = '\\';
const AT: char = '@';
const OPEN_BRACKET: char = '[';
const CLOSE_BRACKET: char = ']';
fn is_atext(c: char) -> bool {
let n: u32 = c.into();
c == '!'
|| c == '#'
|| c == '$'
|| c == '%'
|| c == '&'
|| c == '\''
|| c == '*'
|| c == '+'
|| c == '-'
|| c == '/'
|| c == '='
|| c == '?'
|| c == '^'
|| c == '_'
|| c == '`'
|| c == '{'
|| c == '}'
|| c == '~'
|| (0x41 <= n && n <= 0x5A) // A-Z
|| (0x61 <= n && n <= 0x7A) // a-z
|| (0x30 <= n && n <= 0x39) // 0-9
}
fn is_qtext(c: char) -> bool {
let n: u32 = c.into();
n == 33 || (35 <= n && n <= 91) || (93 <= n && n <= 126)
}
fn is_dtext(c: char) -> bool {
let n: u32 = c.into();
(33 <= n && n <= 90) && (94 <= n && n <= 126)
}
fn is_escape(c: char) -> bool {
let n: u32 = c.into();
(0x21 <= n && n <= 0x7E) // VCHAR
|| n == 0x20 // SPACE
|| n == 0x09 // HTAB
}
}

/// State implements FSM and defines a DFA for language accepting all valid email addresses.
/// The set of states in DFA is itself. All transitions are defined in the implementation itself.
/// Start state of DFA is [State::AddrSpec].
impl FSM<State> for State {
type Symbol = char;
fn transition(state: Self, c: char) -> State {
match state {
Self::AddrSpec => match c {
Self::DQUOTE => State::LocalQText,
c if Self::is_atext(c) => State::LocalAtom,
_ => State::Error,
},
Self::LocalAtom => match c {
Self::DOT => State::LocalDot,
Self::AT => State::LocalPart,
c if Self::is_atext(c) => State::LocalAtom,
_ => State::Error,
},
Self::LocalQText => match c {
Self::BACKSLASH => State::LocalEscape,
Self::DQUOTE => State::LocalQString,
c if Self::is_qtext(c) => State::LocalQText,
_ => State::Error,
},
Self::LocalDot => match c {
c if Self::is_atext(c) => State::LocalAtom,
_ => State::Error,
},
Self::LocalEscape => match c {
c if Self::is_escape(c) => State::LocalQText,
_ => State::Error,
},
Self::LocalQString => match c {
Self::AT => State::LocalPart,
_ => State::Error,
},
Self::LocalPart => match c {
Self::OPEN_BRACKET => State::DomainDText,
c if Self::is_atext(c) => State::DomainAtom,
_ => Self::Error,
},
Self::DomainAtom => match c {
Self::DOT => State::DomainDot,
c if Self::is_atext(c) => State::DomainAtom,
_ => Self::Error,
},
Self::DomainDText => match c {
Self::CLOSE_BRACKET => State::DomainLiteral,
c if Self::is_dtext(c) => State::DomainDText,
_ => Self::Error,
},
Self::DomainDot => match c {
c if Self::is_atext(c) => State::DomainAtom,
_ => Self::Error,
},
Self::DomainLiteral => match c {
_ => Self::Error,
},
Self::Error => Self::Error,
}
}
fn is_final(state: &Self) -> bool {
match state {
Self::DomainLiteral | Self::DomainAtom => true,
_ => false,
}
}
fn start() -> Self {
State::AddrSpec
}
}

/// Iterator for the DFA implemented by [Machine]. Each step through the iterator consumes
/// an input symbol (from input iterator) and transitions the DFA through the corresponding state.
/// The iterator is exhausted when the input iterator gets exhausted. If any invalid character or
/// invalid email address syntax is encountered, then transition is to [State::Error]. For every
/// input symbol, this state consumes the symbol and remains in [State::Error]. Thus, parse errors
/// can be checked just by investigating the state of DFA when input gets exhausted.
///
/// Examining the last state of DFA can be done via [Iterator::last] method. If it is [None], then
/// empty input and parse failed. Otherwise, there will be at least one transition and hence we get
/// some last state. By checking if it is an accepting state, parsing success can be determined.
pub struct MachineIterator<'a> {
input: Chars<'a>,
state: State,
}

/// MachineIterator just wraps over input iterator and performs transitions at every step.
/// It keeps track of current state as well. Thus, next state is determined using current state as
/// well as the input symbol based on the transition rules defined.
impl<'a> Iterator for MachineIterator<'a> {
type Item = State;
fn next(&mut self) -> Option<Self::Item> {
let c = self.input.next()?;
self.state = State::transition(self.state, c);
Some(self.state)
}
}

/// Machine is the core export of the module. It is an [IntoIterator] and consuming the iterator
/// determines if given string literal is a valid email address or not.
pub struct Machine<'a> {
input: &'a str,
}

impl<'a> Machine<'a> {
pub fn new(s: &'a str) -> Self {
Machine { input: s }
}
}

impl<'a> IntoIterator for Machine<'a> {
type Item = State;
type IntoIter = MachineIterator<'a>;
fn into_iter(self) -> Self::IntoIter {
MachineIterator {
state: State::AddrSpec,
input: self.input.chars(),
}
}
}
95 changes: 56 additions & 39 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! ## Grammar
//! Parser for email address (`addr-spec`) as defined in Section 3.4.1 of [`RFC5322`].
//! This crate implements only a subset of the grammar and does not support folding white space
//! and comments in email address. Also, the grammar rules that are defined to preserve backwards
Expand Down Expand Up @@ -58,48 +59,64 @@
//! ```
//!
//! [`RFC5322`]: https://datatracker.ietf.org/doc/html/rfc5322#section-3.4.1
//!
//! ## Finite State Machine
//!
//! The above grammar defines a Regular language. So, we do not need to construct a lexer and
//! a parser. Email address as defined above can be parsed using finite automaton (or regular
//! expressions also will do). In this crate, we construct a finite state machine (module fsm)
//! and parse the given string into email address or fail and emit errors.
//!
//! ```
//! use email_parser::Email;
//! let email: Email = "[email protected]".parse().unwrap();
//! ```

use crate::fsm::{State, FSM};
use std::fmt::{Display, Formatter};
use std::str::FromStr;
use thiserror::Error;

mod terminal {
pub struct At;
pub struct OpenBracket;
pub struct CloseBracket;
pub struct DText(char);
pub struct AText(char);
pub struct Specials(char);
pub struct Backslash;
pub struct Dot;
pub struct Escape(char);
pub struct QText(char);
pub struct DQuote;
/// Email parsing errors.
#[derive(Error, Debug, Clone)]
pub enum Error {
#[error("cannot parse empty email id")]
EmptyEmail,
#[error("invalid RFC5322 formatted email id")]
InvalidEmail,
}

mod non_terminal {
use super::terminal;
pub struct AddrSpec(pub LocalPart, pub terminal::At, pub Domain);
pub enum LocalPart {
DotAtom(DotAtom),
QuotedString(QuotedString),
}
pub enum Domain {
DotAtom(DotAtom),
DomainLiteral(DomainLiteral),
/// Email parsing is accomplished using a finite state machine. FSM is defined in this module.
/// Finite automaton has several states and transitions. When iterator is completely consumed, if
/// the state is a final state, then given string is valid email address.
mod fsm;

/// This is the core of the crate. Defines email address type which can be constructed by parsing a
/// string literal. As long as it is constructed properly, then it means the email address is valid.
pub struct Email {
local: String,
domain: String,
}

/// Support parsing from string literal.
impl FromStr for Email {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let m = fsm::Machine::new(s);
let ref state = m.into_iter().last().ok_or(Error::EmptyEmail)?;
let (one, two) = State::is_final(state)
.then(|| s.split_once('@').unwrap())
.ok_or(Error::InvalidEmail)?;
Ok(Self {
local: one.to_owned(),
domain: two.to_owned(),
})
}
pub struct DomainLiteral(
terminal::OpenBracket,
Vec<terminal::DText>,
terminal::CloseBracket,
);
pub struct Atom(pub Vec<terminal::AText>);
pub struct DotAtomLiteral(pub terminal::Dot, pub Atom);
pub struct DotAtom(pub Atom, pub Vec<DotAtomLiteral>);
pub struct QuotedPair(pub terminal::Backslash, pub terminal::Escape);
pub enum QContent {
QText(terminal::QText),
QuotedPair(QuotedPair),
}

/// Support formatted output.
impl Display for Email {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
writeln!(f, "{}@{}", self.local, self.domain)
}
pub struct QuotedString(
pub terminal::DQuote,
pub Vec<QContent>,
pub terminal::DQuote,
);
}

0 comments on commit 55b31ff

Please sign in to comment.