-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a366bf0
commit 55b31ff
Showing
3 changed files
with
261 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,3 +7,4 @@ authors = ["Manojna <[email protected]>"] | |
license = "MIT" | ||
|
||
[dependencies] | ||
thiserror = "1.0.48" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
//! | ||
//! --- Transition diagram or transition table --- | ||
|
||
use std::str::Chars; | ||
|
||
/// FSM is an abstraction over behavior of deterministic finite automata. A DFA has a set of states | ||
/// (generic type S) and alphabets (all possible symbols). One of them is a start state (start fn). | ||
/// A transition takes the DFA from one state to other by consuming a symbol. If the input is | ||
/// completely consumed and DFA is in a final state (or accepting state) then we say that the input | ||
/// belongs to the language accepted by the DFA. | ||
pub trait FSM<S> { | ||
type Symbol; | ||
fn transition(state: S, symbol: Self::Symbol) -> S; | ||
fn is_final(state: &S) -> bool; | ||
fn start() -> S; | ||
} | ||
|
||
/// The set of possible states in a DFA that represents a language accepting all valid email | ||
/// addresses. [State::Error] is a dead state (or trap state). | ||
#[derive(Clone, Debug, Copy)] | ||
pub enum State { | ||
AddrSpec, | ||
LocalAtom, | ||
LocalQText, | ||
LocalDot, | ||
LocalEscape, | ||
LocalQString, | ||
LocalPart, | ||
DomainAtom, | ||
DomainDText, | ||
DomainDot, | ||
DomainLiteral, | ||
Error, | ||
} | ||
|
||
/// Certain symbols and group of symbols useful for determining transition rules. | ||
impl State { | ||
const DQUOTE: char = '"'; | ||
const DOT: char = '.'; | ||
const BACKSLASH: char = '\\'; | ||
const AT: char = '@'; | ||
const OPEN_BRACKET: char = '['; | ||
const CLOSE_BRACKET: char = ']'; | ||
fn is_atext(c: char) -> bool { | ||
let n: u32 = c.into(); | ||
c == '!' | ||
|| c == '#' | ||
|| c == '$' | ||
|| c == '%' | ||
|| c == '&' | ||
|| c == '\'' | ||
|| c == '*' | ||
|| c == '+' | ||
|| c == '-' | ||
|| c == '/' | ||
|| c == '=' | ||
|| c == '?' | ||
|| c == '^' | ||
|| c == '_' | ||
|| c == '`' | ||
|| c == '{' | ||
|| c == '}' | ||
|| c == '~' | ||
|| (0x41 <= n && n <= 0x5A) // A-Z | ||
|| (0x61 <= n && n <= 0x7A) // a-z | ||
|| (0x30 <= n && n <= 0x39) // 0-9 | ||
} | ||
fn is_qtext(c: char) -> bool { | ||
let n: u32 = c.into(); | ||
n == 33 || (35 <= n && n <= 91) || (93 <= n && n <= 126) | ||
} | ||
fn is_dtext(c: char) -> bool { | ||
let n: u32 = c.into(); | ||
(33 <= n && n <= 90) && (94 <= n && n <= 126) | ||
} | ||
fn is_escape(c: char) -> bool { | ||
let n: u32 = c.into(); | ||
(0x21 <= n && n <= 0x7E) // VCHAR | ||
|| n == 0x20 // SPACE | ||
|| n == 0x09 // HTAB | ||
} | ||
} | ||
|
||
/// State implements FSM and defines a DFA for language accepting all valid email addresses. | ||
/// The set of states in DFA is itself. All transitions are defined in the implementation itself. | ||
/// Start state of DFA is [State::AddrSpec]. | ||
impl FSM<State> for State { | ||
type Symbol = char; | ||
fn transition(state: Self, c: char) -> State { | ||
match state { | ||
Self::AddrSpec => match c { | ||
Self::DQUOTE => State::LocalQText, | ||
c if Self::is_atext(c) => State::LocalAtom, | ||
_ => State::Error, | ||
}, | ||
Self::LocalAtom => match c { | ||
Self::DOT => State::LocalDot, | ||
Self::AT => State::LocalPart, | ||
c if Self::is_atext(c) => State::LocalAtom, | ||
_ => State::Error, | ||
}, | ||
Self::LocalQText => match c { | ||
Self::BACKSLASH => State::LocalEscape, | ||
Self::DQUOTE => State::LocalQString, | ||
c if Self::is_qtext(c) => State::LocalQText, | ||
_ => State::Error, | ||
}, | ||
Self::LocalDot => match c { | ||
c if Self::is_atext(c) => State::LocalAtom, | ||
_ => State::Error, | ||
}, | ||
Self::LocalEscape => match c { | ||
c if Self::is_escape(c) => State::LocalQText, | ||
_ => State::Error, | ||
}, | ||
Self::LocalQString => match c { | ||
Self::AT => State::LocalPart, | ||
_ => State::Error, | ||
}, | ||
Self::LocalPart => match c { | ||
Self::OPEN_BRACKET => State::DomainDText, | ||
c if Self::is_atext(c) => State::DomainAtom, | ||
_ => Self::Error, | ||
}, | ||
Self::DomainAtom => match c { | ||
Self::DOT => State::DomainDot, | ||
c if Self::is_atext(c) => State::DomainAtom, | ||
_ => Self::Error, | ||
}, | ||
Self::DomainDText => match c { | ||
Self::CLOSE_BRACKET => State::DomainLiteral, | ||
c if Self::is_dtext(c) => State::DomainDText, | ||
_ => Self::Error, | ||
}, | ||
Self::DomainDot => match c { | ||
c if Self::is_atext(c) => State::DomainAtom, | ||
_ => Self::Error, | ||
}, | ||
Self::DomainLiteral => match c { | ||
_ => Self::Error, | ||
}, | ||
Self::Error => Self::Error, | ||
} | ||
} | ||
fn is_final(state: &Self) -> bool { | ||
match state { | ||
Self::DomainLiteral | Self::DomainAtom => true, | ||
_ => false, | ||
} | ||
} | ||
fn start() -> Self { | ||
State::AddrSpec | ||
} | ||
} | ||
|
||
/// Iterator for the DFA implemented by [Machine]. Each step through the iterator consumes | ||
/// an input symbol (from input iterator) and transitions the DFA through the corresponding state. | ||
/// The iterator is exhausted when the input iterator gets exhausted. If any invalid character or | ||
/// invalid email address syntax is encountered, then transition is to [State::Error]. For every | ||
/// input symbol, this state consumes the symbol and remains in [State::Error]. Thus, parse errors | ||
/// can be checked just by investigating the state of DFA when input gets exhausted. | ||
/// | ||
/// Examining the last state of DFA can be done via [Iterator::last] method. If it is [None], then | ||
/// empty input and parse failed. Otherwise, there will be at least one transition and hence we get | ||
/// some last state. By checking if it is an accepting state, parsing success can be determined. | ||
pub struct MachineIterator<'a> { | ||
input: Chars<'a>, | ||
state: State, | ||
} | ||
|
||
/// MachineIterator just wraps over input iterator and performs transitions at every step. | ||
/// It keeps track of current state as well. Thus, next state is determined using current state as | ||
/// well as the input symbol based on the transition rules defined. | ||
impl<'a> Iterator for MachineIterator<'a> { | ||
type Item = State; | ||
fn next(&mut self) -> Option<Self::Item> { | ||
let c = self.input.next()?; | ||
self.state = State::transition(self.state, c); | ||
Some(self.state) | ||
} | ||
} | ||
|
||
/// Machine is the core export of the module. It is an [IntoIterator] and consuming the iterator | ||
/// determines if given string literal is a valid email address or not. | ||
pub struct Machine<'a> { | ||
input: &'a str, | ||
} | ||
|
||
impl<'a> Machine<'a> { | ||
pub fn new(s: &'a str) -> Self { | ||
Machine { input: s } | ||
} | ||
} | ||
|
||
impl<'a> IntoIterator for Machine<'a> { | ||
type Item = State; | ||
type IntoIter = MachineIterator<'a>; | ||
fn into_iter(self) -> Self::IntoIter { | ||
MachineIterator { | ||
state: State::AddrSpec, | ||
input: self.input.chars(), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
//! ## Grammar | ||
//! Parser for email address (`addr-spec`) as defined in Section 3.4.1 of [`RFC5322`]. | ||
//! This crate implements only a subset of the grammar and does not support folding white space | ||
//! and comments in email address. Also, the grammar rules that are defined to preserve backwards | ||
|
@@ -58,48 +59,64 @@ | |
//! ``` | ||
//! | ||
//! [`RFC5322`]: https://datatracker.ietf.org/doc/html/rfc5322#section-3.4.1 | ||
//! | ||
//! ## Finite State Machine | ||
//! | ||
//! The above grammar defines a Regular language. So, we do not need to construct a lexer and | ||
//! a parser. Email address as defined above can be parsed using finite automaton (or regular | ||
//! expressions also will do). In this crate, we construct a finite state machine (module fsm) | ||
//! and parse the given string into email address or fail and emit errors. | ||
//! | ||
//! ``` | ||
//! use email_parser::Email; | ||
//! let email: Email = "[email protected]".parse().unwrap(); | ||
//! ``` | ||
|
||
use crate::fsm::{State, FSM}; | ||
use std::fmt::{Display, Formatter}; | ||
use std::str::FromStr; | ||
use thiserror::Error; | ||
|
||
mod terminal { | ||
pub struct At; | ||
pub struct OpenBracket; | ||
pub struct CloseBracket; | ||
pub struct DText(char); | ||
pub struct AText(char); | ||
pub struct Specials(char); | ||
pub struct Backslash; | ||
pub struct Dot; | ||
pub struct Escape(char); | ||
pub struct QText(char); | ||
pub struct DQuote; | ||
/// Email parsing errors. | ||
#[derive(Error, Debug, Clone)] | ||
pub enum Error { | ||
#[error("cannot parse empty email id")] | ||
EmptyEmail, | ||
#[error("invalid RFC5322 formatted email id")] | ||
InvalidEmail, | ||
} | ||
|
||
mod non_terminal { | ||
use super::terminal; | ||
pub struct AddrSpec(pub LocalPart, pub terminal::At, pub Domain); | ||
pub enum LocalPart { | ||
DotAtom(DotAtom), | ||
QuotedString(QuotedString), | ||
} | ||
pub enum Domain { | ||
DotAtom(DotAtom), | ||
DomainLiteral(DomainLiteral), | ||
/// Email parsing is accomplished using a finite state machine. FSM is defined in this module. | ||
/// Finite automaton has several states and transitions. When iterator is completely consumed, if | ||
/// the state is a final state, then given string is valid email address. | ||
mod fsm; | ||
|
||
/// This is the core of the crate. Defines email address type which can be constructed by parsing a | ||
/// string literal. As long as it is constructed properly, then it means the email address is valid. | ||
pub struct Email { | ||
local: String, | ||
domain: String, | ||
} | ||
|
||
/// Support parsing from string literal. | ||
impl FromStr for Email { | ||
type Err = Error; | ||
fn from_str(s: &str) -> Result<Self, Self::Err> { | ||
let m = fsm::Machine::new(s); | ||
let ref state = m.into_iter().last().ok_or(Error::EmptyEmail)?; | ||
let (one, two) = State::is_final(state) | ||
.then(|| s.split_once('@').unwrap()) | ||
.ok_or(Error::InvalidEmail)?; | ||
Ok(Self { | ||
local: one.to_owned(), | ||
domain: two.to_owned(), | ||
}) | ||
} | ||
pub struct DomainLiteral( | ||
terminal::OpenBracket, | ||
Vec<terminal::DText>, | ||
terminal::CloseBracket, | ||
); | ||
pub struct Atom(pub Vec<terminal::AText>); | ||
pub struct DotAtomLiteral(pub terminal::Dot, pub Atom); | ||
pub struct DotAtom(pub Atom, pub Vec<DotAtomLiteral>); | ||
pub struct QuotedPair(pub terminal::Backslash, pub terminal::Escape); | ||
pub enum QContent { | ||
QText(terminal::QText), | ||
QuotedPair(QuotedPair), | ||
} | ||
|
||
/// Support formatted output. | ||
impl Display for Email { | ||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { | ||
writeln!(f, "{}@{}", self.local, self.domain) | ||
} | ||
pub struct QuotedString( | ||
pub terminal::DQuote, | ||
pub Vec<QContent>, | ||
pub terminal::DQuote, | ||
); | ||
} |