commit e58a701b40b08ce29ea3e7758b0bf82d260f7a77 Author: idylls Date: Tue Oct 11 16:13:56 2022 -0400 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4fffb2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ca2804c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "mup" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..8a017cc --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,9 @@ +unstable_features = true +hard_tabs = true +max_width = 80 +imports_granularity = 'Crate' +group_imports = 'StdExternalCrate' +format_strings = true +wrap_comments = true +blank_lines_lower_bound = 0 +blank_lines_upper_bound = 2 diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ea86848 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod parse; diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..a0425a7 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,396 @@ +#[derive(Debug, Copy, Clone)] +struct InvalidCharBoundary; +trait StrExt { + fn char_at_byte_offset( + &self, + offset: T, + ) -> core::result::Result, InvalidCharBoundary> + where + T: Into; +} + +impl StrExt for str { + fn char_at_byte_offset( + &self, + offset: T, + ) -> core::result::Result, InvalidCharBoundary> + where + T: Into, + { + self.get(offset.into()..) + .ok_or(InvalidCharBoundary) + .map(|s| s.chars().next()) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct ByteOffset(pub usize); +impl From for ByteOffset { + fn from(bo: usize) -> Self { + Self(bo) + } +} +impl From for usize { + fn from(bo: ByteOffset) -> Self { + bo.0 + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Span { + start: ByteOffset, + /// exclusive + end: ByteOffset, +} +fn span(start: A, end: B) -> Span +where + A: Into, + B: Into, +{ + Span { + start: start.into(), + end: end.into(), + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Spanned { + span: Span, + value: T, +} + +fn spanned(start: A, end: B, value: T) -> Spanned +where + A: Into, + B: Into, +{ + let span = span(start, end); + + Spanned { value, span } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Error {} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Header { + level: u8, + content: S, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Inline { + Text(S), +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Paragraph { + pieces: Vec>>, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum Block { + Header(Header), + Paragraph(Paragraph), +} + +pub type Result = core::result::Result>; + +struct State<'a> { + corpus: &'a str, + current_offset: ByteOffset, +} +impl<'a> State<'a> { + fn is_eof(&self) -> bool { + self.current_offset.0 >= self.corpus.len() + } + + fn peek(&self) -> Option<(char, ByteOffset)> { + if self.is_eof() { + return None; + } + + Some(( + self.corpus + .char_at_byte_offset(self.current_offset) + .unwrap() + .unwrap(), + self.current_offset, + )) + } + + fn next(&mut self) -> Option<(char, ByteOffset)> { + if self.is_eof() { + return None; + } + + let out = Some(( + self.corpus + .char_at_byte_offset(self.current_offset) + .unwrap() + .unwrap(), + self.current_offset, + )); + + self.current_offset.0 += 1; + + out + } + + fn forward(&mut self) { + self.next(); + } + + fn skip_whitespace(&mut self) { + while let Some((ch, _)) = self.peek() { + if !ch.is_whitespace() { + break; + } + + self.forward(); + } + } +} + +fn header<'a, S>(state: &mut State<'a>) -> Result> +where + S: From<&'a str>, +{ + let start = state.current_offset; + let mut level = 1; + + state.forward(); + + while let Some((ch, _)) = state.peek() { + match ch { + '#' => { + level += 1; + state.forward(); + } + _ => break, + }; + } + + state.skip_whitespace(); + + let content_start = state.current_offset; + let mut content_end = None; + while let Some((ch, bo)) = state.next() { + if ch == '\n' { + content_end = Some(bo); + break; + } + } + let end = state.current_offset; + let content_end = content_end.unwrap_or(end); + + Ok(Header { + level, + content: state.corpus[content_start.0..content_end.0].into(), + }) +} + +fn inline_text<'a, S>(state: &mut State<'a>) -> Result<(S, bool)> +where + S: From<&'a str>, +{ + let start = state.current_offset; + + let mut end: Option = None; + while let Some((ch, bo)) = state.next() { + match ch { + '\n' => match end { + Some(end) => { + return Ok((state.corpus[start.0..end.0].into(), true)) + } + None => end = Some(bo), + }, + _ => { + end = None; + } + } + } + + Ok(( + state.corpus + [start.0..end.map(|e| e.0).unwrap_or_else(|| state.corpus.len())] + .into(), + true, + )) +} + +fn inline<'a, S>(state: &mut State<'a>) -> Result<(Inline, bool)> +where + S: From<&'a str>, +{ + match state.peek() { + _ => { + let (inline, done) = inline_text(state)?; + Ok((Inline::Text(inline), done)) + } + } +} + +fn paragraph<'a, S>(state: &mut State<'a>) -> Result> +where + S: From<&'a str>, +{ + let mut pieces = Vec::new(); + + loop { + let start = state.current_offset; + let (piece, done) = inline::(state)?; + let end = state.current_offset; + + pieces.push(spanned(start, end, piece)); + + if done { + break; + }; + } + + Ok(Paragraph { pieces }) +} + +fn blocks<'a, S>(state: &mut State<'a>) -> Result>>> +where + S: From<&'a str>, +{ + let mut out = Vec::new(); + while let Some((ch, _)) = state.peek() { + state.skip_whitespace(); + + let start = state.current_offset; + let block = match ch { + '#' => Block::Header(header(state)?), + ch if ch.is_whitespace() => { + state.forward(); + break; + } + _ => Block::Paragraph(paragraph(state)?), + }; + let end = state.current_offset; + + out.push(spanned(start, end, block)); + } + + Ok(out) +} + +pub fn parse<'a, S>(corpus: &'a str) -> Result>>> +where + S: From<&'a str>, +{ + let mut state = State { + corpus, + current_offset: 0.into(), + }; + + blocks(&mut state) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn header_1() { + let corpus = "# Header"; + let output = parse(corpus).unwrap(); + + assert_eq!( + &output, + &[spanned( + 0, + corpus.len(), + Block::Header(Header { + level: 1, + content: "Header", + }) + )] + ) + } + + #[test] + fn header_2() { + let corpus = "# Header\n## Header 2\n"; + let output = parse(corpus).unwrap(); + + assert_eq!( + &output, + &[ + spanned( + 0, + 9, + Block::Header(Header { + level: 1, + content: "Header", + }) + ), + spanned( + 9, + 25, + Block::Header(Header { + level: 2, + content: "Header 2", + }) + ), + ] + ) + } + + #[test] + fn paragraph_1() { + let corpus = "Hello, world"; + let output = parse(corpus).unwrap(); + + assert_eq!( + &output, + &[spanned( + 0, + corpus.len(), + Block::Paragraph(Paragraph { + pieces: vec![spanned( + 0, + corpus.len(), + Inline::Text("Hello, world") + )] + }) + )], + ) + } + + #[test] + fn paragraph_2() { + let corpus = "Hello, world\n\nGoodbye, world\n"; + let output = parse(corpus).unwrap(); + + assert_eq!( + &output, + &[ + spanned( + 0, + 14, + Block::Paragraph(Paragraph { + pieces: vec![spanned( + 0, + 14, + Inline::Text("Hello, world") + )] + }) + ), + spanned( + 14, + corpus.len(), + Block::Paragraph(Paragraph { + pieces: vec![spanned( + 14, + corpus.len(), + Inline::Text("Goodbye, world") + )] + }) + ) + ], + ) + } +}