You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

599 lines
10 KiB
Rust

pub trait ParseExtensions<'a> {
type Output;
type Error;
fn parse(
&self,
content: &'a str,
span: Span,
) -> core::result::Result<Self::Output, Self::Error>;
}
impl<'a, T, E, F> ParseExtensions<'a> for F
where
F: Fn(&'a str, Span) -> core::result::Result<T, E>,
{
type Output = T;
type Error = E;
fn parse(
&self,
content: &'a str,
span: Span,
) -> core::result::Result<Self::Output, Self::Error> {
self(content, span)
}
}
// impl<'a, T, E> ParseExtension<'a>
// for &fn(&'a str, Span) -> core::result::Result<T, E>
// {
// type Output = T;
// type Error = E;
// fn parse(
// &self,
// content: &'a str,
// span: Span,
// ) -> core::result::Result<Self::Output, Self::Error> {
// self(content, span)
// }
// }
#[derive(Debug, Copy, Clone)]
struct InvalidCharBoundary;
trait StrExt {
fn char_at_byte_offset<T>(
&self,
offset: T,
) -> core::result::Result<Option<char>, InvalidCharBoundary>
where
T: Into<usize>;
}
impl StrExt for str {
fn char_at_byte_offset<T>(
&self,
offset: T,
) -> core::result::Result<Option<char>, InvalidCharBoundary>
where
T: Into<usize>,
{
self.get(offset.into()..)
.ok_or(InvalidCharBoundary)
.map(|s| s.chars().next())
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct ByteOffset(pub usize);
impl From<usize> for ByteOffset {
fn from(bo: usize) -> Self {
Self(bo)
}
}
impl From<ByteOffset> for usize {
fn from(bo: ByteOffset) -> Self {
bo.0
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct Span {
pub start: ByteOffset,
/// exclusive
pub end: ByteOffset,
}
impl Span {
pub fn enclose<T>(self, t: T) -> Spanned<T> {
spanned(self.start, self.end, t)
}
}
pub fn span<A, B>(start: A, end: B) -> Span
where
A: Into<ByteOffset>,
B: Into<ByteOffset>,
{
Span {
start: start.into(),
end: end.into(),
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct Spanned<T> {
pub span: Span,
pub value: T,
}
pub fn spanned<A, B, T>(start: A, end: B, value: T) -> Spanned<T>
where
A: Into<ByteOffset>,
B: Into<ByteOffset>,
{
let span = span(start, end);
Spanned { value, span }
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Error<ExtError> {
UnclosedExtension,
Extension(ExtError),
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct Header<S> {
pub level: u8,
pub content: S,
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct Text<S>(pub S);
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub struct Extension<S>(pub S);
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum ParagraphPiece<StrRepr, ExtRepr> {
Text(Text<StrRepr>),
Extension(ExtRepr),
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Paragraph<StrRepr, ExtRepr> {
pub pieces: Vec<Spanned<ParagraphPiece<StrRepr, ExtRepr>>>,
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Block<StrRepr, ExtRepr> {
Header(Header<StrRepr>),
Paragraph(Paragraph<StrRepr, ExtRepr>),
}
pub type Result<T, ExtError> =
core::result::Result<T, Spanned<Error<ExtError>>>;
#[derive(Copy, Clone)]
struct State<'a> {
corpus: &'a str,
current_offset: ByteOffset,
}
impl<'a> State<'a> {
fn is_eof(&self) -> bool {
self.current_offset.0 >= self.corpus.len()
}
fn peek(&self) -> Option<(char, ByteOffset)> {
if self.is_eof() {
return None;
}
Some((
self.corpus
.char_at_byte_offset(self.current_offset)
.unwrap()
.unwrap(),
self.current_offset,
))
}
fn peek_n(&self, buf: &mut [Option<(char, ByteOffset)>]) {
let mut self_ = *self;
for slot in buf {
*slot = self_.next();
}
}
fn next(&mut self) -> Option<(char, ByteOffset)> {
if self.is_eof() {
return None;
}
let char = self
.corpus
.char_at_byte_offset(self.current_offset)
.unwrap()
.unwrap();
let out = Some((char, self.current_offset));
self.current_offset.0 += char.len_utf8();
out
}
fn forward(&mut self) {
self.next();
}
fn forward_n(&mut self, n: usize) {
for _ in 0..n {
self.forward();
}
}
fn skip_whitespace(&mut self) {
while let Some((ch, _)) = self.peek() {
if !ch.is_whitespace() {
break;
}
self.forward();
}
}
}
fn header<'a, S, E>(state: &mut State<'a>) -> Result<Header<S>, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
let mut level = 1;
state.forward();
while let Some((ch, _)) = state.peek() {
match ch {
'#' => {
level += 1;
state.forward();
}
_ => break,
};
}
state.skip_whitespace();
let content_start = state.current_offset;
let mut content_end = None;
while let Some((ch, bo)) = state.next() {
if ch == '\n' {
content_end = Some(bo);
break;
}
}
let end = state.current_offset;
let content_end = content_end.unwrap_or(end);
Ok(Header {
level,
content: state.corpus[content_start.0..content_end.0].into(),
})
}
fn extension<'a, S, E>(
state: &mut State<'a>,
pex: &E,
) -> Result<E::Output, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
debug_assert!({
let mut peek_buf = [None; 2];
state.peek_n(&mut peek_buf);
matches!(peek_buf, [Some(('@', _)), Some(('{', _))])
});
state.forward();
state.forward();
let start = state.current_offset;
loop {
let mut peek_buf = [None; 2];
state.peek_n(&mut peek_buf);
match peek_buf {
[None, _] => {
return Err(spanned(
start,
state.current_offset,
Error::UnclosedExtension,
))
}
[Some(('}', bo)), Some(('@', _))] => {
state.forward();
state.forward();
let content = &state.corpus[start.0..bo.0];
return pex
.parse(content, span(start.0, bo.0))
.map_err(|e| spanned(start.0, bo.0, Error::Extension(e)));
}
_ => {
state.forward();
}
}
}
}
fn text<'a, S, E>(state: &mut State<'a>) -> Result<Text<S>, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
let start = state.current_offset;
loop {
let mut peek_buf = [None; 2];
state.peek_n(&mut peek_buf);
match peek_buf {
[None, _]
| [Some(('\n', _)), Some(('\n', _))]
| [Some(('\n', _)), None]
| [Some(('@', _)), Some(('{', _))] => break,
_ => state.forward(),
}
}
let end = state.current_offset;
Ok(Text(state.corpus[start.0..end.0].into()))
}
fn paragraph<'a, S, E>(
state: &mut State<'a>,
pex: &E,
) -> Result<Paragraph<S, E::Output>, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
let mut pieces = Vec::new();
loop {
let start = state.current_offset;
// detect what type of thing we're about to parse.
let mut peek_buf = [None; 2];
state.peek_n(&mut peek_buf);
let piece = match peek_buf {
// eof
[None, _] => break,
// double newline
[Some(('\n', _)), Some(('\n', _))] => {
state.forward_n(2);
break;
}
// newline eof
[Some(('\n', _)), None] => {
state.forward();
break;
}
// extension
[Some(('@', _)), Some(('{', _))] => {
ParagraphPiece::Extension(extension::<S, E>(state, pex)?)
}
// regular text
_ => ParagraphPiece::Text(text::<S, E>(state)?),
};
let end = state.current_offset;
pieces.push(spanned(start, end, piece));
}
Ok(Paragraph { pieces })
}
fn blocks<'a, S, E>(
state: &mut State<'a>,
pex: &E,
) -> Result<Vec<Spanned<Block<S, E::Output>>>, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
let mut out = Vec::new();
while let Some((ch, _)) = state.peek() {
state.skip_whitespace();
let start = state.current_offset;
let block = match ch {
'#' => Block::Header(header::<S, E>(state)?),
_ => Block::Paragraph(paragraph(state, pex)?),
};
let end = state.current_offset;
out.push(spanned(start, end, block));
}
Ok(out)
}
pub struct ParseExtensionsRaw<S>(core::marker::PhantomData<S>);
impl<S> ParseExtensionsRaw<S> {
fn new() -> Self {
Self(Default::default())
}
}
impl<'a, S> ParseExtensions<'a> for ParseExtensionsRaw<S>
where
S: From<&'a str>,
{
type Output = Extension<S>;
type Error = kor::Never;
fn parse(
&self,
content: &'a str,
span: Span,
) -> core::result::Result<Self::Output, Self::Error> {
Ok(Extension(content.into()))
}
}
pub fn parse<'a, S>(
corpus: &'a str,
) -> Result<Vec<Spanned<Block<S, Extension<S>>>>, kor::Never>
where
S: From<&'a str>,
{
parse_extended(corpus, ParseExtensionsRaw::new())
}
pub fn parse_extended<'a, S, E>(
corpus: &'a str,
pex: E,
) -> Result<Vec<Spanned<Block<S, E::Output>>>, E::Error>
where
S: From<&'a str>,
E: ParseExtensions<'a>,
{
let mut state = State {
corpus,
current_offset: 0.into(),
};
blocks(&mut state, &pex)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn header_1() {
let corpus = "# Header";
let output = parse(corpus).unwrap();
assert_eq!(
&output,
&[spanned(
0,
corpus.len(),
Block::Header(Header {
level: 1,
content: "Header",
})
)]
)
}
#[test]
fn header_2() {
let corpus = "# Header\n## Header 2\n";
let output = parse(corpus).unwrap();
assert_eq!(
&output,
&[
spanned(
0,
9,
Block::Header(Header {
level: 1,
content: "Header",
})
),
spanned(
9,
25,
Block::Header(Header {
level: 2,
content: "Header 2",
})
),
]
)
}
#[test]
fn paragraph_1() {
let corpus = "Hello, world";
let output = parse(corpus).unwrap();
assert_eq!(
&output,
&[spanned(
0,
corpus.len(),
Block::Paragraph(Paragraph {
pieces: vec![spanned(
0,
corpus.len(),
ParagraphPiece::Text(Text("Hello, world"))
)]
})
)],
)
}
#[test]
fn paragraph_2() {
let corpus = "Hello, world\n\nGoodbye, world\n";
let output = parse(corpus).unwrap();
assert_eq!(
&output,
&[
spanned(
0,
14,
Block::Paragraph(Paragraph {
pieces: vec![spanned(
0,
12,
ParagraphPiece::Text(Text("Hello, world"))
)]
})
),
spanned(
14,
corpus.len(),
Block::Paragraph(Paragraph {
pieces: vec![spanned(
14,
corpus.len() - 1,
ParagraphPiece::Text(Text("Goodbye, world"))
)]
})
)
],
)
}
#[test]
fn paragraph_extension_1() {
let corpus = "Hello @{world}@";
let output = parse(corpus).unwrap();
assert_eq!(
&output,
&[spanned(
0,
corpus.len(),
Block::Paragraph(Paragraph {
pieces: vec![
spanned(0, 6, ParagraphPiece::Text(Text("Hello "))),
spanned(
6,
corpus.len(),
ParagraphPiece::Extension(Extension("world"))
)
]
})
)]
);
}
#[test]
fn complex_test_1() {
let corpus = include_str!("../test/hello_mup.mup");
let output = parse::<&str>(corpus).unwrap();
assert_eq!(output, &[]);
}
}