From d7670e17b4609008b0069a182b08c12a00f91393 Mon Sep 17 00:00:00 2001
From: Lily Ballard <lily@sb.org>
Date: Wed, 16 Dec 2020 17:43:13 -0800
Subject: [PATCH] Add line, column to KdlError (#3)

* Add line and column numbers to KdlError

These numbers are calculated using our `newline` rule, as I'm not aware
of any standard for how to calculate line numbers in error messages.

The error retains the original `offset` field as well, but it's now
expressed in characters rather than bytes.

* Clean up error handling in parse_document()

We don't need to worry about `Incomplete` errors, the `complete` family
of parsers (which we're using) doesn't ever produce those. We can use
the `.finish()` function to unwrap the error instead, so we don't have
to worry about the underlying error types.
---
 src/error.rs  |  9 +++--
 src/lib.rs    | 36 ++++++++++++--------
 src/parser.rs | 91 ++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 115 insertions(+), 21 deletions(-)
diff --git a/src/error.rs b/src/error.rs
index 7fd6345..d217570 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -4,10 +4,15 @@ use nom::error::{ContextError, ErrorKind, FromExternalError, ParseError};
 
 use thiserror::Error;
 #[derive(Debug, Clone, Eq, PartialEq, Error)]
-#[error("Error parsing document. {kind}")]
+#[error("Error parsing document at line {line} column {column}. {kind}")]
 pub struct KdlError {
     pub input: String,
+    /// Offset in chars of the error.
     pub offset: usize,
+    /// 1-based line number of the error.
+    pub line: usize,
+    /// 1-based column number (in chars) of the error.
+    pub column: usize,
     pub kind: KdlErrorKind,
 }
 
@@ -19,8 +24,6 @@ pub enum KdlErrorKind {
     ParseFloatError(ParseFloatError),
     #[error("Failed to parse {0} component of semver string.")]
     Context(&'static str),
-    #[error("Incomplete input to semver parser.")]
-    IncompleteInput,
     #[error("An unspecified error occurred.")]
     Other,
 }
diff --git a/src/lib.rs b/src/lib.rs
index 40063a7..83952fb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
 use nom::combinator::all_consuming;
-use nom::Err;
+use nom::Finish;
 
 pub use crate::error::{KdlError, KdlErrorKind};
 pub use crate::node::{KdlNode, KdlNodeValue};
@@ -13,12 +13,17 @@ where
     I: AsRef<str>,
 {
     let input = &input.as_ref()[..];
-    match all_consuming(parser::nodes)(input) {
-        Ok((_, arg)) => Ok(arg),
-        Err(err) => Err(match err {
-            Err::Error(e) | Err::Failure(e) => KdlError {
+    all_consuming(parser::nodes)(input)
+        .finish()
+        .map(|(_, arg)| arg)
+        .map_err(|e| {
+            let prefix = &input[..(input.len() - e.input.len())];
+            let (line, column) = calculate_line_column(prefix);
+            KdlError {
                 input: input.into(),
-                offset: e.input.as_ptr() as usize - input.as_ptr() as usize,
+                offset: prefix.chars().count(),
+                line,
+                column,
                 kind: if let Some(kind) = e.kind {
                     kind
                 } else if let Some(ctx) = e.context {
@@ -26,12 +31,15 @@ where
                 } else {
                     KdlErrorKind::Other
                 },
-            },
-            Err::Incomplete(_) => KdlError {
-                input: input.into(),
-                offset: input.len() - 1,
-                kind: KdlErrorKind::IncompleteInput,
-            },
-        }),
-    }
+            }
+        })
+}
+
+/// Calculates the line and column of the end of a `&str`.
+///
+/// If the line ends on a newline, the (line, column) pair is placed on the previous line instead.
+fn calculate_line_column(input: &str) -> (usize, usize) {
+    let (input, skipped_lines) = parser::count_leading_lines(input);
+    let input = parser::strip_trailing_newline(input);
+    (skipped_lines + 1, input.len() + 1) // +1 as we're 1-based
 }
diff --git a/src/parser.rs b/src/parser.rs
index 61e9987..43dd15a 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,12 +1,15 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, iter::from_fn};
 
+use nom::branch::alt;
 use nom::bytes::complete::{tag, take_until, take_while_m_n};
 use nom::character::complete::{alpha1, alphanumeric1, anychar, char, none_of, one_of};
-use nom::combinator::{eof, map, map_opt, map_res, opt, recognize, value};
-use nom::multi::{fold_many0, many0, many1};
+use nom::combinator::{
+    all_consuming, eof, iterator, map, map_opt, map_res, not, opt, recognize, value,
+};
+use nom::multi::{fold_many0, many0, many1, many_till};
 use nom::sequence::{delimited, pair, preceded, terminated, tuple};
+use nom::Finish;
 use nom::IResult;
-use nom::{branch::alt, multi::many_till};
 
 use crate::error::KdlParseError;
 use crate::node::{KdlNode, KdlNodeValue};
@@ -16,6 +19,44 @@ pub(crate) fn nodes(input: &str) -> IResult<&str, Vec<KdlNode>, KdlParseError<&s
     many0(delimited(many0(linespace), node, newline))(input)
 }
 
+// The following two functions exist for the purposes of translating offsets into line/column pairs
+// for error reporting. We're doing this here so we can make use of our `newline` definition, to
+// ensure line/column information is reported accurately based on our definition of newlines, even
+// if we update our definition of newlines later.
+
+/// Counts all lines in the input up to the final line.
+///
+/// This counts and skips past all lines terminated in `newline` with the exception of the final
+/// line, regardless of whether it's newline-terminated. If the input only contains a single line,
+/// the input will be returned unmodified with a count of `0`.
+pub(crate) fn count_leading_lines(input: &str) -> (&str, usize) {
+    let mut iter = iterator(
+        input,
+        terminated(many_till(value((), anychar), newline), not(eof)),
+    );
+    let count = (&mut iter).count();
+    match iter.finish().finish() {
+        Ok((input, _)) => (input, count),
+        // I don't believe this particular parser can error, but we need to handle it anyway
+        Err(e) => (e.input, count),
+    }
+}
+
+/// Strips a single trailing `newline`, if present, from the input.
+pub(crate) fn strip_trailing_newline(input: &str) -> &str {
+    // Nom doesn't support parsing in reverse, but we want to reuse our newline definition. The
+    // longest newline sequence is 2 characters, so we can just test the last char, and the
+    // second-to-last char, and validate that the parser consumes the full input.
+    let mut idx_iter = input.char_indices().map(|(idx, _)| idx);
+    let mut last = idx_iter.next_back();
+    let mut second_last = idx_iter.next_back();
+    // Start with the second-to-last, otherwise \r\n will be parsed as just the \n.
+    from_fn(|| second_last.take().or_else(|| last.take()))
+        .find(|&idx| all_consuming(newline)(&input[idx..]).is_ok())
+        .map(|idx| &input[..idx])
+        .unwrap_or(input)
+}
+
 #[derive(Clone)]
 enum NodeArg {
     Value(KdlNodeValue),
@@ -473,4 +514,46 @@ mod tests {
         assert!(newline("\r").is_err());
         assert!(newline("blah").is_err());
     }
+
+    #[test]
+    fn test_count_leading_lines() {
+        assert_eq!(count_leading_lines(""), ("", 0));
+        assert_eq!(count_leading_lines("foo"), ("foo", 0));
+        assert_eq!(count_leading_lines("foo\n"), ("foo\n", 0));
+        assert_eq!(count_leading_lines("foo\nbar"), ("bar", 1));
+        assert_eq!(count_leading_lines("foo\nbar\n"), ("bar\n", 1));
+        assert_eq!(count_leading_lines("\nfoo\n\nbar\n"), ("bar\n", 3));
+        assert_eq!(count_leading_lines("foo\r\nbar\r\n"), ("bar\r\n", 1));
+        assert_eq!(count_leading_lines("foo\nbar\rbaz"), ("bar\rbaz", 1));
+        assert_eq!(count_leading_lines("foo\nbar\n\n"), ("\n", 2));
+
+        assert_eq!(
+            count_leading_lines(
+                r#"// This example is a GitHub Action if it used KDL syntax.
+// See .github/workflows/ci.yml for the file this was based on.
+name "CI"
+
+on "push" "pull_request"
+
+env {
+  RUSTFLAGS "-Dwarnings"
+"#
+            ),
+            ("  RUSTFLAGS \"-Dwarnings\"\n", 7)
+        );
+    }
+
+    #[test]
+    fn test_strip_trailing_newline() {
+        assert_eq!(strip_trailing_newline(""), "");
+        assert_eq!(strip_trailing_newline("foo"), "foo");
+        assert_eq!(strip_trailing_newline("foo\n"), "foo");
+        assert_eq!(strip_trailing_newline("foo\n\n"), "foo\n");
+        assert_eq!(strip_trailing_newline("foo\nbar"), "foo\nbar");
+        assert_eq!(strip_trailing_newline("foo\nbar\n"), "foo\nbar");
+        assert_eq!(strip_trailing_newline("foo\r\n"), "foo");
+        assert_eq!(strip_trailing_newline("\n"), "");
+        assert_eq!(strip_trailing_newline("foo\r\n\r"), "foo\r\n\r");
+        assert_eq!(strip_trailing_newline("foo\nx"), "foo\nx");
+    }
 }