From 31e7e9d285f2e62b923b3b8e311bf7919c9eccab Mon Sep 17 00:00:00 2001
From: Mikolaj Wielgus <wielgusmikolaj@gmail.com>
Date: Thu, 21 Nov 2024 22:05:18 +0100
Subject: [PATCH] feat(specctra): sanitize DSN input by safelisting chars where
 possible

I have implemented sanitization for every `.next_char()` and
`.peek_char()` that I could without causing an error. It turned out that
there are only two minor places where I can do that for `.next_char()`,
and none for `.peek_char()`.

This is for demonstrative purposes. I don't expect to merge this.
---
 src/specctra/read.rs | 53 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/src/specctra/read.rs b/src/specctra/read.rs
index 067439e..81d2662 100644
--- a/src/specctra/read.rs
+++ b/src/specctra/read.rs
@@ -7,6 +7,8 @@ use utf8_chars::BufReadCharsExt;
 pub enum ParseError {
     #[error("unexpected end of file")]
     Eof,
+    #[error("forbidden character")]
+    ForbiddenChar(char),
     #[error(transparent)]
     Io(#[from] std::io::Error),
     #[error("expected {0}")]
@@ -201,7 +203,12 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
     }
 
     fn next_char(&mut self) -> Result<char, ParseErrorContext> {
-        let return_chr = self.peek_char()?;
+        let chr = self.unsanitized_next_char()?;
+        self.sanitize(chr)
+    }
+
+    fn unsanitized_next_char(&mut self) -> Result<char, ParseErrorContext> {
+        let return_chr = self.unsanitized_peek_char()?;
         self.peeked_char = None;
 
         if return_chr == '\n' {
@@ -215,6 +222,11 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
     }
 
     fn peek_char(&mut self) -> Result<char, ParseErrorContext> {
+        let chr = self.unsanitized_peek_char()?;
+        self.sanitize(chr)
+    }
+
+    fn unsanitized_peek_char(&mut self) -> Result<char, ParseErrorContext> {
         Ok(if let Some(chr) = self.peeked_char {
             chr
         } else {
@@ -229,20 +241,35 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
         })
     }
 
+    fn sanitize(&mut self, chr: char) -> Result<char, ParseErrorContext> {
+        if !chr.is_ascii_alphanumeric()
+            && !self.is_recognized_whitespace(chr)
+            && !(chr == '(' || chr == ')')
+        {
+            return Err(self.add_context(ParseError::ForbiddenChar(chr)));
+        }
+
+        Ok(chr)
+    }
+
     fn skip_whitespace(&mut self) -> Result<(), ParseErrorContext> {
         loop {
-            let chr = self.peek_char()?;
-            if chr == ' ' || chr == '\r' || chr == '\n' {
-                self.next_char().unwrap();
+            let chr = self.unsanitized_peek_char()?;
+            if self.is_recognized_whitespace(chr) {
+                self.unsanitized_next_char().unwrap();
             } else {
                 return Ok(());
             }
         }
     }
 
+    fn is_recognized_whitespace(&mut self, chr: char) -> bool {
+        chr == ' ' || chr == '\r' || chr == '\n'
+    }
+
     fn read_string(&mut self) -> Result<String, ParseErrorContext> {
         if let Some(chr) = self.quote_char {
-            if chr == self.peek_char()? {
+            if chr == self.unsanitized_peek_char()? {
                 return self.read_quoted();
             }
         }
@@ -253,11 +280,11 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
         let mut string = String::new();
 
         loop {
-            let chr = self.peek_char()?;
-            if chr == ' ' || chr == '(' || chr == ')' || chr == '\r' || chr == '\n' {
+            let chr = self.unsanitized_peek_char()?;
+            if self.is_recognized_whitespace(chr) || chr == '(' || chr == ')' {
                 break;
             }
-            string.push(self.next_char().unwrap());
+            string.push(self.unsanitized_next_char().unwrap());
         }
 
         if string.is_empty() {
@@ -270,20 +297,20 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
     fn read_quoted(&mut self) -> Result<String, ParseErrorContext> {
         let mut string = String::new();
 
-        if self.next_char().unwrap() != self.quote_char.unwrap() {
+        if self.unsanitized_next_char().unwrap() != self.quote_char.unwrap() {
             panic!();
         }
 
         loop {
-            let chr = self.peek_char()?;
+            let chr = self.unsanitized_peek_char()?;
             if !self.space_in_quoted && chr == ' ' {
                 panic!("found a space inside a quoted string, but file didn't declare this possibility");
             }
             if chr == self.quote_char.unwrap() {
-                self.next_char().unwrap();
+                self.unsanitized_next_char().unwrap();
                 break;
             }
-            string.push(self.next_char().unwrap());
+            string.push(self.unsanitized_next_char().unwrap());
         }
 
         Ok(string)
@@ -312,7 +339,7 @@ impl<R: std::io::BufRead> ListTokenizer<R> {
         self.skip_whitespace()?;
         let context = self.context();
 
-        let chr = self.peek_char()?;
+        let chr = self.unsanitized_peek_char()?;
         Ok(InputToken::new(
             if chr == '(' {
                 self.next_char().unwrap();