Fix formatter handling of blank lines (#870)

khieta · shaobo-he-aws · commit 44e01df7b785 · 2024-05-21T13:39:06.000-07:00
diff --git a/cedar-policy-formatter/src/pprint/doc.rs b/cedar-policy-formatter/src/pprint/doc.rs
@@ -74,6 +74,7 @@ impl Doc for Node<Option<VariableDef>> {
                     var_doc
                         .append(get_trailing_comment_doc_from_str(
                             &start_comment.trailing_comment,
+                            RcDoc::nil(),
                         ))
                         .append(is_doc)
                         .append(RcDoc::line())
@@ -120,21 +121,23 @@ impl Doc for Node<Option<Cond>> {
                     cond_doc
                         .append(get_trailing_comment_doc_from_str(
                             &cond_comment.trailing_comment,
+                            RcDoc::line(),
                         ))
-                        .append(RcDoc::line())
                         .append(
                             get_leading_comment_doc_from_str(&lb_comment.leading_comment).append(
                                 RcDoc::text("{").append(
-                                    get_trailing_comment_doc_from_str(&lb_comment.trailing_comment)
-                                        .append(RcDoc::line())
-                                        .append(
-                                            get_leading_comment_doc_from_str(&expr_leading_comment)
-                                                .append(expr_doc.group()),
-                                        )
-                                        .nest(context.config.indent_width)
-                                        .append(RcDoc::line())
-                                        .append(rb_doc)
-                                        .group(),
+                                    get_trailing_comment_doc_from_str(
+                                        &lb_comment.trailing_comment,
+                                        RcDoc::line(),
+                                    )
+                                    .append(
+                                        get_leading_comment_doc_from_str(&expr_leading_comment)
+                                            .append(expr_doc.group()),
+                                    )
+                                    .nest(context.config.indent_width)
+                                    .append(RcDoc::line())
+                                    .append(rb_doc)
+                                    .group(),
                                 ),
                             ),
                         )
@@ -145,15 +148,15 @@ impl Doc for Node<Option<Cond>> {
                 cond_doc
                     .append(get_trailing_comment_doc_from_str(
                         &cond_comment.trailing_comment,
+                        RcDoc::line(),
                     ))
-                    .append(RcDoc::line())
                     .append(
                         get_leading_comment_doc_from_str(&lb_comment.leading_comment).append(
                             RcDoc::text("{")
                                 .append(get_trailing_comment_doc_from_str(
                                     &lb_comment.trailing_comment,
+                                    RcDoc::line(),
                                 ))
-                                .append(RcDoc::line())
                                 .append(rb_doc)
                                 .group(),
                         ),
@@ -169,12 +172,12 @@ impl Doc for Node<Option<Expr>> {
         match self.as_inner()?.expr.as_ref() {
             ExprData::If(c, t, e) => {
                 fn pp_group<'n>(
-                    s: &str,
+                    s: &'n str,
                     c: Comment,
                     e: &'n Node<Option<Expr>>,
                     context: &mut Context<'_>,
                 ) -> RcDoc<'n> {
-                    add_comment(RcDoc::as_string(s), c, RcDoc::nil()).append(
+                    add_comment(RcDoc::text(s), c, RcDoc::nil()).append(
                         RcDoc::line()
                             .append(e.to_doc(context))
                             .nest(context.config.indent_width),
@@ -330,7 +333,7 @@ impl Doc for Node<Option<Relation>> {
 
 impl Doc for AddOp {
     fn to_doc(&self, _: &mut Context<'_>) -> Option<RcDoc<'_>> {
-        Some(RcDoc::text(self.to_string()))
+        Some(RcDoc::as_string(self))
     }
 }
 
@@ -366,7 +369,7 @@ impl Doc for Node<Option<Add>> {
 
 impl Doc for MultOp {
     fn to_doc(&self, _: &mut Context<'_>) -> Option<RcDoc<'_>> {
-        Some(RcDoc::text(self.to_string()))
+        Some(RcDoc::as_string(self))
     }
 }
 
@@ -420,9 +423,9 @@ impl Doc for Node<Option<Unary>> {
                                 .map(|i| {
                                     Some(add_comment(
                                         if matches!(op, NegOp::Bang(_)) {
-                                            RcDoc::as_string("!")
+                                            RcDoc::text("!")
                                         } else {
-                                            RcDoc::as_string("-")
+                                            RcDoc::text("-")
                                         },
                                         comment.get(i as usize)?.clone(),
                                         RcDoc::nil(),
@@ -467,7 +470,7 @@ impl Doc for Node<Option<RecInit>> {
             key_doc
                 .append(RcDoc::line_())
                 .append(add_comment(
-                    RcDoc::as_string(":"),
+                    RcDoc::text(":"),
                     get_comment_after_end(e.0.loc.span, &mut context.tokens)?,
                     RcDoc::nil(),
                 ))
@@ -493,7 +496,7 @@ impl Doc for Node<Option<Name>> {
                             let (d, e) = pair;
                             Some((
                                 d.append(add_comment(
-                                    RcDoc::as_string("::"),
+                                    RcDoc::text("::"),
                                     get_comment_after_end(e.loc.span, &mut context.tokens)?,
                                     RcDoc::nil(),
                                 ))
@@ -504,7 +507,7 @@ impl Doc for Node<Option<Name>> {
                     )?
                     .0
                     .append(add_comment(
-                        RcDoc::as_string("::"),
+                        RcDoc::text("::"),
                         get_comment_after_end(path.last()?.loc.span, &mut context.tokens)?,
                         RcDoc::nil(),
                     ))
@@ -517,6 +520,9 @@ impl Doc for Node<Option<Name>> {
 impl Doc for Node<Option<Str>> {
     fn to_doc(&self, context: &mut Context<'_>) -> Option<RcDoc<'_>> {
         let e = self.as_inner()?;
+        // Note: the input string may contain newlines, but `utils::create_multiline_doc`
+        // _cannot_ be used here because this function will change indentation
+        // on newlines, which may alter the string content.
         Some(add_comment(
             RcDoc::as_string(e),
             get_comment_at_start(self.loc.span, &mut context.tokens)?,
@@ -595,7 +601,7 @@ impl Doc for Node<Option<Primary>> {
                             let (d, e) = pair;
                             Some((
                                 d.append(add_comment(
-                                    RcDoc::as_string(","),
+                                    RcDoc::text(","),
                                     get_comment_after_end(e.loc.span, &mut context.tokens)?,
                                     RcDoc::nil(),
                                 ))
@@ -627,7 +633,7 @@ impl Doc for Node<Option<Primary>> {
                             let (d, e) = pair;
                             Some((
                                 d.append(add_comment(
-                                    RcDoc::as_string(","),
+                                    RcDoc::text(","),
                                     get_comment_after_end(e.loc.span, &mut context.tokens)?,
                                     RcDoc::nil(),
                                 ))
@@ -684,7 +690,7 @@ impl Doc for Node<Option<MemAccess>> {
                                 let (d, e) = pair;
                                 Some((
                                     d.append(add_comment(
-                                        RcDoc::as_string(","),
+                                        RcDoc::text(","),
                                         get_comment_after_end(e.loc.span, &mut context.tokens)?,
                                         RcDoc::nil(),
                                     ))
diff --git a/cedar-policy-formatter/src/pprint/fmt.rs b/cedar-policy-formatter/src/pprint/fmt.rs
@@ -120,7 +120,7 @@ pub fn policies_str_to_pretty(ps: &str, config: &Config) -> Result<String> {
         .ok_or(miette!("fail to get input policy CST"))?
         .0
         .iter()
-        .map(|p| Ok(remove_empty_lines(tree_to_pretty(p, &mut context)?.trim())))
+        .map(|p| Ok(remove_empty_lines(&tree_to_pretty(p, &mut context)?)))
         .collect::<Result<Vec<String>>>()?
         .join("\n\n");
     // handle comment at the end of a policyset
diff --git a/cedar-policy-formatter/src/pprint/utils.rs b/cedar-policy-formatter/src/pprint/utils.rs
@@ -16,6 +16,7 @@
 
 use itertools::Itertools;
 use pretty::RcDoc;
+use regex::Regex;
 
 use super::token::{Comment, WrappedToken};
 
@@ -24,24 +25,37 @@ pub fn add_brackets<'a>(d: RcDoc<'a>, leftp: RcDoc<'a>, rightp: RcDoc<'a>) -> Rc
     leftp.append(d.nest(1)).append(rightp)
 }
 
+/// Convert a leading comment to an `RcDoc`, adding leading and trailing newlines.
 pub fn get_leading_comment_doc_from_str<'a>(leading_comment: &str) -> RcDoc<'a> {
     if leading_comment.is_empty() {
         RcDoc::nil()
     } else {
-        let cs: RcDoc<'_> = RcDoc::intersperse(
-            leading_comment
-                .trim()
-                .split('\n')
-                .map(|c| RcDoc::text(c.to_owned())),
-            RcDoc::hardline(),
-        );
-        RcDoc::hardline().append(cs).append(RcDoc::hardline())
+        RcDoc::hardline()
+            .append(create_multiline_doc(leading_comment))
+            .append(RcDoc::hardline())
     }
 }
 
-pub fn get_trailing_comment_doc_from_str<'a>(trailing_comment: &str) -> RcDoc<'a> {
+/// Convert multiline text into an `RcDoc`. Both `RcDoc::as_string` and
+/// `RcDoc::text` allow newlines in the text (although the official
+/// documentation says they don't), but the resulting text will maintain its
+/// original indentation instead of the new "pretty" indentation.
+fn create_multiline_doc<'a>(str: &str) -> RcDoc<'a> {
+    RcDoc::intersperse(
+        str.trim().split('\n').map(|c| RcDoc::text(c.to_owned())),
+        RcDoc::hardline(),
+    )
+}
+
+/// Convert a trailing comment to an `RcDoc`, adding a trailing newline.
+/// There is no need to use `create_multiline_doc` because a trailing comment
+/// cannot contain newlines.
+pub fn get_trailing_comment_doc_from_str<'a>(
+    trailing_comment: &str,
+    next_doc: RcDoc<'a>,
+) -> RcDoc<'a> {
     if trailing_comment.is_empty() {
-        RcDoc::nil()
+        next_doc
     } else {
         RcDoc::space()
             .append(RcDoc::text(trailing_comment.trim().to_owned()))
@@ -112,26 +126,83 @@ pub fn get_comment_in_range(span: miette::SourceSpan, tokens: &mut [WrappedToken
         .collect()
 }
 
-// Wrap doc with comment
+/// Wrap an `RcDoc` with comments. If there is a leading comment, then this
+/// will introduce a newline bat the start of the `RcDoc`. If there is a
+/// trailing comment, then it will introduce a newline at the end.
 pub fn add_comment<'a>(d: RcDoc<'a>, comment: Comment, next_doc: RcDoc<'a>) -> RcDoc<'a> {
     let leading_comment = comment.leading_comment;
     let trailing_comment = comment.trailing_comment;
     let leading_comment_doc = get_leading_comment_doc_from_str(&leading_comment);
-    let trailing_comment_doc: RcDoc<'_> = if trailing_comment.is_empty() {
-        d.append(next_doc)
-    } else {
-        d.append(RcDoc::space())
-            .append(RcDoc::text(trailing_comment.trim().to_owned()))
-            .append(RcDoc::hardline())
-    };
+    let trailing_comment_doc = get_trailing_comment_doc_from_str(&trailing_comment, next_doc);
+    leading_comment_doc.append(d).append(trailing_comment_doc)
+}
 
-    leading_comment_doc.append(trailing_comment_doc.clone())
+/// Remove empty lines from the input string, ignoring the first and last lines.
+/// (Because of how this function is used in `remove_empty_lines`, the first and
+/// last lines may include important spacing information.) This will remove empty
+/// lines  _everywhere_, including in places where that may not be desired
+/// (e.g., in string literals).
+fn remove_empty_interior_lines(s: &str) -> String {
+    let mut new_s = String::new();
+    if s.starts_with('\n') {
+        new_s.push_str("\n");
+    }
+    new_s.push_str(
+        s.split_inclusive('\n')
+            // in the case where `s` does not end in a newline, `!ss.contains('\n')`
+            // preserves whitespace on the last line
+            .filter(|ss| !ss.trim().is_empty() || !ss.contains('\n'))
+            .collect::<Vec<_>>()
+            .join("")
+            .as_str(),
+    );
+    new_s
 }
 
-pub fn remove_empty_lines(s: &str) -> String {
-    s.lines()
-        .filter(|ss| !ss.trim().is_empty())
-        .map(|s| s.to_owned())
-        .collect::<Vec<String>>()
-        .join("\n")
+/// Remove empty lines, safely handling newlines that occur in quotations.
+pub fn remove_empty_lines(text: &str) -> String {
+    // PANIC SAFETY: this regex pattern is valid
+    #[allow(clippy::unwrap_used)]
+    let comment_regex = Regex::new(r"//[^\n]*").unwrap();
+    // PANIC SAFETY: this regex pattern is valid
+    #[allow(clippy::unwrap_used)]
+    let string_regex = Regex::new(r#""(\\.|[^"\\])*"[^\n]*"#).unwrap();
+
+    let mut index = 0;
+    let mut final_text = String::new();
+
+    while index < text.len() {
+        // Check for the next comment and string. The general strategy is to
+        // call `remove_empty_interior_lines` on all the text _outside_ of
+        // strings. Comments should be skipped to avoid interpreting a quote in
+        // a comment as a string.
+        let comment_match = comment_regex.find_at(text, index);
+        let string_match = string_regex.find_at(text, index);
+        match (comment_match, string_match) {
+            (Some(m1), Some(m2)) => {
+                // Handle the earlier match
+                let m = std::cmp::min_by_key(m1, m2, |m| m.start());
+                // PANIC SAFETY: Slicing `text` is safe since `index <= m.start()` and both are within the bounds of `text`.
+                #[allow(clippy::indexing_slicing)]
+                final_text.push_str(&remove_empty_interior_lines(&text[index..m.start()]));
+                final_text.push_str(m.as_str());
+                index = m.end();
+            }
+            (Some(m), None) | (None, Some(m)) => {
+                // PANIC SAFETY: Slicing `text` is safe since `index <= m.start()` and both are within the bounds of `text`.
+                #[allow(clippy::indexing_slicing)]
+                final_text.push_str(&remove_empty_interior_lines(&text[index..m.start()]));
+                final_text.push_str(m.as_str());
+                index = m.end();
+            }
+            (None, None) => {
+                // PANIC SAFETY: Slicing `text` is safe since `index` is within the bounds of `text`.
+                #[allow(clippy::indexing_slicing)]
+                final_text.push_str(&remove_empty_interior_lines(&text[index..]));
+                break;
+            }
+        }
+    }
+    // Trim the final result to account for dangling newlines
+    final_text.trim().to_string()
 }
diff --git a/cedar-policy-formatter/tests/blank_lines.cedar b/cedar-policy-formatter/tests/blank_lines.cedar
@@ -0,0 +1,34 @@
+// Test fix for #862 where blank lines in strings were removed.
+
+// The output of the formatter should change string or eid content (including
+// removing blank lines) because this will change the policy's semantics. It is
+// ok to remove blank lines everywhere else.
+
+permit(principal == User
+
+::
+
+"alice", action, resource 
+
+in Folder::"Name
+
+	
+with a newline") when // trailing comment
+
+{
+
+    context.foo == "string
+
+	with
+  
+  newlines and other	strange characters🐈👍\"
+
+// even something that looks like a comment
+  
+"
+
+// Quotes in comments "
+
+// shouldn't matter "
+
+};
diff --git a/cedar-policy-formatter/tests/snapshots/cedar_policy_formatter__pprint__fmt__tests__format_files@blank_lines.cedar.snap b/cedar-policy-formatter/tests/snapshots/cedar_policy_formatter__pprint__fmt__tests__format_files@blank_lines.cedar.snap
diff --git a/cedar-policy/CHANGELOG.md b/cedar-policy/CHANGELOG.md