Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 91 additions & 5 deletions StyleCop.Analyzers/StyleCop.Analyzers/Helpers/XmlCommentHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ namespace StyleCop.Analyzers.Helpers
{
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp.Syntax;
Expand Down Expand Up @@ -268,22 +267,109 @@ internal static string GetText(XmlTextSyntax textElement, bool normalizeWhitespa
return null;
}

StringBuilder stringBuilder = StringBuilderPool.Allocate();
string result = string.Empty;

StringBuilder stringBuilder = null;

foreach (var item in textElement.TextTokens)
{
stringBuilder.Append(item);
if (result.Length == 0)
{
result = item.ToString();
}
else
{
if (stringBuilder == null)
{
stringBuilder = StringBuilderPool.Allocate();
stringBuilder.Append(result);
}

stringBuilder.Append(item.ToString());
}
}

if (stringBuilder != null)
{
result = StringBuilderPool.ReturnAndFree(stringBuilder);
}

string result = StringBuilderPool.ReturnAndFree(stringBuilder);
if (normalizeWhitespace)
{
result = Regex.Replace(result, @"\s+", " ");
result = result.NormalizeWhiteSpace();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious, @stephentoub on the latest .NET (of course this is broader targeted) would you expect regex replace using pre generated regex to have comparable performance to open coded?

I didn't measure, but fwiw

        [GeneratedRegex(@"[\s^ ]+")]
        private static partial Regex GetRegex();

generates code in part like

			private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)
			{
				int pos = runtextpos;
				if ((uint)pos < (uint)inputSpan.Length)
				{
					ReadOnlySpan<char> span = inputSpan.Slice(pos);
					for (int i = 0; i < span.Length; i++)
					{
						char ch;
						if (((ch = span[i]) < '\u0080') ? ((byte)("㸀\0\u0001\0\0䀀\0\0"[(int)ch >> 4] & (1 << (ch & 0xF))) != 0) : RegexRunner.CharInClass(ch, "\0\u0004\u0001 !^_d"))
						{
							runtextpos = pos + i;
							return true;
						}
					}
				}
				runtextpos = inputSpan.Length;
				return false;

the code below hides some of this inside char.IsWhiteSpace

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and will return the same string if there are no non-space whitespace chars.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After addressing the single line issue and regular expression issue, more optimizations could be having much less return.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious, @stephentoub on the latest .NET (of course this is broader targeted) would you expect regex replace using pre generated regex to have comparable performance to open coded?

Yes.

Note that in .NET 8, the generated code now looks like this:

                private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)
                {
                    int pos = base.runtextpos;
                    
                    // Empty matches aren't possible.
                    if ((uint)pos < (uint)inputSpan.Length)
                    {
                        // The pattern begins with a whitespace character.
                        // Find the next occurrence. If it can't be found, there's no match.
                        int i = inputSpan.Slice(pos).IndexOfAnyWhiteSpace();
                        if (i >= 0)
                        {
                            base.runtextpos = pos + i;
                            return true;
                        }
                    }
                    
                    // No match found.
                    base.runtextpos = inputSpan.Length;
                    return false;
                }

where that IndexOfAnyWhitespace is emitted as:

        /// <summary>Finds the next index of any character that matches a whitespace character.</summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static int IndexOfAnyWhiteSpace(this ReadOnlySpan<char> span)
        {
            int i = span.IndexOfAnyExcept(Utilities.s_asciiExceptWhiteSpace);
            if ((uint)i < (uint)span.Length)
            {
                if (char.IsAscii(span[i]))
                {
                    return i;
                }
        
                do
                {
                    if (char.IsWhiteSpace(span[i]))
                    {
                        return i;
                    }
                    i++;
                }
                while ((uint)i < (uint)span.Length);
            }
        
            return -1;
        }
        
        /// <summary>Supports searching for characters in or not in "\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f!\"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f".</summary>
        internal static readonly IndexOfAnyValues<char> s_asciiExceptWhiteSpace = IndexOfAnyValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f");

such that the search for the whitespace is vectorized. Then the matching logic for finding all the contiguous whitespace currently looks like:

                    // Match a whitespace character atomically at least once.
                    {
                        int iteration = 0;
                        while ((uint)iteration < (uint)slice.Length && char.IsWhiteSpace(slice[iteration]))
                        {
                            iteration++;
                        }
                        
                        if (iteration == 0)
                        {
                            return false; // The input didn't match.
                        }
                        
                        slice = slice.Slice(iteration);
                        pos += iteration;
                    }

(though we're flirting with a change that'll result in that loop also changing to similarly be vectorized with IndexOfAnyExcept).

Replace itself also gets better in .NET 8.

}

return result;
}

internal static string NormalizeWhiteSpace(this string text)
{
if (text == null)
{
return null;
}

int length = text.Length;

bool lastSpace = false;

bool diff = false;

foreach (char ch in text)
{
if (char.IsWhiteSpace(ch))
{
if (lastSpace)
{
length--;
}
else
{
if (ch != ' ')
{
diff = true;
}

lastSpace = true;
}
}
else
{
lastSpace = false;
}
}

if (diff || (length != text.Length))
{
char[] buffer = new char[length];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as you probably know, if this code was targeting .NET Core (or conditionally compiled thus) you could eliminate this char[] allocation by using string.Create( ...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but this project is targeting multiple runtimes.


lastSpace = false;

length = 0;

foreach (char ch in text)
{
if (char.IsWhiteSpace(ch))
{
if (!lastSpace)
{
buffer[length++] = ' ';
lastSpace = true;
}
}
else
{
buffer[length++] = ch;
lastSpace = false;
}
}

return new string(buffer, 0, length);
}

return text;
}

internal static T GetFirstAttributeOrDefault<T>(XmlNodeSyntax nodeSyntax)
where T : XmlAttributeSyntax
{
Expand Down