-
Notifications
You must be signed in to change notification settings - Fork 22
Add analyzer and fix to escape non-ASCII literal values #450
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,200 @@ | ||
| using System.Collections.Immutable; | ||
| using System.Text; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.CodeAnalysis; | ||
| using Microsoft.CodeAnalysis.CodeActions; | ||
| using Microsoft.CodeAnalysis.CodeFixes; | ||
| using Microsoft.CodeAnalysis.CSharp; | ||
| using Microsoft.CodeAnalysis.CSharp.Syntax; | ||
| using Microsoft.CodeAnalysis.Diagnostics; | ||
|
|
||
| namespace D2L.CodeStyle.Analyzers.Language { | ||
| // We are doing this because we (unfortunately) have a mix of encoding for | ||
| // our source code files, and the encoding of the file impacts how strings | ||
| // are interpreted. We really ought to clean that up, but until that | ||
| // happens we can be safer if we avoid non-ASCII characters in our | ||
| // literals. Automated refactoring has bit us before with this. | ||
| [DiagnosticAnalyzer( LanguageNames.CSharp )] | ||
| internal sealed class EscapeNonAsciiCharsInLiteralsAnalyzer : DiagnosticAnalyzer { | ||
| public override ImmutableArray<DiagnosticDescriptor> SupportedDiagnostics | ||
| => ImmutableArray.Create( Diagnostics.EscapeNonAsciiCharsInLiteral ); | ||
|
|
||
| public override void Initialize( AnalysisContext context ) { | ||
| context.EnableConcurrentExecution(); | ||
|
|
||
| context.RegisterSyntaxNodeAction( | ||
| CheckForUnicodeLiterals, | ||
| SyntaxKind.StringLiteralExpression | ||
| ); | ||
|
|
||
| context.RegisterSyntaxNodeAction( | ||
| CheckForUnicodeLiterals, | ||
| SyntaxKind.CharacterLiteralExpression | ||
| ); | ||
| } | ||
|
|
||
| private static void CheckForUnicodeLiterals( | ||
| SyntaxNodeAnalysisContext ctx | ||
| ) { | ||
| // the node could be something like: | ||
| // "foo" | ||
| // @"foo" | ||
| // 'x' | ||
| var literalExpr = (LiteralExpressionSyntax)ctx.Node; | ||
|
|
||
| // We can't handle verbatim strings for the same reason as these | ||
| // folks: https://github.com/dotnet/codeformatter/issues/39 (TODO) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found this quite interesting. |
||
| if ( literalExpr.Token.IsVerbatimStringLiteral() ) { | ||
| return; | ||
| } | ||
|
|
||
| string token = literalExpr.Token.Text; | ||
| string escapedToken = null; | ||
|
|
||
| if( StrictlyAscii( token, out escapedToken ) ) { | ||
| return; | ||
| } | ||
|
|
||
| bool isChar = | ||
| literalExpr.Kind() == SyntaxKind.CharacterLiteralExpression; | ||
|
|
||
| var fixProps = ImmutableDictionary.CreateBuilder<string, string>(); | ||
| fixProps[EscapeCharCodeFix.ESCAPED] = escapedToken; | ||
|
|
||
| ctx.ReportDiagnostic( | ||
| Diagnostic.Create( | ||
| Diagnostics.EscapeNonAsciiCharsInLiteral, | ||
| literalExpr.GetLocation(), | ||
| fixProps.ToImmutable(), | ||
| isChar ? "char" : "string", | ||
| escapedToken | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| private static bool StrictlyAscii( | ||
| string token, | ||
| out string escapedToken | ||
| ) { | ||
| var sb = new StringBuilder(); | ||
| var copyStartIdx = 0; | ||
|
|
||
| // invariant: copyStartIdx < idx | ||
| // Note: the enclosing quotes are included in val; don't bother looking at them. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, variable rename |
||
| for( int idx = 1; idx < token.Length - 1; idx++ ) { | ||
| if( token[idx] < 0x80 ) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lol - so simple |
||
| continue; | ||
| } | ||
|
|
||
| // copy all the ascii chars we've seen between the last copy | ||
| // and now (not inclusive) into sb | ||
| sb.Append( token, copyStartIdx, idx - copyStartIdx ); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems more complicated than copying char-at-a-time, and I'm not sure I see the advantage given that you're using
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's "basically a no-op" when the string is ASCII. Microsoft did it with two passes (one to detect nonASCIIness, one to do the copying) which did char-by-char copying in the second step. It's technically more work to do that (two passes, and char by char is still worse than chunks at a time) but its so neglible and only happens during errors anyway so really not interesting. Honestly I just wrote it this way because it came out of my head this way. I'll look at it again. |
||
|
|
||
| // next time we'll start copying from the char after us | ||
| // (unless this happens again next loop) | ||
| copyStartIdx = idx + 1; | ||
|
|
||
| if ( IsSurrogatePair( token, idx ) ) { | ||
| sb.AppendFormat( | ||
| @"\U{0:X8}", | ||
| char.ConvertToUtf32( token[idx], token[idx + 1] ) | ||
| ); | ||
| } else { | ||
| sb.AppendFormat( | ||
| @"\u{0:X4}", | ||
| (ushort)token[idx] | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| // if copyStartIdx never changed we never saw non-ascii chars and | ||
| // sb is empty. | ||
| if ( copyStartIdx == 0 ) { | ||
| escapedToken = null; | ||
| return true; | ||
| } | ||
|
|
||
| // copy trailing ascii into sb. This is never a no-op because we | ||
| // need to at least copy the ending quote. | ||
| sb.Append( token, copyStartIdx, token.Length - copyStartIdx ); | ||
|
|
||
| escapedToken = sb.ToString(); | ||
| return false; | ||
| } | ||
|
|
||
| private static bool IsSurrogatePair( string str, int idx ) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. D'oh! Thanks |
||
| return idx + 1 < str.Length | ||
| && char.IsHighSurrogate( str[idx] ) | ||
| && char.IsLowSurrogate( str[idx + 1] ); | ||
| } | ||
| } | ||
|
|
||
| [ExportCodeFixProvider( | ||
| LanguageNames.CSharp, | ||
| Name = nameof( EscapeCharCodeFix ) | ||
| )] | ||
| public sealed class EscapeCharCodeFix : CodeFixProvider { | ||
| public const string ESCAPED = "escaped"; | ||
|
|
||
| public override ImmutableArray<string> FixableDiagnosticIds | ||
| => ImmutableArray.Create( | ||
| Diagnostics.EscapeNonAsciiCharsInLiteral.Id | ||
| ); | ||
|
|
||
| public override FixAllProvider GetFixAllProvider() { | ||
| return WellKnownFixAllProviders.BatchFixer; | ||
| } | ||
|
|
||
| public override async Task RegisterCodeFixesAsync( CodeFixContext ctx ) { | ||
| var root = await ctx.Document | ||
| .GetSyntaxRootAsync( ctx.CancellationToken ) | ||
| .ConfigureAwait( false ); | ||
|
|
||
| foreach( var diagnostic in ctx.Diagnostics ) { | ||
| var span = diagnostic.Location.SourceSpan; | ||
|
|
||
| var literal = root.FindNode( span, getInnermostNodeForTie: true ) | ||
| as LiteralExpressionSyntax; | ||
|
|
||
| if ( literal == null ) { | ||
| continue; | ||
| } | ||
|
|
||
| var escapedToken = diagnostic.Properties[ESCAPED]; | ||
|
|
||
| ctx.RegisterCodeFix( | ||
| CodeAction.Create( | ||
| title: "Escape literal", | ||
| ct => Fix( ctx.Document, root, literal, escapedToken ), | ||
| equivalenceKey: nameof( EscapeCharCodeFix ) | ||
| ), | ||
| diagnostic | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| private static Task<Document> Fix( | ||
| Document doc, | ||
| SyntaxNode root, | ||
| LiteralExpressionSyntax literal, | ||
| string escapedToken | ||
| ) { | ||
| var comment = SyntaxFactory | ||
| .Comment( $"/* unencoded: {literal.Token.Text} */" ); | ||
|
|
||
| var newLiteral = literal.WithToken( | ||
| SyntaxFactory.ParseToken( escapedToken ) | ||
| ).WithLeadingTrivia( | ||
| literal.GetLeadingTrivia() | ||
| .Add( comment ) | ||
| .Add( SyntaxFactory.Whitespace( " " ) ) | ||
| ); | ||
|
|
||
| var newRoot = root.ReplaceNode( literal, newLiteral ); | ||
|
|
||
| var newDoc = doc.WithSyntaxRoot( newRoot ); | ||
|
|
||
| return Task.FromResult( newDoc ); | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. string foo = "αβγ";--> string foo = /* unencoded: "αβγ" */ "\u03B1\u03B2\u03B3";The comment could get out of sync. Options:
Thoughts?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no chance that I'll notice an out-of-sync comment in a review. I'd lean towards #1 unless there lots of instances where the specific value is user-facing (seems unlikely since user-facing values should be langTerms, not string constants). Alternately, is there a visualizer that would work here? E.g. it'd be handy if VS could overlay the "rendered" value of the string.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll just remove it. |
||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| // analyzer: D2L.CodeStyle.Analyzers.Language.EscapeNonAsciiCharsInLiteralsAnalyzer | ||
|
|
||
| namespace D2L.CodeStyle.Analyzers.Specs { | ||
| public static class Tests { | ||
| const string EmptyString = ""; | ||
| const string SingleCharString = "x"; | ||
| const string BigString = "this is a string literal\a with \"lots\" of ASCII \041\x21! It even has escaped unicode like \u03c0 = 3.14159..."; | ||
|
|
||
| const string StartsWithNonAscii = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"\u03C0 = 3.14159...") */ "π = 3.14159..." /**/; | ||
|
|
||
| const string NoAscii = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"\u03B1\u03B2\u03B3") */ "αβγ" /**/; | ||
|
|
||
| const string ANiceMix = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"alpha \u03B1 beta \u03B2 alpha-beta \u03B1\u03B2 gamma \u03B3 alpha-beta-gamma \u03B1\u03B2\u03B3.") */ "alpha α beta β alpha-beta αβ gamma γ alpha-beta-gamma αβγ." /**/; | ||
|
|
||
| const string TableFlip = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"(\u256F\u00B0\u25A1\u00B0\uFF09\u256F\uFE35 \u253B\u2501\u253B") */ "(╯°□°)╯︵ ┻━┻" /**/; | ||
|
|
||
| const string Brail = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"\u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C") */ "⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌" /**/; | ||
|
|
||
| // This one hits the branch for surrogate pairs | ||
| const string MormonTwinkleTwinkleLittleStar = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ha! |
||
| /* EscapeNonAsciiCharsInLiteral(string,"\U00010413\uDC13\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001043B\uDC3B\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001044A\uDC4A\U0001042E\uDC2E\U0001043B\uDC3B\U0001044A\uDC4A \U00010445\uDC45\U0001043B\uDC3B\U0001042A\uDC2A\U00010449\uDC49") */ "𐐓𐐶𐐮𐑍𐐿𐑊 𐐻𐐶𐐮𐑍𐐿𐑊 𐑊𐐮𐐻𐑊 𐑅𐐻𐐪𐑉" /**/; | ||
|
|
||
| const string Emojis = | ||
| /* EscapeNonAsciiCharsInLiteral(string,"\U0001F602\uDE02\U0001F60D\uDE0D\U0001F389\uDF89\U0001F44D\uDC4D") */ "😂😍🎉👍" /**/; | ||
|
|
||
| const char AsciiChar = 'x'; | ||
| const char OtherAsciiChar = '\x21'; | ||
| const char Japanese = /* EscapeNonAsciiCharsInLiteral(char,'\u3041') */ 'ぁ' /**/; | ||
| const char EscapedChar = '\u3041'; | ||
|
|
||
| // TODO: this is ignored. We need to implement this: | ||
| // https://github.com/dotnet/codeformatter/issues/39 | ||
| const string VerbatimString = @"Would you like to build a ☃?"; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider providing a description of (or link to) what can go wrong without this.