diff --git a/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj b/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj index a83d791c0..d431f3357 100644 --- a/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj +++ b/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj @@ -49,6 +49,7 @@ + diff --git a/src/D2L.CodeStyle.Analyzers/Diagnostics.cs b/src/D2L.CodeStyle.Analyzers/Diagnostics.cs index fa29b9c10..9af15cb18 100644 --- a/src/D2L.CodeStyle.Analyzers/Diagnostics.cs +++ b/src/D2L.CodeStyle.Analyzers/Diagnostics.cs @@ -434,5 +434,15 @@ public static class Diagnostics { isEnabledByDefault: true, description: "The parameter {0} has a default value of {1} here, but {2} in its original definition in {3}. This causes inconsistent behaviour. Please use the same defualt value everywhere." ); + + public static readonly DiagnosticDescriptor EscapeNonAsciiCharsInLiteral = new DiagnosticDescriptor( + id: "D2L0050", + title: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.", + messageFormat: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.", + category: "Language", + defaultSeverity: DiagnosticSeverity.Error, + isEnabledByDefault: true, + description: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in." + ); } } diff --git a/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs b/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs new file mode 100644 index 000000000..39b7f2c3a --- /dev/null +++ b/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs @@ -0,0 +1,200 @@ +using System.Collections.Immutable; +using System.Text; +using System.Threading.Tasks; +using Microsoft.CodeAnalysis; +using Microsoft.CodeAnalysis.CodeActions; +using Microsoft.CodeAnalysis.CodeFixes; +using Microsoft.CodeAnalysis.CSharp; +using Microsoft.CodeAnalysis.CSharp.Syntax; +using Microsoft.CodeAnalysis.Diagnostics; + +namespace D2L.CodeStyle.Analyzers.Language { + // We are doing this because we (unfortunately) have a mix of encoding for + // our source code files, and the encoding of the file impacts how strings + // are interpreted. We really ought to clean that up, but until that + // happens we can be safer if we avoid non-ASCII characters in our + // literals. Automated refactoring has bit us before with this. + [DiagnosticAnalyzer( LanguageNames.CSharp )] + internal sealed class EscapeNonAsciiCharsInLiteralsAnalyzer : DiagnosticAnalyzer { + public override ImmutableArray SupportedDiagnostics + => ImmutableArray.Create( Diagnostics.EscapeNonAsciiCharsInLiteral ); + + public override void Initialize( AnalysisContext context ) { + context.EnableConcurrentExecution(); + + context.RegisterSyntaxNodeAction( + CheckForUnicodeLiterals, + SyntaxKind.StringLiteralExpression + ); + + context.RegisterSyntaxNodeAction( + CheckForUnicodeLiterals, + SyntaxKind.CharacterLiteralExpression + ); + } + + private static void CheckForUnicodeLiterals( + SyntaxNodeAnalysisContext ctx + ) { + // the node could be something like: + // "foo" + // @"foo" + // 'x' + var literalExpr = (LiteralExpressionSyntax)ctx.Node; + + // We can't handle verbatim strings for the same reason as these + // folks: https://github.com/dotnet/codeformatter/issues/39 (TODO) + if ( literalExpr.Token.IsVerbatimStringLiteral() ) { + return; + } + + string token = literalExpr.Token.Text; + string escapedToken = null; + + if( StrictlyAscii( token, out escapedToken ) ) { + return; + } + + bool isChar = + literalExpr.Kind() == SyntaxKind.CharacterLiteralExpression; + + var fixProps = ImmutableDictionary.CreateBuilder(); + fixProps[EscapeCharCodeFix.ESCAPED] = escapedToken; + + ctx.ReportDiagnostic( + Diagnostic.Create( + Diagnostics.EscapeNonAsciiCharsInLiteral, + literalExpr.GetLocation(), + fixProps.ToImmutable(), + isChar ? "char" : "string", + escapedToken + ) + ); + } + + private static bool StrictlyAscii( + string token, + out string escapedToken + ) { + var sb = new StringBuilder(); + var copyStartIdx = 0; + + // invariant: copyStartIdx < idx + // Note: the enclosing quotes are included in val; don't bother looking at them. + for( int idx = 1; idx < token.Length - 1; idx++ ) { + if( token[idx] < 0x80 ) { + continue; + } + + // copy all the ascii chars we've seen between the last copy + // and now (not inclusive) into sb + sb.Append( token, copyStartIdx, idx - copyStartIdx ); + + // next time we'll start copying from the char after us + // (unless this happens again next loop) + copyStartIdx = idx + 1; + + if ( IsSurrogatePair( token, idx ) ) { + sb.AppendFormat( + @"\U{0:X8}", + char.ConvertToUtf32( token[idx], token[idx + 1] ) + ); + } else { + sb.AppendFormat( + @"\u{0:X4}", + (ushort)token[idx] + ); + } + } + + // if copyStartIdx never changed we never saw non-ascii chars and + // sb is empty. + if ( copyStartIdx == 0 ) { + escapedToken = null; + return true; + } + + // copy trailing ascii into sb. This is never a no-op because we + // need to at least copy the ending quote. + sb.Append( token, copyStartIdx, token.Length - copyStartIdx ); + + escapedToken = sb.ToString(); + return false; + } + + private static bool IsSurrogatePair( string str, int idx ) { + return idx + 1 < str.Length + && char.IsHighSurrogate( str[idx] ) + && char.IsLowSurrogate( str[idx + 1] ); + } + } + + [ExportCodeFixProvider( + LanguageNames.CSharp, + Name = nameof( EscapeCharCodeFix ) + )] + public sealed class EscapeCharCodeFix : CodeFixProvider { + public const string ESCAPED = "escaped"; + + public override ImmutableArray FixableDiagnosticIds + => ImmutableArray.Create( + Diagnostics.EscapeNonAsciiCharsInLiteral.Id + ); + + public override FixAllProvider GetFixAllProvider() { + return WellKnownFixAllProviders.BatchFixer; + } + + public override async Task RegisterCodeFixesAsync( CodeFixContext ctx ) { + var root = await ctx.Document + .GetSyntaxRootAsync( ctx.CancellationToken ) + .ConfigureAwait( false ); + + foreach( var diagnostic in ctx.Diagnostics ) { + var span = diagnostic.Location.SourceSpan; + + var literal = root.FindNode( span, getInnermostNodeForTie: true ) + as LiteralExpressionSyntax; + + if ( literal == null ) { + continue; + } + + var escapedToken = diagnostic.Properties[ESCAPED]; + + ctx.RegisterCodeFix( + CodeAction.Create( + title: "Escape literal", + ct => Fix( ctx.Document, root, literal, escapedToken ), + equivalenceKey: nameof( EscapeCharCodeFix ) + ), + diagnostic + ); + } + } + + private static Task Fix( + Document doc, + SyntaxNode root, + LiteralExpressionSyntax literal, + string escapedToken + ) { + var comment = SyntaxFactory + .Comment( $"/* unencoded: {literal.Token.Text} */" ); + + var newLiteral = literal.WithToken( + SyntaxFactory.ParseToken( escapedToken ) + ).WithLeadingTrivia( + literal.GetLeadingTrivia() + .Add( comment ) + .Add( SyntaxFactory.Whitespace( " " ) ) + ); + + var newRoot = root.ReplaceNode( literal, newLiteral ); + + var newDoc = doc.WithSyntaxRoot( newRoot ); + + return Task.FromResult( newDoc ); + } + } +} diff --git a/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj b/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj index 8b4b98fab..ff6c2eac2 100644 --- a/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj +++ b/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj @@ -59,6 +59,7 @@ + diff --git a/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs b/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs new file mode 100644 index 000000000..31e18a4f5 --- /dev/null +++ b/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs @@ -0,0 +1,40 @@ +// analyzer: D2L.CodeStyle.Analyzers.Language.EscapeNonAsciiCharsInLiteralsAnalyzer + +namespace D2L.CodeStyle.Analyzers.Specs { + public static class Tests { + const string EmptyString = ""; + const string SingleCharString = "x"; + const string BigString = "this is a string literal\a with \"lots\" of ASCII \041\x21! It even has escaped unicode like \u03c0 = 3.14159..."; + + const string StartsWithNonAscii = + /* EscapeNonAsciiCharsInLiteral(string,"\u03C0 = 3.14159...") */ "π = 3.14159..." /**/; + + const string NoAscii = + /* EscapeNonAsciiCharsInLiteral(string,"\u03B1\u03B2\u03B3") */ "αβγ" /**/; + + const string ANiceMix = + /* EscapeNonAsciiCharsInLiteral(string,"alpha \u03B1 beta \u03B2 alpha-beta \u03B1\u03B2 gamma \u03B3 alpha-beta-gamma \u03B1\u03B2\u03B3.") */ "alpha α beta β alpha-beta αβ gamma γ alpha-beta-gamma αβγ." /**/; + + const string TableFlip = + /* EscapeNonAsciiCharsInLiteral(string,"(\u256F\u00B0\u25A1\u00B0\uFF09\u256F\uFE35 \u253B\u2501\u253B") */ "(╯°□°)╯︵ ┻━┻" /**/; + + const string Brail = + /* EscapeNonAsciiCharsInLiteral(string,"\u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C") */ "⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌" /**/; + + // This one hits the branch for surrogate pairs + const string MormonTwinkleTwinkleLittleStar = + /* EscapeNonAsciiCharsInLiteral(string,"\U00010413\uDC13\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001043B\uDC3B\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001044A\uDC4A\U0001042E\uDC2E\U0001043B\uDC3B\U0001044A\uDC4A \U00010445\uDC45\U0001043B\uDC3B\U0001042A\uDC2A\U00010449\uDC49") */ "𐐓𐐶𐐮𐑍𐐿𐑊 𐐻𐐶𐐮𐑍𐐿𐑊 𐑊𐐮𐐻𐑊 𐑅𐐻𐐪𐑉" /**/; + + const string Emojis = + /* EscapeNonAsciiCharsInLiteral(string,"\U0001F602\uDE02\U0001F60D\uDE0D\U0001F389\uDF89\U0001F44D\uDC4D") */ "😂😍🎉👍" /**/; + + const char AsciiChar = 'x'; + const char OtherAsciiChar = '\x21'; + const char Japanese = /* EscapeNonAsciiCharsInLiteral(char,'\u3041') */ 'ぁ' /**/; + const char EscapedChar = '\u3041'; + + // TODO: this is ignored. We need to implement this: + // https://github.com/dotnet/codeformatter/issues/39 + const string VerbatimString = @"Would you like to build a ☃?"; + } +}