diff --git a/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj b/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj
index a83d791c0..d431f3357 100644
--- a/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj
+++ b/src/D2L.CodeStyle.Analyzers/D2L.CodeStyle.Analyzers.csproj
@@ -49,6 +49,7 @@
+
diff --git a/src/D2L.CodeStyle.Analyzers/Diagnostics.cs b/src/D2L.CodeStyle.Analyzers/Diagnostics.cs
index fa29b9c10..9af15cb18 100644
--- a/src/D2L.CodeStyle.Analyzers/Diagnostics.cs
+++ b/src/D2L.CodeStyle.Analyzers/Diagnostics.cs
@@ -434,5 +434,15 @@ public static class Diagnostics {
isEnabledByDefault: true,
description: "The parameter {0} has a default value of {1} here, but {2} in its original definition in {3}. This causes inconsistent behaviour. Please use the same defualt value everywhere."
);
+
+ public static readonly DiagnosticDescriptor EscapeNonAsciiCharsInLiteral = new DiagnosticDescriptor(
+ id: "D2L0050",
+ title: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
+ messageFormat: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
+ category: "Language",
+ defaultSeverity: DiagnosticSeverity.Error,
+ isEnabledByDefault: true,
+ description: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in."
+ );
}
}
diff --git a/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs b/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs
new file mode 100644
index 000000000..39b7f2c3a
--- /dev/null
+++ b/src/D2L.CodeStyle.Analyzers/Language/EscapeNonAsciiCharsInLiteralsAnalyzer.cs
@@ -0,0 +1,200 @@
+using System.Collections.Immutable;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.CodeAnalysis;
+using Microsoft.CodeAnalysis.CodeActions;
+using Microsoft.CodeAnalysis.CodeFixes;
+using Microsoft.CodeAnalysis.CSharp;
+using Microsoft.CodeAnalysis.CSharp.Syntax;
+using Microsoft.CodeAnalysis.Diagnostics;
+
+namespace D2L.CodeStyle.Analyzers.Language {
+ // We are doing this because we (unfortunately) have a mix of encoding for
+ // our source code files, and the encoding of the file impacts how strings
+ // are interpreted. We really ought to clean that up, but until that
+ // happens we can be safer if we avoid non-ASCII characters in our
+ // literals. Automated refactoring has bit us before with this.
+ [DiagnosticAnalyzer( LanguageNames.CSharp )]
+ internal sealed class EscapeNonAsciiCharsInLiteralsAnalyzer : DiagnosticAnalyzer {
+ public override ImmutableArray SupportedDiagnostics
+ => ImmutableArray.Create( Diagnostics.EscapeNonAsciiCharsInLiteral );
+
+ public override void Initialize( AnalysisContext context ) {
+ context.EnableConcurrentExecution();
+
+ context.RegisterSyntaxNodeAction(
+ CheckForUnicodeLiterals,
+ SyntaxKind.StringLiteralExpression
+ );
+
+ context.RegisterSyntaxNodeAction(
+ CheckForUnicodeLiterals,
+ SyntaxKind.CharacterLiteralExpression
+ );
+ }
+
+ private static void CheckForUnicodeLiterals(
+ SyntaxNodeAnalysisContext ctx
+ ) {
+ // the node could be something like:
+ // "foo"
+ // @"foo"
+ // 'x'
+ var literalExpr = (LiteralExpressionSyntax)ctx.Node;
+
+ // We can't handle verbatim strings for the same reason as these
+ // folks: https://github.com/dotnet/codeformatter/issues/39 (TODO)
+ if ( literalExpr.Token.IsVerbatimStringLiteral() ) {
+ return;
+ }
+
+ string token = literalExpr.Token.Text;
+ string escapedToken = null;
+
+ if( StrictlyAscii( token, out escapedToken ) ) {
+ return;
+ }
+
+ bool isChar =
+ literalExpr.Kind() == SyntaxKind.CharacterLiteralExpression;
+
+ var fixProps = ImmutableDictionary.CreateBuilder();
+ fixProps[EscapeCharCodeFix.ESCAPED] = escapedToken;
+
+ ctx.ReportDiagnostic(
+ Diagnostic.Create(
+ Diagnostics.EscapeNonAsciiCharsInLiteral,
+ literalExpr.GetLocation(),
+ fixProps.ToImmutable(),
+ isChar ? "char" : "string",
+ escapedToken
+ )
+ );
+ }
+
+ private static bool StrictlyAscii(
+ string token,
+ out string escapedToken
+ ) {
+ var sb = new StringBuilder();
+ var copyStartIdx = 0;
+
+ // invariant: copyStartIdx < idx
+ // Note: the enclosing quotes are included in val; don't bother looking at them.
+ for( int idx = 1; idx < token.Length - 1; idx++ ) {
+ if( token[idx] < 0x80 ) {
+ continue;
+ }
+
+ // copy all the ascii chars we've seen between the last copy
+ // and now (not inclusive) into sb
+ sb.Append( token, copyStartIdx, idx - copyStartIdx );
+
+ // next time we'll start copying from the char after us
+ // (unless this happens again next loop)
+ copyStartIdx = idx + 1;
+
+ if ( IsSurrogatePair( token, idx ) ) {
+ sb.AppendFormat(
+ @"\U{0:X8}",
+ char.ConvertToUtf32( token[idx], token[idx + 1] )
+ );
+ } else {
+ sb.AppendFormat(
+ @"\u{0:X4}",
+ (ushort)token[idx]
+ );
+ }
+ }
+
+ // if copyStartIdx never changed we never saw non-ascii chars and
+ // sb is empty.
+ if ( copyStartIdx == 0 ) {
+ escapedToken = null;
+ return true;
+ }
+
+ // copy trailing ascii into sb. This is never a no-op because we
+ // need to at least copy the ending quote.
+ sb.Append( token, copyStartIdx, token.Length - copyStartIdx );
+
+ escapedToken = sb.ToString();
+ return false;
+ }
+
+ private static bool IsSurrogatePair( string str, int idx ) {
+ return idx + 1 < str.Length
+ && char.IsHighSurrogate( str[idx] )
+ && char.IsLowSurrogate( str[idx + 1] );
+ }
+ }
+
+ [ExportCodeFixProvider(
+ LanguageNames.CSharp,
+ Name = nameof( EscapeCharCodeFix )
+ )]
+ public sealed class EscapeCharCodeFix : CodeFixProvider {
+ public const string ESCAPED = "escaped";
+
+ public override ImmutableArray FixableDiagnosticIds
+ => ImmutableArray.Create(
+ Diagnostics.EscapeNonAsciiCharsInLiteral.Id
+ );
+
+ public override FixAllProvider GetFixAllProvider() {
+ return WellKnownFixAllProviders.BatchFixer;
+ }
+
+ public override async Task RegisterCodeFixesAsync( CodeFixContext ctx ) {
+ var root = await ctx.Document
+ .GetSyntaxRootAsync( ctx.CancellationToken )
+ .ConfigureAwait( false );
+
+ foreach( var diagnostic in ctx.Diagnostics ) {
+ var span = diagnostic.Location.SourceSpan;
+
+ var literal = root.FindNode( span, getInnermostNodeForTie: true )
+ as LiteralExpressionSyntax;
+
+ if ( literal == null ) {
+ continue;
+ }
+
+ var escapedToken = diagnostic.Properties[ESCAPED];
+
+ ctx.RegisterCodeFix(
+ CodeAction.Create(
+ title: "Escape literal",
+ ct => Fix( ctx.Document, root, literal, escapedToken ),
+ equivalenceKey: nameof( EscapeCharCodeFix )
+ ),
+ diagnostic
+ );
+ }
+ }
+
+ private static Task Fix(
+ Document doc,
+ SyntaxNode root,
+ LiteralExpressionSyntax literal,
+ string escapedToken
+ ) {
+ var comment = SyntaxFactory
+ .Comment( $"/* unencoded: {literal.Token.Text} */" );
+
+ var newLiteral = literal.WithToken(
+ SyntaxFactory.ParseToken( escapedToken )
+ ).WithLeadingTrivia(
+ literal.GetLeadingTrivia()
+ .Add( comment )
+ .Add( SyntaxFactory.Whitespace( " " ) )
+ );
+
+ var newRoot = root.ReplaceNode( literal, newLiteral );
+
+ var newDoc = doc.WithSyntaxRoot( newRoot );
+
+ return Task.FromResult( newDoc );
+ }
+ }
+}
diff --git a/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj b/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj
index 8b4b98fab..ff6c2eac2 100644
--- a/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj
+++ b/tests/D2L.CodeStyle.Analyzers.Test/D2L.CodeStyle.Analyzers.Tests.csproj
@@ -59,6 +59,7 @@
+
diff --git a/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs b/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs
new file mode 100644
index 000000000..31e18a4f5
--- /dev/null
+++ b/tests/D2L.CodeStyle.Analyzers.Test/Specs/EscapeNonAsciiCharsInLiteralsAnalyzer.cs
@@ -0,0 +1,40 @@
+// analyzer: D2L.CodeStyle.Analyzers.Language.EscapeNonAsciiCharsInLiteralsAnalyzer
+
+namespace D2L.CodeStyle.Analyzers.Specs {
+ public static class Tests {
+ const string EmptyString = "";
+ const string SingleCharString = "x";
+ const string BigString = "this is a string literal\a with \"lots\" of ASCII \041\x21! It even has escaped unicode like \u03c0 = 3.14159...";
+
+ const string StartsWithNonAscii =
+ /* EscapeNonAsciiCharsInLiteral(string,"\u03C0 = 3.14159...") */ "π = 3.14159..." /**/;
+
+ const string NoAscii =
+ /* EscapeNonAsciiCharsInLiteral(string,"\u03B1\u03B2\u03B3") */ "αβγ" /**/;
+
+ const string ANiceMix =
+ /* EscapeNonAsciiCharsInLiteral(string,"alpha \u03B1 beta \u03B2 alpha-beta \u03B1\u03B2 gamma \u03B3 alpha-beta-gamma \u03B1\u03B2\u03B3.") */ "alpha α beta β alpha-beta αβ gamma γ alpha-beta-gamma αβγ." /**/;
+
+ const string TableFlip =
+ /* EscapeNonAsciiCharsInLiteral(string,"(\u256F\u00B0\u25A1\u00B0\uFF09\u256F\uFE35 \u253B\u2501\u253B") */ "(╯°□°)╯︵ ┻━┻" /**/;
+
+ const string Brail =
+ /* EscapeNonAsciiCharsInLiteral(string,"\u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C") */ "⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌" /**/;
+
+ // This one hits the branch for surrogate pairs
+ const string MormonTwinkleTwinkleLittleStar =
+ /* EscapeNonAsciiCharsInLiteral(string,"\U00010413\uDC13\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001043B\uDC3B\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001044A\uDC4A\U0001042E\uDC2E\U0001043B\uDC3B\U0001044A\uDC4A \U00010445\uDC45\U0001043B\uDC3B\U0001042A\uDC2A\U00010449\uDC49") */ "𐐓𐐶𐐮𐑍𐐿𐑊 𐐻𐐶𐐮𐑍𐐿𐑊 𐑊𐐮𐐻𐑊 𐑅𐐻𐐪𐑉" /**/;
+
+ const string Emojis =
+ /* EscapeNonAsciiCharsInLiteral(string,"\U0001F602\uDE02\U0001F60D\uDE0D\U0001F389\uDF89\U0001F44D\uDC4D") */ "😂😍🎉👍" /**/;
+
+ const char AsciiChar = 'x';
+ const char OtherAsciiChar = '\x21';
+ const char Japanese = /* EscapeNonAsciiCharsInLiteral(char,'\u3041') */ 'ぁ' /**/;
+ const char EscapedChar = '\u3041';
+
+ // TODO: this is ignored. We need to implement this:
+ // https://github.com/dotnet/codeformatter/issues/39
+ const string VerbatimString = @"Would you like to build a ☃?";
+ }
+}