Brightspace · j3parker · Nov 16, 2018 · cpacey · Nov 19, 2018 · cpacey
@@ -49,6 +49,7 @@
     <Compile Include="Immutability\ImmutableGenericArgumentAnalyzer.cs" />
     <Compile Include="Immutability\ImmutableGenericDeclarationAnalyzer.cs" />
     <Compile Include="Language\DefaultValueConsistencyAnalyzer.cs" />
+    <Compile Include="Language\EscapeNonAsciiCharsInLiteralsAnalyzer.cs" />
     <Compile Include="Language\UseNamedArgumentsCodeFix.cs" />
     <Compile Include="Language\RequireNamedArgumentsAnalyzer.cs" />
     <Compile Include="Visibility\CorsHeaderAppenderUsageAnalyzer.cs" />

@@ -434,5 +434,15 @@ public static class Diagnostics {
 			isEnabledByDefault: true,
 			description: "The parameter {0} has a default value of {1} here, but {2} in its original definition in {3}. This causes inconsistent behaviour. Please use the same defualt value everywhere."
 		);
+
+		public static readonly DiagnosticDescriptor EscapeNonAsciiCharsInLiteral = new DiagnosticDescriptor(
+			id: "D2L0050",
+			title: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
+			messageFormat: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
+			category: "Language",
+			defaultSeverity: DiagnosticSeverity.Error,
+			isEnabledByDefault: true,
+			description: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in."
+		);
 	}
 }
@@ -0,0 +1,200 @@
+using System.Collections.Immutable;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.CodeAnalysis;
+using Microsoft.CodeAnalysis.CodeActions;
+using Microsoft.CodeAnalysis.CodeFixes;
+using Microsoft.CodeAnalysis.CSharp;
+using Microsoft.CodeAnalysis.CSharp.Syntax;
+using Microsoft.CodeAnalysis.Diagnostics;
+
+namespace D2L.CodeStyle.Analyzers.Language {
+	// We are doing this because we (unfortunately) have a mix of encoding for
+	// our source code files, and the encoding of the file impacts how strings
+	// are interpreted. We really ought to clean that up, but until that
+	// happens we can be safer if we avoid non-ASCII characters in our
+	// literals. Automated refactoring has bit us before with this.
+	[DiagnosticAnalyzer( LanguageNames.CSharp )]
+	internal sealed class EscapeNonAsciiCharsInLiteralsAnalyzer : DiagnosticAnalyzer {
+		public override ImmutableArray<DiagnosticDescriptor> SupportedDiagnostics
+			=> ImmutableArray.Create( Diagnostics.EscapeNonAsciiCharsInLiteral );
+
+		public override void Initialize( AnalysisContext context ) {
+			context.EnableConcurrentExecution();
+
+			context.RegisterSyntaxNodeAction(
+				CheckForUnicodeLiterals,
+				SyntaxKind.StringLiteralExpression
+			);
+
+			context.RegisterSyntaxNodeAction(
+				CheckForUnicodeLiterals,
+				SyntaxKind.CharacterLiteralExpression
+			);
+		}
+
+		private static void CheckForUnicodeLiterals(
+			SyntaxNodeAnalysisContext ctx
+		) {
+			// the node could be something like:
+			//   "foo"
+			//   @"foo"
+			//   'x'
+			var literalExpr = (LiteralExpressionSyntax)ctx.Node;
+
+			// We can't handle verbatim strings for the same reason as these
+			// folks: https://github.com/dotnet/codeformatter/issues/39 (TODO)
+			if ( literalExpr.Token.IsVerbatimStringLiteral() ) {
+				return;
+			}
+
+			string token = literalExpr.Token.Text;
+			string escapedToken = null;
+
+			if( StrictlyAscii( token, out escapedToken ) ) {
+				return;
+			}
+
+			bool isChar =
+				literalExpr.Kind() == SyntaxKind.CharacterLiteralExpression;
+
+			var fixProps = ImmutableDictionary.CreateBuilder<string, string>();
+			fixProps[EscapeCharCodeFix.ESCAPED] = escapedToken;
+
+			ctx.ReportDiagnostic(
+				Diagnostic.Create(
+					Diagnostics.EscapeNonAsciiCharsInLiteral,
+					literalExpr.GetLocation(),
+					fixProps.ToImmutable(),
+					isChar ? "char" : "string",
+					escapedToken
+				)
+			);
+		}
+
+		private static bool StrictlyAscii(
+			string token,
+			out string escapedToken
+		) {
+			var sb = new StringBuilder();
+			var copyStartIdx = 0;
+
+			// invariant: copyStartIdx < idx
+			// Note: the enclosing quotes are included in val; don't bother looking at them.
+			for( int idx = 1; idx < token.Length - 1; idx++ ) {
+				if( token[idx] < 0x80 ) {
+					continue;
+				}
+
+				// copy all the ascii chars we've seen between the last copy
+				// and now (not inclusive) into sb
+				sb.Append( token, copyStartIdx, idx - copyStartIdx );
+
+				// next time we'll start copying from the char after us
+				// (unless this happens again next loop)
+				copyStartIdx = idx + 1;
+
+				if ( IsSurrogatePair( token, idx ) ) {
+					sb.AppendFormat(
+						@"\U{0:X8}",
+						char.ConvertToUtf32( token[idx], token[idx + 1] )
+					);
+				} else {
+					sb.AppendFormat(
+						@"\u{0:X4}",
+						(ushort)token[idx]
+					);
+				}
+			}
+
+			// if copyStartIdx never changed we never saw non-ascii chars and
+			// sb is empty.
+			if ( copyStartIdx == 0 ) {
+				escapedToken = null;
+				return true;
+			}
+
+			// copy trailing ascii into sb. This is never a no-op because we
+			// need to at least copy the ending quote.
+			sb.Append( token, copyStartIdx, token.Length - copyStartIdx );
+
+			escapedToken = sb.ToString();
+			return false;
+		}
+
+		private static bool IsSurrogatePair( string str, int idx ) {
+			return idx + 1 < str.Length
+				&& char.IsHighSurrogate( str[idx] )
+				&& char.IsLowSurrogate( str[idx + 1] );
+		}
+	}
+
+	[ExportCodeFixProvider(
+		LanguageNames.CSharp,
+		Name = nameof( EscapeCharCodeFix )
+	)]
+	public sealed class EscapeCharCodeFix : CodeFixProvider {
+		public const string ESCAPED = "escaped";
+
+		public override ImmutableArray<string> FixableDiagnosticIds
+			=> ImmutableArray.Create(
+				Diagnostics.EscapeNonAsciiCharsInLiteral.Id
+			);
+
+		public override FixAllProvider GetFixAllProvider() {
+			return WellKnownFixAllProviders.BatchFixer;
+		}
+
+		public override async Task RegisterCodeFixesAsync( CodeFixContext ctx ) {
+			var root = await ctx.Document
+				.GetSyntaxRootAsync( ctx.CancellationToken )
+				.ConfigureAwait( false );
+
+			foreach( var diagnostic in ctx.Diagnostics ) {
+				var span = diagnostic.Location.SourceSpan;
+
+				var literal = root.FindNode( span, getInnermostNodeForTie: true )
+					as LiteralExpressionSyntax;
+
+				if ( literal == null ) {
+					continue;
+				}
+
+				var escapedToken = diagnostic.Properties[ESCAPED];
+
+				ctx.RegisterCodeFix(
+					CodeAction.Create(
+						title: "Escape literal",
+						ct => Fix( ctx.Document, root, literal, escapedToken ),
+						equivalenceKey: nameof( EscapeCharCodeFix )
+					),
+					diagnostic
+				);
+			}
+		}
+
+		private static Task<Document> Fix(
+			Document doc,
+			SyntaxNode root,
+			LiteralExpressionSyntax literal,
+			string escapedToken
+		) {
+			var comment = SyntaxFactory
+				.Comment( $"/* unencoded: {literal.Token.Text} */" );
+
+			var newLiteral = literal.WithToken(
+				SyntaxFactory.ParseToken( escapedToken )
+			).WithLeadingTrivia(
+				literal.GetLeadingTrivia()
+					.Add( comment )
+					.Add( SyntaxFactory.Whitespace( " " ) )
+			);
+
+			var newRoot = root.ReplaceNode( literal, newLiteral );
+
+			var newDoc = doc.WithSyntaxRoot( newRoot );
+
+			return Task.FromResult( newDoc );
+		}
+	}
+}
@@ -59,6 +59,7 @@
     <EmbeddedResource Include="Specs\EventHandlerLoaderTypesAnalyzer.cs" />
     <EmbeddedResource Include="Specs\EventTypesAnalyzer.cs" />
     <EmbeddedResource Include="Specs\DefaultValueConsistencyAnalyzer.cs" />
+    <EmbeddedResource Include="Specs\EscapeNonAsciiCharsInLiteralsAnalyzer.cs" />
     <Compile Include="TestBase.cs" />
     <Compile Include="Extensions\TypeSymbolExtensionsTests.cs" />
     <Compile Include="ApiUsage\NotNullAnalyzerTests.cs" />

@@ -0,0 +1,40 @@
+// analyzer: D2L.CodeStyle.Analyzers.Language.EscapeNonAsciiCharsInLiteralsAnalyzer
+
+namespace D2L.CodeStyle.Analyzers.Specs {
+	public static class Tests {
+		const string EmptyString = "";
+		const string SingleCharString = "x";
+		const string BigString = "this is a string literal\a with \"lots\" of ASCII \041\x21! It even has escaped unicode like \u03c0 = 3.14159...";
+
+		const string StartsWithNonAscii =
+			/* EscapeNonAsciiCharsInLiteral(string,"\u03C0 = 3.14159...") */ "π = 3.14159..." /**/;
+
+		const string NoAscii =
+			/* EscapeNonAsciiCharsInLiteral(string,"\u03B1\u03B2\u03B3") */ "αβγ" /**/;
+
+		const string ANiceMix =
+			/* EscapeNonAsciiCharsInLiteral(string,"alpha \u03B1 beta \u03B2 alpha-beta \u03B1\u03B2 gamma \u03B3 alpha-beta-gamma \u03B1\u03B2\u03B3.") */ "alpha α beta β alpha-beta αβ gamma γ alpha-beta-gamma αβγ." /**/;
+
+		const string TableFlip =
+			/* EscapeNonAsciiCharsInLiteral(string,"(\u256F\u00B0\u25A1\u00B0\uFF09\u256F\uFE35 \u253B\u2501\u253B") */ "(╯°□°）╯︵ ┻━┻" /**/;
+
+		const string Brail =
+			/* EscapeNonAsciiCharsInLiteral(string,"\u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C") */ "⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌" /**/;
+
+		// This one hits the branch for surrogate pairs
+		const string MormonTwinkleTwinkleLittleStar =
+			/* EscapeNonAsciiCharsInLiteral(string,"\U00010413\uDC13\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001043B\uDC3B\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001044A\uDC4A\U0001042E\uDC2E\U0001043B\uDC3B\U0001044A\uDC4A \U00010445\uDC45\U0001043B\uDC3B\U0001042A\uDC2A\U00010449\uDC49") */ "𐐓𐐶𐐮𐑍𐐿𐑊 𐐻𐐶𐐮𐑍𐐿𐑊 𐑊𐐮𐐻𐑊 𐑅𐐻𐐪𐑉" /**/;
+
+		const string Emojis =
+			/* EscapeNonAsciiCharsInLiteral(string,"\U0001F602\uDE02\U0001F60D\uDE0D\U0001F389\uDF89\U0001F44D\uDC4D") */ "😂😍🎉👍" /**/;
+
+		const char AsciiChar = 'x';
+		const char OtherAsciiChar = '\x21';
+		const char Japanese = /* EscapeNonAsciiCharsInLiteral(char,'\u3041') */ 'ぁ' /**/;
+		const char EscapedChar = '\u3041';
+
+		// TODO: this is ignored. We need to implement this:
+		// https://github.com/dotnet/codeformatter/issues/39
+		const string VerbatimString = @"Would you like to build a ☃?";
+	}
+}