Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
<Compile Include="Immutability\ImmutableGenericArgumentAnalyzer.cs" />
<Compile Include="Immutability\ImmutableGenericDeclarationAnalyzer.cs" />
<Compile Include="Language\DefaultValueConsistencyAnalyzer.cs" />
<Compile Include="Language\EscapeNonAsciiCharsInLiteralsAnalyzer.cs" />
<Compile Include="Language\UseNamedArgumentsCodeFix.cs" />
<Compile Include="Language\RequireNamedArgumentsAnalyzer.cs" />
<Compile Include="Visibility\CorsHeaderAppenderUsageAnalyzer.cs" />
Expand Down
10 changes: 10 additions & 0 deletions src/D2L.CodeStyle.Analyzers/Diagnostics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -434,5 +434,15 @@ public static class Diagnostics {
isEnabledByDefault: true,
description: "The parameter {0} has a default value of {1} here, but {2} in its original definition in {3}. This causes inconsistent behaviour. Please use the same defualt value everywhere."
);

public static readonly DiagnosticDescriptor EscapeNonAsciiCharsInLiteral = new DiagnosticDescriptor(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider providing a description of (or link to) what can go wrong without this.

id: "D2L0050",
title: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
messageFormat: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in.",
category: "Language",
defaultSeverity: DiagnosticSeverity.Error,
isEnabledByDefault: true,
description: "This {0}-literal should be escaped to {1} to avoid a dependency on the encoding of the file it is contained in."
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
using System.Collections.Immutable;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CodeActions;
using Microsoft.CodeAnalysis.CodeFixes;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using Microsoft.CodeAnalysis.Diagnostics;

namespace D2L.CodeStyle.Analyzers.Language {
// We are doing this because we (unfortunately) have a mix of encoding for
// our source code files, and the encoding of the file impacts how strings
// are interpreted. We really ought to clean that up, but until that
// happens we can be safer if we avoid non-ASCII characters in our
// literals. Automated refactoring has bit us before with this.
[DiagnosticAnalyzer( LanguageNames.CSharp )]
internal sealed class EscapeNonAsciiCharsInLiteralsAnalyzer : DiagnosticAnalyzer {
public override ImmutableArray<DiagnosticDescriptor> SupportedDiagnostics
=> ImmutableArray.Create( Diagnostics.EscapeNonAsciiCharsInLiteral );

public override void Initialize( AnalysisContext context ) {
context.EnableConcurrentExecution();

context.RegisterSyntaxNodeAction(
CheckForUnicodeLiterals,
SyntaxKind.StringLiteralExpression
);

context.RegisterSyntaxNodeAction(
CheckForUnicodeLiterals,
SyntaxKind.CharacterLiteralExpression
);
}

private static void CheckForUnicodeLiterals(
SyntaxNodeAnalysisContext ctx
) {
// the node could be something like:
// "foo"
// @"foo"
// 'x'
var literalExpr = (LiteralExpressionSyntax)ctx.Node;

// We can't handle verbatim strings for the same reason as these
// folks: https://github.com/dotnet/codeformatter/issues/39 (TODO)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this quite interesting.

if ( literalExpr.Token.IsVerbatimStringLiteral() ) {
return;
}

string token = literalExpr.Token.Text;
string escapedToken = null;

if( StrictlyAscii( token, out escapedToken ) ) {
return;
}

bool isChar =
literalExpr.Kind() == SyntaxKind.CharacterLiteralExpression;

var fixProps = ImmutableDictionary.CreateBuilder<string, string>();
fixProps[EscapeCharCodeFix.ESCAPED] = escapedToken;

ctx.ReportDiagnostic(
Diagnostic.Create(
Diagnostics.EscapeNonAsciiCharsInLiteral,
literalExpr.GetLocation(),
fixProps.ToImmutable(),
isChar ? "char" : "string",
escapedToken
)
);
}

private static bool StrictlyAscii(
string token,
out string escapedToken
) {
var sb = new StringBuilder();
var copyStartIdx = 0;

// invariant: copyStartIdx < idx
// Note: the enclosing quotes are included in val; don't bother looking at them.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is val?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, variable rename

for( int idx = 1; idx < token.Length - 1; idx++ ) {
if( token[idx] < 0x80 ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol - so simple

continue;
}

// copy all the ascii chars we've seen between the last copy
// and now (not inclusive) into sb
sb.Append( token, copyStartIdx, idx - copyStartIdx );
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems more complicated than copying char-at-a-time, and I'm not sure I see the advantage given that you're using StringBuffer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's "basically a no-op" when the string is ASCII. Microsoft did it with two passes (one to detect nonASCIIness, one to do the copying) which did char-by-char copying in the second step. It's technically more work to do that (two passes, and char by char is still worse than chunks at a time) but its so neglible and only happens during errors anyway so really not interesting.

Honestly I just wrote it this way because it came out of my head this way. I'll look at it again.


// next time we'll start copying from the char after us
// (unless this happens again next loop)
copyStartIdx = idx + 1;

if ( IsSurrogatePair( token, idx ) ) {
sb.AppendFormat(
@"\U{0:X8}",
char.ConvertToUtf32( token[idx], token[idx + 1] )
);
} else {
sb.AppendFormat(
@"\u{0:X4}",
(ushort)token[idx]
);
}
}

// if copyStartIdx never changed we never saw non-ascii chars and
// sb is empty.
if ( copyStartIdx == 0 ) {
escapedToken = null;
return true;
}

// copy trailing ascii into sb. This is never a no-op because we
// need to at least copy the ending quote.
sb.Append( token, copyStartIdx, token.Length - copyStartIdx );

escapedToken = sb.ToString();
return false;
}

private static bool IsSurrogatePair( string str, int idx ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use char.IsSurrogatePair?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

D'oh! Thanks

return idx + 1 < str.Length
&& char.IsHighSurrogate( str[idx] )
&& char.IsLowSurrogate( str[idx + 1] );
}
}

[ExportCodeFixProvider(
LanguageNames.CSharp,
Name = nameof( EscapeCharCodeFix )
)]
public sealed class EscapeCharCodeFix : CodeFixProvider {
public const string ESCAPED = "escaped";

public override ImmutableArray<string> FixableDiagnosticIds
=> ImmutableArray.Create(
Diagnostics.EscapeNonAsciiCharsInLiteral.Id
);

public override FixAllProvider GetFixAllProvider() {
return WellKnownFixAllProviders.BatchFixer;
}

public override async Task RegisterCodeFixesAsync( CodeFixContext ctx ) {
var root = await ctx.Document
.GetSyntaxRootAsync( ctx.CancellationToken )
.ConfigureAwait( false );

foreach( var diagnostic in ctx.Diagnostics ) {
var span = diagnostic.Location.SourceSpan;

var literal = root.FindNode( span, getInnermostNodeForTie: true )
as LiteralExpressionSyntax;

if ( literal == null ) {
continue;
}

var escapedToken = diagnostic.Properties[ESCAPED];

ctx.RegisterCodeFix(
CodeAction.Create(
title: "Escape literal",
ct => Fix( ctx.Document, root, literal, escapedToken ),
equivalenceKey: nameof( EscapeCharCodeFix )
),
diagnostic
);
}
}

private static Task<Document> Fix(
Document doc,
SyntaxNode root,
LiteralExpressionSyntax literal,
string escapedToken
) {
var comment = SyntaxFactory
.Comment( $"/* unencoded: {literal.Token.Text} */" );

var newLiteral = literal.WithToken(
SyntaxFactory.ParseToken( escapedToken )
).WithLeadingTrivia(
literal.GetLeadingTrivia()
.Add( comment )
.Add( SyntaxFactory.Whitespace( " " ) )
);

var newRoot = root.ReplaceNode( literal, newLiteral );

var newDoc = doc.WithSyntaxRoot( newRoot );

return Task.FromResult( newDoc );
Copy link
Member Author

@j3parker j3parker Nov 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

string foo = "αβγ";

-->

string foo = /* unencoded: "αβγ" */ "\u03B1\u03B2\u03B3";

The comment could get out of sync. Options:

  1. Don't do this
  2. Write an analyzer to verify they are in sync (not hard, but probably not worth it.)
  3. It's fine, it's not likely to get through code review

Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no chance that I'll notice an out-of-sync comment in a review. I'd lean towards #1 unless there lots of instances where the specific value is user-facing (seems unlikely since user-facing values should be langTerms, not string constants).

Alternately, is there a visualizer that would work here? E.g. it'd be handy if VS could overlay the "rendered" value of the string.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll just remove it.

}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
<EmbeddedResource Include="Specs\EventHandlerLoaderTypesAnalyzer.cs" />
<EmbeddedResource Include="Specs\EventTypesAnalyzer.cs" />
<EmbeddedResource Include="Specs\DefaultValueConsistencyAnalyzer.cs" />
<EmbeddedResource Include="Specs\EscapeNonAsciiCharsInLiteralsAnalyzer.cs" />
<Compile Include="TestBase.cs" />
<Compile Include="Extensions\TypeSymbolExtensionsTests.cs" />
<Compile Include="ApiUsage\NotNullAnalyzerTests.cs" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// analyzer: D2L.CodeStyle.Analyzers.Language.EscapeNonAsciiCharsInLiteralsAnalyzer

namespace D2L.CodeStyle.Analyzers.Specs {
public static class Tests {
const string EmptyString = "";
const string SingleCharString = "x";
const string BigString = "this is a string literal\a with \"lots\" of ASCII \041\x21! It even has escaped unicode like \u03c0 = 3.14159...";

const string StartsWithNonAscii =
/* EscapeNonAsciiCharsInLiteral(string,"\u03C0 = 3.14159...") */ "π = 3.14159..." /**/;

const string NoAscii =
/* EscapeNonAsciiCharsInLiteral(string,"\u03B1\u03B2\u03B3") */ "αβγ" /**/;

const string ANiceMix =
/* EscapeNonAsciiCharsInLiteral(string,"alpha \u03B1 beta \u03B2 alpha-beta \u03B1\u03B2 gamma \u03B3 alpha-beta-gamma \u03B1\u03B2\u03B3.") */ "alpha α beta β alpha-beta αβ gamma γ alpha-beta-gamma αβγ." /**/;

const string TableFlip =
/* EscapeNonAsciiCharsInLiteral(string,"(\u256F\u00B0\u25A1\u00B0\uFF09\u256F\uFE35 \u253B\u2501\u253B") */ "(╯°□°)╯︵ ┻━┻" /**/;

const string Brail =
/* EscapeNonAsciiCharsInLiteral(string,"\u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C") */ "⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌" /**/;

// This one hits the branch for surrogate pairs
const string MormonTwinkleTwinkleLittleStar =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha!

/* EscapeNonAsciiCharsInLiteral(string,"\U00010413\uDC13\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001043B\uDC3B\U00010436\uDC36\U0001042E\uDC2E\U0001044D\uDC4D\U0001043F\uDC3F\U0001044A\uDC4A \U0001044A\uDC4A\U0001042E\uDC2E\U0001043B\uDC3B\U0001044A\uDC4A \U00010445\uDC45\U0001043B\uDC3B\U0001042A\uDC2A\U00010449\uDC49") */ "𐐓𐐶𐐮𐑍𐐿𐑊 𐐻𐐶𐐮𐑍𐐿𐑊 𐑊𐐮𐐻𐑊 𐑅𐐻𐐪𐑉" /**/;

const string Emojis =
/* EscapeNonAsciiCharsInLiteral(string,"\U0001F602\uDE02\U0001F60D\uDE0D\U0001F389\uDF89\U0001F44D\uDC4D") */ "😂😍🎉👍" /**/;

const char AsciiChar = 'x';
const char OtherAsciiChar = '\x21';
const char Japanese = /* EscapeNonAsciiCharsInLiteral(char,'\u3041') */ 'ぁ' /**/;
const char EscapedChar = '\u3041';

// TODO: this is ignored. We need to implement this:
// https://github.com/dotnet/codeformatter/issues/39
const string VerbatimString = @"Would you like to build a ☃?";
}
}