Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lexical colorization/classification for template strings. #2026

Merged
merged 10 commits into from
Feb 13, 2015
133 changes: 123 additions & 10 deletions src/services/services.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,9 @@ module ts {
InMultiLineCommentTrivia,
InSingleQuoteStringLiteral,
InDoubleQuoteStringLiteral,
InTemplateHeadOrNoSubstitutionTemplate,
InTemplateMiddleOrTail,
InTemplateSubstitutionPosition,
}

export enum TokenClass {
Expand All @@ -1168,7 +1171,26 @@ module ts {
}

export interface Classifier {
getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): ClassificationResult;
/**
* Gives lexical classifications of tokens on a line without any syntactic context.
* For instance, a token consisting of the text 'string' can be either an identifier
* named 'string' or the keyword 'string', however, because this classifier is not aware,
* it relies on certain heuristics to give acceptable results. For classifications where
* speed trumps accuracy, this function is preferable; however, for true accuracy, the
* syntactic classifier is ideal. In fact, in certain editing scenarios, combining the
* lexical, syntactic, and semantic classifiers may issue the best user experience.
*
* @param text The text of a line to classify.
* @param lexState The state of the lexical classifier at the end of the previous line.
* @param syntacticClassifierAbsent Whether the client is *not* using a syntactic classifier.
* If there is no syntactic classifier (syntacticClassifierAbsent=true),
* certain heuristics may be used in its place; however, if there is a
* syntactic classifier (syntacticClassifierAbsent=false), certain
* classifications which may be incorrectly categorized will be given
* back as Identifiers in order to allow the syntactic classifier to
* subsume the classification.
*/
getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent: boolean): ClassificationResult;
}

/**
Expand Down Expand Up @@ -5617,6 +5639,28 @@ module ts {
noRegexTable[SyntaxKind.TrueKeyword] = true;
noRegexTable[SyntaxKind.FalseKeyword] = true;

// Just a stack of TemplateHeads and OpenCurlyBraces, used to perform rudimentary (inexact)
// classification on template strings. Because of the context free nature of templates,
// the only precise way to classify a template portion would be by propagating the stack across
// lines, just as we do with the end-of-line state. However, this is a burden for implementers,
// and the behavior is entirely subsumed by the syntactic classifier anyway, so we instead
// flatten any nesting when the template stack is non-empty and encode it in the end-of-line state.
// Situations in which this fails are
// 1) When template strings are nested across different lines:
// `hello ${ `world
// ` }`
//
// Where on the second line, you will get the closing of a template,
// a closing curly, and a new template.
//
// 2) When substitution expressions have curly braces and the curly brace falls on the next line:
// `hello ${ () => {
// return "world" } } `
//
// Where on the second line, you will get the 'return' keyword,
// a string literal, and a template end consisting of '} } `'.
var templateStack: SyntaxKind[] = [];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Explain that rudimentary means 'inexact' and give example of where template classification will be wrong.


function isAccessibilityModifier(kind: SyntaxKind) {
switch (kind) {
case SyntaxKind.PublicKeyword:
Expand Down Expand Up @@ -5650,13 +5694,19 @@ module ts {
// if there are more cases we want the classifier to be better at.
return true;
}

// 'classifyKeywordsInGenerics' should be 'true' when a syntactic classifier is not present.
function getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): ClassificationResult {

// If there is a syntactic classifier ('syntacticClassifierAbsent' is false),
// we will be more conservative in order to avoid conflicting with the syntactic classifier.
function getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent?: boolean): ClassificationResult {
var offset = 0;
var token = SyntaxKind.Unknown;
var lastNonTriviaToken = SyntaxKind.Unknown;

// Empty out the template stack for reuse.
while (templateStack.length > 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

templateStack.length = 0

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually faster to do it this way.

templateStack.pop();
}

// If we're in a string literal, then prepend: "\
// (and a newline). That way when we lex we'll think we're still in a string literal.
//
Expand All @@ -5675,6 +5725,17 @@ module ts {
text = "/*\n" + text;
offset = 3;
break;
case EndOfLineState.InTemplateHeadOrNoSubstitutionTemplate:
text = "`\n" + text;
offset = 2;
break;
case EndOfLineState.InTemplateMiddleOrTail:
text = "}\n" + text;
offset = 2;
// fallthrough
case EndOfLineState.InTemplateSubstitutionPosition:
templateStack.push(SyntaxKind.TemplateHead);
break;
}

scanner.setText(text);
Expand Down Expand Up @@ -5739,12 +5800,45 @@ module ts {
token === SyntaxKind.StringKeyword ||
token === SyntaxKind.NumberKeyword ||
token === SyntaxKind.BooleanKeyword) {
if (angleBracketStack > 0 && !classifyKeywordsInGenerics) {
// If it looks like we're could be in something generic, don't classify this
// as a keyword. We may just get overwritten by the syntactic classifier,
// causing a noisy experience for the user.
token = SyntaxKind.Identifier;
}
if (angleBracketStack > 0 && !syntacticClassifierAbsent) {
// If it looks like we're could be in something generic, don't classify this
// as a keyword. We may just get overwritten by the syntactic classifier,
// causing a noisy experience for the user.
token = SyntaxKind.Identifier;
}
}
else if (token === SyntaxKind.TemplateHead) {
templateStack.push(token);
}
else if (token === SyntaxKind.OpenBraceToken) {
// If we don't have anything on the template stack,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't understand your comment wrapping.

// then we aren't trying to keep track of a previously scanned template head.
if (templateStack.length > 0) {
templateStack.push(token);
}
}
else if (token === SyntaxKind.CloseBraceToken) {
// If we don't have anything on the template stack,
// then we aren't trying to keep track of a previously scanned template head.
if (templateStack.length > 0) {
var lastTemplateStackToken = lastOrUndefined(templateStack);

if (lastTemplateStackToken === SyntaxKind.TemplateHead) {
token = scanner.reScanTemplateToken();

// Only pop on a TemplateTail; a TemplateMiddle indicates there is more for us.
if (token === SyntaxKind.TemplateTail) {
templateStack.pop();
}
else {
Debug.assert(token === SyntaxKind.TemplateMiddle, "Should have been a template middle. Was " + token);
}
}
else {
Debug.assert(lastTemplateStackToken === SyntaxKind.OpenBraceToken, "Should have been an open brace. Was: " + token);
templateStack.pop();
}
}
}

lastNonTriviaToken = token;
Expand Down Expand Up @@ -5789,6 +5883,22 @@ module ts {
result.finalLexState = EndOfLineState.InMultiLineCommentTrivia;
}
}
else if (isTemplateLiteralKind(token)) {
if (scanner.isUnterminated()) {
if (token === SyntaxKind.TemplateTail) {
result.finalLexState = EndOfLineState.InTemplateMiddleOrTail;
}
else if (token === SyntaxKind.NoSubstitutionTemplateLiteral) {
result.finalLexState = EndOfLineState.InTemplateHeadOrNoSubstitutionTemplate;
}
else {
Debug.fail("Only 'NoSubstitutionTemplateLiteral's and 'TemplateTail's can be unterminated; got SyntaxKind #" + token);
}
}
}
else if (templateStack.length > 0 && lastOrUndefined(templateStack) === SyntaxKind.TemplateHead) {
result.finalLexState = EndOfLineState.InTemplateSubstitutionPosition;
}
}
}

Expand Down Expand Up @@ -5892,6 +6002,9 @@ module ts {
return TokenClass.Whitespace;
case SyntaxKind.Identifier:
default:
if (isTemplateLiteralKind(token)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return isTemplate(...)
    ? ...StringLiteral
    : ...Identifier

return TokenClass.StringLiteral;
}
return TokenClass.Identifier;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/services/shims.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ module ts {
}

export interface ClassifierShim extends Shim {
getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): string;
getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent?: boolean): string;
}

export interface CoreServicesShim extends Shim {
Expand Down
24 changes: 23 additions & 1 deletion tests/baselines/reference/APISample_compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -1741,6 +1741,9 @@ declare module "typescript" {
InMultiLineCommentTrivia = 1,
InSingleQuoteStringLiteral = 2,
InDoubleQuoteStringLiteral = 3,
InTemplateHeadOrNoSubstitutionTemplate = 4,
InTemplateMiddleOrTail = 5,
InTemplateSubstitutionPosition = 6,
}
enum TokenClass {
Punctuation = 0,
Expand All @@ -1762,7 +1765,26 @@ declare module "typescript" {
classification: TokenClass;
}
interface Classifier {
getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): ClassificationResult;
/**
* Gives lexical classifications of tokens on a line without any syntactic context.
* For instance, a token consisting of the text 'string' can be either an identifier
* named 'string' or the keyword 'string', however, because this classifier is not aware,
* it relies on certain heuristics to give acceptable results. For classifications where
* speed trumps accuracy, this function is preferable; however, for true accuracy, the
* syntactic classifier is ideal. In fact, in certain editing scenarios, combining the
* lexical, syntactic, and semantic classifiers may issue the best user experience.
*
* @param text The text of a line to classify.
* @param lexState The state of the lexical classifier at the end of the previous line.
* @param syntacticClassifierAbsent Whether the client is *not* using a syntactic classifier.
* If there is no syntactic classifier (syntacticClassifierAbsent=true),
* certain heuristics may be used in its place; however, if there is a
* syntactic classifier (syntacticClassifierAbsent=false), certain
* classifications which may be incorrectly categorized will be given
* back as Identifiers in order to allow the syntactic classifier to
* subsume the classification.
*/
getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent: boolean): ClassificationResult;
}
/**
* The document registry represents a store of SourceFile objects that can be shared between
Expand Down
34 changes: 31 additions & 3 deletions tests/baselines/reference/APISample_compile.types
Original file line number Diff line number Diff line change
Expand Up @@ -5539,6 +5539,15 @@ declare module "typescript" {

InDoubleQuoteStringLiteral = 3,
>InDoubleQuoteStringLiteral : EndOfLineState

InTemplateHeadOrNoSubstitutionTemplate = 4,
>InTemplateHeadOrNoSubstitutionTemplate : EndOfLineState

InTemplateMiddleOrTail = 5,
>InTemplateMiddleOrTail : EndOfLineState

InTemplateSubstitutionPosition = 6,
>InTemplateSubstitutionPosition : EndOfLineState
}
enum TokenClass {
>TokenClass : TokenClass
Expand Down Expand Up @@ -5594,12 +5603,31 @@ declare module "typescript" {
interface Classifier {
>Classifier : Classifier

getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): ClassificationResult;
>getClassificationsForLine : (text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean) => ClassificationResult
/**
* Gives lexical classifications of tokens on a line without any syntactic context.
* For instance, a token consisting of the text 'string' can be either an identifier
* named 'string' or the keyword 'string', however, because this classifier is not aware,
* it relies on certain heuristics to give acceptable results. For classifications where
* speed trumps accuracy, this function is preferable; however, for true accuracy, the
* syntactic classifier is ideal. In fact, in certain editing scenarios, combining the
* lexical, syntactic, and semantic classifiers may issue the best user experience.
*
* @param text The text of a line to classify.
* @param lexState The state of the lexical classifier at the end of the previous line.
* @param syntacticClassifierAbsent Whether the client is *not* using a syntactic classifier.
* If there is no syntactic classifier (syntacticClassifierAbsent=true),
* certain heuristics may be used in its place; however, if there is a
* syntactic classifier (syntacticClassifierAbsent=false), certain
* classifications which may be incorrectly categorized will be given
* back as Identifiers in order to allow the syntactic classifier to
* subsume the classification.
*/
getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent: boolean): ClassificationResult;
>getClassificationsForLine : (text: string, lexState: EndOfLineState, syntacticClassifierAbsent: boolean) => ClassificationResult
>text : string
>lexState : EndOfLineState
>EndOfLineState : EndOfLineState
>classifyKeywordsInGenerics : boolean
>syntacticClassifierAbsent : boolean
>ClassificationResult : ClassificationResult
}
/**
Expand Down
24 changes: 23 additions & 1 deletion tests/baselines/reference/APISample_linter.js
Original file line number Diff line number Diff line change
Expand Up @@ -1772,6 +1772,9 @@ declare module "typescript" {
InMultiLineCommentTrivia = 1,
InSingleQuoteStringLiteral = 2,
InDoubleQuoteStringLiteral = 3,
InTemplateHeadOrNoSubstitutionTemplate = 4,
InTemplateMiddleOrTail = 5,
InTemplateSubstitutionPosition = 6,
}
enum TokenClass {
Punctuation = 0,
Expand All @@ -1793,7 +1796,26 @@ declare module "typescript" {
classification: TokenClass;
}
interface Classifier {
getClassificationsForLine(text: string, lexState: EndOfLineState, classifyKeywordsInGenerics?: boolean): ClassificationResult;
/**
* Gives lexical classifications of tokens on a line without any syntactic context.
* For instance, a token consisting of the text 'string' can be either an identifier
* named 'string' or the keyword 'string', however, because this classifier is not aware,
* it relies on certain heuristics to give acceptable results. For classifications where
* speed trumps accuracy, this function is preferable; however, for true accuracy, the
* syntactic classifier is ideal. In fact, in certain editing scenarios, combining the
* lexical, syntactic, and semantic classifiers may issue the best user experience.
*
* @param text The text of a line to classify.
* @param lexState The state of the lexical classifier at the end of the previous line.
* @param syntacticClassifierAbsent Whether the client is *not* using a syntactic classifier.
* If there is no syntactic classifier (syntacticClassifierAbsent=true),
* certain heuristics may be used in its place; however, if there is a
* syntactic classifier (syntacticClassifierAbsent=false), certain
* classifications which may be incorrectly categorized will be given
* back as Identifiers in order to allow the syntactic classifier to
* subsume the classification.
*/
getClassificationsForLine(text: string, lexState: EndOfLineState, syntacticClassifierAbsent: boolean): ClassificationResult;
}
/**
* The document registry represents a store of SourceFile objects that can be shared between
Expand Down
Loading