From c00677733f39a474ef81cab77be57b2ec01d24df Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Fri, 17 Jan 2025 10:14:30 +0100 Subject: [PATCH] Improve performance of hidden node parsing (#1790) --- .../langium/src/parser/cst-node-builder.ts | 72 ++++++++++--------- packages/langium/src/parser/langium-parser.ts | 25 ++++++- packages/langium/src/serializer/hydrator.ts | 13 ++-- packages/langium/src/syntax-tree.ts | 4 +- .../test/parser/langium-parser.test.ts | 27 ++++++- packages/langium/test/utils/cst-utils.test.ts | 33 +++++++++ 6 files changed, 126 insertions(+), 48 deletions(-) diff --git a/packages/langium/src/parser/cst-node-builder.ts b/packages/langium/src/parser/cst-node-builder.ts index 95a744ab6..72c89a160 100644 --- a/packages/langium/src/parser/cst-node-builder.ts +++ b/packages/langium/src/parser/cst-node-builder.ts @@ -9,7 +9,6 @@ import type { Range } from 'vscode-languageserver-types'; import type { AbstractElement } from '../languages/generated/ast.js'; import type { AstNode, CompositeCstNode, CstNode, LeafCstNode, RootCstNode } from '../syntax-tree.js'; import { Position } from 'vscode-languageserver-types'; -import { isCompositeCstNode } from '../syntax-tree.js'; import { tokenToRange } from '../utils/cst-utils.js'; export class CstNodeBuilder { @@ -17,7 +16,7 @@ export class CstNodeBuilder { private rootNode!: RootCstNodeImpl; private nodeStack: CompositeCstNodeImpl[] = []; - private get current(): CompositeCstNodeImpl { + get current(): CompositeCstNodeImpl { return this.nodeStack[this.nodeStack.length - 1] ?? this.rootNode; } @@ -37,8 +36,8 @@ export class CstNodeBuilder { return compositeNode; } - buildLeafNode(token: IToken, feature: AbstractElement): LeafCstNode { - const leafNode = new LeafCstNodeImpl(token.startOffset, token.image.length, tokenToRange(token), token.tokenType, false); + buildLeafNode(token: IToken, feature?: AbstractElement): LeafCstNode { + const leafNode = new LeafCstNodeImpl(token.startOffset, token.image.length, tokenToRange(token), token.tokenType, !feature); leafNode.grammarSource = feature; leafNode.root = this.rootNode; this.current.content.push(leafNode); @@ -55,6 +54,39 @@ export class CstNodeBuilder { } } + addHiddenNodes(tokens: IToken[]): void { + const nodes: LeafCstNode[] = []; + for (const token of tokens) { + const leafNode = new LeafCstNodeImpl(token.startOffset, token.image.length, tokenToRange(token), token.tokenType, true); + leafNode.root = this.rootNode; + nodes.push(leafNode); + } + let current: CompositeCstNode = this.current; + let added = false; + // If we are within a composite node, we add the hidden nodes to the content + if (current.content.length > 0) { + current.content.push(...nodes); + return; + } + // Otherwise we are at a newly created node + // Instead of adding the hidden nodes here, we search for the first parent node with content + while (current.container) { + const index = current.container.content.indexOf(current); + if (index > 0) { + // Add the hidden nodes before the current node + current.container.content.splice(index, 0, ...nodes); + added = true; + break; + } + current = current.container; + } + // If we arrive at the root node, we add the hidden nodes at the beginning + // This is the case if the hidden nodes are the first nodes in the tree + if (!added) { + this.rootNode.content.unshift(...nodes); + } + } + construct(item: { $type: string | symbol | undefined, $cstNode: CstNode }): void { const current: CstNode = this.current; // The specified item could be a datatype ($type is symbol) or a fragment ($type is undefined) @@ -70,34 +102,6 @@ export class CstNodeBuilder { this.removeNode(node); } } - - addHiddenTokens(hiddenTokens: IToken[]): void { - for (const token of hiddenTokens) { - const hiddenNode = new LeafCstNodeImpl(token.startOffset, token.image.length, tokenToRange(token), token.tokenType, true); - hiddenNode.root = this.rootNode; - this.addHiddenToken(this.rootNode, hiddenNode); - } - } - - private addHiddenToken(node: CompositeCstNode, token: LeafCstNode): void { - const { offset: tokenStart, end: tokenEnd } = token; - - for (let i = 0; i < node.content.length; i++) { - const child = node.content[i]; - const { offset: childStart, end: childEnd } = child; - if (isCompositeCstNode(child) && tokenStart > childStart && tokenEnd < childEnd) { - this.addHiddenToken(child, token); - return; - } else if (tokenEnd <= childStart) { - node.content.splice(i, 0, token); - return; - } - } - - // We know that we haven't found a suited position for the token - // So we simply add it to the end of the current node - node.content.push(token); - } } export abstract class AbstractCstNode implements CstNode { @@ -107,7 +111,7 @@ export abstract class AbstractCstNode implements CstNode { abstract get range(): Range; container?: CompositeCstNode; - grammarSource: AbstractElement; + grammarSource?: AbstractElement; root: RootCstNode; private _astNode?: AstNode; @@ -117,7 +121,7 @@ export abstract class AbstractCstNode implements CstNode { } /** @deprecated use `grammarSource` instead. */ - get feature(): AbstractElement { + get feature(): AbstractElement | undefined { return this.grammarSource; } diff --git a/packages/langium/src/parser/langium-parser.ts b/packages/langium/src/parser/langium-parser.ts index be07feaf2..03a28b207 100644 --- a/packages/langium/src/parser/langium-parser.ts +++ b/packages/langium/src/parser/langium-parser.ts @@ -10,7 +10,7 @@ import type { AbstractElement, Action, Assignment, ParserRule } from '../languag import type { Linker } from '../references/linker.js'; import type { LangiumCoreServices } from '../services.js'; import type { AstNode, AstReflection, CompositeCstNode, CstNode } from '../syntax-tree.js'; -import type { Lexer } from './lexer.js'; +import type { Lexer, LexerResult } from './lexer.js'; import type { IParserConfig } from './parser-config.js'; import type { ValueConverter } from './value-converter.js'; import { defaultParserErrorProvider, EmbeddedActionsParser, LLkLookaheadStrategy } from 'chevrotain'; @@ -195,6 +195,7 @@ export class LangiumParser extends AbstractLangiumParser { private readonly converter: ValueConverter; private readonly astReflection: AstReflection; private readonly nodeBuilder = new CstNodeBuilder(); + private lexerResult?: LexerResult; private stack: any[] = []; private assignmentMap = new Map(); @@ -232,15 +233,16 @@ export class LangiumParser extends AbstractLangiumParser { parse(input: string, options: ParserOptions = {}): ParseResult { this.nodeBuilder.buildRootNode(input); - const lexerResult = this.lexer.tokenize(input); + const lexerResult = this.lexerResult = this.lexer.tokenize(input); this.wrapper.input = lexerResult.tokens; const ruleMethod = options.rule ? this.allRules.get(options.rule) : this.mainRule; if (!ruleMethod) { throw new Error(options.rule ? `No rule found with name '${options.rule}'` : 'No main rule available.'); } const result = ruleMethod.call(this.wrapper, {}); - this.nodeBuilder.addHiddenTokens(lexerResult.hidden); + this.nodeBuilder.addHiddenNodes(lexerResult.hidden); this.unorderedGroups.clear(); + this.lexerResult = undefined; return { value: result, lexerErrors: lexerResult.errors, @@ -273,9 +275,26 @@ export class LangiumParser extends AbstractLangiumParser { }; } + private extractHiddenTokens(token: IToken): IToken[] { + const hiddenTokens = this.lexerResult!.hidden; + if (!hiddenTokens.length) { + return []; + } + const offset = token.startOffset; + for (let i = 0; i < hiddenTokens.length; i++) { + const token = hiddenTokens[i]; + if (token.startOffset > offset) { + return hiddenTokens.splice(0, i); + } + } + return hiddenTokens.splice(0, hiddenTokens.length); + } + consume(idx: number, tokenType: TokenType, feature: AbstractElement): void { const token = this.wrapper.wrapConsume(idx, tokenType); if (!this.isRecording() && this.isValidToken(token)) { + const hiddenTokens = this.extractHiddenTokens(token); + this.nodeBuilder.addHiddenNodes(hiddenTokens); const leafNode = this.nodeBuilder.buildLeafNode(token, feature); const { assignment, isCrossRef } = this.getAssignment(feature); const current = this.current; diff --git a/packages/langium/src/serializer/hydrator.ts b/packages/langium/src/serializer/hydrator.ts index c18592f36..0f9477c70 100644 --- a/packages/langium/src/serializer/hydrator.ts +++ b/packages/langium/src/serializer/hydrator.ts @@ -296,23 +296,22 @@ export class DefaultHydrator implements Hydrator { return this.lexer.definition[name]; } - protected getGrammarElementId(node: AbstractElement): number | undefined { + protected getGrammarElementId(node: AbstractElement | undefined): number | undefined { + if (!node) { + return undefined; + } if (this.grammarElementIdMap.size === 0) { this.createGrammarElementIdMap(); } return this.grammarElementIdMap.get(node); } - protected getGrammarElement(id: number): AbstractElement { + protected getGrammarElement(id: number): AbstractElement | undefined { if (this.grammarElementIdMap.size === 0) { this.createGrammarElementIdMap(); } const element = this.grammarElementIdMap.getKey(id); - if (element) { - return element; - } else { - throw new Error('Invalid grammar element id: ' + id); - } + return element; } protected createGrammarElementIdMap(): void { diff --git a/packages/langium/src/syntax-tree.ts b/packages/langium/src/syntax-tree.ts index 47a548715..cbdc8bbbd 100644 --- a/packages/langium/src/syntax-tree.ts +++ b/packages/langium/src/syntax-tree.ts @@ -234,9 +234,9 @@ export interface CstNode extends DocumentSegment { /** The root CST node */ readonly root: RootCstNode; /** The grammar element from which this node was parsed */ - readonly grammarSource: AbstractElement; + readonly grammarSource?: AbstractElement; /** @deprecated use `grammarSource` instead. */ - readonly feature: AbstractElement; + readonly feature?: AbstractElement; /** The AST node created from this CST node */ readonly astNode: AstNode; /** @deprecated use `astNode` instead. */ diff --git a/packages/langium/test/parser/langium-parser.test.ts b/packages/langium/test/parser/langium-parser.test.ts index 7da85a974..efb6491c7 100644 --- a/packages/langium/test/parser/langium-parser.test.ts +++ b/packages/langium/test/parser/langium-parser.test.ts @@ -4,9 +4,9 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { AstNode, LangiumCoreServices } from 'langium'; +import { EmptyFileSystem, type AstNode, type LangiumCoreServices } from 'langium'; import { describe, expect, test, beforeEach } from 'vitest'; -import { createServicesForGrammar } from 'langium/grammar'; +import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; import { parseHelper } from 'langium/test'; describe('Partial parsing', () => { @@ -66,6 +66,29 @@ describe('Partial parsing', () => { }); +describe('hidden node parsing', () => { + + test('finishes in expected time', async () => { + const parser = createLangiumGrammarServices(EmptyFileSystem).grammar.parser.LangiumParser; + let content = 'Rule:'; + // Adding hidden nodes used to cause exponential parsing time behavior + for (let i = 0; i < 2500; i++) { + content += "'a' /* A */ /* B */ /* C */\n"; + } + content += ';'; + const start = Date.now(); + // This roughly takes 100-300 ms on a modern machine + // If it takes longer, the hidden node parsing is likely to be exponential + // On an older version of the parser, this took ~5 seconds + const result = parser.parse(content); + expect(result.lexerErrors).toHaveLength(0); + expect(result.parserErrors).toHaveLength(0); + const end = Date.now(); + expect(end - start).toBeLessThan(1000); + }); + +}); + interface A extends AstNode { name: string } diff --git a/packages/langium/test/utils/cst-utils.test.ts b/packages/langium/test/utils/cst-utils.test.ts index 89b506140..9c232e65d 100644 --- a/packages/langium/test/utils/cst-utils.test.ts +++ b/packages/langium/test/utils/cst-utils.test.ts @@ -62,6 +62,39 @@ describe('findLeafNode', () => { } }); +describe('findCommentNode', () => { + test('Finds correct comment with multiple comments before and after', async () => { + const text = expandToString` + Main: value=AB; + terminal AB: + /** A */ + /** B */ + 'A' + /** C */ + /** D */; + `; + const grammar = await parser(text); + const offset = text.indexOf("'A'") + 1; + const leafNode = findLeafNodeAtOffset(grammar.parseResult.value.$cstNode!, offset); + const keyword = leafNode?.astNode; + expect(keyword).toBeDefined(); + const comment = CstUtils.findCommentNode(keyword?.$cstNode, ['ML_COMMENT']); + expect(comment?.text).toBe('/** B */'); + }); + + test('Finds correct comment at the start of the file', async () => { + const text = expandToString` + /** A */ + /** B */ + /** C */ + grammar test + `; + const grammar = await parser(text); + const comment = CstUtils.findCommentNode(grammar.parseResult.value.$cstNode, ['ML_COMMENT']); + expect(comment?.text).toBe('/** C */'); + }); +}); + describe('compareRange', () => { test.each([ // Different lines