Skip to content

Commit

Permalink
feat(parser/html): handle script and style tags (#3970)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 authored Sep 18, 2024
1 parent 0ca9b69 commit ad1a744
Show file tree
Hide file tree
Showing 11 changed files with 356 additions and 3 deletions.
31 changes: 30 additions & 1 deletion crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod tests;

use crate::token_source::HtmlLexContext;
use crate::token_source::{HtmlEmbededLanguage, HtmlLexContext};
use biome_html_syntax::HtmlSyntaxKind::{
COMMENT, DOCTYPE_KW, EOF, ERROR_TOKEN, HTML_KW, HTML_LITERAL, HTML_STRING_LITERAL, NEWLINE,
TOMBSTONE, UNICODE_BOM, WHITESPACE,
Expand Down Expand Up @@ -111,6 +111,32 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume an embedded language in its entirety. Stops immediately before the closing tag.
fn consume_token_embedded_language(
&mut self,
_current: u8,
lang: HtmlEmbededLanguage,
) -> HtmlSyntaxKind {
let start = self.text_position();
let end_tag = lang.end_tag();
while self.current_byte().is_some() {
if self.source[self.position..(self.position + end_tag.len())]
.eq_ignore_ascii_case(end_tag)
{
break;
}
self.advance(1);
}

if self.text_position() != start {
HTML_LITERAL
} else {
// if the element is empty, we will immediately hit the closing tag.
// we HAVE to consume something, so we start consuming the closing tag.
self.consume_byte(T![<])
}
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -442,6 +468,9 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
HtmlLexContext::Doctype => self.consume_token_doctype(current),
HtmlLexContext::EmbeddedLanguage(lang) => {
self.consume_token_embedded_language(current, lang)
}
},
None => EOF,
}
Expand Down
21 changes: 19 additions & 2 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod parse_error;

use crate::parser::HtmlParser;
use crate::syntax::parse_error::*;
use crate::token_source::HtmlLexContext;
use crate::token_source::{HtmlEmbededLanguage, HtmlLexContext};
use biome_html_syntax::HtmlSyntaxKind::*;
use biome_html_syntax::{HtmlSyntaxKind, T};
use biome_parser::parse_lists::ParseNodeList;
Expand All @@ -20,6 +20,9 @@ static VOID_ELEMENTS: &[&str] = &[
"wbr",
];

/// For these elements, the content is treated as raw text and no parsing is done inside them. This is so that the contents of these tags can be parsed by a different parser.
pub(crate) static EMBEDDED_LANGUAGE_ELEMENTS: &[&str] = &["script", "style"];

pub(crate) fn parse_root(p: &mut HtmlParser) {
let m = p.start();

Expand Down Expand Up @@ -76,6 +79,9 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
let should_be_self_closing = VOID_ELEMENTS
.iter()
.any(|tag| tag.eq_ignore_ascii_case(opening_tag_name.as_str()));
let is_embedded_language_tag = EMBEDDED_LANGUAGE_ELEMENTS
.iter()
.any(|tag| tag.eq_ignore_ascii_case(opening_tag_name.as_str()));
parse_literal(p).or_add_diagnostic(p, expected_element_name);

AttributeList.parse_list(p);
Expand All @@ -92,7 +98,18 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
return Present(m.complete(p, HTML_SELF_CLOSING_ELEMENT));
}
p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
p.expect_with_context(
T![>],
if is_embedded_language_tag {
HtmlLexContext::EmbeddedLanguage(match opening_tag_name.as_str() {
tag if tag.eq_ignore_ascii_case("script") => HtmlEmbededLanguage::Script,
tag if tag.eq_ignore_ascii_case("style") => HtmlEmbededLanguage::Style,
_ => unreachable!(),
})
} else {
HtmlLexContext::OutsideTag
},
);
let opening = m.complete(p, HTML_OPENING_ELEMENT);
loop {
ElementList.parse_list(p);
Expand Down
17 changes: 17 additions & 0 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,23 @@ pub(crate) enum HtmlLexContext {
///
/// When the parser has encounters the sequence `<!DOCTYPE`, it switches to this context. It will remain in this context until the next `>` token is encountered.
Doctype,
/// Treat everything as text until the closing tag is encountered.
EmbeddedLanguage(HtmlEmbededLanguage),
}

#[derive(Copy, Clone, Debug)]
pub(crate) enum HtmlEmbededLanguage {
Script,
Style,
}

impl HtmlEmbededLanguage {
pub fn end_tag(&self) -> &'static str {
match self {
Self::Script => "</script>",
Self::Style => "</style>",
}
}
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<script>
console.log('Hello, world!');
</script>
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<script>
console.log('Hello, world!');
</script>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..7 "script" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@7..8 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@8..40 "\n\tconsole.log('Hello, world!');\n" [] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@40..41 "<" [] [],
slash_token: SLASH@41..42 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@42..48 "script" [] [],
},
r_angle_token: R_ANGLE@48..49 ">" [] [],
},
},
eof_token: EOF@49..50 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "script" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "\n\tconsole.log('Hello, world!');\n" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "script" [] []
3: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<script></script>
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<script></script>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..7 "script" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@7..8 ">" [] [],
},
children: HtmlElementList [],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@8..9 "<" [] [],
slash_token: SLASH@9..10 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@10..16 "script" [] [],
},
r_angle_token: R_ANGLE@16..17 ">" [] [],
},
},
eof_token: EOF@17..18 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "script" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "script" [] []
3: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<script>window.jQuery || document.write('<script src="js/vendor/jquery-3.3.1.min.js"><\/script>')</script>
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<script>window.jQuery || document.write('<script src="js/vendor/jquery-3.3.1.min.js"><\/script>')</script>
```
## AST
```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..7 "script" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@7..8 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@8..97 "window.jQuery || document.write('<script src=\"js/vendor/jquery-3.3.1.min.js\"><\\/script>')" [] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@97..98 "<" [] [],
slash_token: SLASH@98..99 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@99..105 "script" [] [],
},
r_angle_token: R_ANGLE@105..106 ">" [] [],
},
},
eof_token: EOF@106..107 "" [Newline("\n")] [],
}
```
## CST
```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "script" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "window.jQuery || document.write('<script src=\"js/vendor/jquery-3.3.1.min.js\"><\\/script>')" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "script" [] []
3: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<script>
if (1 < 2) {
console.log('Hello, world!');
}
</script>
Loading

0 comments on commit ad1a744

Please sign in to comment.