Skip to content

Feat: Support <script> #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Support for parsing `<script>` tag

## [2.4.0] - 2020-08-07
### Added
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Html.Parser.run "<div><p>Hello, world!</p></div>"
```

## Limitations
* `<script>` tags are not fully supported.
* SVG is not supported.

Feel free to contribute!
Expand Down
108 changes: 108 additions & 0 deletions src/Html/Parser.elm
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,17 @@ element =
]
|. Parser.chompIf ((==) '>')

else if name == "script" then
-- <script> can contain JavaScript operator '<' which confuses `closingTag` parser so:
--
-- * look for `</script>` exactly to close the tag
-- * UNLESS it is contained in a JavaScript string or comment
--
-- In essence, we provide partial JavaScript parser here
Parser.succeed (Element name attributes)
|. Parser.chompIf ((==) '>')
|= consumeJavaScriptUntilClosingTag

else
Parser.succeed (Element name attributes)
|. Parser.chompIf ((==) '>')
Expand All @@ -267,6 +278,103 @@ element =
)


consumeJavaScriptUntilClosingTag : Parser (List Node)
consumeJavaScriptUntilClosingTag =
Parser.loop [] <|
\acc ->
let
accumulate newNode =
Parser.Loop <|
case ( acc, newNode ) of
( [], first ) ->
[ first ]

( (Text accChunk) :: tail, Text newChunk ) ->
-- Merge top-most text node unless HTML comment nodes are interleaved
Text (accChunk ++ newChunk) :: tail

( nonTextNode :: tail, _ ) ->
newNode :: nonTextNode :: tail
in
Parser.oneOf
[ -- HTML comments are, albeit considered a bad practice recently,
-- allowed inside <script> to hide scripts from really ancient web browser
comment
|> Parser.map accumulate
, Parser.lineComment "//"
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
, Parser.multiComment "/*" "*/" Parser.NotNestable
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '"'
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '\''
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '`'
|> Parser.map (Text >> accumulate)
, closingScriptTag
|> Parser.map (\() -> Parser.Done (List.reverse acc))
, Parser.chompIf (always True)
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
]


closingScriptTag : Parser ()
closingScriptTag =
Parser.token "</"
|. (Parser.chompWhile (\char -> char /= '>' && not (isSpaceCharacter char))
|> Parser.getChompedString
|> Parser.andThen
(\chunk ->
if String.toLower chunk == "script" then
Parser.succeed ()

else
Parser.problem "not a </script>"
)
)
|. Parser.chompWhile isSpaceCharacter
|. Parser.token ">"


javaScriptStringLike : Char -> Parser String
javaScriptStringLike terminatorChar =
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let
terminatorStr =
String.fromChar terminatorChar
in
Parser.succeed identity
|. Parser.token terminatorStr
|= Parser.loop "" (stringHelp terminatorChar terminatorStr)
-- Restoring original shape
|> Parser.map (\chunk -> terminatorStr ++ chunk ++ terminatorStr)


stringHelp : Char -> String -> String -> Parser (Parser.Step String String)
stringHelp terminatorChar terminatorStr acc =
Parser.oneOf
[ Parser.succeed (\char -> Parser.Loop (acc ++ "\\" ++ char))
|. Parser.token "\\"
|= justOneChar
, Parser.token terminatorStr
|> Parser.map (\_ -> Parser.Done acc)
, chompOneOrMore (\char -> char /= '\\' && char /= terminatorChar)
|> Parser.getChompedString
|> Parser.map (\chunk -> Parser.Loop (acc ++ chunk))
]


justOneChar : Parser String
justOneChar =
Parser.loop () <|
\_ ->
Parser.chompIf (always True)
|> Parser.getChompedString
|> Parser.map Parser.Done
Comment on lines +369 to +375
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This "consuming just one whatever character (after backslash escape)" is somewhat rough, but couldn't come up with better impl



tagName : Parser String
tagName =
Parser.getChompedString
Expand Down
115 changes: 112 additions & 3 deletions tests/Main.elm → tests/MainTests.elm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module Main exposing (suite)
module MainTests exposing (suite)

import Dict
import Expect exposing (Expectation)
Expand Down Expand Up @@ -164,9 +164,117 @@ documentTests =
[ test "minimal" (testParseDocument "<!DOCTYPE html><html></html>" (Html.Parser.Document [] "" [] ( [], [] ) []))
, test "example1" (testParseDocument "<!--Early!--><!DOCTYPE html LEGACY \"My legacy string stuff\"><!--Teehee!--><html><p>Got it.</p><br></html><!--Smelly feet-->" { doctype = "LEGACY \"My legacy string stuff\"", document = ( [], [ Element "p" [] [ Text "Got it." ], Element "br" [] [] ] ), postdocComments = [ "Smelly feet" ], preambleComments = [ "Early!" ], predocComments = [ "Teehee!" ] })
, test "recapitalized1" (testParseDocument "<!--EaRlY!--><!DoCtYpE HtMl lEgAcY \"mY LeGaCy StRiNg StUfF\"><!--tEeHeE!--><HtMl><P>gOt It.</P><bR></HtMl><!--sMeLlY fEeT-->" { doctype = "lEgAcY \"mY LeGaCy StRiNg StUfF\"", document = ( [], [ Element "p" [] [ Text "gOt It." ], Element "br" [] [] ] ), postdocComments = [ "sMeLlY fEeT" ], preambleComments = [ "EaRlY!" ], predocComments = [ "tEeHeE!" ] })
, test "realWorld1"
(testParseDocument realWorld1
{ preambleComments = []
, doctype = ""
, predocComments = []
, postdocComments = []
, document =
( []
, [ Text "\n "
, Element "head"
[]
[ Text "\n "
, Element "meta" [ ( "charset", "utf-8" ) ] []
, Text "\n "
, Element "title" [] [ Text "Title" ]
, Text "\n "
, Element "link" [ ( "rel", "stylesheet" ), ( "href", "/style.css" ) ] []
, Text "\n "
, Element "link" [ ( "rel", "canonical" ), ( "href", "https://example.com" ) ] []
, Text "\n "
, Element "script" [ ( "async", "" ), ( "type", "text/javascript" ), ( "src", "https://external.example.com/script.js" ) ] []
, Text "\n "
, Comment " Google Analytics "
, Text "\n "
, Element "script" [ ( "async", "" ), ( "src", "https://www.googletagmanager.com/gtag/js?id=xxxxxxxx" ) ] []
, Text "\n "
, Element "script" [] [ Text """
/**
Block comments
*/
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'xxxxxxxx');
""" ]
, Text "\n "
]
, Text "\n "
, Element "body"
[]
[ Text "\n "
, Element "div" [ ( "id", "root" ) ] []
, Text "\n "
, Element "script" [] [ Comment """
// Ancient Browser Workaround
// Hiding <script> contents
document.write('<script src="inline.js"></script>');
//""" ]
, Text "\n "
, Element "script" [] [ Text """
var dqStringWithScript = "<script></script> inside JavaScript double-quoted string must be ignored";
var sqStringWithScript = '<script></script> inside JavaScript single-quoted string must be ignored';
var templateWithScript = `<script></script> inside JavaScript template literal must be ignored; ${"even interpolated <script></script>"}`;
// <script></script> inside JavaScript line comment must be ignored
/*
<script></script> inside JavaScript multiline comment must be ignored
*/
""" ]
, Text "\n "
]
, Text "\n"
]
)
}
)
]


realWorld1 : String
realWorld1 =
"""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Title</title>
<link rel="stylesheet" href="/style.css">
<link rel="canonical" href="https://example.com">
<script async type='text/javascript' src='https://external.example.com/script.js'></script>
<!-- Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=xxxxxxxx"></script>
<script>
/**
Block comments
*/
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'xxxxxxxx');
</script>
</head>
<body>
<div id="root"></div>
<script><!--
// Ancient Browser Workaround
// Hiding <script> contents
document.write('<script src="inline.js"></script>');
//--></script>
<script>
var dqStringWithScript = "<script></script> inside JavaScript double-quoted string must be ignored";
var sqStringWithScript = '<script></script> inside JavaScript single-quoted string must be ignored';
var templateWithScript = `<script></script> inside JavaScript template literal must be ignored; ${"even interpolated <script></script>"}`;
// <script></script> inside JavaScript line comment must be ignored
/*
<script></script> inside JavaScript multiline comment must be ignored
*/
</script>
</body>
</html>
"""


documentToStringTests : Test
documentToStringTests =
describe "documentToString"
Expand Down Expand Up @@ -240,6 +348,8 @@ errorTests =
, test "wrong DOCTYPE keyword" (testDocumentError "<!DOCTYRP html><html></html>")
, test "wrong DOCTYPE" (testDocumentError "<!DOCTYPE httl><html></html>")
, test "wrong html tag" (testDocumentError "<!DOCTYPE html><document></document>")
, test "incomplete script1" (testDocumentError "<script>")
, test "incomplete script2 (PR#18 comment)" (testDocumentError "<script>'")
]


Expand All @@ -254,6 +364,5 @@ suite =
, commentTests
, attributeTests
, errorTests

--, scriptTests
, scriptTests
]