hecrj · ymtszw · Feb 19, 2022 · Feb 19, 2022 · Feb 19, 2022 · Feb 19, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Added
+- Support for parsing `<script>` tag
 
 ## [2.4.0] - 2020-08-07
 ### Added

diff --git a/README.md b/README.md
@@ -10,7 +10,6 @@ Html.Parser.run "<div><p>Hello, world!</p></div>"
 ```
 
 ## Limitations
-  * `<script>` tags are not fully supported.
   * SVG is not supported.
 
 Feel free to contribute!

diff --git a/src/Html/Parser.elm b/src/Html/Parser.elm
@@ -259,6 +259,17 @@ element =
                             ]
                         |. Parser.chompIf ((==) '>')
 
+                else if name == "script" then
+                    -- <script> can contain JavaScript operator '<' which confuses `closingTag` parser so:
+                    --
+                    -- * look for `</script>` exactly to close the tag
+                    -- * UNLESS it is contained in a JavaScript string or comment
+                    --
+                    -- In essence, we provide partial JavaScript parser here
+                    Parser.succeed (Element name attributes)
+                        |. Parser.chompIf ((==) '>')
+                        |= consumeJavaScriptUntilClosingTag
+
                 else
                     Parser.succeed (Element name attributes)
                         |. Parser.chompIf ((==) '>')
@@ -267,6 +278,103 @@ element =
             )
 
 
+consumeJavaScriptUntilClosingTag : Parser (List Node)
+consumeJavaScriptUntilClosingTag =
+    Parser.loop [] <|
+        \acc ->
+            let
+                accumulate newNode =
+                    Parser.Loop <|
+                        case ( acc, newNode ) of
+                            ( [], first ) ->
+                                [ first ]
+
+                            ( (Text accChunk) :: tail, Text newChunk ) ->
+                                -- Merge top-most text node unless HTML comment nodes are interleaved
+                                Text (accChunk ++ newChunk) :: tail
+
+                            ( nonTextNode :: tail, _ ) ->
+                                newNode :: nonTextNode :: tail
+            in
+            Parser.oneOf
+                [ -- HTML comments are, albeit considered a bad practice recently,
+                  -- allowed inside <script> to hide scripts from really ancient web browser
+                  comment
+                    |> Parser.map accumulate
+                , Parser.lineComment "//"
+                    |> Parser.getChompedString
+                    |> Parser.map (Text >> accumulate)
+                , Parser.multiComment "/*" "*/" Parser.NotNestable
+                    |> Parser.getChompedString
+                    |> Parser.map (Text >> accumulate)
+                , javaScriptStringLike '"'
+                    |> Parser.map (Text >> accumulate)
+                , javaScriptStringLike '\''
+                    |> Parser.map (Text >> accumulate)
+                , javaScriptStringLike '`'
+                    |> Parser.map (Text >> accumulate)
+                , closingScriptTag
+                    |> Parser.map (\() -> Parser.Done (List.reverse acc))
+                , Parser.chompIf (always True)
+                    |> Parser.getChompedString
+                    |> Parser.map (Text >> accumulate)
+                ]
+
+
+closingScriptTag : Parser ()
+closingScriptTag =
+    Parser.token "</"
+        |. (Parser.chompWhile (\char -> char /= '>' && not (isSpaceCharacter char))
+                |> Parser.getChompedString
+                |> Parser.andThen
+                    (\chunk ->
+                        if String.toLower chunk == "script" then
+                            Parser.succeed ()
+
+                        else
+                            Parser.problem "not a </script>"
+                    )
+           )
+        |. Parser.chompWhile isSpaceCharacter
+        |. Parser.token ">"
+
+
+javaScriptStringLike : Char -> Parser String
+javaScriptStringLike terminatorChar =
+    let
+        terminatorStr =
+            String.fromChar terminatorChar
+    in
+    Parser.succeed identity
+        |. Parser.token terminatorStr
+        |= Parser.loop "" (stringHelp terminatorChar terminatorStr)
+        -- Restoring original shape
+        |> Parser.map (\chunk -> terminatorStr ++ chunk ++ terminatorStr)
+
+
+stringHelp : Char -> String -> String -> Parser (Parser.Step String String)
+stringHelp terminatorChar terminatorStr acc =
+    Parser.oneOf
+        [ Parser.succeed (\char -> Parser.Loop (acc ++ "\\" ++ char))
+            |. Parser.token "\\"
+            |= justOneChar
+        , Parser.token terminatorStr
+            |> Parser.map (\_ -> Parser.Done acc)
+        , chompOneOrMore (\char -> char /= '\\' && char /= terminatorChar)
+            |> Parser.getChompedString
+            |> Parser.map (\chunk -> Parser.Loop (acc ++ chunk))
+        ]
+
+
+justOneChar : Parser String
+justOneChar =
+    Parser.loop () <|
+        \_ ->
+            Parser.chompIf (always True)
+                |> Parser.getChompedString
+                |> Parser.map Parser.Done
+
+
 tagName : Parser String
 tagName =
     Parser.getChompedString

diff --git a/tests/Main.elm → tests/MainTests.elm b/tests/Main.elm → tests/MainTests.elm
@@ -1,4 +1,4 @@
-module Main exposing (suite)
+module MainTests exposing (suite)
 
 import Dict
 import Expect exposing (Expectation)
@@ -164,9 +164,117 @@ documentTests =
         [ test "minimal" (testParseDocument "<!DOCTYPE html><html></html>" (Html.Parser.Document [] "" [] ( [], [] ) []))
         , test "example1" (testParseDocument "<!--Early!--><!DOCTYPE html LEGACY \"My legacy string stuff\"><!--Teehee!--><html><p>Got it.</p><br></html><!--Smelly feet-->" { doctype = "LEGACY \"My legacy string stuff\"", document = ( [], [ Element "p" [] [ Text "Got it." ], Element "br" [] [] ] ), postdocComments = [ "Smelly feet" ], preambleComments = [ "Early!" ], predocComments = [ "Teehee!" ] })
         , test "recapitalized1" (testParseDocument "<!--EaRlY!--><!DoCtYpE HtMl lEgAcY \"mY LeGaCy StRiNg StUfF\"><!--tEeHeE!--><HtMl><P>gOt It.</P><bR></HtMl><!--sMeLlY fEeT-->" { doctype = "lEgAcY \"mY LeGaCy StRiNg StUfF\"", document = ( [], [ Element "p" [] [ Text "gOt It." ], Element "br" [] [] ] ), postdocComments = [ "sMeLlY fEeT" ], preambleComments = [ "EaRlY!" ], predocComments = [ "tEeHeE!" ] })
+        , test "realWorld1"
+            (testParseDocument realWorld1
+                { preambleComments = []
+                , doctype = ""
+                , predocComments = []
+                , postdocComments = []
+                , document =
+                    ( []
+                    , [ Text "\n  "
+                      , Element "head"
+                            []
+                            [ Text "\n    "
+                            , Element "meta" [ ( "charset", "utf-8" ) ] []
+                            , Text "\n    "
+                            , Element "title" [] [ Text "Title" ]
+                            , Text "\n    "
+                            , Element "link" [ ( "rel", "stylesheet" ), ( "href", "/style.css" ) ] []
+                            , Text "\n    "
+                            , Element "link" [ ( "rel", "canonical" ), ( "href", "https://example.com" ) ] []
+                            , Text "\n    "
+                            , Element "script" [ ( "async", "" ), ( "type", "text/javascript" ), ( "src", "https://external.example.com/script.js" ) ] []
+                            , Text "\n    "
+                            , Comment " Google Analytics "
+                            , Text "\n    "
+                            , Element "script" [ ( "async", "" ), ( "src", "https://www.googletagmanager.com/gtag/js?id=xxxxxxxx" ) ] []
+                            , Text "\n    "
+                            , Element "script" [] [ Text """
+        /**
+            Block comments
+        */
+        window.dataLayer = window.dataLayer || [];
+        function gtag(){dataLayer.push(arguments);}
+        gtag('js', new Date());
+        gtag('config', 'xxxxxxxx');
+    """ ]
+                            , Text "\n  "
+                            ]
+                      , Text "\n  "
+                      , Element "body"
+                            []
+                            [ Text "\n    "
+                            , Element "div" [ ( "id", "root" ) ] []
+                            , Text "\n    "
+                            , Element "script" [] [ Comment """
+    // Ancient Browser Workaround
+    // Hiding <script> contents
+    document.write('<script src="inline.js"></script>');
+    //""" ]
+                            , Text "\n    "
+                            , Element "script" [] [ Text """
+        var dqStringWithScript = "<script></script> inside JavaScript double-quoted string must be ignored";
+        var sqStringWithScript = '<script></script> inside JavaScript single-quoted string must be ignored';
+        var templateWithScript = `<script></script> inside JavaScript template literal must be ignored; ${"even interpolated <script></script>"}`;
+        // <script></script> inside JavaScript line comment must be ignored
+        /*
+            <script></script> inside JavaScript multiline comment must be ignored
+        */
+    """ ]
+                            , Text "\n  "
+                            ]
+                      , Text "\n"
+                      ]
+                    )
+                }
+            )
         ]
 
 
+realWorld1 : String
+realWorld1 =
+    """<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <title>Title</title>
+    <link rel="stylesheet" href="/style.css">
+    <link rel="canonical" href="https://example.com">
+    <script async type='text/javascript' src='https://external.example.com/script.js'></script>
+    <!-- Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=xxxxxxxx"></script>
+    <script>
+        /**
+            Block comments
+        */
+        window.dataLayer = window.dataLayer || [];
+        function gtag(){dataLayer.push(arguments);}
+        gtag('js', new Date());
+        gtag('config', 'xxxxxxxx');
+    </script>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script><!--
+    // Ancient Browser Workaround
+    // Hiding <script> contents
+    document.write('<script src="inline.js"></script>');
+    //--></script>
+    <script>
+        var dqStringWithScript = "<script></script> inside JavaScript double-quoted string must be ignored";
+        var sqStringWithScript = '<script></script> inside JavaScript single-quoted string must be ignored';
+        var templateWithScript = `<script></script> inside JavaScript template literal must be ignored; ${"even interpolated <script></script>"}`;
+        // <script></script> inside JavaScript line comment must be ignored
+        /*
+            <script></script> inside JavaScript multiline comment must be ignored
+        */
+    </script>
+  </body>
+</html>
+"""
+
+
 documentToStringTests : Test
 documentToStringTests =
     describe "documentToString"
@@ -240,6 +348,8 @@ errorTests =
         , test "wrong DOCTYPE keyword" (testDocumentError "<!DOCTYRP html><html></html>")
         , test "wrong DOCTYPE" (testDocumentError "<!DOCTYPE httl><html></html>")
         , test "wrong html tag" (testDocumentError "<!DOCTYPE html><document></document>")
+        , test "incomplete script1" (testDocumentError "<script>")
+        , test "incomplete script2 (PR#18 comment)" (testDocumentError "<script>'")
         ]
 
 
@@ -254,6 +364,5 @@ suite =
         , commentTests
         , attributeTests
         , errorTests
-
-        --, scriptTests
+        , scriptTests
         ]