executablebooks · chrisjsewell · Sep 27, 2022 · Sep 27, 2022 · Sep 27, 2022 · Sep 27, 2022
diff --git a/docs/index.md b/docs/index.md
@@ -85,6 +85,12 @@ html_string = md.render("some *Markdown*")
 .. autofunction:: mdit_py_plugins.container.container_plugin
 ```
 
+## Inline Attributes
+
+```{eval-rst}
+.. autofunction:: mdit_py_plugins.attrs.attrs_plugin
+```
+
 ## Math
 
 ```{eval-rst}

diff --git a/mdit_py_plugins/attrs/__init__.py b/mdit_py_plugins/attrs/__init__.py
@@ -0,0 +1 @@
+from .index import attrs_plugin  # noqa: F401
diff --git a/mdit_py_plugins/attrs/index.py b/mdit_py_plugins/attrs/index.py
@@ -0,0 +1,50 @@
+from markdown_it import MarkdownIt
+from markdown_it.rules_inline import StateInline
+
+from .parse import ParseError, parse
+
+
+def attrs_plugin(md: MarkdownIt, *, after=("image", "code_inline")):
+    """Parse inline attributes that immediately follow certain inline elements::
+
+        ![alt](https://image.com){#id .a b=c}
+
+    Inside the curly braces, the following syntax is possible:
+
+    - `.foo` specifies foo as a class.
+      Multiple classes may be given in this way; they will be combined.
+    - `#foo` specifies foo as an identifier.
+      An element may have only one identifier;
+      if multiple identifiers are given, the last one is used.
+    - `key="value"` or `key=value` specifies a key-value attribute.
+       Quotes are not needed when the value consists entirely of
+       ASCII alphanumeric characters or `_` or `:` or `-`.
+       Backslash escapes may be used inside quoted values.
+    - `%` begins a comment, which ends with the next `%` or the end of the attribute (`}`).
+
+    **Note:** This plugin is currently limited to "self-closing" elements,
+    such as images and code spans. It does not work with links or emphasis.
+
+    :param md: The MarkdownIt instance to modify.
+    :param after: The names of inline elements after which attributes may be specified.
+    """
+
+    def attr_rule(state: StateInline, silent: bool):
+        if state.pending or not state.tokens:
+            return False
+        token = state.tokens[-1]
+        if token.type not in after:
+            return False
+        try:
+            new_pos, attrs = parse(state.src[state.pos :])
+        except ParseError:
+            return False
+        state.pos += new_pos + 1
+        if not silent:
+            if "class" in attrs and "class" in token.attrs:
+                attrs["class"] = f"{token.attrs['class']} {attrs['class']}"
+            token.attrs.update(attrs)
+
+        return True
+
+    md.inline.ruler.push("attr", attr_rule)
diff --git a/mdit_py_plugins/attrs/parse.py b/mdit_py_plugins/attrs/parse.py
@@ -0,0 +1,265 @@
+"""Parser for attributes::
+
+    attributes { id = "foo", class = "bar baz",
+                key1 = "val1", key2 = "val2" }
+
+Adapted from:
+https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1
+
+syntax:
+
+attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}'
+attribute <- identifier | class | keyval
+identifier <- '#' name
+class <- '.' name
+name <- (nonspace, nonpunctuation other than ':', '_', '-')+
+keyval <- key '=' val
+key <- (ASCII_ALPHANUM | ':' | '_' | '-')+
+val <- bareval | quotedval
+bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+
+quotedval <- '"' ([^"] | '\"') '"'
+"""
+from __future__ import annotations
+
+from enum import Enum
+import re
+from typing import Callable
+
+
+class State(Enum):
+    START = 0
+    SCANNING = 1
+    SCANNING_ID = 2
+    SCANNING_CLASS = 3
+    SCANNING_KEY = 4
+    SCANNING_VALUE = 5
+    SCANNING_BARE_VALUE = 6
+    SCANNING_QUOTED_VALUE = 7
+    SCANNING_COMMENT = 8
+    SCANNING_ESCAPED = 9
+    DONE = 10
+
+
+REGEX_SPACE = re.compile(r"\s")
+REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]")
+REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]")
+
+
+class TokenState:
+    def __init__(self):
+        self._tokens = []
+        self.start: int = 0
+
+    def set_start(self, start: int) -> None:
+        self.start = start
+
+    def append(self, start: int, end: int, ttype: str):
+        self._tokens.append((start, end, ttype))
+
+    def compile(self, string: str) -> dict[str, str]:
+        """compile the tokens into a dictionary"""
+        attributes = {}
+        classes = []
+        idx = 0
+        while idx < len(self._tokens):
+            start, end, ttype = self._tokens[idx]
+            if ttype == "id":
+                attributes["id"] = string[start:end]
+            elif ttype == "class":
+                classes.append(string[start:end])
+            elif ttype == "key":
+                key = string[start:end]
+                if idx + 1 < len(self._tokens):
+                    start, end, ttype = self._tokens[idx + 1]
+                    if ttype == "value":
+                        if key == "class":
+                            classes.append(string[start:end])
+                        else:
+                            attributes[key] = string[start:end]
+                        idx += 1
+            idx += 1
+        if classes:
+            attributes["class"] = " ".join(classes)
+        return attributes
+
+    def __str__(self) -> str:
+        return str(self._tokens)
+
+    def __repr__(self) -> str:
+        return repr(self._tokens)
+
+
+class ParseError(Exception):
+    def __init__(self, msg: str, pos: int) -> None:
+        self.pos = pos
+        super().__init__(msg + f" at position {pos}")
+
+
+def parse(string: str) -> tuple[int, dict[str, str]]:
+    """Parse attributes from start of string.
+
+    :returns: (length of parsed string, dict of attributes)
+    """
+    pos = 0
+    state: State = State.START
+    tokens = TokenState()
+    while pos < len(string):
+        state = HANDLERS[state](string[pos], pos, tokens)
+        if state == State.DONE:
+            return pos, tokens.compile(string)
+        pos = pos + 1
+
+    return pos, tokens.compile(string)
+
+
+def handle_start(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == "{":
+        return State.SCANNING
+    raise ParseError("Attributes must start with '{'", pos)
+
+
+def handle_scanning(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return State.SCANNING
+    if char == "}":
+        return State.DONE
+    if char == "#":
+        tokens.set_start(pos)
+        return State.SCANNING_ID
+    if char == "%":
+        tokens.set_start(pos)
+        return State.SCANNING_COMMENT
+    if char == ".":
+        tokens.set_start(pos)
+        return State.SCANNING_CLASS
+    if REGEX_KEY_CHARACTERS.fullmatch(char):
+        tokens.set_start(pos)
+        return State.SCANNING_KEY
+
+    raise ParseError(f"Unexpected character whilst scanning: {char}", pos)
+
+
+def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == "%":
+        return State.SCANNING
+
+    return State.SCANNING_COMMENT
+
+
+def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State:
+
+    if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
+        return State.SCANNING_ID
+
+    if char == "}":
+        if (pos - 1) > tokens.start:
+            tokens.append(tokens.start + 1, pos, "id")
+        return State.DONE
+
+    if REGEX_SPACE.fullmatch(char):
+        if (pos - 1) > tokens.start:
+            tokens.append(tokens.start + 1, pos, "id")
+        return State.SCANNING
+
+    raise ParseError(f"Unexpected character whilst scanning id: {char}", pos)
+
+
+def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State:
+
+    if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
+        return State.SCANNING_CLASS
+
+    if char == "}":
+        if (pos - 1) > tokens.start:
+            tokens.append(tokens.start + 1, pos, "class")
+        return State.DONE
+
+    if REGEX_SPACE.fullmatch(char):
+        if (pos - 1) > tokens.start:
+            tokens.append(tokens.start + 1, pos, "class")
+        return State.SCANNING
+
+    raise ParseError(f"Unexpected character whilst scanning class: {char}", pos)
+
+
+def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == "=":
+        tokens.append(tokens.start, pos, "key")
+        return State.SCANNING_VALUE
+
+    if REGEX_KEY_CHARACTERS.fullmatch(char):
+        return State.SCANNING_KEY
+
+    raise ParseError(f"Unexpected character whilst scanning key: {char}", pos)
+
+
+def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == '"':
+        tokens.set_start(pos)
+        return State.SCANNING_QUOTED_VALUE
+
+    if REGEX_KEY_CHARACTERS.fullmatch(char):
+        tokens.set_start(pos)
+        return State.SCANNING_BARE_VALUE
+
+    raise ParseError(f"Unexpected character whilst scanning value: {char}", pos)
+
+
+def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State:
+
+    if REGEX_KEY_CHARACTERS.fullmatch(char):
+        return State.SCANNING_BARE_VALUE
+
+    if char == "}":
+        tokens.append(tokens.start, pos, "value")
+        return State.DONE
+
+    if REGEX_SPACE.fullmatch(char):
+        tokens.append(tokens.start, pos, "value")
+        return State.SCANNING
+
+    raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos)
+
+
+def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State:
+    return State.SCANNING_QUOTED_VALUE
+
+
+def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State:
+
+    if char == '"':
+        tokens.append(tokens.start + 1, pos, "value")
+        return State.SCANNING
+
+    if char == "\\":
+        return State.SCANNING_ESCAPED
+
+    if char == "{" or char == "}":
+        raise ParseError(
+            f"Unexpected character whilst scanning quoted value: {char}", pos
+        )
+
+    if char == "\n":
+        tokens.append(tokens.start + 1, pos, "value")
+        return State.SCANNING_QUOTED_VALUE
+
+    return State.SCANNING_QUOTED_VALUE
+
+
+HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = {
+    State.START: handle_start,
+    State.SCANNING: handle_scanning,
+    State.SCANNING_COMMENT: handle_scanning_comment,
+    State.SCANNING_ID: handle_scanning_id,
+    State.SCANNING_CLASS: handle_scanning_class,
+    State.SCANNING_KEY: handle_scanning_key,
+    State.SCANNING_VALUE: handle_scanning_value,
+    State.SCANNING_BARE_VALUE: handle_scanning_bare_value,
+    State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value,
+    State.SCANNING_ESCAPED: handle_scanning_escaped,
+}
diff --git a/tests/fixtures/attrs.md b/tests/fixtures/attrs.md
@@ -0,0 +1,46 @@
+simple image
+.
+![a](b){#id .a b=c}
+.
+<p><img src="b" alt="a" id="id" b="c" class="a"></p>
+.
+
+simple inline code
+.
+`a`{#id .a b=c}
+.
+<p><code id="id" b="c" class="a">a</code></p>
+.
+
+ignore if space
+.
+![a](b) {#id key="*"}
+.
+<p><img src="b" alt="a"> {#id key=&quot;*&quot;}</p>
+.
+
+ignore if text
+.
+![a](b)b{#id key="*"}
+.
+<p><img src="b" alt="a">b{#id key=&quot;*&quot;}</p>
+.
+
+multi-line
+.
+![a](b){
+    #id .a
+    b=c
+    }
+more
+.
+<p><img src="b" alt="a" id="id" b="c" class="a">
+more</p>
+.
+
+combined
+.
+![a](b){#a .a}{.b class=x other=h}{#x class="x g" other=a}
+.
+<p><img src="b" alt="a" id="x" class="a b x x g" other="a"></p>
+.
diff --git a/tests/test_attrs.py b/tests/test_attrs.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+from markdown_it import MarkdownIt
+from markdown_it.utils import read_fixture_file
+import pytest
+
+from mdit_py_plugins.attrs import attrs_plugin
+
+FIXTURE_PATH = Path(__file__).parent.joinpath("fixtures", "attrs.md")
+
+
+@pytest.mark.parametrize("line,title,input,expected", read_fixture_file(FIXTURE_PATH))
+def test_fixture(line, title, input, expected):
+    md = MarkdownIt("commonmark").use(attrs_plugin)
+    md.options["xhtmlOut"] = False
+    text = md.render(input)
+    print(text)
+    assert text.rstrip() == expected.rstrip()