myst_parser.parsers.parse_html 源代码

"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.

The AST can also reproduce the HTML text.

Example::

    >> text = '<div class="note"><p>text</p></div>'
    >> ast = tokenize_html(text)
    >> list(ast.walk(include_self=True))
    [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')]
    >> str(ast)
    '<div class="note"><p>text</p></div>'
    >> str(ast[0][0])
    '<p>text</p>'

Note: optional tags are not accounted for
(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)

"""

from __future__ import annotations

import inspect
import itertools
from collections import abc, deque
from collections.abc import Callable, Iterable, Iterator
from html.parser import HTMLParser
from typing import Any



[文档]
class Attribute(dict):
    """This class holds the tags's attributes."""

    def __getitem__(self, key: str) -> str:
        """If self doesn't have the key it returns ''."""
        return self.get(key, "")

    @property
    def classes(self) -> list[str]:
        """Return 'class' attribute as list."""
        return self["class"].split()

    def __str__(self) -> str:
        """Return a htmlized representation for attributes."""
        return " ".join(f'{key}="{value}"' for key, value in self.items())




[文档]
class Element(abc.MutableSequence):
    """An Element of the xml/html document.

    All xml/html entities inherit from this class.
    """

    def __init__(self, name: str = "", attr: dict | None = None) -> None:
        """Initialise the element."""
        self.name = name
        self.attrs: Attribute = Attribute(attr or {})
        self._parent: Element | None = None
        self._children: list[Element] = []

    @property
    def parent(self) -> Element | None:
        """Return parent."""
        return self._parent

    @property
    def children(self) -> list[Element]:
        """Return copy of children."""
        return self._children[:]


[文档]
    def reset_children(self, children: list[Element], deepcopy: bool = False):
        new_children = []
        for i, item in enumerate(children):
            assert isinstance(item, Element)
            if deepcopy:
                item = item.deepcopy()
            if item._parent is None:
                item._parent = self
            elif item._parent != self:
                raise AssertionError(f"different parent already set for item {i}")
            new_children.append(item)
        self._children = new_children


    def __getitem__(self, index: int) -> Element:  # type: ignore[override]
        return self._children[index]

    def __setitem__(self, index: int, item: Element):  # type: ignore[override]
        assert isinstance(item, Element)
        if item._parent is not None and item._parent != self:
            raise AssertionError(f"different parent already set for: {item!r}")
        item._parent = self
        return self._children.__setitem__(index, item)

    def __delitem__(self, index: int):  # type: ignore[override]
        return self._children.__delitem__(index)

    def __len__(self) -> int:
        return self._children.__len__()

    def __iter__(self) -> Iterator[Element]:
        yield from self._children


[文档]
    def insert(self, index: int, item: Element):
        assert isinstance(item, Element)
        if item._parent is not None and item._parent != self:
            raise AssertionError(f"different parent already set for: {item!r}")
        item._parent = self
        return self._children.insert(index, item)



[文档]
    def deepcopy(self) -> Element:
        """Recursively copy and remove parent."""
        _copy = self.__class__(self.name, self.attrs)
        for child in self:
            _copy_child = child.deepcopy()
            _copy.append(_copy_child)
        return _copy


    def __repr__(self) -> str:
        text = f"{self.__class__.__name__}({self.name!r}"
        if self.attrs:
            text += f", {self.attrs!r}"
        text += ")"
        return text


[文档]
    def render(
        self,
        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
        **kwargs,
    ) -> str:
        """Returns a HTML string representation of the element.

        :param tag_overrides: Provide a dictionary of render function
            for specific tag names, to override the normal render format

        """
        raise NotImplementedError


    def __str__(self) -> str:
        return self.render()

    def __eq__(self, item: Any) -> bool:
        return item is self


[文档]
    def walk(self, include_self: bool = False) -> Iterator[Element]:
        """Walk through the xml/html AST."""
        if include_self:
            yield self
        for child in self:
            yield child
            yield from child.walk()



[文档]
    def strip(self, inplace: bool = False, recurse: bool = False) -> Element:
        """Return copy with all `Data` tokens
        that only contain whitespace / newlines removed.
        """
        element = self
        if not inplace:
            element = self.deepcopy()
        element.reset_children(
            [
                e
                for e in element.children
                if not (isinstance(e, Data) and e.data.strip() == "")
            ]
        )
        if recurse:
            for child in element:
                child.strip(inplace=True, recurse=True)
        return element



[文档]
    def find(
        self,
        identifier: str | type[Element],
        attrs: dict | None = None,
        classes: Iterable[str] | None = None,
        include_self: bool = False,
        recurse: bool = True,
    ) -> Iterator[Element]:
        """Find all elements that match name and specific attributes."""
        iterator = self.walk() if recurse else self
        if include_self:
            iterator = itertools.chain([self], iterator)
        test_func = (
            (lambda c: isinstance(c, identifier))
            if inspect.isclass(identifier)
            else lambda c: c.name == identifier
        )
        classes = set(classes) if classes is not None else classes
        for child in iterator:
            if test_func(child):
                if classes is not None and not classes.issubset(child.attrs.classes):
                    continue
                for key, value in (attrs or {}).items():
                    if child.attrs[key] != value:
                        break
                else:
                    yield child





[文档]
class Root(Element):
    """The root of the AST tree."""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        """Returns a string HTML representation of the structure."""
        return "".join(child.render(**kwargs) for child in self)





[文档]
class Tag(Element):
    """Represent xml/html tags under the form: <name key="value" ...> ... </name>."""


[文档]
    def render(
        self,
        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
        **kwargs,
    ) -> str:
        if tag_overrides and self.name in tag_overrides:
            return tag_overrides[self.name](self, tag_overrides)
        return (
            f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"
            + "".join(
                child.render(tag_overrides=tag_overrides, **kwargs) for child in self
            )
            + f"</{self.name}>"
        )





[文档]
class XTag(Element):
    """Represent XHTML style tags with no children, like `<img src="t.gif" />`"""


[文档]
    def render(
        self,
        tag_overrides: dict[str, Callable[[Element, dict], str]] | None = None,
        **kwargs,
    ) -> str:
        if tag_overrides is not None and self.name in tag_overrides:
            return tag_overrides[self.name](self, tag_overrides)
        return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}/>"





[文档]
class VoidTag(Element):
    """Represent tags with no children, only start tag, like `<img src="t.gif" >`"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"<{self.name}{' ' if self.attrs else ''}{self.attrs}>"





[文档]
class TerminalElement(Element):
    def __init__(self, data: str):
        super().__init__("")
        self.data: str = data

    def __repr__(self) -> str:
        text = self.data
        if len(text) > 20:
            text = text[:17] + "..."
        return f"{self.__class__.__name__}({text!r})"


[文档]
    def deepcopy(self) -> TerminalElement:
        """Copy and remove parent."""
        _copy = self.__class__(self.data)
        return _copy





[文档]
class Data(TerminalElement):
    """Represent data inside xml/html documents, like raw text."""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return self.data





[文档]
class Declaration(TerminalElement):
    """Represent declarations, like `<!DOCTYPE html>`"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"<!{self.data}>"





[文档]
class Comment(TerminalElement):
    """Represent HTML comments"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"<!--{self.data}-->"





[文档]
class Pi(TerminalElement):
    """Represent processing instructions like `<?xml-stylesheet ?>`"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"<?{self.data}>"





[文档]
class Char(TerminalElement):
    """Represent character codes like: `&#0`"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"&#{self.data};"





[文档]
class Entity(TerminalElement):
    """Represent entities like `&amp`"""


[文档]
    def render(self, **kwargs) -> str:  # type: ignore[override]
        return f"&{self.data};"





[文档]
class Tree:
    """The engine class to generate the AST tree."""

    def __init__(self, name: str = ""):
        """Initialise Tree"""
        self.name = name
        self.outmost = Root(name)
        self.stack: deque = deque()
        self.stack.append(self.outmost)


[文档]
    def clear(self):
        """Clear the outmost and stack for a new parsing."""
        self.outmost = Root(self.name)
        self.stack.clear()
        self.stack.append(self.outmost)



[文档]
    def last(self) -> Element:
        """Return the last pointer which point to the actual tag scope."""
        return self.stack[-1]



[文档]
    def nest_tag(self, name: str, attrs: dict):
        """Nest a given tag at the bottom of the tree using
        the last stack's pointer.
        """
        pointer = self.stack.pop()
        item = Tag(name, attrs)
        pointer.append(item)
        self.stack.append(pointer)
        self.stack.append(item)



[文档]
    def nest_xtag(self, name: str, attrs: dict):
        """Nest an XTag onto the tree."""
        top = self.last()
        item = XTag(name, attrs)
        top.append(item)



[文档]
    def nest_vtag(self, name: str, attrs: dict):
        """Nest a VoidTag onto the tree."""
        top = self.last()
        item = VoidTag(name, attrs)
        top.append(item)



[文档]
    def nest_terminal(self, klass: type[TerminalElement], data: str):
        """Nest the data onto the tree."""
        top = self.last()
        item = klass(data)
        top.append(item)



[文档]
    def enclose(self, name: str):
        """When a closing tag is found, pop the pointer's scope from the stack,
        to then point to the earlier scope's tag.
        """
        count = 0
        for ind in reversed(self.stack):
            count = count + 1
            if ind.name == name:
                break
        else:
            count = 0

        # It pops all the items which do not match with the closing tag.
        for _ in range(count):
            self.stack.pop()





[文档]
class HtmlToAst(HTMLParser):
    """The tokenizer class."""

    # see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
    void_elements = {
        "area",
        "base",
        "br",
        "col",
        "embed",
        "hr",
        "img",
        "input",
        "link",
        "meta",
        "param",
        "source",
        "track",
        "wbr",
    }

    def __init__(self, name: str = "", convert_charrefs: bool = False):
        super().__init__(convert_charrefs=convert_charrefs)
        self.struct = Tree(name)


[文档]
    def feed(self, source: str) -> Root:  # type: ignore[override]
        """Parse the source string."""
        self.struct.clear()
        super().feed(source)
        return self.struct.outmost



[文档]
    def handle_starttag(self, name: str, attr):
        """When found an opening tag then nest it onto the tree."""
        if name in self.void_elements:
            self.struct.nest_vtag(name, attr)
        else:
            self.struct.nest_tag(name, attr)



[文档]
    def handle_startendtag(self, name: str, attr):
        """When found a XHTML tag style then nest it up to the tree."""
        self.struct.nest_xtag(name, attr)



[文档]
    def handle_endtag(self, name: str):
        """When found a closing tag then makes it point to the right scope."""
        if name not in self.void_elements:
            self.struct.enclose(name)



[文档]
    def handle_data(self, data: str):
        """Nest data onto the tree."""
        self.struct.nest_terminal(Data, data)



[文档]
    def handle_decl(self, decl: str):
        self.struct.nest_terminal(Declaration, decl)



[文档]
    def unknown_decl(self, decl: str):
        self.struct.nest_terminal(Declaration, decl)



[文档]
    def handle_charref(self, data: str):
        self.struct.nest_terminal(Char, data)



[文档]
    def handle_entityref(self, data: str):
        self.struct.nest_terminal(Entity, data)



[文档]
    def handle_pi(self, data: str):
        self.struct.nest_terminal(Pi, data)



[文档]
    def handle_comment(self, data: str):
        self.struct.nest_terminal(Comment, data)





[文档]
def tokenize_html(text: str, name: str = "", convert_charrefs: bool = False) -> Root:
    parser = HtmlToAst(name, convert_charrefs=convert_charrefs)
    return parser.feed(text)