Source code for markdown_it.tree

"""A tree representation of a linear markdown-it token stream.

This module is not part of upstream JavaScript markdown-it.
"""
from __future__ import annotations

from collections.abc import Generator, Sequence
import textwrap
from typing import Any, NamedTuple, TypeVar, overload

from .token import Token


class _NesterTokens(NamedTuple):
    opening: Token
    closing: Token


_NodeType = TypeVar("_NodeType", bound="SyntaxTreeNode")


[docs]class SyntaxTreeNode: """A Markdown syntax tree node. A class that can be used to construct a tree representation of a linear `markdown-it-py` token stream. Each node in the tree represents either: - root of the Markdown document - a single unnested `Token` - a `Token` "_open" and "_close" token pair, and the tokens nested in between """ def __init__( self, tokens: Sequence[Token] = (), *, create_root: bool = True ) -> None: """Initialize a `SyntaxTreeNode` from a token stream. If `create_root` is True, create a root node for the document. """ # Only nodes representing an unnested token have self.token self.token: Token | None = None # Only containers have nester tokens self.nester_tokens: _NesterTokens | None = None # Root node does not have self.parent self._parent: Any = None # Empty list unless a non-empty container, or unnested token that has # children (i.e. inline or img) self._children: list[Any] = [] if create_root: self._set_children_from_tokens(tokens) return if not tokens: raise ValueError( "Can only create root from empty token sequence." " Set `create_root=True`." ) elif len(tokens) == 1: inline_token = tokens[0] if inline_token.nesting: raise ValueError( "Unequal nesting level at the start and end of token stream." ) self.token = inline_token if inline_token.children: self._set_children_from_tokens(inline_token.children) else: self.nester_tokens = _NesterTokens(tokens[0], tokens[-1]) self._set_children_from_tokens(tokens[1:-1]) def __repr__(self) -> str: return f"{type(self).__name__}({self.type})" @overload def __getitem__(self: _NodeType, item: int) -> _NodeType: ... @overload def __getitem__(self: _NodeType, item: slice) -> list[_NodeType]: ... def __getitem__(self: _NodeType, item: int | slice) -> _NodeType | list[_NodeType]: return self.children[item]
[docs] def to_tokens(self: _NodeType) -> list[Token]: """Recover the linear token stream.""" def recursive_collect_tokens(node: _NodeType, token_list: list[Token]) -> None: if node.type == "root": for child in node.children: recursive_collect_tokens(child, token_list) elif node.token: token_list.append(node.token) else: assert node.nester_tokens token_list.append(node.nester_tokens.opening) for child in node.children: recursive_collect_tokens(child, token_list) token_list.append(node.nester_tokens.closing) tokens: list[Token] = [] recursive_collect_tokens(self, tokens) return tokens
@property def children(self: _NodeType) -> list[_NodeType]: return self._children @children.setter def children(self: _NodeType, value: list[_NodeType]) -> None: self._children = value @property def parent(self: _NodeType) -> _NodeType | None: return self._parent # type: ignore @parent.setter def parent(self: _NodeType, value: _NodeType | None) -> None: self._parent = value @property def is_root(self) -> bool: """Is the node a special root node?""" return not (self.token or self.nester_tokens) @property def is_nested(self) -> bool: """Is this node nested?. Returns `True` if the node represents a `Token` pair and tokens in the sequence between them, where `Token.nesting` of the first `Token` in the pair is 1 and nesting of the other `Token` is -1. """ return bool(self.nester_tokens) @property def siblings(self: _NodeType) -> Sequence[_NodeType]: """Get siblings of the node. Gets the whole group of siblings, including self. """ if not self.parent: return [self] return self.parent.children @property def type(self) -> str: """Get a string type of the represented syntax. - "root" for root nodes - `Token.type` if the node represents an unnested token - `Token.type` of the opening token, with "_open" suffix stripped, if the node represents a nester token pair """ if self.is_root: return "root" if self.token: return self.token.type assert self.nester_tokens return _removesuffix(self.nester_tokens.opening.type, "_open") @property def next_sibling(self: _NodeType) -> _NodeType | None: """Get the next node in the sequence of siblings. Returns `None` if this is the last sibling. """ self_index = self.siblings.index(self) if self_index + 1 < len(self.siblings): return self.siblings[self_index + 1] return None @property def previous_sibling(self: _NodeType) -> _NodeType | None: """Get the previous node in the sequence of siblings. Returns `None` if this is the first sibling. """ self_index = self.siblings.index(self) if self_index - 1 >= 0: return self.siblings[self_index - 1] return None def _add_child( self, tokens: Sequence[Token], ) -> None: """Make a child node for `self`.""" child = type(self)(tokens, create_root=False) child.parent = self self.children.append(child) def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None: """Convert the token stream to a tree structure and set the resulting nodes as children of `self`.""" reversed_tokens = list(reversed(tokens)) while reversed_tokens: token = reversed_tokens.pop() if not token.nesting: self._add_child([token]) continue if token.nesting != 1: raise ValueError("Invalid token nesting") nested_tokens = [token] nesting = 1 while reversed_tokens and nesting: token = reversed_tokens.pop() nested_tokens.append(token) nesting += token.nesting if nesting: raise ValueError(f"unclosed tokens starting {nested_tokens[0]}") self._add_child(nested_tokens)
[docs] def pretty( self, *, indent: int = 2, show_text: bool = False, _current: int = 0 ) -> str: """Create an XML style string of the tree.""" prefix = " " * _current text = prefix + f"<{self.type}" if not self.is_root and self.attrs: text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items()) text += ">" if ( show_text and not self.is_root and self.type in ("text", "text_special") and self.content ): text += "\n" + textwrap.indent(self.content, prefix + " " * indent) for child in self.children: text += "\n" + child.pretty( indent=indent, show_text=show_text, _current=_current + indent ) return text
[docs] def walk( self: _NodeType, *, include_self: bool = True ) -> Generator[_NodeType, None, None]: """Recursively yield all descendant nodes in the tree starting at self. The order mimics the order of the underlying linear token stream (i.e. depth first). """ if include_self: yield self for child in self.children: yield from child.walk(include_self=True)
# NOTE: # The values of the properties defined below directly map to properties # of the underlying `Token`s. A root node does not translate to a `Token` # object, so calling these property getters on a root node will raise an # `AttributeError`. # # There is no mapping for `Token.nesting` because the `is_nested` property # provides that data, and can be called on any node type, including root. def _attribute_token(self) -> Token: """Return the `Token` that is used as the data source for the properties defined below.""" if self.token: return self.token if self.nester_tokens: return self.nester_tokens.opening raise AttributeError("Root node does not have the accessed attribute") @property def tag(self) -> str: """html tag name, e.g. \"p\" """ return self._attribute_token().tag @property def attrs(self) -> dict[str, str | int | float]: """Html attributes.""" return self._attribute_token().attrs
[docs] def attrGet(self, name: str) -> None | str | int | float: """Get the value of attribute `name`, or null if it does not exist.""" return self._attribute_token().attrGet(name)
@property def map(self) -> tuple[int, int] | None: """Source map info. Format: `tuple[ line_begin, line_end ]`""" map_ = self._attribute_token().map if map_: # Type ignore because `Token`s attribute types are not perfect return tuple(map_) # type: ignore return None @property def level(self) -> int: """nesting level, the same as `state.level`""" return self._attribute_token().level @property def content(self) -> str: """In a case of self-closing tag (code, html, fence, etc.), it has contents of this tag.""" return self._attribute_token().content @property def markup(self) -> str: """'*' or '_' for emphasis, fence string for fence, etc.""" return self._attribute_token().markup @property def info(self) -> str: """fence infostring""" return self._attribute_token().info @property def meta(self) -> dict[Any, Any]: """A place for plugins to store an arbitrary data.""" return self._attribute_token().meta @property def block(self) -> bool: """True for block-level tokens, false for inline tokens.""" return self._attribute_token().block @property def hidden(self) -> bool: """If it's true, ignore this element when rendering. Used for tight lists to hide paragraphs.""" return self._attribute_token().hidden
def _removesuffix(string: str, suffix: str) -> str: """Remove a suffix from a string. Replace this with str.removesuffix() from stdlib when minimum Python version is 3.9. """ if suffix and string.endswith(suffix): return string[: -len(suffix)] return string