"""Module for reading notebook formats from a string input."""
from __future__ import annotations
import dataclasses as dc
from functools import partial
import json
from pathlib import Path
from typing import Callable, Iterator
from docutils.parsers.rst import Directive
from markdown_it.renderer import RendererHTML
from myst_parser.config.main import MdParserConfig
from myst_parser.parsers.mdit import create_md_parser
import nbformat as nbf
import yaml
from myst_nb.core.config import NbParserConfig
from myst_nb.core.loggers import DocutilsDocLogger, SphinxDocLogger
NOTEBOOK_VERSION = 4
"""The notebook version that readers should return."""
[文档]
@dc.dataclass()
class NbReader:
"""A data class for reading a notebook format."""
read: Callable[[str], nbf.NotebookNode]
"""The function to read a notebook from a (utf8) string."""
md_config: MdParserConfig
"""The configuration for parsing markdown cells."""
read_fmt: dict | None = dc.field(default=None)
"""The type of the reader, if known."""
def standard_nb_read(text: str) -> nbf.NotebookNode:
"""Read a standard .ipynb notebook from a string."""
return nbf.reads(text, as_version=NOTEBOOK_VERSION)
[文档]
def create_nb_reader(
path: str,
md_config: MdParserConfig,
nb_config: NbParserConfig,
content: None | str | Iterator[str],
) -> NbReader | None:
"""Create a notebook reader, given a string, source path and configuration.
Note, we do not directly parse to a notebook, since jupyter-cache functionality
requires the reader.
:param path: Path to the input source being processed.
:param nb_config: The configuration for parsing Notebooks.
:param md_config: The default configuration for parsing Markown.
:param content: The input string (optionally used to check for text-based notebooks)
:returns: the notebook reader, and the (potentially modified) MdParserConfig,
or None if the input cannot be read as a notebook.
"""
try:
from sphinx.util._importer import import_object
except ImportError:
from sphinx.util import import_object
# get all possible readers
readers = nb_config.custom_formats.copy()
# add the default reader
readers.setdefault(".ipynb", (standard_nb_read, {}, False)) # type: ignore
# we check suffixes ordered by longest first, to ensure we get the "closest" match
iterator = sorted(readers.items(), key=lambda x: len(x[0]), reverse=True)
for suffix, (reader, reader_kwargs, commonmark_only) in iterator:
if str(Path(path)).endswith(suffix):
if isinstance(reader, str):
# attempt to load the reader as an object path
reader = import_object(reader)
if commonmark_only:
# Markdown cells should be read as Markdown only
md_config = dc.replace(md_config, commonmark_only=True)
return NbReader(partial(reader, **(reader_kwargs or {})), md_config) # type: ignore
# a Markdown file is a special case, since we only treat it as a notebook,
# if it starts with certain "top-matter"
if content is not None and is_myst_markdown_notebook(content):
return NbReader(
partial(
read_myst_markdown_notebook,
config=md_config,
add_source_map=True,
path=path,
),
md_config,
{"type": "plugin", "name": "myst_nb_md"},
)
# if we get here, we did not find a reader
return None
[文档]
def is_myst_markdown_notebook(text: str | Iterator[str]) -> bool:
"""Check if the input is a MyST Markdown notebook.
This is identified by the presence of a top-matter section, containing either::
---
file_format: mystnb
---
or::
---
jupytext:
text_representation:
format_name: myst
---
:param text: The input text.
:returns: True if the input is a markdown notebook.
"""
if isinstance(text, str):
if not text.startswith("---"): # skip creating the line list in memory
return False
text = (line for line in text.splitlines())
try:
if not next(text).startswith("---"):
return False
except StopIteration:
return False
top_matter = []
for line in text:
if line.startswith("---") or line.startswith("..."):
break
top_matter.append(line.rstrip() + "\n")
try:
metadata = yaml.safe_load("".join(top_matter))
assert isinstance(metadata, dict)
except Exception:
return False
if "file_format" in metadata and metadata["file_format"] == "mystnb":
return True
if (
metadata.get("jupytext", {})
.get("text_representation", {})
.get("format_name", None)
!= "myst"
):
return False
return True
# TODO move this to reader, since not strictly part of function objective
# or just allow nbformat/nbclient to handle the failure
# if "name" not in metadata.get("kernelspec", {}):
# raise IOError(
# "A myst notebook text-representation requires " "kernelspec/name metadata"
# )
# if "display_name" not in metadata.get("kernelspec", {}):
# raise IOError(
# "A myst notebook text-representation requires "
# "kernelspec/display_name metadata"
# )
def myst_nb_reader_plugin(uri: str) -> nbf.NotebookNode:
"""Read a myst notebook from a string.
Used as plugin for jupyter-cache.
"""
return read_myst_markdown_notebook(
Path(uri).read_text("utf8"), add_source_map=True, path=uri
)
[文档]
def read_myst_markdown_notebook(
text,
config: MdParserConfig | None = None,
code_directive="{code-cell}",
raw_directive="{raw-cell}",
add_source_map=False,
path: str | Path | None = None,
) -> nbf.NotebookNode:
"""Convert text written in the myst format to a notebook.
:param text: the file text
:param code_directive: the name of the directive to search for containing code cells
:param raw_directive: the name of the directive to search for containing raw cells
:param add_source_map: add a `source_map` key to the notebook metadata,
which is a list of the starting source line number for each cell.
:param path: path to notebook (required for :load:)
:raises MystMetadataParsingError if the metadata block is not valid JSON/YAML
NOTE: we assume here that all of these directives are at the top-level,
i.e. not nested in other directives.
"""
config = config or MdParserConfig()
# parse markdown file up to the block level (i.e. don't worry about inline text)
inline_config = dc.replace(
config, disable_syntax=(list(config.disable_syntax) + ["inline"])
)
parser = create_md_parser(inline_config, RendererHTML)
tokens = parser.parse(text + "\n")
lines = text.splitlines()
md_start_line = 0
# get the document metadata
metadata_nb = {}
if tokens[0].type == "front_matter":
metadata = tokens.pop(0)
md_start_line = metadata.map[1] if metadata.map else 0
try:
metadata_nb = yaml.safe_load(metadata.content)
except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
raise MystMetadataParsingError(f"Notebook metadata: {error}")
# add missing display name to the metadata, as required by the nbformat schema:
# https://github.com/jupyter/nbformat/blob/f712d60f13c5b168313222cbf4bee7face98a081/nbformat/v4/nbformat.v4.5.schema.json#L16
if (
"kernelspec" in metadata_nb
and "name" in metadata_nb["kernelspec"]
and "display_name" not in metadata_nb["kernelspec"]
):
metadata_nb["kernelspec"]["display_name"] = metadata_nb["kernelspec"]["name"]
# create an empty notebook
nbf_version = nbf.v4
kwargs = {"metadata": nbf.from_dict(metadata_nb)}
notebook = nbf_version.new_notebook(**kwargs)
source_map = [] # this is a list of the starting line number for each cell
def _flush_markdown(start_line, token, md_metadata):
"""When we find a cell we check if there is preceding text.o"""
endline = token.map[0] if token else len(lines)
md_source = _strip_blank_lines("\n".join(lines[start_line:endline]))
meta = nbf.from_dict(md_metadata)
if md_source:
source_map.append(start_line)
notebook.cells.append(
nbf_version.new_markdown_cell(source=md_source, metadata=meta)
)
# iterate through the tokens to identify notebook cells
nesting_level = 0
md_metadata: dict = {}
for token in tokens:
nesting_level += token.nesting
if nesting_level != 0:
# we ignore fenced block that are nested, e.g. as part of lists, etc
continue
token_map = token.map or [0, 0]
if token.type == "fence" and token.info.startswith(code_directive):
_flush_markdown(md_start_line, token, md_metadata)
options, body_lines = _read_fenced_cell(token, len(notebook.cells), "Code")
# Parse :load: or load: tags and populate body with contents of file
if "load" in options:
body_lines = _load_code_from_file(
path, options["load"], token, body_lines
)
meta = nbf.from_dict(options)
source_map.append(token_map[0] + 1)
notebook.cells.append(
nbf_version.new_code_cell(source="\n".join(body_lines), metadata=meta)
)
md_metadata = {}
md_start_line = token_map[1]
elif token.type == "fence" and token.info.startswith(raw_directive):
_flush_markdown(md_start_line, token, md_metadata)
options, body_lines = _read_fenced_cell(token, len(notebook.cells), "Raw")
meta = nbf.from_dict(options)
source_map.append(token_map[0] + 1)
notebook.cells.append(
nbf_version.new_raw_cell(source="\n".join(body_lines), metadata=meta)
)
md_metadata = {}
md_start_line = token_map[1]
elif token.type == "myst_block_break":
_flush_markdown(md_start_line, token, md_metadata)
md_metadata = _read_cell_metadata(token, len(notebook.cells))
md_start_line = token_map[1]
_flush_markdown(md_start_line, None, md_metadata)
if add_source_map:
notebook.metadata["source_map"] = source_map
return notebook
class MystMetadataParsingError(Exception):
"""Error when parsing metadata from myst formatted text"""
class _LoadFileParsingError(Exception):
"""Error when parsing files for code-blocks/code-cells"""
def _strip_blank_lines(text):
text = text.rstrip()
while text and text.startswith("\n"):
text = text[1:]
return text
class _MockDirective:
option_spec = {"options": True}
required_arguments = 0
optional_arguments = 1
has_content = True
def _read_fenced_cell(token, cell_index, cell_type):
from myst_parser.parsers.directives import parse_directive_text
result = parse_directive_text(
directive_class=_MockDirective,
first_line="",
content=token.content,
validate_options=False,
)
if result.warnings:
raise MystMetadataParsingError(
"{} cell {} at line {} could not be read: {}".format(
cell_type, cell_index, token.map[0] + 1, result.warnings[0]
)
)
return result.options, result.body
def _read_cell_metadata(token, cell_index):
metadata = {}
if token.content:
try:
metadata = json.loads(token.content.strip())
except Exception as err:
raise MystMetadataParsingError(
"Markdown cell {} at line {} could not be read: {}".format(
cell_index, token.map[0] + 1, err
)
)
if not isinstance(metadata, dict):
raise MystMetadataParsingError(
"Markdown cell {} at line {} is not a dict".format(
cell_index, token.map[0] + 1
)
)
return metadata
def _load_code_from_file(
nb_path: None | str | Path, file_name: str, token, body_lines: list[str]
):
"""load source code from a file."""
if nb_path is None:
raise _LoadFileParsingError("path to notebook not supplied for :load:")
file_path = Path(nb_path).parent.joinpath(file_name).resolve()
if len(body_lines):
pass # TODO this would make the reader dependent on sphinx
# line = token.map[0] if token.map else 0
# msg = (
# f"{nb_path}:{line} content of code-cell is being overwritten by "
# f":load: {file_name}"
# )
# LOGGER.warning(msg)
try:
body_lines = file_path.read_text().split("\n")
except Exception:
raise _LoadFileParsingError(f"Can't read file from :load: {file_path}")
return body_lines
class UnexpectedCellDirective(Directive):
"""The `{code-cell}`` and ``{raw-cell}`` directives, are special cases,
which are picked up by the MyST Markdown reader to convert them into notebooks.
If any are left in the parsed Markdown, it probably means that they were nested
inside another directive, which is not allowed.
Therefore, we log a warning if it is triggered, and discard it.
"""
optional_arguments = 1
final_argument_whitespace = True
has_content = True
def run(self):
"""Run the directive."""
message = (
"Found an unexpected `code-cell` or `raw-cell` directive. "
"Either this file was not converted to a notebook, "
"because Jupytext header content was missing, "
"or the `code-cell` was not converted, because it is nested. "
"See https://myst-nb.readthedocs.io/en/latest/use/markdown.html "
"for more information."
)
document = self.state.document
if hasattr(document.settings, "env"):
logger = SphinxDocLogger(document)
else:
logger = DocutilsDocLogger(document) # type: ignore
logger.warning(message, line=self.lineno, subtype="nbcell")
return []