Source code for hs_py.encoding.trio

"""Trio text format parser and encoder.

Trio is a line-oriented format for hand-authoring Haystack data records.
Each record contains tag name-value pairs separated by lines of dashes.
Values are encoded in Zinc scalar format with Trio-specific extensions
(unquoted strings, ``true``/``false`` booleans).

See: https://project-haystack.org/doc/docHaystack/Trio
"""

from __future__ import annotations

from typing import Any

from hs_py.encoding.scanner import IDENT_CHARS, scan_val
from hs_py.grid import Grid
from hs_py.kinds import MARKER, Marker

__all__ = [
    "encode_trio",
    "parse_trio",
    "parse_zinc_val",
]


# ---------------------------------------------------------------------------
# Public API — Decode
# ---------------------------------------------------------------------------


_MAX_TRIO_DEPTH = 32



[docs]
def parse_trio(text: str, *, _depth: int = 0) -> list[dict[str, Any]]:
    """Parse Trio text into a list of tag dicts.

    Each dict represents one record (separated by lines of ``---``).
    Supports multi-line string, Zinc, and Trio values via indented
    continuation lines.

    :param text: Trio-formatted text.
    :returns: List of tag dicts, one per record.
    :raises ValueError: If nesting depth exceeds limit.
    """
    if _depth > _MAX_TRIO_DEPTH:
        msg = "Maximum Trio nesting depth exceeded"
        raise ValueError(msg)
    records: list[dict[str, Any]] = []
    current: dict[str, Any] = {}
    ml_tag: str | None = None
    ml_lines: list[str] = []
    ml_mode: str = "string"  # "string", "zinc", or "trio"

    for raw_line in text.split("\n"):
        line = _strip_comment(raw_line)
        stripped = line.strip()

        # Multi-line continuation: blank lines or indented lines.
        # Must be checked BEFORE separator so indented "  ---" inside
        # Trio:/Zinc: multi-line content is collected, not treated as
        # a record boundary.
        if ml_tag is not None:
            if not stripped:
                ml_lines.append("")
                continue
            if line[0] in " \t":
                ml_lines.append(line)
                continue
            # Non-indented, non-blank line ends multi-line mode.
            # Fall through to separator / tag-line handling below.
            _flush_multiline(current, ml_tag, ml_lines, ml_mode, _depth=_depth)
            ml_tag = None
            ml_lines = []
            ml_mode = "string"

        # Record separator: any line of only dashes
        if _is_separator(line):
            if current:
                records.append(current)
                current = {}
            continue

        if not stripped:
            continue

        # Parse tag line
        name, val_str = _parse_tag_line(stripped)
        if val_str is None:
            current[name] = MARKER
        elif val_str == "":
            # Start multi-line string
            ml_tag = name
            ml_mode = "string"
        elif val_str == "Zinc:":
            # Start multi-line Zinc data
            ml_tag = name
            ml_mode = "zinc"
        elif val_str == "Trio:":
            # Start multi-line Trio data
            ml_tag = name
            ml_mode = "trio"
        else:
            current[name] = _parse_trio_val(val_str)

    # Finalize last record
    _flush_multiline(current, ml_tag, ml_lines, ml_mode, _depth=_depth)
    if current:
        records.append(current)

    return records




[docs]
def parse_zinc_val(text: str) -> Any:
    """Parse a Zinc-encoded scalar value string.

    This parses strict Zinc syntax only. For Trio-specific extensions
    (unquoted strings, ``true``/``false``), use :func:`parse_trio`.

    :param text: Zinc value text.
    :returns: Parsed Haystack value.
    """
    text = text.strip()
    if not text:
        return None
    val, _ = scan_val(text, 0)
    return val



# ---------------------------------------------------------------------------
# Public API — Encode
# ---------------------------------------------------------------------------



[docs]
def encode_trio(records: list[dict[str, Any]]) -> str:
    """Encode a list of tag dicts as Trio text.

    Multi-line strings, nested :class:`~hs_py.grid.Grid` values (via Zinc),
    and nested record lists (via Trio) are encoded using indented
    continuation lines.

    :param records: List of tag dicts, one per record.
    :returns: Trio-formatted text with trailing newline.
    """
    from hs_py.encoding.zinc import encode_grid as _zinc_encode_grid
    from hs_py.encoding.zinc import encode_val as _zinc_encode_val

    parts: list[str] = []
    for rec in records:
        lines: list[str] = ["---"]
        for name, val in rec.items():
            if isinstance(val, Marker):
                lines.append(name)
            elif isinstance(val, str) and "\n" in val:
                # Multi-line string
                lines.append(f"{name}:")
                for ml in val.split("\n"):
                    lines.append(f"  {ml}" if ml else "")
            elif isinstance(val, Grid):
                # Nested grid via Zinc: multi-line
                zinc_text = _zinc_encode_grid(val)
                lines.append(f"{name}: Zinc:")
                for ml in zinc_text.split("\n"):
                    lines.append(f"  {ml}")
            elif isinstance(val, list) and val and isinstance(val[0], dict):
                # Nested records via Trio: multi-line
                trio_text = encode_trio(val)
                lines.append(f"{name}: Trio:")
                for ml in trio_text.split("\n"):
                    if ml:
                        lines.append(f"  {ml}")
            else:
                lines.append(f"{name}: {_zinc_encode_val(val)}")
        parts.append("\n".join(lines))
    return "\n".join(parts) + "\n"



# ---------------------------------------------------------------------------
# Trio-specific value parsing
# ---------------------------------------------------------------------------


def _parse_trio_val(text: str) -> Any:
    """Parse a Trio value with Zinc syntax and unquoted string fallback.

    Extends Zinc parsing with:
    - ``true``/``false`` boolean keywords
    - Unquoted string fallback when Zinc parsing doesn't consume the full value
    """
    text = text.strip()
    if not text:
        return None

    # Trio-specific boolean keywords
    if text == "true":
        return True
    if text == "false":
        return False

    # Try Zinc parsing
    try:
        val, end = scan_val(text, 0)
        # If fully consumed, use the parsed value
        if not text[end:].strip():
            return val
    except ValueError:
        pass

    # Fall back to unquoted string
    return text


# ---------------------------------------------------------------------------
# Line-level helpers
# ---------------------------------------------------------------------------


def _is_separator(line: str) -> bool:
    """Check if a line is a record separator (one or more dashes)."""
    stripped = line.strip()
    return bool(stripped) and all(c == "-" for c in stripped)


def _strip_comment(line: str) -> str:
    """Strip ``//`` comment from a line, respecting quoted strings and URIs."""
    in_str = False
    in_uri = False
    i = 0
    while i < len(line):
        ch = line[i]
        if in_str:
            if ch == "\\":
                i += 2
                continue
            if ch == '"':
                in_str = False
        elif in_uri:
            if ch == "`":
                in_uri = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "`":
                in_uri = True
            elif ch == "/" and i + 1 < len(line) and line[i + 1] == "/":
                return line[:i]
        i += 1
    return line


def _parse_tag_line(line: str) -> tuple[str, str | None]:
    """Parse a tag line into ``(name, value_str)`` or ``(name, None)``."""
    i = 0
    while i < len(line) and line[i] in IDENT_CHARS:
        i += 1
    name = line[:i]
    if not name:
        msg = f"Expected tag name: {line!r}"
        raise ValueError(msg)

    # Look for colon (skip optional whitespace between name and colon)
    j = i
    while j < len(line) and line[j] == " ":
        j += 1
    if j < len(line) and line[j] == ":":
        val_str = line[j + 1 :].strip()
        return name, val_str
    return name, None


def _flush_multiline(
    current: dict[str, Any],
    tag: str | None,
    lines: list[str],
    mode: str,
    *,
    _depth: int = 0,
) -> None:
    """Finalize a multi-line value and add it to the current record."""
    if tag is None:
        return
    text = _join_multiline(lines)
    if mode == "zinc":
        from hs_py.encoding.zinc import decode_grid

        current[tag] = decode_grid(text)
    elif mode == "trio":
        current[tag] = parse_trio(text, _depth=_depth + 1)
    else:
        current[tag] = text


def _join_multiline(lines: list[str]) -> str:
    """Join multi-line string continuation lines, stripping common indent."""
    if not lines:
        return ""
    indents = [len(ln) - len(ln.lstrip()) for ln in lines if ln.strip()]
    min_indent = min(indents) if indents else 0
    stripped = [ln[min_indent:] if ln.strip() else "" for ln in lines]
    while stripped and not stripped[-1]:
        stripped.pop()
    return "\n".join(stripped)