"""Trio text format parser and encoder.
Trio is a line-oriented format for hand-authoring Haystack data records.
Each record contains tag name-value pairs separated by lines of dashes.
Values are encoded in Zinc scalar format with Trio-specific extensions
(unquoted strings, ``true``/``false`` booleans).
See: https://project-haystack.org/doc/docHaystack/Trio
"""
from __future__ import annotations
from typing import Any
from hs_py.encoding.scanner import IDENT_CHARS, scan_val
from hs_py.grid import Grid
from hs_py.kinds import MARKER, Marker
__all__ = [
"encode_trio",
"parse_trio",
"parse_zinc_val",
]
# ---------------------------------------------------------------------------
# Public API — Decode
# ---------------------------------------------------------------------------
_MAX_TRIO_DEPTH = 32
[docs]
def parse_trio(text: str, *, _depth: int = 0) -> list[dict[str, Any]]:
"""Parse Trio text into a list of tag dicts.
Each dict represents one record (separated by lines of ``---``).
Supports multi-line string, Zinc, and Trio values via indented
continuation lines.
:param text: Trio-formatted text.
:returns: List of tag dicts, one per record.
:raises ValueError: If nesting depth exceeds limit.
"""
if _depth > _MAX_TRIO_DEPTH:
msg = "Maximum Trio nesting depth exceeded"
raise ValueError(msg)
records: list[dict[str, Any]] = []
current: dict[str, Any] = {}
ml_tag: str | None = None
ml_lines: list[str] = []
ml_mode: str = "string" # "string", "zinc", or "trio"
for raw_line in text.split("\n"):
line = _strip_comment(raw_line)
stripped = line.strip()
# Multi-line continuation: blank lines or indented lines.
# Must be checked BEFORE separator so indented " ---" inside
# Trio:/Zinc: multi-line content is collected, not treated as
# a record boundary.
if ml_tag is not None:
if not stripped:
ml_lines.append("")
continue
if line[0] in " \t":
ml_lines.append(line)
continue
# Non-indented, non-blank line ends multi-line mode.
# Fall through to separator / tag-line handling below.
_flush_multiline(current, ml_tag, ml_lines, ml_mode, _depth=_depth)
ml_tag = None
ml_lines = []
ml_mode = "string"
# Record separator: any line of only dashes
if _is_separator(line):
if current:
records.append(current)
current = {}
continue
if not stripped:
continue
# Parse tag line
name, val_str = _parse_tag_line(stripped)
if val_str is None:
current[name] = MARKER
elif val_str == "":
# Start multi-line string
ml_tag = name
ml_mode = "string"
elif val_str == "Zinc:":
# Start multi-line Zinc data
ml_tag = name
ml_mode = "zinc"
elif val_str == "Trio:":
# Start multi-line Trio data
ml_tag = name
ml_mode = "trio"
else:
current[name] = _parse_trio_val(val_str)
# Finalize last record
_flush_multiline(current, ml_tag, ml_lines, ml_mode, _depth=_depth)
if current:
records.append(current)
return records
[docs]
def parse_zinc_val(text: str) -> Any:
"""Parse a Zinc-encoded scalar value string.
This parses strict Zinc syntax only. For Trio-specific extensions
(unquoted strings, ``true``/``false``), use :func:`parse_trio`.
:param text: Zinc value text.
:returns: Parsed Haystack value.
"""
text = text.strip()
if not text:
return None
val, _ = scan_val(text, 0)
return val
# ---------------------------------------------------------------------------
# Public API — Encode
# ---------------------------------------------------------------------------
[docs]
def encode_trio(records: list[dict[str, Any]]) -> str:
"""Encode a list of tag dicts as Trio text.
Multi-line strings, nested :class:`~hs_py.grid.Grid` values (via Zinc),
and nested record lists (via Trio) are encoded using indented
continuation lines.
:param records: List of tag dicts, one per record.
:returns: Trio-formatted text with trailing newline.
"""
from hs_py.encoding.zinc import encode_grid as _zinc_encode_grid
from hs_py.encoding.zinc import encode_val as _zinc_encode_val
parts: list[str] = []
for rec in records:
lines: list[str] = ["---"]
for name, val in rec.items():
if isinstance(val, Marker):
lines.append(name)
elif isinstance(val, str) and "\n" in val:
# Multi-line string
lines.append(f"{name}:")
for ml in val.split("\n"):
lines.append(f" {ml}" if ml else "")
elif isinstance(val, Grid):
# Nested grid via Zinc: multi-line
zinc_text = _zinc_encode_grid(val)
lines.append(f"{name}: Zinc:")
for ml in zinc_text.split("\n"):
lines.append(f" {ml}")
elif isinstance(val, list) and val and isinstance(val[0], dict):
# Nested records via Trio: multi-line
trio_text = encode_trio(val)
lines.append(f"{name}: Trio:")
for ml in trio_text.split("\n"):
if ml:
lines.append(f" {ml}")
else:
lines.append(f"{name}: {_zinc_encode_val(val)}")
parts.append("\n".join(lines))
return "\n".join(parts) + "\n"
# ---------------------------------------------------------------------------
# Trio-specific value parsing
# ---------------------------------------------------------------------------
def _parse_trio_val(text: str) -> Any:
"""Parse a Trio value with Zinc syntax and unquoted string fallback.
Extends Zinc parsing with:
- ``true``/``false`` boolean keywords
- Unquoted string fallback when Zinc parsing doesn't consume the full value
"""
text = text.strip()
if not text:
return None
# Trio-specific boolean keywords
if text == "true":
return True
if text == "false":
return False
# Try Zinc parsing
try:
val, end = scan_val(text, 0)
# If fully consumed, use the parsed value
if not text[end:].strip():
return val
except ValueError:
pass
# Fall back to unquoted string
return text
# ---------------------------------------------------------------------------
# Line-level helpers
# ---------------------------------------------------------------------------
def _is_separator(line: str) -> bool:
"""Check if a line is a record separator (one or more dashes)."""
stripped = line.strip()
return bool(stripped) and all(c == "-" for c in stripped)
def _strip_comment(line: str) -> str:
"""Strip ``//`` comment from a line, respecting quoted strings and URIs."""
in_str = False
in_uri = False
i = 0
while i < len(line):
ch = line[i]
if in_str:
if ch == "\\":
i += 2
continue
if ch == '"':
in_str = False
elif in_uri:
if ch == "`":
in_uri = False
else:
if ch == '"':
in_str = True
elif ch == "`":
in_uri = True
elif ch == "/" and i + 1 < len(line) and line[i + 1] == "/":
return line[:i]
i += 1
return line
def _parse_tag_line(line: str) -> tuple[str, str | None]:
"""Parse a tag line into ``(name, value_str)`` or ``(name, None)``."""
i = 0
while i < len(line) and line[i] in IDENT_CHARS:
i += 1
name = line[:i]
if not name:
msg = f"Expected tag name: {line!r}"
raise ValueError(msg)
# Look for colon (skip optional whitespace between name and colon)
j = i
while j < len(line) and line[j] == " ":
j += 1
if j < len(line) and line[j] == ":":
val_str = line[j + 1 :].strip()
return name, val_str
return name, None
def _flush_multiline(
current: dict[str, Any],
tag: str | None,
lines: list[str],
mode: str,
*,
_depth: int = 0,
) -> None:
"""Finalize a multi-line value and add it to the current record."""
if tag is None:
return
text = _join_multiline(lines)
if mode == "zinc":
from hs_py.encoding.zinc import decode_grid
current[tag] = decode_grid(text)
elif mode == "trio":
current[tag] = parse_trio(text, _depth=_depth + 1)
else:
current[tag] = text
def _join_multiline(lines: list[str]) -> str:
"""Join multi-line string continuation lines, stripping common indent."""
if not lines:
return ""
indents = [len(ln) - len(ln.lstrip()) for ln in lines if ln.strip()]
min_indent = min(indents) if indents else 0
stripped = [ln[min_indent:] if ln.strip() else "" for ln in lines]
while stripped and not stripped[-1]:
stripped.pop()
return "\n".join(stripped)