tono/backend/importers/ergo_hdf5.py

"""
Importer for Asylum Research / Ergo HDF5 files (.h5, .hdf5, .he5).

Asylum Research instruments store scan metadata in a sidecar group rather
than as dataset attributes.  This importer reads physical dimensions from:

  Image/DataSetInfo/Global/Channels/<channel>/ImageDims
    DimScaling   - (2,2) array: [[Y_start, Y_end], [X_start, X_end]]
                   absolute physical coordinate ranges in DimUnits
    DimExtents   - pixel counts [yres, xres] (stored in a child group, not used for sizing)
    DimUnits     - lateral unit strings [Y_unit, X_unit]
    DataUnits    - Z unit string

If the sidecar group is absent (generic HDF5), standard dataset attributes
are used as a fallback:

  xreal / yreal   - physical scan size in metres  (fallback: 1e-6)
  xoff  / yoff    - position offset in metres      (fallback: 0)
  si_unit_xy      - lateral unit string            (fallback: "m")
  si_unit_z       - value unit string              (fallback: "m")

Requires:
    pip install h5py
"""

from __future__ import annotations

from pathlib import Path

import numpy as np

from backend.data_types import DataField


extensions = frozenset({".h5", ".hdf5", ".he5"})
calibrated = True   # we attempt to read physical metadata


def _iter_2d_datasets(h5file):
    """Yield (name, dataset) for every 2-D numeric dataset in the file."""
    import h5py

    def _visit(name, obj):
        if isinstance(obj, h5py.Dataset) and obj.ndim == 2 and np.issubdtype(obj.dtype, np.number):
            results.append((name, obj))

    results: list = []
    h5file.visititems(_visit)
    return results


def _attr_str(attrs, key: str, default: str) -> str:
    val = attrs.get(key)
    if val is None:
        return default
    if isinstance(val, bytes):
        return val.decode("utf-8", errors="replace").strip() or default
    return str(val).strip() or default


def _attr_float(attrs, key: str, default: float) -> float:
    val = attrs.get(key)
    if val is None:
        return default
    try:
        return float(val)
    except (TypeError, ValueError):
        return default


def _ar_image_dims(f, ds_name: str) -> dict | None:
    """
    Look up Asylum Research ImageDims metadata for a dataset.

    AR .h5 files store scan dimensions in a sibling group rather than as
    dataset attributes.  Given a dataset path like:
      "Image/DataSet/Resolution 0/Frame 0/Adhesion:Retrace/Image"
    the channel name is the second-to-last component ("Adhesion:Retrace"),
    and the metadata lives at:
      "Image/DataSetInfo/Global/Channels/<channel>/ImageDims"

    DimScaling is a (2, 2) array of *absolute physical coordinate ranges*
    (not per-pixel step sizes), stored Y-first:
      scaling[0, :] = [Y_start, Y_end]
      scaling[1, :] = [X_start, X_end]
    Both values are in the unit given by DimUnits.

    Returns a dict with xreal, yreal, xoff, yoff, si_unit_xy, si_unit_z,
    or None if the group isn't found.
    """
    import h5py

    parts = ds_name.split("/")
    if len(parts) < 2:
        return None
    channel = parts[-2]

    dims_path = f"Image/DataSetInfo/Global/Channels/{channel}/ImageDims"
    grp = f.get(dims_path)
    if not isinstance(grp, h5py.Group):
        return None

    scaling = grp.attrs.get("DimScaling")   # shape (2, 2): [[Y_start, Y_end], [X_start, X_end]]
    dim_units = grp.attrs.get("DimUnits")   # array of unit strings, e.g. ['m', 'm'] (Y then X)
    data_units = grp.attrs.get("DataUnits") # Z unit string, e.g. 'N'

    if scaling is None or np.asarray(scaling).shape != (2, 2):
        return None

    scaling = np.asarray(scaling, dtype=np.float64)
    # Y axis first (row-major), then X — matching numpy's (rows, cols) convention.
    y_start, y_end = float(scaling[0, 0]), float(scaling[0, 1])
    x_start, x_end = float(scaling[1, 0]), float(scaling[1, 1])

    xreal = abs(x_end - x_start) or 1e-6
    yreal = abs(y_end - y_start) or 1e-6
    xoff  = min(x_start, x_end)
    yoff  = min(y_start, y_end)

    def _decode(raw, default="m") -> str:
        if raw is None:
            return default
        if hasattr(raw, "__iter__") and not isinstance(raw, (str, bytes)):
            raw = list(raw)[0] if len(raw) else default
        if isinstance(raw, bytes):
            return raw.decode("utf-8", errors="replace").strip() or default
        return str(raw).strip() or default

    # DimUnits is [Y_unit, X_unit]; X unit is the canonical lateral unit.
    if dim_units is not None and len(dim_units) >= 2:
        xy_unit = _decode(dim_units[1])
    elif dim_units is not None and len(dim_units) >= 1:
        xy_unit = _decode(dim_units[0])
    else:
        xy_unit = "m"

    return {
        "xreal": xreal,
        "yreal": yreal,
        "xoff":  xoff,
        "yoff":  yoff,
        "si_unit_xy": xy_unit,
        "si_unit_z":  _decode(data_units),
    }


def load(path: Path) -> list[DataField]:
    try:
        import h5py
    except ImportError:
        raise ImportError("Install 'h5py' to load HDF5 files:  pip install h5py")

    with h5py.File(str(path), "r") as f:
        datasets = _iter_2d_datasets(f)
        if not datasets:
            raise ValueError(f"No 2-D numeric datasets found in {path.name}")

        fields = []
        for name, ds in datasets:
            data = np.asarray(ds, dtype=np.float64)

            # Try Asylum Research sidecar metadata first, then dataset attrs.
            ar = _ar_image_dims(f, name)
            if ar:
                fields.append(DataField(
                    data=data,
                    xreal=ar["xreal"], yreal=ar["yreal"],
                    xoff=ar["xoff"],   yoff=ar["yoff"],
                    si_unit_xy=ar["si_unit_xy"],
                    si_unit_z=ar["si_unit_z"],
                ))
            else:
                attrs = ds.attrs
                fields.append(DataField(
                    data=data,
                    xreal=_attr_float(attrs, "xreal", 1e-6),
                    yreal=_attr_float(attrs, "yreal", 1e-6),
                    xoff=_attr_float(attrs, "xoff", 0.0),
                    yoff=_attr_float(attrs, "yoff", 0.0),
                    si_unit_xy=_attr_str(attrs, "si_unit_xy", "m"),
                    si_unit_z=_attr_str(attrs, "si_unit_z", "m"),
                ))
    return fields


def _display_names(full_names: list[str]) -> list[str]:
    """
    Derive short display names from HDF5 dataset paths.

    Rules (all comparisons case-insensitive):
      1. All thumbnail datasets are filtered out.
      2. Display name = second-to-last path component (drops the leaf like
         "/image" or "/thumbnail").
      3. "global" channels sort to the front.
      4. If two kept datasets share the same second-to-last name, the leaf is
         appended to disambiguate.

    Returns a list in sorted order (not parallel to full_names).
    """
    from collections import Counter

    # Filter out all thumbnail datasets.
    kept: list[tuple[int, str]] = []   # (original index, full name)
    for i, name in enumerate(full_names):
        if name.split("/")[-1].lower() == "thumbnail":
            continue
        kept.append((i, name))

    # Sort: "global" second-to-last first, then alphabetical.
    def _sort_key(item: tuple[int, str]) -> tuple[int, str]:
        parts = item[1].split("/")
        second_last = parts[-2].lower() if len(parts) >= 2 else parts[-1].lower()
        return (0 if second_last == "global" else 1, second_last)

    kept.sort(key=_sort_key)

    # Build short names (second-to-last), disambiguate clashes.
    short = [
        (parts[-2] if len(parts := name.split("/")) >= 2 else parts[-1])
        for _, name in kept
    ]
    counts = Counter(short)
    disambiguated = [
        f"{s}/{name.split('/')[-1]}" if counts[s] > 1 else s
        for s, (_, name) in zip(short, kept)
    ]

    return disambiguated


def channel_names(path: Path) -> list[str]:
    try:
        import h5py
    except ImportError:
        return []
    try:
        with h5py.File(str(path), "r") as f:
            datasets = _iter_2d_datasets(f)
            full_names = [name for name, _ in datasets]
        return [n for n in _display_names(full_names) if n is not None]
    except Exception:
        return []