""" Importer for Asylum Research / Ergo HDF5 files (.h5, .hdf5, .he5). Asylum Research instruments store scan metadata in a sidecar group rather than as dataset attributes. This importer reads physical dimensions from: Image/DataSetInfo/Global/Channels//ImageDims DimScaling - (2,2) array: [[Y_start, Y_end], [X_start, X_end]] absolute physical coordinate ranges in DimUnits DimExtents - pixel counts [yres, xres] (stored in a child group, not used for sizing) DimUnits - lateral unit strings [Y_unit, X_unit] DataUnits - Z unit string If the sidecar group is absent (generic HDF5), standard dataset attributes are used as a fallback: xreal / yreal - physical scan size in metres (fallback: 1e-6) xoff / yoff - position offset in metres (fallback: 0) si_unit_xy - lateral unit string (fallback: "m") si_unit_z - value unit string (fallback: "m") Requires: pip install h5py """ from __future__ import annotations from pathlib import Path import numpy as np from backend.data_types import DataField extensions = frozenset({".h5", ".hdf5", ".he5"}) calibrated = True # we attempt to read physical metadata def _iter_2d_datasets(h5file): """Yield (name, dataset) for every 2-D numeric dataset in the file.""" import h5py def _visit(name, obj): if isinstance(obj, h5py.Dataset) and obj.ndim == 2 and np.issubdtype(obj.dtype, np.number): results.append((name, obj)) results: list = [] h5file.visititems(_visit) return results def _attr_str(attrs, key: str, default: str) -> str: val = attrs.get(key) if val is None: return default if isinstance(val, bytes): return val.decode("utf-8", errors="replace").strip() or default return str(val).strip() or default def _attr_float(attrs, key: str, default: float) -> float: val = attrs.get(key) if val is None: return default try: return float(val) except (TypeError, ValueError): return default def _ar_image_dims(f, ds_name: str) -> dict | None: """ Look up Asylum Research ImageDims metadata for a dataset. AR .h5 files store scan dimensions in a sibling group rather than as dataset attributes. Given a dataset path like: "Image/DataSet/Resolution 0/Frame 0/Adhesion:Retrace/Image" the channel name is the second-to-last component ("Adhesion:Retrace"), and the metadata lives at: "Image/DataSetInfo/Global/Channels//ImageDims" DimScaling is a (2, 2) array of *absolute physical coordinate ranges* (not per-pixel step sizes), stored Y-first: scaling[0, :] = [Y_start, Y_end] scaling[1, :] = [X_start, X_end] Both values are in the unit given by DimUnits. Returns a dict with xreal, yreal, xoff, yoff, si_unit_xy, si_unit_z, or None if the group isn't found. """ import h5py parts = ds_name.split("/") if len(parts) < 2: return None channel = parts[-2] dims_path = f"Image/DataSetInfo/Global/Channels/{channel}/ImageDims" grp = f.get(dims_path) if not isinstance(grp, h5py.Group): return None scaling = grp.attrs.get("DimScaling") # shape (2, 2): [[Y_start, Y_end], [X_start, X_end]] dim_units = grp.attrs.get("DimUnits") # array of unit strings, e.g. ['m', 'm'] (Y then X) data_units = grp.attrs.get("DataUnits") # Z unit string, e.g. 'N' if scaling is None or np.asarray(scaling).shape != (2, 2): return None scaling = np.asarray(scaling, dtype=np.float64) # Y axis first (row-major), then X — matching numpy's (rows, cols) convention. y_start, y_end = float(scaling[0, 0]), float(scaling[0, 1]) x_start, x_end = float(scaling[1, 0]), float(scaling[1, 1]) xreal = abs(x_end - x_start) or 1e-6 yreal = abs(y_end - y_start) or 1e-6 xoff = min(x_start, x_end) yoff = min(y_start, y_end) def _decode(raw, default="m") -> str: if raw is None: return default if hasattr(raw, "__iter__") and not isinstance(raw, (str, bytes)): raw = list(raw)[0] if len(raw) else default if isinstance(raw, bytes): return raw.decode("utf-8", errors="replace").strip() or default return str(raw).strip() or default # DimUnits is [Y_unit, X_unit]; X unit is the canonical lateral unit. if dim_units is not None and len(dim_units) >= 2: xy_unit = _decode(dim_units[1]) elif dim_units is not None and len(dim_units) >= 1: xy_unit = _decode(dim_units[0]) else: xy_unit = "m" return { "xreal": xreal, "yreal": yreal, "xoff": xoff, "yoff": yoff, "si_unit_xy": xy_unit, "si_unit_z": _decode(data_units), } def load(path: Path) -> list[DataField]: try: import h5py except ImportError: raise ImportError("Install 'h5py' to load HDF5 files: pip install h5py") with h5py.File(str(path), "r") as f: datasets = _iter_2d_datasets(f) if not datasets: raise ValueError(f"No 2-D numeric datasets found in {path.name}") fields = [] for name, ds in datasets: data = np.asarray(ds, dtype=np.float64) # Try Asylum Research sidecar metadata first, then dataset attrs. ar = _ar_image_dims(f, name) if ar: fields.append(DataField( data=data, xreal=ar["xreal"], yreal=ar["yreal"], xoff=ar["xoff"], yoff=ar["yoff"], si_unit_xy=ar["si_unit_xy"], si_unit_z=ar["si_unit_z"], )) else: attrs = ds.attrs fields.append(DataField( data=data, xreal=_attr_float(attrs, "xreal", 1e-6), yreal=_attr_float(attrs, "yreal", 1e-6), xoff=_attr_float(attrs, "xoff", 0.0), yoff=_attr_float(attrs, "yoff", 0.0), si_unit_xy=_attr_str(attrs, "si_unit_xy", "m"), si_unit_z=_attr_str(attrs, "si_unit_z", "m"), )) return fields def _display_names(full_names: list[str]) -> list[str]: """ Derive short display names from HDF5 dataset paths. Rules (all comparisons case-insensitive): 1. All thumbnail datasets are filtered out. 2. Display name = second-to-last path component (drops the leaf like "/image" or "/thumbnail"). 3. "global" channels sort to the front. 4. If two kept datasets share the same second-to-last name, the leaf is appended to disambiguate. Returns a list in sorted order (not parallel to full_names). """ from collections import Counter # Filter out all thumbnail datasets. kept: list[tuple[int, str]] = [] # (original index, full name) for i, name in enumerate(full_names): if name.split("/")[-1].lower() == "thumbnail": continue kept.append((i, name)) # Sort: "global" second-to-last first, then alphabetical. def _sort_key(item: tuple[int, str]) -> tuple[int, str]: parts = item[1].split("/") second_last = parts[-2].lower() if len(parts) >= 2 else parts[-1].lower() return (0 if second_last == "global" else 1, second_last) kept.sort(key=_sort_key) # Build short names (second-to-last), disambiguate clashes. short = [ (parts[-2] if len(parts := name.split("/")) >= 2 else parts[-1]) for _, name in kept ] counts = Counter(short) disambiguated = [ f"{s}/{name.split('/')[-1]}" if counts[s] > 1 else s for s, (_, name) in zip(short, kept) ] return disambiguated def channel_names(path: Path) -> list[str]: try: import h5py except ImportError: return [] try: with h5py.File(str(path), "r") as f: datasets = _iter_2d_datasets(f) full_names = [name for name, _ in datasets] return [n for n in _display_names(full_names) if n is not None] except Exception: return []