API Reference

Auto-generated from source docstrings.

`point_collocation`

Top-level convenience imports:

import point_collocation as pc

pc.plan(...)     # build a matchup plan
pc.matchup(...)  # execute the plan

Core

plan

point_collocation.core.plan.plan

plan(points: PointsFrame, *, data_source: str = 'earthaccess', source_kwargs: dict[str, Any] | None = None, time_buffer: str | Timedelta | timedelta | int = '0h') -> Plan

Build a :class:Plan previewing which granules cover each point.

Parameters:

Name	Type	Description	Default
`points`	`PointsFrame`	DataFrame with at minimum `lat`, `lon`, and `time` (or `date` as an alias). If the column is named `date` and contains date-only values, the time-of-day is set to noon (12:00 UTC) for matching purposes. An optional `pc_id` column may be included to supply custom point identifiers. If present, these values must be unique; duplicate `pc_id` values raise a :class:`ValueError`. Any additional columns beyond `lat`, `lon`, `time`, and `pc_id` are preserved and included in the output returned by :func:`~point_collocation.matchup`.	required
`data_source`	`str`	Data source to search. Currently only `"earthaccess"` is supported.	`'earthaccess'`
`source_kwargs`	`dict[str, Any] \| None`	Keyword arguments forwarded to `earthaccess.search_data()`. Must contain at least one of `"short_name"`, `"concept_id"`, or `"doi"`. The special keys `"access"` and `"in_region"` are not forwarded to `search_data()`; instead they are passed to `result.data_links()` on every returned granule to control which link type is used (e.g. `"access": "direct"` for S3). Granules whose `data_links()` returns an empty list for the given kwargs are silently excluded from the plan.	`None`
`time_buffer`	`str \| Timedelta \| timedelta \| int`	Extra temporal margin when matching a point to a granule. A point at time t matches a granule whose coverage is `[begin, end]` if `begin - buffer ≤ t ≤ end + buffer`. Accepts a :class:`pandas.Timedelta`, :class:`datetime.timedelta`, or a pandas-parseable string (`"12H"`, `"30min"`, …). Default is `"0h"` (exact overlap required).	`'0h'`

Returns:

Type	Description
`Plan`	The planning object; inspect with :meth:`Plan.summary` and execute with :func:`~point_collocation.matchup`.

Raises:

Type	Description
`ValueError`	If points is missing required columns, data_source is not recognised, `source_kwargs` does not contain at least one of `"short_name"`, `"concept_id"`, or `"doi"`, or the `pc_id` column contains duplicate values.
`ImportError`	If the `earthaccess` package is not installed.

Source code in point_collocation/core/plan.py

def plan(
    points: PointsFrame,
    *,
    data_source: str = "earthaccess",
    source_kwargs: dict[str, Any] | None = None,
    time_buffer: str | pd.Timedelta | datetime.timedelta | int = "0h",
) -> Plan:
    """Build a :class:`Plan` previewing which granules cover each point.

    Parameters
    ----------
    points:
        DataFrame with at minimum ``lat``, ``lon``, and ``time`` (or
        ``date`` as an alias).  If the column is named ``date`` and
        contains date-only values, the time-of-day is set to noon
        (12:00 UTC) for matching purposes.

        An optional ``pc_id`` column may be included to supply custom
        point identifiers.  If present, these values must be unique;
        duplicate ``pc_id`` values raise a :class:`ValueError`.  Any
        additional columns beyond ``lat``, ``lon``, ``time``, and
        ``pc_id`` are preserved and included in the output returned by
        :func:`~point_collocation.matchup`.
    data_source:
        Data source to search.  Currently only ``"earthaccess"`` is
        supported.
    source_kwargs:
        Keyword arguments forwarded to ``earthaccess.search_data()``.
        Must contain at least one of ``"short_name"``, ``"concept_id"``,
        or ``"doi"``.  The special keys ``"access"`` and ``"in_region"``
        are *not* forwarded to ``search_data()``; instead they are passed
        to ``result.data_links()`` on every returned granule to control
        which link type is used (e.g. ``"access": "direct"`` for S3).
        Granules whose ``data_links()`` returns an empty list for the
        given kwargs are silently excluded from the plan.
    time_buffer:
        Extra temporal margin when matching a point to a granule.  A
        point at time *t* matches a granule whose coverage is
        ``[begin, end]`` if ``begin - buffer ≤ t ≤ end + buffer``.
        Accepts a :class:`pandas.Timedelta`, :class:`datetime.timedelta`,
        or a pandas-parseable string (``"12H"``, ``"30min"``, …).
        Default is ``"0h"`` (exact overlap required).

    Returns
    -------
    Plan
        The planning object; inspect with :meth:`Plan.summary` and
        execute with :func:`~point_collocation.matchup`.

    Raises
    ------
    ValueError
        If *points* is missing required columns, *data_source* is not
        recognised, ``source_kwargs`` does not contain at least one of
        ``"short_name"``, ``"concept_id"``, or ``"doi"``, or the
        ``pc_id`` column contains duplicate values.
    ImportError
        If the ``earthaccess`` package is not installed.
    """
    if data_source != "earthaccess":
        raise ValueError(
            f"Unknown data_source {data_source!r}. "
            "Currently only 'earthaccess' is supported."
        )

    points, y_orig, x_orig, time_orig = _plan_normalise_columns(points)
    _plan_validate_points(points)

    buffer = _parse_time_buffer(time_buffer)
    results, granule_metas = _search_earthaccess(points, source_kwargs=source_kwargs)
    point_granule_map = _match_points_to_granules(points, granule_metas, buffer)

    return Plan(
        points=points,
        results=results,
        granules=granule_metas,
        point_granule_map=point_granule_map,
        source_kwargs=dict(source_kwargs or {}),
        time_buffer=buffer,
        pts_y_col_original=y_orig,
        pts_x_col_original=x_orig,
        pts_time_col_original=time_orig,
    )

matchup

point_collocation.core.engine.matchup

matchup(plan: 'Plan', *, open_method: str | dict | None = None, variables: list[str] | None = None, spatial_method: str | None = None, open_dataset_kwargs: dict | None = None, coord_spec: dict | None = None, silent: bool = True, batch_size: int | None = None, save_dir: str | PathLike | None = None, granule_range: tuple[int, int] | None = None) -> pd.DataFrame

Extract variables from cloud-hosted granules at the given points.

Parameters:

Name	Type	Description	Default
`plan`	`'Plan'`	A :class:`~point_collocation.core.plan.Plan` object previously built with :func:`~point_collocation.plan`. Data source and search parameters are taken from the plan. One output row is produced per (point, granule) pair; points with zero matching granules produce a single NaN row.	required
`open_method`	`str \| dict \| None`	How granules are opened. Accepts a string preset or a dict spec. String presets: `"dataset"` — open with `xarray.open_dataset` (fast path for typical flat NetCDF files). `"datatree"` — open as a raw DataTree without merging groups. `"datatree-merge"` — open as DataTree and merge all groups into a flat Dataset (for grouped/HDF5-ish files). `"auto"` (default) — try the fast `"dataset"` path first; if lat/lon coordinates cannot be detected, fall back to `"datatree-merge"` automatically. Dict spec (advanced): .. code-block:: python `open_method = { "xarray_open": "dataset" \| "datatree", "open_kwargs": {}, "merge": None \| "all" \| "root" \| ["/path/a"], "merge_kwargs": {}, "coords": "auto" \| ["Lat", "Lon"] \| {"lat": "...", "lon": "..."}, "set_coords": True, "dim_renames": None \| {"node": {"old": "new"}}, "auto_align_phony_dims": None \| "safe", }` All keys are optional; missing keys receive sensible defaults. Unknown keys raise :exc:`ValueError`. Pre-defined profiles for common products are importable from :mod:`point_collocation.profiles` (e.g. `pace_l3`, `pace_l2`).	`None`
`variables`	`list[str] \| None`	Variable names to extract from each granule. When provided, overrides any variables stored on the plan. When omitted, falls back to `plan.variables`. If the resolved list is empty, the output will have no variable columns. Raises :exc:`ValueError` if a requested variable is not found in the opened dataset.	`None`
`spatial_method`	`str \| None`	Method used for spatial matching. `"auto"` (default) — automatically selects the best method based on the dimensionality of the geolocation coordinates: 1-D coordinates (regular/gridded data, both lat and lon are 1-D): uses `"axis"` (vectorised `ds.sel(..., method="nearest")` over all points at once). If `"axis"` fails for any reason, falls back to `"kdtree"` automatically. 2-D coordinates (irregular/swath data, or either coordinate is 2-D): uses `"kdtree"`. `xoak-kdtree` and `xoak-haversine` are never selected automatically; set them explicitly if needed. `"axis"` — Vectorised `ds.sel(..., method="nearest")` for all points in a single call. Requires 1-D (regular-grid) coordinate arrays for both latitude and longitude; raises :exc:`ValueError` with a suggestion to use `"auto"` or `"kdtree"` for 2-D coordinates. `"kdtree"` — xarray's built-in :class:`xarray.indexes.NDPointIndex` with the default `ScipyKDTreeAdapter`. Works with both 1-D and 2-D coordinate arrays (requires `scipy`). `"xoak-kdtree"` — the `xoak` package's `SklearnKDTreeAdapter`. Works with both 1-D and 2-D arrays (requires `xoak` and `scikit-learn`). `"xoak-haversine"` — the `xoak` package's `SklearnGeoBallTreeAdapter`, which uses the haversine metric for accurate great-circle distance calculations. Recommended for data near the poles where the Euclidean k-d tree used by `"xoak-kdtree"` can return incorrect nearest neighbours due to coordinate distortion. Works with both 1-D and 2-D arrays (requires `xoak` and `scikit-learn`). Lat/lon values are passed in degrees; the adapter converts them to radians internally.	`None`
`open_dataset_kwargs`	`dict \| None`	Optional dictionary of keyword arguments forwarded to the xarray open function for every granule opened during the run. These override any `"open_kwargs"` in open_method but are themselves overridden by their respective defaults only for missing keys (`chunks` → `{}`, `engine` → `"h5netcdf"`, `decode_timedelta` → `False`).	`None`
`coord_spec`	`dict \| None`	Coordinate specification controlling how axis/coordinate names are interpreted for both the source dataset and the points DataFrame. Defaults to auto-detection of lat/lon/time from standard name candidates. Example usage with non-standard variable names and optional additional axes:: `coord_spec = { "coordinate_system": "geographic", "y": {"source": "grid_lat", "points": "lat"}, "x": {"source": "grid_lon", "points": "lon"}, "time": {"source": "auto", "points": "auto"}, # optional additional axes: "depth": {"source": "z", "points": "depth"}, "wavelength": {"source": "wavelength", "points": "wave"}, }` The `source` key is the variable/coordinate name in the source dataset; `points` is the column name in the points DataFrame. Set either to `"auto"` for standard-name auto-detection. If `coord_spec` specifies `source` for `y`/`x` and `open_method['coords']` also specifies explicit names, a :exc:`ValueError` is raised when they conflict. Set one side to `"auto"` to let the other take precedence. Additional axes (beyond time) are optional; if the configured column is absent from the points DataFrame the axis is silently skipped.	`None`
`silent`	`bool`	When `True` (default), all progress output is suppressed. Set to `False` to print a progress message to stdout after every batch_size granules.	`True`
`batch_size`	`int \| None`	Number of granules to process between progress reports (and between intermediate saves when save_dir is set). Defaults to `None`, which sets the batch size to one more than the total number of matched granules so that all granules are processed in a single batch.	`None`
`save_dir`	`str \| PathLike \| None`	Directory in which intermediate results are saved as Parquet files after each batch of batch_size granules. The directory is created automatically if it does not exist. Each batch is saved as `plan_<first>_<last>.parquet` where first and last are the granule indices from the plan. When `None` (default), no intermediate files are written.	`None`
`granule_range`	`tuple[int, int] \| None`	Optional `(start, end)` tuple (both 1-based and inclusive) that restricts processing to a contiguous slice of the matched granules, ordered by granule index. For example, `granule_range=(261, 620)` resumes from granule 261 after a crash that completed granules 1–260. Progress messages continue to report absolute granule numbers (e.g. "granules 261-270 of 620 processed") so the output is directly comparable with messages from the original run. When `None` (default), all matched granules are processed.	`None`

Returns:

Type Description

DataFrame

One row per (point, granule) pair. In addition to the original point columns and one column per requested variable, the output always includes:

pc_id Point identifier. If the input dataframe contains a pc_id column those values are preserved as-is; otherwise the row index from the input dataframe is used. Duplicate pc_id values in the input are not allowed and raise a :class:ValueError during planning. granule_id Identifier of the granule that provided this row's values. granule_lat Latitude of the matched location inside the granule (i.e. the nearest-neighbour grid or swath position). granule_lon Longitude of the matched location inside the granule. granule_time Midpoint of the granule's temporal coverage, derived from the granule metadata (begin + (end - begin) / 2). For earthaccess granules, temporal information is stored in the search result metadata rather than in the dataset itself. For zero-match rows, this column is pandas.NaT.

Any extra columns present in the input dataframe are retained in the output. Points with zero matching granules contribute a single NaN row. The output is sorted to match the pc_id order from the input dataframe.

Raises:

Type	Description
`ValueError`	If open_method is a string that is not a valid preset, or a dict with unknown keys or an invalid `"xarray_open"` value.
`ValueError`	If a requested variable is not present in an opened dataset.
`ValueError`	If geolocation variables cannot be detected unambiguously.
`ValueError`	If `granule_range` is not a 2-tuple of positive integers with `start <= end`, or if either bound exceeds the number of matched granules in the plan.
`ImportError`	If `spatial_method="xoak-kdtree"` and the `xoak` package is not installed.
`ImportError`	If `spatial_method="xoak-haversine"` and the `xoak` package is not installed.
`ImportError`	If `spatial_method="kdtree"` and `scipy` is not installed.

Source code in point_collocation/core/engine.py

def matchup(
    plan: "Plan",
    *,
    open_method: str | dict | None = None,
    variables: list[str] | None = None,
    spatial_method: str | None = None,
    open_dataset_kwargs: dict | None = None,
    coord_spec: dict | None = None,
    silent: bool = True,
    batch_size: int | None = None,
    save_dir: str | os.PathLike | None = None,
    granule_range: tuple[int, int] | None = None,
) -> pd.DataFrame:
    """Extract variables from cloud-hosted granules at the given points.

    Parameters
    ----------
    plan:
        A :class:`~point_collocation.core.plan.Plan` object previously
        built with :func:`~point_collocation.plan`.  Data source and
        search parameters are taken from the plan.  One output row is
        produced per (point, granule) pair; points with zero matching
        granules produce a single NaN row.
    open_method:
        How granules are opened.  Accepts a string preset or a dict spec.

        **String presets:**

        * ``"dataset"`` — open with ``xarray.open_dataset`` (fast path for
          typical flat NetCDF files).
        * ``"datatree"`` — open as a raw DataTree without merging groups.
        * ``"datatree-merge"`` — open as DataTree and merge all groups into
          a flat Dataset (for grouped/HDF5-ish files).
        * ``"auto"`` *(default)* — try the fast ``"dataset"`` path first; if
          lat/lon coordinates cannot be detected, fall back to
          ``"datatree-merge"`` automatically.

        **Dict spec** (advanced):

        .. code-block:: python

            open_method = {
                "xarray_open":           "dataset" | "datatree",
                "open_kwargs":           {},
                "merge":                 None | "all" | "root" | ["/path/a"],
                "merge_kwargs":          {},
                "coords":                "auto" | ["Lat", "Lon"] | {"lat": "...", "lon": "..."},
                "set_coords":            True,
                "dim_renames":           None | {"node": {"old": "new"}},
                "auto_align_phony_dims": None | "safe",
            }

        All keys are optional; missing keys receive sensible defaults.
        Unknown keys raise :exc:`ValueError`.

        Pre-defined profiles for common products are importable from
        :mod:`point_collocation.profiles` (e.g. ``pace_l3``, ``pace_l2``).
    variables:
        Variable names to extract from each granule.  When provided,
        overrides any variables stored on the plan.  When omitted,
        falls back to ``plan.variables``.  If the resolved list is
        empty, the output will have no variable columns.
        Raises :exc:`ValueError` if a requested variable is not found
        in the opened dataset.
    spatial_method:
        Method used for spatial matching.

        * ``"auto"`` *(default)* — automatically selects the best method
          based on the dimensionality of the geolocation coordinates:

          - **1-D coordinates** (regular/gridded data, both lat and lon
            are 1-D): uses ``"axis"`` (vectorised
            ``ds.sel(..., method="nearest")`` over all points at once).
            If ``"axis"`` fails for any reason, falls back to ``"kdtree"``
            automatically.
          - **2-D coordinates** (irregular/swath data, or either coordinate
            is 2-D): uses ``"kdtree"``.

          ``xoak-kdtree`` and ``xoak-haversine`` are never selected
          automatically; set them explicitly if needed.

        * ``"axis"`` — Vectorised ``ds.sel(..., method="nearest")`` for all
          points in a single call.  Requires 1-D (regular-grid) coordinate
          arrays for both latitude and longitude; raises :exc:`ValueError`
          with a suggestion to use ``"auto"`` or ``"kdtree"`` for 2-D
          coordinates.
        * ``"kdtree"`` — xarray's built-in
          :class:`xarray.indexes.NDPointIndex` with the default
          ``ScipyKDTreeAdapter``.  Works with both 1-D and 2-D coordinate
          arrays (requires ``scipy``).
        * ``"xoak-kdtree"`` — the ``xoak`` package's ``SklearnKDTreeAdapter``.
          Works with both 1-D and 2-D arrays (requires ``xoak`` and
          ``scikit-learn``).
        * ``"xoak-haversine"`` — the ``xoak`` package's
          ``SklearnGeoBallTreeAdapter``, which uses the haversine metric for
          accurate great-circle distance calculations.  Recommended for data
          near the poles where the Euclidean k-d tree used by
          ``"xoak-kdtree"`` can return incorrect nearest neighbours due to
          coordinate distortion.  Works with both 1-D and 2-D arrays
          (requires ``xoak`` and ``scikit-learn``).  Lat/lon values are
          passed in degrees; the adapter converts them to radians internally.
    open_dataset_kwargs:
        Optional dictionary of keyword arguments forwarded to the xarray
        open function for every granule opened during the run.  These
        override any ``"open_kwargs"`` in *open_method* but are themselves
        overridden by their respective defaults only for missing keys
        (``chunks`` → ``{}``, ``engine`` → ``"h5netcdf"``,
        ``decode_timedelta`` → ``False``).
    coord_spec:
        Coordinate specification controlling how axis/coordinate names are
        interpreted for both the source dataset and the points DataFrame.
        Defaults to auto-detection of lat/lon/time from standard name
        candidates.  Example usage with non-standard variable names and
        optional additional axes::

            coord_spec = {
                "coordinate_system": "geographic",
                "y":    {"source": "grid_lat", "points": "lat"},
                "x":    {"source": "grid_lon", "points": "lon"},
                "time": {"source": "auto",      "points": "auto"},
                # optional additional axes:
                "depth":      {"source": "z",          "points": "depth"},
                "wavelength": {"source": "wavelength", "points": "wave"},
            }

        The ``source`` key is the variable/coordinate name in the source
        dataset; ``points`` is the column name in the points DataFrame.
        Set either to ``"auto"`` for standard-name auto-detection.

        If ``coord_spec`` specifies ``source`` for ``y``/``x`` and
        ``open_method['coords']`` also specifies explicit names, a
        :exc:`ValueError` is raised when they conflict.  Set one side to
        ``"auto"`` to let the other take precedence.

        Additional axes (beyond time) are optional; if the configured column
        is absent from the points DataFrame the axis is silently skipped.
    silent:
        When ``True`` (default), all progress output is suppressed.
        Set to ``False`` to print a progress message to stdout after
        every *batch_size* granules.
    batch_size:
        Number of granules to process between progress reports (and
        between intermediate saves when *save_dir* is set).  Defaults
        to ``None``, which sets the batch size to one more than the
        total number of matched granules so that all granules are
        processed in a single batch.
    save_dir:
        Directory in which intermediate results are saved as Parquet
        files after each batch of *batch_size* granules.  The directory
        is created automatically if it does not exist.  Each batch is
        saved as ``plan_<first>_<last>.parquet`` where *first* and
        *last* are the granule indices from the plan.  When ``None``
        (default), no intermediate files are written.
    granule_range:
        Optional ``(start, end)`` tuple (both **1-based and inclusive**)
        that restricts processing to a contiguous slice of the matched
        granules, ordered by granule index.  For example,
        ``granule_range=(261, 620)`` resumes from granule 261 after a
        crash that completed granules 1–260.  Progress messages continue
        to report absolute granule numbers (e.g.
        "granules 261-270 of 620 processed") so the output is directly
        comparable with messages from the original run.  When ``None``
        (default), all matched granules are processed.

    Returns
    -------
    pandas.DataFrame
        One row per (point, granule) pair.  In addition to the original
        point columns and one column per requested variable, the output
        always includes:

        ``pc_id``
            Point identifier.  If the input dataframe contains a ``pc_id``
            column those values are preserved as-is; otherwise the row
            index from the input dataframe is used.  Duplicate ``pc_id``
            values in the input are not allowed and raise a
            :class:`ValueError` during planning.
        ``granule_id``
            Identifier of the granule that provided this row's values.
        ``granule_lat``
            Latitude of the matched location inside the granule (i.e.
            the nearest-neighbour grid or swath position).
        ``granule_lon``
            Longitude of the matched location inside the granule.
        ``granule_time``
            Midpoint of the granule's temporal coverage, derived from
            the granule metadata (``begin + (end - begin) / 2``).  For
            earthaccess granules, temporal information is stored in the
            search result metadata rather than in the dataset itself.
            For zero-match rows, this column is ``pandas.NaT``.

        Any extra columns present in the input dataframe are retained in
        the output.  Points with zero matching granules contribute a
        single NaN row.  The output is sorted to match the ``pc_id``
        order from the input dataframe.

    Raises
    ------
    ValueError
        If *open_method* is a string that is not a valid preset, or a dict
        with unknown keys or an invalid ``"xarray_open"`` value.
    ValueError
        If a requested variable is not present in an opened dataset.
    ValueError
        If geolocation variables cannot be detected unambiguously.
    ValueError
        If ``granule_range`` is not a 2-tuple of positive integers with
        ``start <= end``, or if either bound exceeds the number of matched
        granules in the plan.
    ImportError
        If ``spatial_method="xoak-kdtree"`` and the ``xoak`` package is not
        installed.
    ImportError
        If ``spatial_method="xoak-haversine"`` and the ``xoak`` package is not
        installed.
    ImportError
        If ``spatial_method="kdtree"`` and ``scipy`` is not installed.
    """
    if granule_range is not None:
        if (
            len(granule_range) != 2
            or not isinstance(granule_range[0], int)
            or not isinstance(granule_range[1], int)
            or granule_range[0] < 1
            or granule_range[1] < granule_range[0]
        ):
            raise ValueError(
                f"granule_range={granule_range!r} is not valid. "
                "Must be a (start, end) tuple of positive integers with start <= end, "
                "both 1-based and inclusive (e.g. granule_range=(261, 620))."
            )

    if spatial_method is None:
        spatial_method = "auto"

    if spatial_method not in _VALID_SPATIAL_METHODS:
        raise ValueError(
            f"spatial_method={spatial_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_SPATIAL_METHODS)}."
        )

    # Validate xoak is importable before we start processing granules.
    if spatial_method == "xoak-kdtree":
        try:
            from xoak.tree_adapters import SklearnKDTreeAdapter  # type: ignore[import-untyped]  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'xoak' package (and scikit-learn) are required for spatial_method='xoak-kdtree'. "
                "Install them with: pip install xoak scikit-learn"
            ) from exc

    # Validate xoak is importable before we start processing granules.
    if spatial_method == "xoak-haversine":
        try:
            from xoak.tree_adapters import SklearnGeoBallTreeAdapter  # type: ignore[import-untyped]  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'xoak' package (and scikit-learn) are required for spatial_method='xoak-haversine'. "
                "Install them with: pip install xoak scikit-learn"
            ) from exc

    # Validate scipy is importable before we start processing granules.
    if spatial_method == "kdtree":
        try:
            from scipy.spatial import KDTree  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'scipy' package is required for spatial_method='kdtree'. "
                "Install it with: pip install scipy"
            ) from exc

    # Normalize open_method to a full dict spec (raises ValueError on invalid input).
    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method, open_dataset_kwargs)

    # Normalize coord_spec and bridge y/x sources into spec["coords"].
    # This ensures non-standard lat/lon variable names in coord_spec are used
    # when opening datasets, with conflict detection against open_method["coords"].
    resolved_coord_spec = _normalize_coord_spec(coord_spec)
    spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

    # Resolve additional axes (depth, wavelength, etc.) from the points DataFrame.
    # We use the plan's already-normalised points (lat/lon/time canonical names).
    # The y/x/time columns are always "lat"/"lon"/"time" after plan normalisation.
    additional_axes = _resolve_additional_axes(plan.points, resolved_coord_spec)

    if not silent:
        _print_coord_spec_summary(
            plan,
            resolved_coord_spec,
            additional_axes,
        )

    effective_vars: list[str] = variables if variables is not None else plan.variables
    return _execute_plan(
        plan,
        spec=spec,
        spatial_method=spatial_method,
        variables=effective_vars,
        silent=silent,
        batch_size=batch_size,
        save_dir=save_dir,
        granule_range=granule_range,
        additional_axes=additional_axes,
    )

Plan

point_collocation.core.plan.Plan `dataclass`

A planned matchup: stores the point→granule mapping and search results.

Attributes:

Name	Type	Description
`points`	`DataFrame`	Normalised points DataFrame (`time` column).
`results`	`list[Any]`	Original earthaccess result objects in search order. Passed directly to `earthaccess.open()` when executing the plan.
`granules`	`list[GranuleMeta]`	:class:`GranuleMeta` for every unique granule returned by the search (parallel with results).
`point_granule_map`	`dict[Any, list[int]]`	Maps each row index of points to a (possibly empty) list of indices into granules.
`variables`	`list[str]`	Default variables to extract during :func:`~point_collocation.matchup`. Can be overridden by passing `variables` directly to :func:`~point_collocation.matchup`.
`source_kwargs`	`dict[str, Any]`	earthaccess search kwargs used to build this plan.
`time_buffer`	`Timedelta`	Temporal buffer that was applied when matching points to granules.

Source code in point_collocation/core/plan.py

@dataclass
class Plan:
    """A planned matchup: stores the point→granule mapping and search results.

    Attributes
    ----------
    points:
        Normalised points DataFrame (``time`` column).
    results:
        Original earthaccess result objects in search order.  Passed
        directly to ``earthaccess.open()`` when executing the plan.
    granules:
        :class:`GranuleMeta` for every unique granule returned by the
        search (parallel with *results*).
    point_granule_map:
        Maps each row index of *points* to a (possibly empty) list of
        indices into *granules*.
    variables:
        Default variables to extract during :func:`~point_collocation.matchup`.
        Can be overridden by passing ``variables`` directly to
        :func:`~point_collocation.matchup`.
    source_kwargs:
        earthaccess search kwargs used to build this plan.
    time_buffer:
        Temporal buffer that was applied when matching points to granules.
    """

    points: pd.DataFrame
    results: list[Any]
    granules: list[GranuleMeta]
    point_granule_map: dict[Any, list[int]]
    variables: list[str] = field(default_factory=list)
    source_kwargs: dict[str, Any] = field(default_factory=dict)
    time_buffer: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(0))

    # Original column names as detected in the user's points DataFrame, before
    # normalisation to the canonical "lat"/"lon"/"time" names.  Used for
    # transparent reporting in plan.open_dataset() and pc.matchup() output.
    pts_y_col_original: str = field(default=_CANONICAL_LAT)
    """Original latitude column name detected in the user's points DataFrame."""

    pts_x_col_original: str = field(default=_CANONICAL_LON)
    """Original longitude column name detected in the user's points DataFrame."""

    pts_time_col_original: str = field(default=_CANONICAL_TIME)
    """Original time column name detected in the user's points DataFrame."""

    # ------------------------------------------------------------------
    # Indexing — plan[0] returns a result object; plan[0:10] returns a
    # subset Plan restricted to the sliced points.
    # ------------------------------------------------------------------

    def __getitem__(self, idx: int | slice) -> "Plan | Any":
        """Return a subset :class:`Plan` or a single earthaccess result.

        Parameters
        ----------
        idx:
            * **Integer** — returns the earthaccess result object at that
              position (``self.results[idx]``), so that ``plan[0]`` can
              still be passed to :meth:`open_dataset`.
            * **Slice** — returns a new :class:`Plan` whose ``points``
              are the rows selected by the slice (``points.iloc[idx]``),
              with ``point_granule_map``, ``granules``, and ``results``
              filtered and re-indexed accordingly.  This allows users to
              test a subset of a large plan::

                  res = pc.matchup(plan[0:10], variables=["avw"])
        """
        if isinstance(idx, int):
            return self.results[idx]

        # --- Slice: subset by points ---
        subset_points = self.points.iloc[idx]
        subset_pt_indices = list(subset_points.index)

        # Collect granule indices (into self.granules) needed by the subset.
        needed_g_idx: list[int] = []
        seen_g: set[int] = set()
        for pt_idx in subset_pt_indices:
            for g_idx in self.point_granule_map.get(pt_idx, []):
                if g_idx not in seen_g:
                    needed_g_idx.append(g_idx)
                    seen_g.add(g_idx)
        needed_g_idx.sort()

        # Build re-index map: old granule index → new granule index.
        g_remap: dict[int, int] = {old: new for new, old in enumerate(needed_g_idx)}

        # New granules with corrected result_index (sequential from 0).
        new_granules = [
            GranuleMeta(
                granule_id=self.granules[old_g].granule_id,
                begin=self.granules[old_g].begin,
                end=self.granules[old_g].end,
                bbox=self.granules[old_g].bbox,
                result_index=new_g,
            )
            for new_g, old_g in enumerate(needed_g_idx)
        ]

        # New results list — only the results referenced by kept granules.
        new_results = [self.results[self.granules[old_g].result_index] for old_g in needed_g_idx]

        # New point_granule_map using re-indexed granule indices.
        new_pgm: dict[Any, list[int]] = {
            pt_idx: [g_remap[g] for g in self.point_granule_map.get(pt_idx, [])]
            for pt_idx in subset_pt_indices
        }

        return Plan(
            points=subset_points,
            results=new_results,
            granules=new_granules,
            point_granule_map=new_pgm,
            variables=list(self.variables),
            source_kwargs=dict(self.source_kwargs),
            time_buffer=self.time_buffer,
            pts_y_col_original=self.pts_y_col_original,
            pts_x_col_original=self.pts_x_col_original,
            pts_time_col_original=self.pts_time_col_original,
        )

    # ------------------------------------------------------------------
    # Dataset opening helpers
    # ------------------------------------------------------------------

    def open_dataset(
        self,
        result: "int | Any",
        open_method: "str | dict | None" = None,
        *,
        coord_spec: "dict | None" = None,
        silent: bool = False,
    ) -> "Any":
        """Open a single granule result as an :class:`xarray.Dataset` or DataTree.

        Parameters
        ----------
        result:
            An integer index into ``plan.results`` (e.g. ``0``), or a
            single earthaccess result object obtained via ``plan[n]``.
            Using an integer is preferred: ``plan.open_dataset(0)`` is
            equivalent to ``plan.open_dataset(plan[0])``.
        open_method:
            How to open the granule.  Accepts the same string presets or
            dict spec as :func:`~point_collocation.matchup`.  Defaults to
            ``"auto"`` (try dataset first, fall back to datatree merge).

            **String presets:**

            * ``"dataset"`` — open with ``xarray.open_dataset`` (flat NetCDF).
            * ``"datatree"`` — open as a DataTree with all groups; returns the
              raw :class:`xarray.DataTree` (or ``datatree.DataTree``) without
              merging groups.  Equivalent to ``xarray.open_datatree(f)``.
            * ``"datatree-merge"`` — open as DataTree and merge all groups into
              a flat Dataset.
            * ``"auto"`` *(default)* — probe the file first; if lat/lon can be
              detected via ``xr.open_dataset``, use that; otherwise fall back to
              ``"datatree-merge"``.  The printed spec shows the **resolved** mode.

            Pass open-function kwargs via the ``"open_kwargs"`` key of a
            dict spec, e.g.
            ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
        coord_spec:
            Coordinate specification controlling how axis/coordinate names are
            interpreted for both the source dataset and the points DataFrame.
            Defaults to auto-detection of lat/lon/time from standard name
            candidates.  See :data:`~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC`
            for the full structure.
        silent:
            When ``False`` (default), print the effective open_method spec
            actually used (after normalization and auto-resolution) and a
            geolocation summary line.
            Set to ``True`` to suppress this output.

        Returns
        -------
        xarray.Dataset or xarray.DataTree
            A flat :class:`xarray.Dataset` for all modes except
            ``open_method="datatree"`` (or a dict spec with
            ``xarray_open="datatree"`` and ``merge=None``), which returns the
            raw DataTree.
            The caller is responsible for closing the returned object when
            finished (e.g. ``ds.close()``).
        """
        if isinstance(result, int):
            n = len(self.results)
            if result < 0 or result >= n:
                raise IndexError(
                    f"result index {result} is out of range for a plan with {n} result(s). "
                    f"Valid indices are 0 to {n - 1}."
                )
            result = self.results[result]

        from point_collocation.core._open_method import (
            _apply_coord_spec_to_spec,
            _apply_coords,
            _build_effective_open_kwargs,
            _geoloc_description,
            _merge_datatree_with_spec,
            _normalize_open_method,
            _open_and_merge_dataset_groups,
            _open_datatree_fn,
            _resolve_auto_spec,
            _suppress_dask_progress,
        )
        from point_collocation.core._coord_spec import _normalize_coord_spec
        from point_collocation.core.engine import _find_time_dim

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        effective_open_method = "auto" if open_method is None else open_method
        spec = _normalize_open_method(effective_open_method)

        # Bridge coord_spec y/x sources into spec["coords"] with conflict detection.
        resolved_coord_spec = _normalize_coord_spec(coord_spec)
        spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

        xarray_open = spec.get("xarray_open", "dataset")
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
        if len(file_objs) != 1:
            raise RuntimeError(
                f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
            )
        file_obj = file_objs[0]

        # For "auto" mode, probe the file first so that the printed spec shows
        # the actual resolved mode (e.g. "dataset" or "datatree"), not "auto".
        # Any ValueError from _resolve_auto_spec (both probes failed) is
        # propagated to the caller rather than silently downgrading to an
        # empty-dataset fallback.
        if xarray_open == "auto":
            spec = _resolve_auto_spec(file_obj, spec)
            xarray_open = spec["xarray_open"]
            effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        if not silent:
            display_spec = {k: v for k, v in spec.items() if not k.startswith("_")}
            display_spec["open_kwargs"] = effective_kwargs
            display_spec.setdefault("merge", None)
            print(f"open_method: {display_spec!r}")
            reason = spec.get("_auto_switch_reason")
            if reason:
                print(f"open_method='auto' switched to 'datatree': {reason}")

        def _try_print_geoloc(
            ds: "xr.Dataset",
            spec: dict,
            *,
            silent: bool,
            plan: "Plan",
        ) -> None:
            """Print geolocation summary or a 'not found' note."""
            if silent:
                return
            try:
                ds_coord, lon_n, lat_n = _apply_coords(ds, spec)
                time_dim = _find_time_dim(ds_coord)
                print(
                    _geoloc_description(
                        ds_coord,
                        lon_n,
                        lat_n,
                        spec,
                        time_dim=time_dim,
                        pts_y_col=plan.pts_y_col_original,
                        pts_x_col=plan.pts_x_col_original,
                        pts_time_col=plan.pts_time_col_original,
                    )
                )
            except ValueError as exc:
                print(f"Geolocation: could not detect lat/lon in dataset — {exc}")

        def _promote_coords(ds: "xr.Dataset", spec: dict) -> "xr.Dataset":
            """Apply coord promotion from *spec* to *ds*, returning the updated dataset.

            Silently returns *ds* unchanged if geolocation detection fails —
            the user can still work with the dataset; they'll have seen the error
            from ``_try_print_geoloc`` if ``silent=False``.
            """
            try:
                ds_promoted, _, _ = _apply_coords(ds, spec)
                return ds_promoted
            except ValueError:
                return ds

        if xarray_open == "datatree":
            merge = spec.get("merge")
            if merge is None:
                # Return the raw DataTree without merging — like open_datatree(f).
                # Still try to detect geolocation from the DataTree's root node so
                # the user gets a summary even when groups are not merged.
                dt = _open_datatree_fn(file_obj, effective_kwargs)
                if not silent:
                    try:
                        root_ds = dt.to_dataset()  # type: ignore[union-attr]
                    except AttributeError:
                        root_ds = None
                    if root_ds is not None:
                        _try_print_geoloc(root_ds, spec, silent=False, plan=self)
                    else:
                        print("Geolocation: DataTree returned without merging — could not read root dataset.")
                return dt
            # merge is "all", "root", or a list: merge groups into a flat Dataset.
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            try:
                ds = _merge_datatree_with_spec(dt, spec)
            finally:
                if hasattr(dt, "close"):
                    dt.close()
            _try_print_geoloc(ds, spec, silent=silent, plan=self)
            return _promote_coords(ds, spec)

        if xarray_open == "dataset":
            merge = spec.get("merge")
            if merge is not None:
                # Dataset-based group merge: open each group and merge.
                ds = _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
            else:
                with _suppress_dask_progress():
                    ds = xr.open_dataset(file_obj, **effective_kwargs)  # type: ignore[arg-type]
            _try_print_geoloc(ds, spec, silent=silent, plan=self)
            return _promote_coords(ds, spec)

        raise ValueError(
            f"open_method['xarray_open']={xarray_open!r} is not valid for open_dataset."
        )

    def open_mfdataset(
        self,
        results: "list[Any] | Plan",
        open_method: "str | dict | None" = None,
        *,
        silent: bool = False,
    ) -> "xr.Dataset":
        """Open multiple granule results as a single :class:`xarray.Dataset`.

        Parameters
        ----------
        results:
            A list of earthaccess result objects, or a :class:`Plan`
            (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
            ``results`` attribute is used.
        open_method:
            How to open each granule.  ``"dataset"`` uses
            ``xarray.open_mfdataset`` across all file objects.
            ``"datatree-merge"`` opens each granule as a DataTree, merges
            its groups into a flat dataset, then concatenates all granules
            along a new ``granule`` dimension.  Defaults to ``"auto"``.
            Pass open-function kwargs via the ``"open_kwargs"`` key of a
            dict spec, e.g.
            ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
        silent:
            When ``False`` (default), print the effective open_method spec
            actually used (after normalization and defaults are applied).
            Set to ``True`` to suppress this output.

        Returns
        -------
        xarray.Dataset
        """
        from point_collocation.core._open_method import (
            _build_effective_open_kwargs,
            _merge_datatree_with_spec,
            _normalize_open_method,
            _open_and_merge_dataset_groups,
            _open_as_flat_dataset,
            _open_datatree_fn,
            _suppress_dask_progress,
        )

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        effective_open_method = "auto" if open_method is None else open_method
        spec = _normalize_open_method(effective_open_method)

        xarray_open = spec.get("xarray_open", "dataset")
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        if not silent:
            display_spec = {**spec, "open_kwargs": effective_kwargs}
            display_spec.setdefault("merge", None)
            print(f"open_method: {display_spec!r}")

        result_list = results.results if isinstance(results, Plan) else list(results)
        file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

        if xarray_open == "datatree":
            # Open each granule as a DataTree, merge its groups, then
            # concatenate all granule datasets along a new "granule" dim.
            merged_datasets: list[xr.Dataset] = []
            for file_obj in file_objs:
                dt = _open_datatree_fn(file_obj, effective_kwargs)
                try:
                    merged_datasets.append(_merge_datatree_with_spec(dt, spec))
                finally:
                    if hasattr(dt, "close"):
                        dt.close()
            if not merged_datasets:
                return xr.Dataset()
            return xr.concat(merged_datasets, dim="granule")

        if xarray_open in ("dataset", "auto"):
            # For dataset mode with merge, open each granule's groups as
            # separate datasets and merge them, then concatenate all granules
            # along a new "granule" dimension.
            # Without merge, use xr.open_mfdataset for simplicity.
            merge = spec.get("merge")
            if merge is not None:
                merged_datasets = []
                for file_obj in file_objs:
                    merged_datasets.append(
                        _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
                    )
                if not merged_datasets:
                    return xr.Dataset()
                return xr.concat(merged_datasets, dim="granule")
            with _suppress_dask_progress():
                return xr.open_mfdataset(file_objs, **effective_kwargs)  # type: ignore[arg-type]

        raise ValueError(
            f"open_method['xarray_open']={xarray_open!r} is not valid for open_mfdataset."
        )

    # ------------------------------------------------------------------
    # Variable inspection (removed; use open_dataset(0) instead)
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Summary
    # ------------------------------------------------------------------

    def summary(self, n: int | None = None) -> None:
        """Print a human-readable summary of the plan.

        Parameters
        ----------
        n:
            Number of points to show in the per-point section.
            Defaults to ``min(5, len(self.points))``.
            ``0`` or negative values suppress the per-point section.
        """
        if n is None:
            n = min(5, len(self.points))
        elif n < 0:
            n = 0

        zero_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) == 0
        )
        multi_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) > 1
        )

        matched_granule_count = len(
            {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
        )

        lines: list[str] = [
            f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
            f"  Points with 0 matches : {zero_match}",
            f"  Points with >1 matches: {multi_match}",
            f"  Time buffer: {self.time_buffer}",
        ]

        n_show = min(n, len(self.points))
        if n_show > 0:
            lines.append("")
            lines.append(f"First {n_show} point(s):")
            for pt_idx, row in self.points.head(n_show).iterrows():
                g_indices = self.point_granule_map.get(pt_idx, [])
                lines.append(
                    f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                    f"time={row['time']}: {len(g_indices)} match(es)"
                )
                for g_idx in g_indices:
                    lines.append(f"    → {self.granules[g_idx].granule_id}")

        print("\n".join(lines))

summary

summary(n: int | None = None) -> None

Print a human-readable summary of the plan.

Parameters:

Name	Type	Description	Default
`n`	`int \| None`	Number of points to show in the per-point section. Defaults to `min(5, len(self.points))`. `0` or negative values suppress the per-point section.	`None`

Source code in point_collocation/core/plan.py

def summary(self, n: int | None = None) -> None:
    """Print a human-readable summary of the plan.

    Parameters
    ----------
    n:
        Number of points to show in the per-point section.
        Defaults to ``min(5, len(self.points))``.
        ``0`` or negative values suppress the per-point section.
    """
    if n is None:
        n = min(5, len(self.points))
    elif n < 0:
        n = 0

    zero_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) == 0
    )
    multi_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) > 1
    )

    matched_granule_count = len(
        {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
    )

    lines: list[str] = [
        f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
        f"  Points with 0 matches : {zero_match}",
        f"  Points with >1 matches: {multi_match}",
        f"  Time buffer: {self.time_buffer}",
    ]

    n_show = min(n, len(self.points))
    if n_show > 0:
        lines.append("")
        lines.append(f"First {n_show} point(s):")
        for pt_idx, row in self.points.head(n_show).iterrows():
            g_indices = self.point_granule_map.get(pt_idx, [])
            lines.append(
                f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                f"time={row['time']}: {len(g_indices)} match(es)"
            )
            for g_idx in g_indices:
                lines.append(f"    → {self.granules[g_idx].granule_id}")

    print("\n".join(lines))

open_dataset

open_dataset(result: 'int | Any', open_method: 'str | dict | None' = None, *, coord_spec: 'dict | None' = None, silent: bool = False) -> 'Any'

Open a single granule result as an :class:xarray.Dataset or DataTree.

Parameters:

Name	Type	Description	Default
`result`	`'int \| Any'`	An integer index into `plan.results` (e.g. `0`), or a single earthaccess result object obtained via `plan[n]`. Using an integer is preferred: `plan.open_dataset(0)` is equivalent to `plan.open_dataset(plan[0])`.	required
`open_method`	`'str \| dict \| None'`	How to open the granule. Accepts the same string presets or dict spec as :func:`~point_collocation.matchup`. Defaults to `"auto"` (try dataset first, fall back to datatree merge). String presets: `"dataset"` — open with `xarray.open_dataset` (flat NetCDF). `"datatree"` — open as a DataTree with all groups; returns the raw :class:`xarray.DataTree` (or `datatree.DataTree`) without merging groups. Equivalent to `xarray.open_datatree(f)`. `"datatree-merge"` — open as DataTree and merge all groups into a flat Dataset. `"auto"` (default) — probe the file first; if lat/lon can be detected via `xr.open_dataset`, use that; otherwise fall back to `"datatree-merge"`. The printed spec shows the resolved mode. Pass open-function kwargs via the `"open_kwargs"` key of a dict spec, e.g. `open_method={"open_kwargs": {"engine": "netcdf4"}}`.	`None`
`coord_spec`	`'dict \| None'`	Coordinate specification controlling how axis/coordinate names are interpreted for both the source dataset and the points DataFrame. Defaults to auto-detection of lat/lon/time from standard name candidates. See :data:`~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC` for the full structure.	`None`
`silent`	`bool`	When `False` (default), print the effective open_method spec actually used (after normalization and auto-resolution) and a geolocation summary line. Set to `True` to suppress this output.	`False`

Returns:

Type	Description
`Dataset or DataTree`	A flat :class:`xarray.Dataset` for all modes except `open_method="datatree"` (or a dict spec with `xarray_open="datatree"` and `merge=None`), which returns the raw DataTree. The caller is responsible for closing the returned object when finished (e.g. `ds.close()`).

Source code in point_collocation/core/plan.py

def open_dataset(
    self,
    result: "int | Any",
    open_method: "str | dict | None" = None,
    *,
    coord_spec: "dict | None" = None,
    silent: bool = False,
) -> "Any":
    """Open a single granule result as an :class:`xarray.Dataset` or DataTree.

    Parameters
    ----------
    result:
        An integer index into ``plan.results`` (e.g. ``0``), or a
        single earthaccess result object obtained via ``plan[n]``.
        Using an integer is preferred: ``plan.open_dataset(0)`` is
        equivalent to ``plan.open_dataset(plan[0])``.
    open_method:
        How to open the granule.  Accepts the same string presets or
        dict spec as :func:`~point_collocation.matchup`.  Defaults to
        ``"auto"`` (try dataset first, fall back to datatree merge).

        **String presets:**

        * ``"dataset"`` — open with ``xarray.open_dataset`` (flat NetCDF).
        * ``"datatree"`` — open as a DataTree with all groups; returns the
          raw :class:`xarray.DataTree` (or ``datatree.DataTree``) without
          merging groups.  Equivalent to ``xarray.open_datatree(f)``.
        * ``"datatree-merge"`` — open as DataTree and merge all groups into
          a flat Dataset.
        * ``"auto"`` *(default)* — probe the file first; if lat/lon can be
          detected via ``xr.open_dataset``, use that; otherwise fall back to
          ``"datatree-merge"``.  The printed spec shows the **resolved** mode.

        Pass open-function kwargs via the ``"open_kwargs"`` key of a
        dict spec, e.g.
        ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
    coord_spec:
        Coordinate specification controlling how axis/coordinate names are
        interpreted for both the source dataset and the points DataFrame.
        Defaults to auto-detection of lat/lon/time from standard name
        candidates.  See :data:`~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC`
        for the full structure.
    silent:
        When ``False`` (default), print the effective open_method spec
        actually used (after normalization and auto-resolution) and a
        geolocation summary line.
        Set to ``True`` to suppress this output.

    Returns
    -------
    xarray.Dataset or xarray.DataTree
        A flat :class:`xarray.Dataset` for all modes except
        ``open_method="datatree"`` (or a dict spec with
        ``xarray_open="datatree"`` and ``merge=None``), which returns the
        raw DataTree.
        The caller is responsible for closing the returned object when
        finished (e.g. ``ds.close()``).
    """
    if isinstance(result, int):
        n = len(self.results)
        if result < 0 or result >= n:
            raise IndexError(
                f"result index {result} is out of range for a plan with {n} result(s). "
                f"Valid indices are 0 to {n - 1}."
            )
        result = self.results[result]

    from point_collocation.core._open_method import (
        _apply_coord_spec_to_spec,
        _apply_coords,
        _build_effective_open_kwargs,
        _geoloc_description,
        _merge_datatree_with_spec,
        _normalize_open_method,
        _open_and_merge_dataset_groups,
        _open_datatree_fn,
        _resolve_auto_spec,
        _suppress_dask_progress,
    )
    from point_collocation.core._coord_spec import _normalize_coord_spec
    from point_collocation.core.engine import _find_time_dim

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method)

    # Bridge coord_spec y/x sources into spec["coords"] with conflict detection.
    resolved_coord_spec = _normalize_coord_spec(coord_spec)
    spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

    xarray_open = spec.get("xarray_open", "dataset")
    effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
    if len(file_objs) != 1:
        raise RuntimeError(
            f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
        )
    file_obj = file_objs[0]

    # For "auto" mode, probe the file first so that the printed spec shows
    # the actual resolved mode (e.g. "dataset" or "datatree"), not "auto".
    # Any ValueError from _resolve_auto_spec (both probes failed) is
    # propagated to the caller rather than silently downgrading to an
    # empty-dataset fallback.
    if xarray_open == "auto":
        spec = _resolve_auto_spec(file_obj, spec)
        xarray_open = spec["xarray_open"]
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    if not silent:
        display_spec = {k: v for k, v in spec.items() if not k.startswith("_")}
        display_spec["open_kwargs"] = effective_kwargs
        display_spec.setdefault("merge", None)
        print(f"open_method: {display_spec!r}")
        reason = spec.get("_auto_switch_reason")
        if reason:
            print(f"open_method='auto' switched to 'datatree': {reason}")

    def _try_print_geoloc(
        ds: "xr.Dataset",
        spec: dict,
        *,
        silent: bool,
        plan: "Plan",
    ) -> None:
        """Print geolocation summary or a 'not found' note."""
        if silent:
            return
        try:
            ds_coord, lon_n, lat_n = _apply_coords(ds, spec)
            time_dim = _find_time_dim(ds_coord)
            print(
                _geoloc_description(
                    ds_coord,
                    lon_n,
                    lat_n,
                    spec,
                    time_dim=time_dim,
                    pts_y_col=plan.pts_y_col_original,
                    pts_x_col=plan.pts_x_col_original,
                    pts_time_col=plan.pts_time_col_original,
                )
            )
        except ValueError as exc:
            print(f"Geolocation: could not detect lat/lon in dataset — {exc}")

    def _promote_coords(ds: "xr.Dataset", spec: dict) -> "xr.Dataset":
        """Apply coord promotion from *spec* to *ds*, returning the updated dataset.

        Silently returns *ds* unchanged if geolocation detection fails —
        the user can still work with the dataset; they'll have seen the error
        from ``_try_print_geoloc`` if ``silent=False``.
        """
        try:
            ds_promoted, _, _ = _apply_coords(ds, spec)
            return ds_promoted
        except ValueError:
            return ds

    if xarray_open == "datatree":
        merge = spec.get("merge")
        if merge is None:
            # Return the raw DataTree without merging — like open_datatree(f).
            # Still try to detect geolocation from the DataTree's root node so
            # the user gets a summary even when groups are not merged.
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            if not silent:
                try:
                    root_ds = dt.to_dataset()  # type: ignore[union-attr]
                except AttributeError:
                    root_ds = None
                if root_ds is not None:
                    _try_print_geoloc(root_ds, spec, silent=False, plan=self)
                else:
                    print("Geolocation: DataTree returned without merging — could not read root dataset.")
            return dt
        # merge is "all", "root", or a list: merge groups into a flat Dataset.
        dt = _open_datatree_fn(file_obj, effective_kwargs)
        try:
            ds = _merge_datatree_with_spec(dt, spec)
        finally:
            if hasattr(dt, "close"):
                dt.close()
        _try_print_geoloc(ds, spec, silent=silent, plan=self)
        return _promote_coords(ds, spec)

    if xarray_open == "dataset":
        merge = spec.get("merge")
        if merge is not None:
            # Dataset-based group merge: open each group and merge.
            ds = _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
        else:
            with _suppress_dask_progress():
                ds = xr.open_dataset(file_obj, **effective_kwargs)  # type: ignore[arg-type]
        _try_print_geoloc(ds, spec, silent=silent, plan=self)
        return _promote_coords(ds, spec)

    raise ValueError(
        f"open_method['xarray_open']={xarray_open!r} is not valid for open_dataset."
    )

open_mfdataset

open_mfdataset(results: 'list[Any] | Plan', open_method: 'str | dict | None' = None, *, silent: bool = False) -> 'xr.Dataset'

Open multiple granule results as a single :class:xarray.Dataset.

Parameters:

Name	Type	Description	Default
`results`	`'list[Any] \| Plan'`	A list of earthaccess result objects, or a :class:`Plan` (e.g. `plan[0:2]`). When a :class:`Plan` is passed its `results` attribute is used.	required
`open_method`	`'str \| dict \| None'`	How to open each granule. `"dataset"` uses `xarray.open_mfdataset` across all file objects. `"datatree-merge"` opens each granule as a DataTree, merges its groups into a flat dataset, then concatenates all granules along a new `granule` dimension. Defaults to `"auto"`. Pass open-function kwargs via the `"open_kwargs"` key of a dict spec, e.g. `open_method={"open_kwargs": {"engine": "netcdf4"}}`.	`None`
`silent`	`bool`	When `False` (default), print the effective open_method spec actually used (after normalization and defaults are applied). Set to `True` to suppress this output.	`False`

Returns:

Type	Description
`Dataset`

Source code in point_collocation/core/plan.py

def open_mfdataset(
    self,
    results: "list[Any] | Plan",
    open_method: "str | dict | None" = None,
    *,
    silent: bool = False,
) -> "xr.Dataset":
    """Open multiple granule results as a single :class:`xarray.Dataset`.

    Parameters
    ----------
    results:
        A list of earthaccess result objects, or a :class:`Plan`
        (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
        ``results`` attribute is used.
    open_method:
        How to open each granule.  ``"dataset"`` uses
        ``xarray.open_mfdataset`` across all file objects.
        ``"datatree-merge"`` opens each granule as a DataTree, merges
        its groups into a flat dataset, then concatenates all granules
        along a new ``granule`` dimension.  Defaults to ``"auto"``.
        Pass open-function kwargs via the ``"open_kwargs"`` key of a
        dict spec, e.g.
        ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
    silent:
        When ``False`` (default), print the effective open_method spec
        actually used (after normalization and defaults are applied).
        Set to ``True`` to suppress this output.

    Returns
    -------
    xarray.Dataset
    """
    from point_collocation.core._open_method import (
        _build_effective_open_kwargs,
        _merge_datatree_with_spec,
        _normalize_open_method,
        _open_and_merge_dataset_groups,
        _open_as_flat_dataset,
        _open_datatree_fn,
        _suppress_dask_progress,
    )

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method)

    xarray_open = spec.get("xarray_open", "dataset")
    effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    if not silent:
        display_spec = {**spec, "open_kwargs": effective_kwargs}
        display_spec.setdefault("merge", None)
        print(f"open_method: {display_spec!r}")

    result_list = results.results if isinstance(results, Plan) else list(results)
    file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

    if xarray_open == "datatree":
        # Open each granule as a DataTree, merge its groups, then
        # concatenate all granule datasets along a new "granule" dim.
        merged_datasets: list[xr.Dataset] = []
        for file_obj in file_objs:
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            try:
                merged_datasets.append(_merge_datatree_with_spec(dt, spec))
            finally:
                if hasattr(dt, "close"):
                    dt.close()
        if not merged_datasets:
            return xr.Dataset()
        return xr.concat(merged_datasets, dim="granule")

    if xarray_open in ("dataset", "auto"):
        # For dataset mode with merge, open each granule's groups as
        # separate datasets and merge them, then concatenate all granules
        # along a new "granule" dimension.
        # Without merge, use xr.open_mfdataset for simplicity.
        merge = spec.get("merge")
        if merge is not None:
            merged_datasets = []
            for file_obj in file_objs:
                merged_datasets.append(
                    _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
                )
            if not merged_datasets:
                return xr.Dataset()
            return xr.concat(merged_datasets, dim="granule")
        with _suppress_dask_progress():
            return xr.open_mfdataset(file_objs, **effective_kwargs)  # type: ignore[arg-type]

    raise ValueError(
        f"open_method['xarray_open']={xarray_open!r} is not valid for open_mfdataset."
    )

IO / Adapters

point_collocation.adapters

Source adapters that normalise heterogeneous inputs into the SourceProtocol.

Built-in adapters

earthaccess : wraps file-like objects returned by earthaccess.open()

Future adapters (not yet implemented)

stac : STAC item assets url : plain HTTPS URLs local : local file paths

SourceAdapter

Bases: ABC

Abstract base for source adapters.

Subclass this to add support for a new data source. The core engine only calls :meth:open_dataset; everything else is internal to the adapter.

Source code in point_collocation/adapters/base.py

class SourceAdapter(ABC):
    """Abstract base for source adapters.

    Subclass this to add support for a new data source.  The core
    engine only calls :meth:`open_dataset`; everything else is internal
    to the adapter.
    """

    @abstractmethod
    def open_dataset(self, **kwargs: object) -> object:
        """Return an ``xarray.Dataset`` for this source.

        Parameters
        ----------
        **kwargs:
            Forwarded verbatim to ``xarray.open_dataset``.
        """
        raise NotImplementedError  # pragma: no cover

open_dataset `abstractmethod`

open_dataset(**kwargs: object) -> object

Return an xarray.Dataset for this source.

Parameters:

Name	Type	Description	Default
`**kwargs`	`object`	Forwarded verbatim to `xarray.open_dataset`.	`{}`

Source code in point_collocation/adapters/base.py

@abstractmethod
def open_dataset(self, **kwargs: object) -> object:
    """Return an ``xarray.Dataset`` for this source.

    Parameters
    ----------
    **kwargs:
        Forwarded verbatim to ``xarray.open_dataset``.
    """
    raise NotImplementedError  # pragma: no cover

Search

point_collocation.core._granule

Helpers for working with individual granules (source files).

Responsibilities

Extract a human-readable identifier from an arbitrary source object.
Parse the temporal coverage (start/end date) from a NASA-style L3 granule filename.

Supported filename conventions

YYYYDOY — single day (DOY = day-of-year, 001–366) YYYYDOY_YYYYDOY — multi-day range (e.g., 8-day composites, monthly) YYYYMMDD — single day in calendar format YYYYMMDD_YYYYMMDD — multi-day range in calendar format

The period keyword embedded in the filename (.DAY., .8D., .MO.) is used to infer the end date when only a start date is present.

Examples of supported filenames

PACE_OCI_2024070.L3m.DAY.RRS.Rrs_412.4km.nc
PACE_OCI_2024049_2024056.L3m.8D.CHL.chlor_a.9km.nc
AQUA_MODIS.20230601.L3m.DAY.SST.sst.4km.nc
AQUA_MODIS.20230601_20230630.L3m.MO.CHL.chlor_a.9km.nc

get_source_id

get_source_id(source: object) -> str

Return a human-readable identifier (basename) for source.

Tries, in order:

pathlib.Path → path.name
Plain str → os.path.basename(source)
Object with a .path or .name string attribute
str(source) as last resort

Source code in point_collocation/core/_granule.py

def get_source_id(source: object) -> str:
    """Return a human-readable identifier (basename) for *source*.

    Tries, in order:

    1. ``pathlib.Path`` → ``path.name``
    2. Plain ``str`` → ``os.path.basename(source)``
    3. Object with a ``.path`` or ``.name`` string attribute
    4. ``str(source)`` as last resort
    """
    if isinstance(source, pathlib.Path):
        return source.name
    if isinstance(source, str):
        return os.path.basename(source)
    for attr in ("path", "name"):
        val = getattr(source, attr, None)
        if isinstance(val, str) and val:
            return os.path.basename(val)
    return str(source)

parse_temporal_range

parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]

Return (start, end) timestamps for the granule named filename.

Only the basename of filename is examined.

Parameters:

Name	Type	Description	Default
`filename`	`str`	File path or basename.	required

Returns:

Type	Description
`tuple[Timestamp, Timestamp]`	Inclusive start and end dates (time component is midnight UTC).

Raises:

Type	Description
`ValueError`	If no recognisable date pattern is found in filename.

Source code in point_collocation/core/_granule.py

def parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]:
    """Return ``(start, end)`` timestamps for the granule named *filename*.

    Only the basename of *filename* is examined.

    Parameters
    ----------
    filename:
        File path or basename.

    Returns
    -------
    tuple[pandas.Timestamp, pandas.Timestamp]
        Inclusive start and end dates (time component is midnight UTC).

    Raises
    ------
    ValueError
        If no recognisable date pattern is found in *filename*.
    """
    basename = os.path.basename(filename)

    # ------------------------------------------------------------------
    # DOY-format pair:  YYYYDOY_YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})_(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = datetime.strptime(m.group(2), "%Y%j")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Calendar-format pair:  YYYYMMDD_YYYYMMDD
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})_(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = datetime.strptime(m.group(2), "%Y%m%d")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single DOY date:  YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single calendar date:  YYYYMMDD (must start with "20…")
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    raise ValueError(
        f"Cannot parse temporal range from filename: {basename!r}"
    )

API Reference

point_collocation

Core

plan

point_collocation.core.plan.plan

matchup

point_collocation.core.engine.matchup

Plan

point_collocation.core.plan.Plan dataclass

summary

open_dataset

open_mfdataset

IO / Adapters

point_collocation.adapters

SourceAdapter

open_dataset abstractmethod

Search

point_collocation.core._granule

get_source_id

parse_temporal_range

`point_collocation`

point_collocation.core.plan.Plan `dataclass`

open_dataset `abstractmethod`