Skip to content

API Reference

Auto-generated from source docstrings.


point_collocation

Top-level convenience imports:

import point_collocation as pc

pc.plan(...)     # build a matchup plan
pc.matchup(...)  # execute the plan

Core

plan

point_collocation.core.plan.plan

plan(points: PointsFrame, *, data_source: str = 'earthaccess', source_kwargs: dict[str, Any] | None = None, time_buffer: str | Timedelta | timedelta | int = '0h') -> Plan

Build a :class:Plan previewing which granules cover each point.

Parameters:

Name Type Description Default
points PointsFrame

DataFrame with at minimum lat, lon, and time (or date as an alias). If the column is named date and contains date-only values, the time-of-day is set to noon (12:00 UTC) for matching purposes.

An optional pc_id column may be included to supply custom point identifiers. If present, these values must be unique; duplicate pc_id values raise a :class:ValueError. Any additional columns beyond lat, lon, time, and pc_id are preserved and included in the output returned by :func:~point_collocation.matchup.

required
data_source str

Data source to search. Currently only "earthaccess" is supported.

'earthaccess'
source_kwargs dict[str, Any] | None

Keyword arguments forwarded to earthaccess.search_data(). Must contain at least one of "short_name", "concept_id", or "doi". The special keys "access" and "in_region" are not forwarded to search_data(); instead they are passed to result.data_links() on every returned granule to control which link type is used (e.g. "access": "direct" for S3). Granules whose data_links() returns an empty list for the given kwargs are silently excluded from the plan.

None
time_buffer str | Timedelta | timedelta | int

Extra temporal margin when matching a point to a granule. A point at time t matches a granule whose coverage is [begin, end] if begin - buffer ≤ t ≤ end + buffer. Accepts a :class:pandas.Timedelta, :class:datetime.timedelta, or a pandas-parseable string ("12H", "30min", …). Default is "0h" (exact overlap required).

'0h'

Returns:

Type Description
Plan

The planning object; inspect with :meth:Plan.summary and execute with :func:~point_collocation.matchup.

Raises:

Type Description
ValueError

If points is missing required columns, data_source is not recognised, source_kwargs does not contain at least one of "short_name", "concept_id", or "doi", or the pc_id column contains duplicate values.

ImportError

If the earthaccess package is not installed.

Source code in point_collocation/core/plan.py
def plan(
    points: PointsFrame,
    *,
    data_source: str = "earthaccess",
    source_kwargs: dict[str, Any] | None = None,
    time_buffer: str | pd.Timedelta | datetime.timedelta | int = "0h",
) -> Plan:
    """Build a :class:`Plan` previewing which granules cover each point.

    Parameters
    ----------
    points:
        DataFrame with at minimum ``lat``, ``lon``, and ``time`` (or
        ``date`` as an alias).  If the column is named ``date`` and
        contains date-only values, the time-of-day is set to noon
        (12:00 UTC) for matching purposes.

        An optional ``pc_id`` column may be included to supply custom
        point identifiers.  If present, these values must be unique;
        duplicate ``pc_id`` values raise a :class:`ValueError`.  Any
        additional columns beyond ``lat``, ``lon``, ``time``, and
        ``pc_id`` are preserved and included in the output returned by
        :func:`~point_collocation.matchup`.
    data_source:
        Data source to search.  Currently only ``"earthaccess"`` is
        supported.
    source_kwargs:
        Keyword arguments forwarded to ``earthaccess.search_data()``.
        Must contain at least one of ``"short_name"``, ``"concept_id"``,
        or ``"doi"``.  The special keys ``"access"`` and ``"in_region"``
        are *not* forwarded to ``search_data()``; instead they are passed
        to ``result.data_links()`` on every returned granule to control
        which link type is used (e.g. ``"access": "direct"`` for S3).
        Granules whose ``data_links()`` returns an empty list for the
        given kwargs are silently excluded from the plan.
    time_buffer:
        Extra temporal margin when matching a point to a granule.  A
        point at time *t* matches a granule whose coverage is
        ``[begin, end]`` if ``begin - buffer ≤ t ≤ end + buffer``.
        Accepts a :class:`pandas.Timedelta`, :class:`datetime.timedelta`,
        or a pandas-parseable string (``"12H"``, ``"30min"``, …).
        Default is ``"0h"`` (exact overlap required).

    Returns
    -------
    Plan
        The planning object; inspect with :meth:`Plan.summary` and
        execute with :func:`~point_collocation.matchup`.

    Raises
    ------
    ValueError
        If *points* is missing required columns, *data_source* is not
        recognised, ``source_kwargs`` does not contain at least one of
        ``"short_name"``, ``"concept_id"``, or ``"doi"``, or the
        ``pc_id`` column contains duplicate values.
    ImportError
        If the ``earthaccess`` package is not installed.
    """
    if data_source != "earthaccess":
        raise ValueError(
            f"Unknown data_source {data_source!r}. "
            "Currently only 'earthaccess' is supported."
        )

    points, y_orig, x_orig, time_orig = _plan_normalise_columns(points)
    _plan_validate_points(points)

    buffer = _parse_time_buffer(time_buffer)
    results, granule_metas = _search_earthaccess(points, source_kwargs=source_kwargs)
    point_granule_map = _match_points_to_granules(points, granule_metas, buffer)

    return Plan(
        points=points,
        results=results,
        granules=granule_metas,
        point_granule_map=point_granule_map,
        source_kwargs=dict(source_kwargs or {}),
        time_buffer=buffer,
        pts_y_col_original=y_orig,
        pts_x_col_original=x_orig,
        pts_time_col_original=time_orig,
    )

matchup

point_collocation.core.engine.matchup

matchup(plan: 'Plan', *, open_method: str | dict | None = None, variables: list[str] | None = None, spatial_method: str | None = None, open_dataset_kwargs: dict | None = None, coord_spec: dict | None = None, silent: bool = True, batch_size: int | None = None, save_dir: str | PathLike | None = None, granule_range: tuple[int, int] | None = None) -> pd.DataFrame

Extract variables from cloud-hosted granules at the given points.

Parameters:

Name Type Description Default
plan 'Plan'

A :class:~point_collocation.core.plan.Plan object previously built with :func:~point_collocation.plan. Data source and search parameters are taken from the plan. One output row is produced per (point, granule) pair; points with zero matching granules produce a single NaN row.

required
open_method str | dict | None

How granules are opened. Accepts a string preset or a dict spec.

String presets:

  • "dataset" — open with xarray.open_dataset (fast path for typical flat NetCDF files).
  • "datatree" — open as a raw DataTree without merging groups.
  • "datatree-merge" — open as DataTree and merge all groups into a flat Dataset (for grouped/HDF5-ish files).
  • "auto" (default) — try the fast "dataset" path first; if lat/lon coordinates cannot be detected, fall back to "datatree-merge" automatically.

Dict spec (advanced):

.. code-block:: python

open_method = {
    "xarray_open":           "dataset" | "datatree",
    "open_kwargs":           {},
    "merge":                 None | "all" | "root" | ["/path/a"],
    "merge_kwargs":          {},
    "coords":                "auto" | ["Lat", "Lon"] | {"lat": "...", "lon": "..."},
    "set_coords":            True,
    "dim_renames":           None | {"node": {"old": "new"}},
    "auto_align_phony_dims": None | "safe",
}

All keys are optional; missing keys receive sensible defaults. Unknown keys raise :exc:ValueError.

Pre-defined profiles for common products are importable from :mod:point_collocation.profiles (e.g. pace_l3, pace_l2).

None
variables list[str] | None

Variable names to extract from each granule. When provided, overrides any variables stored on the plan. When omitted, falls back to plan.variables. If the resolved list is empty, the output will have no variable columns. Raises :exc:ValueError if a requested variable is not found in the opened dataset.

None
spatial_method str | None

Method used for spatial matching.

  • "auto" (default) — automatically selects the best method based on the dimensionality of the geolocation coordinates:

  • 1-D coordinates (regular/gridded data, both lat and lon are 1-D): uses "axis" (vectorised ds.sel(..., method="nearest") over all points at once). If "axis" fails for any reason, falls back to "kdtree" automatically.

  • 2-D coordinates (irregular/swath data, or either coordinate is 2-D): uses "kdtree".

xoak-kdtree and xoak-haversine are never selected automatically; set them explicitly if needed.

  • "axis" — Vectorised ds.sel(..., method="nearest") for all points in a single call. Requires 1-D (regular-grid) coordinate arrays for both latitude and longitude; raises :exc:ValueError with a suggestion to use "auto" or "kdtree" for 2-D coordinates.
  • "kdtree" — xarray's built-in :class:xarray.indexes.NDPointIndex with the default ScipyKDTreeAdapter. Works with both 1-D and 2-D coordinate arrays (requires scipy).
  • "xoak-kdtree" — the xoak package's SklearnKDTreeAdapter. Works with both 1-D and 2-D arrays (requires xoak and scikit-learn).
  • "xoak-haversine" — the xoak package's SklearnGeoBallTreeAdapter, which uses the haversine metric for accurate great-circle distance calculations. Recommended for data near the poles where the Euclidean k-d tree used by "xoak-kdtree" can return incorrect nearest neighbours due to coordinate distortion. Works with both 1-D and 2-D arrays (requires xoak and scikit-learn). Lat/lon values are passed in degrees; the adapter converts them to radians internally.
None
open_dataset_kwargs dict | None

Optional dictionary of keyword arguments forwarded to the xarray open function for every granule opened during the run. These override any "open_kwargs" in open_method but are themselves overridden by their respective defaults only for missing keys (chunks{}, engine"h5netcdf", decode_timedeltaFalse).

None
coord_spec dict | None

Coordinate specification controlling how axis/coordinate names are interpreted for both the source dataset and the points DataFrame. Defaults to auto-detection of lat/lon/time from standard name candidates. Example usage with non-standard variable names and optional additional axes::

coord_spec = {
    "coordinate_system": "geographic",
    "y":    {"source": "grid_lat", "points": "lat"},
    "x":    {"source": "grid_lon", "points": "lon"},
    "time": {"source": "auto",      "points": "auto"},
    # optional additional axes:
    "depth":      {"source": "z",          "points": "depth"},
    "wavelength": {"source": "wavelength", "points": "wave"},
}

The source key is the variable/coordinate name in the source dataset; points is the column name in the points DataFrame. Set either to "auto" for standard-name auto-detection.

If coord_spec specifies source for y/x and open_method['coords'] also specifies explicit names, a :exc:ValueError is raised when they conflict. Set one side to "auto" to let the other take precedence.

Additional axes (beyond time) are optional; if the configured column is absent from the points DataFrame the axis is silently skipped.

None
silent bool

When True (default), all progress output is suppressed. Set to False to print a progress message to stdout after every batch_size granules.

True
batch_size int | None

Number of granules to process between progress reports (and between intermediate saves when save_dir is set). Defaults to None, which sets the batch size to one more than the total number of matched granules so that all granules are processed in a single batch.

None
save_dir str | PathLike | None

Directory in which intermediate results are saved as Parquet files after each batch of batch_size granules. The directory is created automatically if it does not exist. Each batch is saved as plan_<first>_<last>.parquet where first and last are the granule indices from the plan. When None (default), no intermediate files are written.

None
granule_range tuple[int, int] | None

Optional (start, end) tuple (both 1-based and inclusive) that restricts processing to a contiguous slice of the matched granules, ordered by granule index. For example, granule_range=(261, 620) resumes from granule 261 after a crash that completed granules 1–260. Progress messages continue to report absolute granule numbers (e.g. "granules 261-270 of 620 processed") so the output is directly comparable with messages from the original run. When None (default), all matched granules are processed.

None

Returns:

Type Description
DataFrame

One row per (point, granule) pair. In addition to the original point columns and one column per requested variable, the output always includes:

pc_id Point identifier. If the input dataframe contains a pc_id column those values are preserved as-is; otherwise the row index from the input dataframe is used. Duplicate pc_id values in the input are not allowed and raise a :class:ValueError during planning. granule_id Identifier of the granule that provided this row's values. granule_lat Latitude of the matched location inside the granule (i.e. the nearest-neighbour grid or swath position). granule_lon Longitude of the matched location inside the granule. granule_time Midpoint of the granule's temporal coverage, derived from the granule metadata (begin + (end - begin) / 2). For earthaccess granules, temporal information is stored in the search result metadata rather than in the dataset itself. For zero-match rows, this column is pandas.NaT.

Any extra columns present in the input dataframe are retained in the output. Points with zero matching granules contribute a single NaN row. The output is sorted to match the pc_id order from the input dataframe.

Raises:

Type Description
ValueError

If open_method is a string that is not a valid preset, or a dict with unknown keys or an invalid "xarray_open" value.

ValueError

If a requested variable is not present in an opened dataset.

ValueError

If geolocation variables cannot be detected unambiguously.

ValueError

If granule_range is not a 2-tuple of positive integers with start <= end, or if either bound exceeds the number of matched granules in the plan.

ImportError

If spatial_method="xoak-kdtree" and the xoak package is not installed.

ImportError

If spatial_method="xoak-haversine" and the xoak package is not installed.

ImportError

If spatial_method="kdtree" and scipy is not installed.

Source code in point_collocation/core/engine.py
def matchup(
    plan: "Plan",
    *,
    open_method: str | dict | None = None,
    variables: list[str] | None = None,
    spatial_method: str | None = None,
    open_dataset_kwargs: dict | None = None,
    coord_spec: dict | None = None,
    silent: bool = True,
    batch_size: int | None = None,
    save_dir: str | os.PathLike | None = None,
    granule_range: tuple[int, int] | None = None,
) -> pd.DataFrame:
    """Extract variables from cloud-hosted granules at the given points.

    Parameters
    ----------
    plan:
        A :class:`~point_collocation.core.plan.Plan` object previously
        built with :func:`~point_collocation.plan`.  Data source and
        search parameters are taken from the plan.  One output row is
        produced per (point, granule) pair; points with zero matching
        granules produce a single NaN row.
    open_method:
        How granules are opened.  Accepts a string preset or a dict spec.

        **String presets:**

        * ``"dataset"`` — open with ``xarray.open_dataset`` (fast path for
          typical flat NetCDF files).
        * ``"datatree"`` — open as a raw DataTree without merging groups.
        * ``"datatree-merge"`` — open as DataTree and merge all groups into
          a flat Dataset (for grouped/HDF5-ish files).
        * ``"auto"`` *(default)* — try the fast ``"dataset"`` path first; if
          lat/lon coordinates cannot be detected, fall back to
          ``"datatree-merge"`` automatically.

        **Dict spec** (advanced):

        .. code-block:: python

            open_method = {
                "xarray_open":           "dataset" | "datatree",
                "open_kwargs":           {},
                "merge":                 None | "all" | "root" | ["/path/a"],
                "merge_kwargs":          {},
                "coords":                "auto" | ["Lat", "Lon"] | {"lat": "...", "lon": "..."},
                "set_coords":            True,
                "dim_renames":           None | {"node": {"old": "new"}},
                "auto_align_phony_dims": None | "safe",
            }

        All keys are optional; missing keys receive sensible defaults.
        Unknown keys raise :exc:`ValueError`.

        Pre-defined profiles for common products are importable from
        :mod:`point_collocation.profiles` (e.g. ``pace_l3``, ``pace_l2``).
    variables:
        Variable names to extract from each granule.  When provided,
        overrides any variables stored on the plan.  When omitted,
        falls back to ``plan.variables``.  If the resolved list is
        empty, the output will have no variable columns.
        Raises :exc:`ValueError` if a requested variable is not found
        in the opened dataset.
    spatial_method:
        Method used for spatial matching.

        * ``"auto"`` *(default)* — automatically selects the best method
          based on the dimensionality of the geolocation coordinates:

          - **1-D coordinates** (regular/gridded data, both lat and lon
            are 1-D): uses ``"axis"`` (vectorised
            ``ds.sel(..., method="nearest")`` over all points at once).
            If ``"axis"`` fails for any reason, falls back to ``"kdtree"``
            automatically.
          - **2-D coordinates** (irregular/swath data, or either coordinate
            is 2-D): uses ``"kdtree"``.

          ``xoak-kdtree`` and ``xoak-haversine`` are never selected
          automatically; set them explicitly if needed.

        * ``"axis"`` — Vectorised ``ds.sel(..., method="nearest")`` for all
          points in a single call.  Requires 1-D (regular-grid) coordinate
          arrays for both latitude and longitude; raises :exc:`ValueError`
          with a suggestion to use ``"auto"`` or ``"kdtree"`` for 2-D
          coordinates.
        * ``"kdtree"`` — xarray's built-in
          :class:`xarray.indexes.NDPointIndex` with the default
          ``ScipyKDTreeAdapter``.  Works with both 1-D and 2-D coordinate
          arrays (requires ``scipy``).
        * ``"xoak-kdtree"`` — the ``xoak`` package's ``SklearnKDTreeAdapter``.
          Works with both 1-D and 2-D arrays (requires ``xoak`` and
          ``scikit-learn``).
        * ``"xoak-haversine"`` — the ``xoak`` package's
          ``SklearnGeoBallTreeAdapter``, which uses the haversine metric for
          accurate great-circle distance calculations.  Recommended for data
          near the poles where the Euclidean k-d tree used by
          ``"xoak-kdtree"`` can return incorrect nearest neighbours due to
          coordinate distortion.  Works with both 1-D and 2-D arrays
          (requires ``xoak`` and ``scikit-learn``).  Lat/lon values are
          passed in degrees; the adapter converts them to radians internally.
    open_dataset_kwargs:
        Optional dictionary of keyword arguments forwarded to the xarray
        open function for every granule opened during the run.  These
        override any ``"open_kwargs"`` in *open_method* but are themselves
        overridden by their respective defaults only for missing keys
        (``chunks`` → ``{}``, ``engine`` → ``"h5netcdf"``,
        ``decode_timedelta`` → ``False``).
    coord_spec:
        Coordinate specification controlling how axis/coordinate names are
        interpreted for both the source dataset and the points DataFrame.
        Defaults to auto-detection of lat/lon/time from standard name
        candidates.  Example usage with non-standard variable names and
        optional additional axes::

            coord_spec = {
                "coordinate_system": "geographic",
                "y":    {"source": "grid_lat", "points": "lat"},
                "x":    {"source": "grid_lon", "points": "lon"},
                "time": {"source": "auto",      "points": "auto"},
                # optional additional axes:
                "depth":      {"source": "z",          "points": "depth"},
                "wavelength": {"source": "wavelength", "points": "wave"},
            }

        The ``source`` key is the variable/coordinate name in the source
        dataset; ``points`` is the column name in the points DataFrame.
        Set either to ``"auto"`` for standard-name auto-detection.

        If ``coord_spec`` specifies ``source`` for ``y``/``x`` and
        ``open_method['coords']`` also specifies explicit names, a
        :exc:`ValueError` is raised when they conflict.  Set one side to
        ``"auto"`` to let the other take precedence.

        Additional axes (beyond time) are optional; if the configured column
        is absent from the points DataFrame the axis is silently skipped.
    silent:
        When ``True`` (default), all progress output is suppressed.
        Set to ``False`` to print a progress message to stdout after
        every *batch_size* granules.
    batch_size:
        Number of granules to process between progress reports (and
        between intermediate saves when *save_dir* is set).  Defaults
        to ``None``, which sets the batch size to one more than the
        total number of matched granules so that all granules are
        processed in a single batch.
    save_dir:
        Directory in which intermediate results are saved as Parquet
        files after each batch of *batch_size* granules.  The directory
        is created automatically if it does not exist.  Each batch is
        saved as ``plan_<first>_<last>.parquet`` where *first* and
        *last* are the granule indices from the plan.  When ``None``
        (default), no intermediate files are written.
    granule_range:
        Optional ``(start, end)`` tuple (both **1-based and inclusive**)
        that restricts processing to a contiguous slice of the matched
        granules, ordered by granule index.  For example,
        ``granule_range=(261, 620)`` resumes from granule 261 after a
        crash that completed granules 1–260.  Progress messages continue
        to report absolute granule numbers (e.g.
        "granules 261-270 of 620 processed") so the output is directly
        comparable with messages from the original run.  When ``None``
        (default), all matched granules are processed.

    Returns
    -------
    pandas.DataFrame
        One row per (point, granule) pair.  In addition to the original
        point columns and one column per requested variable, the output
        always includes:

        ``pc_id``
            Point identifier.  If the input dataframe contains a ``pc_id``
            column those values are preserved as-is; otherwise the row
            index from the input dataframe is used.  Duplicate ``pc_id``
            values in the input are not allowed and raise a
            :class:`ValueError` during planning.
        ``granule_id``
            Identifier of the granule that provided this row's values.
        ``granule_lat``
            Latitude of the matched location inside the granule (i.e.
            the nearest-neighbour grid or swath position).
        ``granule_lon``
            Longitude of the matched location inside the granule.
        ``granule_time``
            Midpoint of the granule's temporal coverage, derived from
            the granule metadata (``begin + (end - begin) / 2``).  For
            earthaccess granules, temporal information is stored in the
            search result metadata rather than in the dataset itself.
            For zero-match rows, this column is ``pandas.NaT``.

        Any extra columns present in the input dataframe are retained in
        the output.  Points with zero matching granules contribute a
        single NaN row.  The output is sorted to match the ``pc_id``
        order from the input dataframe.

    Raises
    ------
    ValueError
        If *open_method* is a string that is not a valid preset, or a dict
        with unknown keys or an invalid ``"xarray_open"`` value.
    ValueError
        If a requested variable is not present in an opened dataset.
    ValueError
        If geolocation variables cannot be detected unambiguously.
    ValueError
        If ``granule_range`` is not a 2-tuple of positive integers with
        ``start <= end``, or if either bound exceeds the number of matched
        granules in the plan.
    ImportError
        If ``spatial_method="xoak-kdtree"`` and the ``xoak`` package is not
        installed.
    ImportError
        If ``spatial_method="xoak-haversine"`` and the ``xoak`` package is not
        installed.
    ImportError
        If ``spatial_method="kdtree"`` and ``scipy`` is not installed.
    """
    if granule_range is not None:
        if (
            len(granule_range) != 2
            or not isinstance(granule_range[0], int)
            or not isinstance(granule_range[1], int)
            or granule_range[0] < 1
            or granule_range[1] < granule_range[0]
        ):
            raise ValueError(
                f"granule_range={granule_range!r} is not valid. "
                "Must be a (start, end) tuple of positive integers with start <= end, "
                "both 1-based and inclusive (e.g. granule_range=(261, 620))."
            )

    if spatial_method is None:
        spatial_method = "auto"

    if spatial_method not in _VALID_SPATIAL_METHODS:
        raise ValueError(
            f"spatial_method={spatial_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_SPATIAL_METHODS)}."
        )

    # Validate xoak is importable before we start processing granules.
    if spatial_method == "xoak-kdtree":
        try:
            from xoak.tree_adapters import SklearnKDTreeAdapter  # type: ignore[import-untyped]  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'xoak' package (and scikit-learn) are required for spatial_method='xoak-kdtree'. "
                "Install them with: pip install xoak scikit-learn"
            ) from exc

    # Validate xoak is importable before we start processing granules.
    if spatial_method == "xoak-haversine":
        try:
            from xoak.tree_adapters import SklearnGeoBallTreeAdapter  # type: ignore[import-untyped]  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'xoak' package (and scikit-learn) are required for spatial_method='xoak-haversine'. "
                "Install them with: pip install xoak scikit-learn"
            ) from exc

    # Validate scipy is importable before we start processing granules.
    if spatial_method == "kdtree":
        try:
            from scipy.spatial import KDTree  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'scipy' package is required for spatial_method='kdtree'. "
                "Install it with: pip install scipy"
            ) from exc

    # Normalize open_method to a full dict spec (raises ValueError on invalid input).
    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method, open_dataset_kwargs)

    # Normalize coord_spec and bridge y/x sources into spec["coords"].
    # This ensures non-standard lat/lon variable names in coord_spec are used
    # when opening datasets, with conflict detection against open_method["coords"].
    resolved_coord_spec = _normalize_coord_spec(coord_spec)
    spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

    # Resolve additional axes (depth, wavelength, etc.) from the points DataFrame.
    # We use the plan's already-normalised points (lat/lon/time canonical names).
    # The y/x/time columns are always "lat"/"lon"/"time" after plan normalisation.
    additional_axes = _resolve_additional_axes(plan.points, resolved_coord_spec)

    if not silent:
        _print_coord_spec_summary(
            plan,
            resolved_coord_spec,
            additional_axes,
        )

    effective_vars: list[str] = variables if variables is not None else plan.variables
    return _execute_plan(
        plan,
        spec=spec,
        spatial_method=spatial_method,
        variables=effective_vars,
        silent=silent,
        batch_size=batch_size,
        save_dir=save_dir,
        granule_range=granule_range,
        additional_axes=additional_axes,
    )

Plan

point_collocation.core.plan.Plan dataclass

A planned matchup: stores the point→granule mapping and search results.

Attributes:

Name Type Description
points DataFrame

Normalised points DataFrame (time column).

results list[Any]

Original earthaccess result objects in search order. Passed directly to earthaccess.open() when executing the plan.

granules list[GranuleMeta]

:class:GranuleMeta for every unique granule returned by the search (parallel with results).

point_granule_map dict[Any, list[int]]

Maps each row index of points to a (possibly empty) list of indices into granules.

variables list[str]

Default variables to extract during :func:~point_collocation.matchup. Can be overridden by passing variables directly to :func:~point_collocation.matchup.

source_kwargs dict[str, Any]

earthaccess search kwargs used to build this plan.

time_buffer Timedelta

Temporal buffer that was applied when matching points to granules.

Source code in point_collocation/core/plan.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
@dataclass
class Plan:
    """A planned matchup: stores the point→granule mapping and search results.

    Attributes
    ----------
    points:
        Normalised points DataFrame (``time`` column).
    results:
        Original earthaccess result objects in search order.  Passed
        directly to ``earthaccess.open()`` when executing the plan.
    granules:
        :class:`GranuleMeta` for every unique granule returned by the
        search (parallel with *results*).
    point_granule_map:
        Maps each row index of *points* to a (possibly empty) list of
        indices into *granules*.
    variables:
        Default variables to extract during :func:`~point_collocation.matchup`.
        Can be overridden by passing ``variables`` directly to
        :func:`~point_collocation.matchup`.
    source_kwargs:
        earthaccess search kwargs used to build this plan.
    time_buffer:
        Temporal buffer that was applied when matching points to granules.
    """

    points: pd.DataFrame
    results: list[Any]
    granules: list[GranuleMeta]
    point_granule_map: dict[Any, list[int]]
    variables: list[str] = field(default_factory=list)
    source_kwargs: dict[str, Any] = field(default_factory=dict)
    time_buffer: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(0))

    # Original column names as detected in the user's points DataFrame, before
    # normalisation to the canonical "lat"/"lon"/"time" names.  Used for
    # transparent reporting in plan.open_dataset() and pc.matchup() output.
    pts_y_col_original: str = field(default=_CANONICAL_LAT)
    """Original latitude column name detected in the user's points DataFrame."""

    pts_x_col_original: str = field(default=_CANONICAL_LON)
    """Original longitude column name detected in the user's points DataFrame."""

    pts_time_col_original: str = field(default=_CANONICAL_TIME)
    """Original time column name detected in the user's points DataFrame."""

    # ------------------------------------------------------------------
    # Indexing — plan[0] returns a result object; plan[0:10] returns a
    # subset Plan restricted to the sliced points.
    # ------------------------------------------------------------------

    def __getitem__(self, idx: int | slice) -> "Plan | Any":
        """Return a subset :class:`Plan` or a single earthaccess result.

        Parameters
        ----------
        idx:
            * **Integer** — returns the earthaccess result object at that
              position (``self.results[idx]``), so that ``plan[0]`` can
              still be passed to :meth:`open_dataset`.
            * **Slice** — returns a new :class:`Plan` whose ``points``
              are the rows selected by the slice (``points.iloc[idx]``),
              with ``point_granule_map``, ``granules``, and ``results``
              filtered and re-indexed accordingly.  This allows users to
              test a subset of a large plan::

                  res = pc.matchup(plan[0:10], variables=["avw"])
        """
        if isinstance(idx, int):
            return self.results[idx]

        # --- Slice: subset by points ---
        subset_points = self.points.iloc[idx]
        subset_pt_indices = list(subset_points.index)

        # Collect granule indices (into self.granules) needed by the subset.
        needed_g_idx: list[int] = []
        seen_g: set[int] = set()
        for pt_idx in subset_pt_indices:
            for g_idx in self.point_granule_map.get(pt_idx, []):
                if g_idx not in seen_g:
                    needed_g_idx.append(g_idx)
                    seen_g.add(g_idx)
        needed_g_idx.sort()

        # Build re-index map: old granule index → new granule index.
        g_remap: dict[int, int] = {old: new for new, old in enumerate(needed_g_idx)}

        # New granules with corrected result_index (sequential from 0).
        new_granules = [
            GranuleMeta(
                granule_id=self.granules[old_g].granule_id,
                begin=self.granules[old_g].begin,
                end=self.granules[old_g].end,
                bbox=self.granules[old_g].bbox,
                result_index=new_g,
            )
            for new_g, old_g in enumerate(needed_g_idx)
        ]

        # New results list — only the results referenced by kept granules.
        new_results = [self.results[self.granules[old_g].result_index] for old_g in needed_g_idx]

        # New point_granule_map using re-indexed granule indices.
        new_pgm: dict[Any, list[int]] = {
            pt_idx: [g_remap[g] for g in self.point_granule_map.get(pt_idx, [])]
            for pt_idx in subset_pt_indices
        }

        return Plan(
            points=subset_points,
            results=new_results,
            granules=new_granules,
            point_granule_map=new_pgm,
            variables=list(self.variables),
            source_kwargs=dict(self.source_kwargs),
            time_buffer=self.time_buffer,
            pts_y_col_original=self.pts_y_col_original,
            pts_x_col_original=self.pts_x_col_original,
            pts_time_col_original=self.pts_time_col_original,
        )

    # ------------------------------------------------------------------
    # Dataset opening helpers
    # ------------------------------------------------------------------

    def open_dataset(
        self,
        result: "int | Any",
        open_method: "str | dict | None" = None,
        *,
        coord_spec: "dict | None" = None,
        silent: bool = False,
    ) -> "Any":
        """Open a single granule result as an :class:`xarray.Dataset` or DataTree.

        Parameters
        ----------
        result:
            An integer index into ``plan.results`` (e.g. ``0``), or a
            single earthaccess result object obtained via ``plan[n]``.
            Using an integer is preferred: ``plan.open_dataset(0)`` is
            equivalent to ``plan.open_dataset(plan[0])``.
        open_method:
            How to open the granule.  Accepts the same string presets or
            dict spec as :func:`~point_collocation.matchup`.  Defaults to
            ``"auto"`` (try dataset first, fall back to datatree merge).

            **String presets:**

            * ``"dataset"`` — open with ``xarray.open_dataset`` (flat NetCDF).
            * ``"datatree"`` — open as a DataTree with all groups; returns the
              raw :class:`xarray.DataTree` (or ``datatree.DataTree``) without
              merging groups.  Equivalent to ``xarray.open_datatree(f)``.
            * ``"datatree-merge"`` — open as DataTree and merge all groups into
              a flat Dataset.
            * ``"auto"`` *(default)* — probe the file first; if lat/lon can be
              detected via ``xr.open_dataset``, use that; otherwise fall back to
              ``"datatree-merge"``.  The printed spec shows the **resolved** mode.

            Pass open-function kwargs via the ``"open_kwargs"`` key of a
            dict spec, e.g.
            ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
        coord_spec:
            Coordinate specification controlling how axis/coordinate names are
            interpreted for both the source dataset and the points DataFrame.
            Defaults to auto-detection of lat/lon/time from standard name
            candidates.  See :data:`~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC`
            for the full structure.
        silent:
            When ``False`` (default), print the effective open_method spec
            actually used (after normalization and auto-resolution) and a
            geolocation summary line.
            Set to ``True`` to suppress this output.

        Returns
        -------
        xarray.Dataset or xarray.DataTree
            A flat :class:`xarray.Dataset` for all modes except
            ``open_method="datatree"`` (or a dict spec with
            ``xarray_open="datatree"`` and ``merge=None``), which returns the
            raw DataTree.
            The caller is responsible for closing the returned object when
            finished (e.g. ``ds.close()``).
        """
        if isinstance(result, int):
            n = len(self.results)
            if result < 0 or result >= n:
                raise IndexError(
                    f"result index {result} is out of range for a plan with {n} result(s). "
                    f"Valid indices are 0 to {n - 1}."
                )
            result = self.results[result]

        from point_collocation.core._open_method import (
            _apply_coord_spec_to_spec,
            _apply_coords,
            _build_effective_open_kwargs,
            _geoloc_description,
            _merge_datatree_with_spec,
            _normalize_open_method,
            _open_and_merge_dataset_groups,
            _open_datatree_fn,
            _resolve_auto_spec,
            _suppress_dask_progress,
        )
        from point_collocation.core._coord_spec import _normalize_coord_spec
        from point_collocation.core.engine import _find_time_dim

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        effective_open_method = "auto" if open_method is None else open_method
        spec = _normalize_open_method(effective_open_method)

        # Bridge coord_spec y/x sources into spec["coords"] with conflict detection.
        resolved_coord_spec = _normalize_coord_spec(coord_spec)
        spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

        xarray_open = spec.get("xarray_open", "dataset")
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
        if len(file_objs) != 1:
            raise RuntimeError(
                f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
            )
        file_obj = file_objs[0]

        # For "auto" mode, probe the file first so that the printed spec shows
        # the actual resolved mode (e.g. "dataset" or "datatree"), not "auto".
        # Any ValueError from _resolve_auto_spec (both probes failed) is
        # propagated to the caller rather than silently downgrading to an
        # empty-dataset fallback.
        if xarray_open == "auto":
            spec = _resolve_auto_spec(file_obj, spec)
            xarray_open = spec["xarray_open"]
            effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        if not silent:
            display_spec = {k: v for k, v in spec.items() if not k.startswith("_")}
            display_spec["open_kwargs"] = effective_kwargs
            display_spec.setdefault("merge", None)
            print(f"open_method: {display_spec!r}")
            reason = spec.get("_auto_switch_reason")
            if reason:
                print(f"open_method='auto' switched to 'datatree': {reason}")

        def _try_print_geoloc(
            ds: "xr.Dataset",
            spec: dict,
            *,
            silent: bool,
            plan: "Plan",
        ) -> None:
            """Print geolocation summary or a 'not found' note."""
            if silent:
                return
            try:
                ds_coord, lon_n, lat_n = _apply_coords(ds, spec)
                time_dim = _find_time_dim(ds_coord)
                print(
                    _geoloc_description(
                        ds_coord,
                        lon_n,
                        lat_n,
                        spec,
                        time_dim=time_dim,
                        pts_y_col=plan.pts_y_col_original,
                        pts_x_col=plan.pts_x_col_original,
                        pts_time_col=plan.pts_time_col_original,
                    )
                )
            except ValueError as exc:
                print(f"Geolocation: could not detect lat/lon in dataset — {exc}")

        def _promote_coords(ds: "xr.Dataset", spec: dict) -> "xr.Dataset":
            """Apply coord promotion from *spec* to *ds*, returning the updated dataset.

            Silently returns *ds* unchanged if geolocation detection fails —
            the user can still work with the dataset; they'll have seen the error
            from ``_try_print_geoloc`` if ``silent=False``.
            """
            try:
                ds_promoted, _, _ = _apply_coords(ds, spec)
                return ds_promoted
            except ValueError:
                return ds

        if xarray_open == "datatree":
            merge = spec.get("merge")
            if merge is None:
                # Return the raw DataTree without merging — like open_datatree(f).
                # Still try to detect geolocation from the DataTree's root node so
                # the user gets a summary even when groups are not merged.
                dt = _open_datatree_fn(file_obj, effective_kwargs)
                if not silent:
                    try:
                        root_ds = dt.to_dataset()  # type: ignore[union-attr]
                    except AttributeError:
                        root_ds = None
                    if root_ds is not None:
                        _try_print_geoloc(root_ds, spec, silent=False, plan=self)
                    else:
                        print("Geolocation: DataTree returned without merging — could not read root dataset.")
                return dt
            # merge is "all", "root", or a list: merge groups into a flat Dataset.
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            try:
                ds = _merge_datatree_with_spec(dt, spec)
            finally:
                if hasattr(dt, "close"):
                    dt.close()
            _try_print_geoloc(ds, spec, silent=silent, plan=self)
            return _promote_coords(ds, spec)

        if xarray_open == "dataset":
            merge = spec.get("merge")
            if merge is not None:
                # Dataset-based group merge: open each group and merge.
                ds = _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
            else:
                with _suppress_dask_progress():
                    ds = xr.open_dataset(file_obj, **effective_kwargs)  # type: ignore[arg-type]
            _try_print_geoloc(ds, spec, silent=silent, plan=self)
            return _promote_coords(ds, spec)

        raise ValueError(
            f"open_method['xarray_open']={xarray_open!r} is not valid for open_dataset."
        )

    def open_mfdataset(
        self,
        results: "list[Any] | Plan",
        open_method: "str | dict | None" = None,
        *,
        silent: bool = False,
    ) -> "xr.Dataset":
        """Open multiple granule results as a single :class:`xarray.Dataset`.

        Parameters
        ----------
        results:
            A list of earthaccess result objects, or a :class:`Plan`
            (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
            ``results`` attribute is used.
        open_method:
            How to open each granule.  ``"dataset"`` uses
            ``xarray.open_mfdataset`` across all file objects.
            ``"datatree-merge"`` opens each granule as a DataTree, merges
            its groups into a flat dataset, then concatenates all granules
            along a new ``granule`` dimension.  Defaults to ``"auto"``.
            Pass open-function kwargs via the ``"open_kwargs"`` key of a
            dict spec, e.g.
            ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
        silent:
            When ``False`` (default), print the effective open_method spec
            actually used (after normalization and defaults are applied).
            Set to ``True`` to suppress this output.

        Returns
        -------
        xarray.Dataset
        """
        from point_collocation.core._open_method import (
            _build_effective_open_kwargs,
            _merge_datatree_with_spec,
            _normalize_open_method,
            _open_and_merge_dataset_groups,
            _open_as_flat_dataset,
            _open_datatree_fn,
            _suppress_dask_progress,
        )

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        effective_open_method = "auto" if open_method is None else open_method
        spec = _normalize_open_method(effective_open_method)

        xarray_open = spec.get("xarray_open", "dataset")
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

        if not silent:
            display_spec = {**spec, "open_kwargs": effective_kwargs}
            display_spec.setdefault("merge", None)
            print(f"open_method: {display_spec!r}")

        result_list = results.results if isinstance(results, Plan) else list(results)
        file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

        if xarray_open == "datatree":
            # Open each granule as a DataTree, merge its groups, then
            # concatenate all granule datasets along a new "granule" dim.
            merged_datasets: list[xr.Dataset] = []
            for file_obj in file_objs:
                dt = _open_datatree_fn(file_obj, effective_kwargs)
                try:
                    merged_datasets.append(_merge_datatree_with_spec(dt, spec))
                finally:
                    if hasattr(dt, "close"):
                        dt.close()
            if not merged_datasets:
                return xr.Dataset()
            return xr.concat(merged_datasets, dim="granule")

        if xarray_open in ("dataset", "auto"):
            # For dataset mode with merge, open each granule's groups as
            # separate datasets and merge them, then concatenate all granules
            # along a new "granule" dimension.
            # Without merge, use xr.open_mfdataset for simplicity.
            merge = spec.get("merge")
            if merge is not None:
                merged_datasets = []
                for file_obj in file_objs:
                    merged_datasets.append(
                        _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
                    )
                if not merged_datasets:
                    return xr.Dataset()
                return xr.concat(merged_datasets, dim="granule")
            with _suppress_dask_progress():
                return xr.open_mfdataset(file_objs, **effective_kwargs)  # type: ignore[arg-type]

        raise ValueError(
            f"open_method['xarray_open']={xarray_open!r} is not valid for open_mfdataset."
        )

    # ------------------------------------------------------------------
    # Variable inspection (removed; use open_dataset(0) instead)
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Summary
    # ------------------------------------------------------------------

    def summary(self, n: int | None = None) -> None:
        """Print a human-readable summary of the plan.

        Parameters
        ----------
        n:
            Number of points to show in the per-point section.
            Defaults to ``min(5, len(self.points))``.
            ``0`` or negative values suppress the per-point section.
        """
        if n is None:
            n = min(5, len(self.points))
        elif n < 0:
            n = 0

        zero_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) == 0
        )
        multi_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) > 1
        )

        matched_granule_count = len(
            {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
        )

        lines: list[str] = [
            f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
            f"  Points with 0 matches : {zero_match}",
            f"  Points with >1 matches: {multi_match}",
            f"  Time buffer: {self.time_buffer}",
        ]

        n_show = min(n, len(self.points))
        if n_show > 0:
            lines.append("")
            lines.append(f"First {n_show} point(s):")
            for pt_idx, row in self.points.head(n_show).iterrows():
                g_indices = self.point_granule_map.get(pt_idx, [])
                lines.append(
                    f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                    f"time={row['time']}: {len(g_indices)} match(es)"
                )
                for g_idx in g_indices:
                    lines.append(f"    → {self.granules[g_idx].granule_id}")

        print("\n".join(lines))

summary

summary(n: int | None = None) -> None

Print a human-readable summary of the plan.

Parameters:

Name Type Description Default
n int | None

Number of points to show in the per-point section. Defaults to min(5, len(self.points)). 0 or negative values suppress the per-point section.

None
Source code in point_collocation/core/plan.py
def summary(self, n: int | None = None) -> None:
    """Print a human-readable summary of the plan.

    Parameters
    ----------
    n:
        Number of points to show in the per-point section.
        Defaults to ``min(5, len(self.points))``.
        ``0`` or negative values suppress the per-point section.
    """
    if n is None:
        n = min(5, len(self.points))
    elif n < 0:
        n = 0

    zero_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) == 0
    )
    multi_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) > 1
    )

    matched_granule_count = len(
        {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
    )

    lines: list[str] = [
        f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
        f"  Points with 0 matches : {zero_match}",
        f"  Points with >1 matches: {multi_match}",
        f"  Time buffer: {self.time_buffer}",
    ]

    n_show = min(n, len(self.points))
    if n_show > 0:
        lines.append("")
        lines.append(f"First {n_show} point(s):")
        for pt_idx, row in self.points.head(n_show).iterrows():
            g_indices = self.point_granule_map.get(pt_idx, [])
            lines.append(
                f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                f"time={row['time']}: {len(g_indices)} match(es)"
            )
            for g_idx in g_indices:
                lines.append(f"    → {self.granules[g_idx].granule_id}")

    print("\n".join(lines))

open_dataset

open_dataset(result: 'int | Any', open_method: 'str | dict | None' = None, *, coord_spec: 'dict | None' = None, silent: bool = False) -> 'Any'

Open a single granule result as an :class:xarray.Dataset or DataTree.

Parameters:

Name Type Description Default
result 'int | Any'

An integer index into plan.results (e.g. 0), or a single earthaccess result object obtained via plan[n]. Using an integer is preferred: plan.open_dataset(0) is equivalent to plan.open_dataset(plan[0]).

required
open_method 'str | dict | None'

How to open the granule. Accepts the same string presets or dict spec as :func:~point_collocation.matchup. Defaults to "auto" (try dataset first, fall back to datatree merge).

String presets:

  • "dataset" — open with xarray.open_dataset (flat NetCDF).
  • "datatree" — open as a DataTree with all groups; returns the raw :class:xarray.DataTree (or datatree.DataTree) without merging groups. Equivalent to xarray.open_datatree(f).
  • "datatree-merge" — open as DataTree and merge all groups into a flat Dataset.
  • "auto" (default) — probe the file first; if lat/lon can be detected via xr.open_dataset, use that; otherwise fall back to "datatree-merge". The printed spec shows the resolved mode.

Pass open-function kwargs via the "open_kwargs" key of a dict spec, e.g. open_method={"open_kwargs": {"engine": "netcdf4"}}.

None
coord_spec 'dict | None'

Coordinate specification controlling how axis/coordinate names are interpreted for both the source dataset and the points DataFrame. Defaults to auto-detection of lat/lon/time from standard name candidates. See :data:~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC for the full structure.

None
silent bool

When False (default), print the effective open_method spec actually used (after normalization and auto-resolution) and a geolocation summary line. Set to True to suppress this output.

False

Returns:

Type Description
Dataset or DataTree

A flat :class:xarray.Dataset for all modes except open_method="datatree" (or a dict spec with xarray_open="datatree" and merge=None), which returns the raw DataTree. The caller is responsible for closing the returned object when finished (e.g. ds.close()).

Source code in point_collocation/core/plan.py
def open_dataset(
    self,
    result: "int | Any",
    open_method: "str | dict | None" = None,
    *,
    coord_spec: "dict | None" = None,
    silent: bool = False,
) -> "Any":
    """Open a single granule result as an :class:`xarray.Dataset` or DataTree.

    Parameters
    ----------
    result:
        An integer index into ``plan.results`` (e.g. ``0``), or a
        single earthaccess result object obtained via ``plan[n]``.
        Using an integer is preferred: ``plan.open_dataset(0)`` is
        equivalent to ``plan.open_dataset(plan[0])``.
    open_method:
        How to open the granule.  Accepts the same string presets or
        dict spec as :func:`~point_collocation.matchup`.  Defaults to
        ``"auto"`` (try dataset first, fall back to datatree merge).

        **String presets:**

        * ``"dataset"`` — open with ``xarray.open_dataset`` (flat NetCDF).
        * ``"datatree"`` — open as a DataTree with all groups; returns the
          raw :class:`xarray.DataTree` (or ``datatree.DataTree``) without
          merging groups.  Equivalent to ``xarray.open_datatree(f)``.
        * ``"datatree-merge"`` — open as DataTree and merge all groups into
          a flat Dataset.
        * ``"auto"`` *(default)* — probe the file first; if lat/lon can be
          detected via ``xr.open_dataset``, use that; otherwise fall back to
          ``"datatree-merge"``.  The printed spec shows the **resolved** mode.

        Pass open-function kwargs via the ``"open_kwargs"`` key of a
        dict spec, e.g.
        ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
    coord_spec:
        Coordinate specification controlling how axis/coordinate names are
        interpreted for both the source dataset and the points DataFrame.
        Defaults to auto-detection of lat/lon/time from standard name
        candidates.  See :data:`~point_collocation.core._coord_spec.DEFAULT_COORD_SPEC`
        for the full structure.
    silent:
        When ``False`` (default), print the effective open_method spec
        actually used (after normalization and auto-resolution) and a
        geolocation summary line.
        Set to ``True`` to suppress this output.

    Returns
    -------
    xarray.Dataset or xarray.DataTree
        A flat :class:`xarray.Dataset` for all modes except
        ``open_method="datatree"`` (or a dict spec with
        ``xarray_open="datatree"`` and ``merge=None``), which returns the
        raw DataTree.
        The caller is responsible for closing the returned object when
        finished (e.g. ``ds.close()``).
    """
    if isinstance(result, int):
        n = len(self.results)
        if result < 0 or result >= n:
            raise IndexError(
                f"result index {result} is out of range for a plan with {n} result(s). "
                f"Valid indices are 0 to {n - 1}."
            )
        result = self.results[result]

    from point_collocation.core._open_method import (
        _apply_coord_spec_to_spec,
        _apply_coords,
        _build_effective_open_kwargs,
        _geoloc_description,
        _merge_datatree_with_spec,
        _normalize_open_method,
        _open_and_merge_dataset_groups,
        _open_datatree_fn,
        _resolve_auto_spec,
        _suppress_dask_progress,
    )
    from point_collocation.core._coord_spec import _normalize_coord_spec
    from point_collocation.core.engine import _find_time_dim

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method)

    # Bridge coord_spec y/x sources into spec["coords"] with conflict detection.
    resolved_coord_spec = _normalize_coord_spec(coord_spec)
    spec = _apply_coord_spec_to_spec(spec, resolved_coord_spec)

    xarray_open = spec.get("xarray_open", "dataset")
    effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
    if len(file_objs) != 1:
        raise RuntimeError(
            f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
        )
    file_obj = file_objs[0]

    # For "auto" mode, probe the file first so that the printed spec shows
    # the actual resolved mode (e.g. "dataset" or "datatree"), not "auto".
    # Any ValueError from _resolve_auto_spec (both probes failed) is
    # propagated to the caller rather than silently downgrading to an
    # empty-dataset fallback.
    if xarray_open == "auto":
        spec = _resolve_auto_spec(file_obj, spec)
        xarray_open = spec["xarray_open"]
        effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    if not silent:
        display_spec = {k: v for k, v in spec.items() if not k.startswith("_")}
        display_spec["open_kwargs"] = effective_kwargs
        display_spec.setdefault("merge", None)
        print(f"open_method: {display_spec!r}")
        reason = spec.get("_auto_switch_reason")
        if reason:
            print(f"open_method='auto' switched to 'datatree': {reason}")

    def _try_print_geoloc(
        ds: "xr.Dataset",
        spec: dict,
        *,
        silent: bool,
        plan: "Plan",
    ) -> None:
        """Print geolocation summary or a 'not found' note."""
        if silent:
            return
        try:
            ds_coord, lon_n, lat_n = _apply_coords(ds, spec)
            time_dim = _find_time_dim(ds_coord)
            print(
                _geoloc_description(
                    ds_coord,
                    lon_n,
                    lat_n,
                    spec,
                    time_dim=time_dim,
                    pts_y_col=plan.pts_y_col_original,
                    pts_x_col=plan.pts_x_col_original,
                    pts_time_col=plan.pts_time_col_original,
                )
            )
        except ValueError as exc:
            print(f"Geolocation: could not detect lat/lon in dataset — {exc}")

    def _promote_coords(ds: "xr.Dataset", spec: dict) -> "xr.Dataset":
        """Apply coord promotion from *spec* to *ds*, returning the updated dataset.

        Silently returns *ds* unchanged if geolocation detection fails —
        the user can still work with the dataset; they'll have seen the error
        from ``_try_print_geoloc`` if ``silent=False``.
        """
        try:
            ds_promoted, _, _ = _apply_coords(ds, spec)
            return ds_promoted
        except ValueError:
            return ds

    if xarray_open == "datatree":
        merge = spec.get("merge")
        if merge is None:
            # Return the raw DataTree without merging — like open_datatree(f).
            # Still try to detect geolocation from the DataTree's root node so
            # the user gets a summary even when groups are not merged.
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            if not silent:
                try:
                    root_ds = dt.to_dataset()  # type: ignore[union-attr]
                except AttributeError:
                    root_ds = None
                if root_ds is not None:
                    _try_print_geoloc(root_ds, spec, silent=False, plan=self)
                else:
                    print("Geolocation: DataTree returned without merging — could not read root dataset.")
            return dt
        # merge is "all", "root", or a list: merge groups into a flat Dataset.
        dt = _open_datatree_fn(file_obj, effective_kwargs)
        try:
            ds = _merge_datatree_with_spec(dt, spec)
        finally:
            if hasattr(dt, "close"):
                dt.close()
        _try_print_geoloc(ds, spec, silent=silent, plan=self)
        return _promote_coords(ds, spec)

    if xarray_open == "dataset":
        merge = spec.get("merge")
        if merge is not None:
            # Dataset-based group merge: open each group and merge.
            ds = _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
        else:
            with _suppress_dask_progress():
                ds = xr.open_dataset(file_obj, **effective_kwargs)  # type: ignore[arg-type]
        _try_print_geoloc(ds, spec, silent=silent, plan=self)
        return _promote_coords(ds, spec)

    raise ValueError(
        f"open_method['xarray_open']={xarray_open!r} is not valid for open_dataset."
    )

open_mfdataset

open_mfdataset(results: 'list[Any] | Plan', open_method: 'str | dict | None' = None, *, silent: bool = False) -> 'xr.Dataset'

Open multiple granule results as a single :class:xarray.Dataset.

Parameters:

Name Type Description Default
results 'list[Any] | Plan'

A list of earthaccess result objects, or a :class:Plan (e.g. plan[0:2]). When a :class:Plan is passed its results attribute is used.

required
open_method 'str | dict | None'

How to open each granule. "dataset" uses xarray.open_mfdataset across all file objects. "datatree-merge" opens each granule as a DataTree, merges its groups into a flat dataset, then concatenates all granules along a new granule dimension. Defaults to "auto". Pass open-function kwargs via the "open_kwargs" key of a dict spec, e.g. open_method={"open_kwargs": {"engine": "netcdf4"}}.

None
silent bool

When False (default), print the effective open_method spec actually used (after normalization and defaults are applied). Set to True to suppress this output.

False

Returns:

Type Description
Dataset
Source code in point_collocation/core/plan.py
def open_mfdataset(
    self,
    results: "list[Any] | Plan",
    open_method: "str | dict | None" = None,
    *,
    silent: bool = False,
) -> "xr.Dataset":
    """Open multiple granule results as a single :class:`xarray.Dataset`.

    Parameters
    ----------
    results:
        A list of earthaccess result objects, or a :class:`Plan`
        (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
        ``results`` attribute is used.
    open_method:
        How to open each granule.  ``"dataset"`` uses
        ``xarray.open_mfdataset`` across all file objects.
        ``"datatree-merge"`` opens each granule as a DataTree, merges
        its groups into a flat dataset, then concatenates all granules
        along a new ``granule`` dimension.  Defaults to ``"auto"``.
        Pass open-function kwargs via the ``"open_kwargs"`` key of a
        dict spec, e.g.
        ``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
    silent:
        When ``False`` (default), print the effective open_method spec
        actually used (after normalization and defaults are applied).
        Set to ``True`` to suppress this output.

    Returns
    -------
    xarray.Dataset
    """
    from point_collocation.core._open_method import (
        _build_effective_open_kwargs,
        _merge_datatree_with_spec,
        _normalize_open_method,
        _open_and_merge_dataset_groups,
        _open_as_flat_dataset,
        _open_datatree_fn,
        _suppress_dask_progress,
    )

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    effective_open_method = "auto" if open_method is None else open_method
    spec = _normalize_open_method(effective_open_method)

    xarray_open = spec.get("xarray_open", "dataset")
    effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))

    if not silent:
        display_spec = {**spec, "open_kwargs": effective_kwargs}
        display_spec.setdefault("merge", None)
        print(f"open_method: {display_spec!r}")

    result_list = results.results if isinstance(results, Plan) else list(results)
    file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

    if xarray_open == "datatree":
        # Open each granule as a DataTree, merge its groups, then
        # concatenate all granule datasets along a new "granule" dim.
        merged_datasets: list[xr.Dataset] = []
        for file_obj in file_objs:
            dt = _open_datatree_fn(file_obj, effective_kwargs)
            try:
                merged_datasets.append(_merge_datatree_with_spec(dt, spec))
            finally:
                if hasattr(dt, "close"):
                    dt.close()
        if not merged_datasets:
            return xr.Dataset()
        return xr.concat(merged_datasets, dim="granule")

    if xarray_open in ("dataset", "auto"):
        # For dataset mode with merge, open each granule's groups as
        # separate datasets and merge them, then concatenate all granules
        # along a new "granule" dimension.
        # Without merge, use xr.open_mfdataset for simplicity.
        merge = spec.get("merge")
        if merge is not None:
            merged_datasets = []
            for file_obj in file_objs:
                merged_datasets.append(
                    _open_and_merge_dataset_groups(file_obj, spec, effective_kwargs)
                )
            if not merged_datasets:
                return xr.Dataset()
            return xr.concat(merged_datasets, dim="granule")
        with _suppress_dask_progress():
            return xr.open_mfdataset(file_objs, **effective_kwargs)  # type: ignore[arg-type]

    raise ValueError(
        f"open_method['xarray_open']={xarray_open!r} is not valid for open_mfdataset."
    )

IO / Adapters

point_collocation.adapters

Source adapters that normalise heterogeneous inputs into the SourceProtocol.

Built-in adapters

earthaccess : wraps file-like objects returned by earthaccess.open()

Future adapters (not yet implemented)

stac : STAC item assets url : plain HTTPS URLs local : local file paths

SourceAdapter

Bases: ABC

Abstract base for source adapters.

Subclass this to add support for a new data source. The core engine only calls :meth:open_dataset; everything else is internal to the adapter.

Source code in point_collocation/adapters/base.py
class SourceAdapter(ABC):
    """Abstract base for source adapters.

    Subclass this to add support for a new data source.  The core
    engine only calls :meth:`open_dataset`; everything else is internal
    to the adapter.
    """

    @abstractmethod
    def open_dataset(self, **kwargs: object) -> object:
        """Return an ``xarray.Dataset`` for this source.

        Parameters
        ----------
        **kwargs:
            Forwarded verbatim to ``xarray.open_dataset``.
        """
        raise NotImplementedError  # pragma: no cover

open_dataset abstractmethod

open_dataset(**kwargs: object) -> object

Return an xarray.Dataset for this source.

Parameters:

Name Type Description Default
**kwargs object

Forwarded verbatim to xarray.open_dataset.

{}
Source code in point_collocation/adapters/base.py
@abstractmethod
def open_dataset(self, **kwargs: object) -> object:
    """Return an ``xarray.Dataset`` for this source.

    Parameters
    ----------
    **kwargs:
        Forwarded verbatim to ``xarray.open_dataset``.
    """
    raise NotImplementedError  # pragma: no cover

point_collocation.core._granule

Helpers for working with individual granules (source files).

Responsibilities
  • Extract a human-readable identifier from an arbitrary source object.
  • Parse the temporal coverage (start/end date) from a NASA-style L3 granule filename.
Supported filename conventions

YYYYDOY — single day (DOY = day-of-year, 001–366) YYYYDOY_YYYYDOY — multi-day range (e.g., 8-day composites, monthly) YYYYMMDD — single day in calendar format YYYYMMDD_YYYYMMDD — multi-day range in calendar format

The period keyword embedded in the filename (.DAY., .8D., .MO.) is used to infer the end date when only a start date is present.

Examples of supported filenames
  • PACE_OCI_2024070.L3m.DAY.RRS.Rrs_412.4km.nc
  • PACE_OCI_2024049_2024056.L3m.8D.CHL.chlor_a.9km.nc
  • AQUA_MODIS.20230601.L3m.DAY.SST.sst.4km.nc
  • AQUA_MODIS.20230601_20230630.L3m.MO.CHL.chlor_a.9km.nc

get_source_id

get_source_id(source: object) -> str

Return a human-readable identifier (basename) for source.

Tries, in order:

  1. pathlib.Pathpath.name
  2. Plain stros.path.basename(source)
  3. Object with a .path or .name string attribute
  4. str(source) as last resort
Source code in point_collocation/core/_granule.py
def get_source_id(source: object) -> str:
    """Return a human-readable identifier (basename) for *source*.

    Tries, in order:

    1. ``pathlib.Path`` → ``path.name``
    2. Plain ``str`` → ``os.path.basename(source)``
    3. Object with a ``.path`` or ``.name`` string attribute
    4. ``str(source)`` as last resort
    """
    if isinstance(source, pathlib.Path):
        return source.name
    if isinstance(source, str):
        return os.path.basename(source)
    for attr in ("path", "name"):
        val = getattr(source, attr, None)
        if isinstance(val, str) and val:
            return os.path.basename(val)
    return str(source)

parse_temporal_range

parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]

Return (start, end) timestamps for the granule named filename.

Only the basename of filename is examined.

Parameters:

Name Type Description Default
filename str

File path or basename.

required

Returns:

Type Description
tuple[Timestamp, Timestamp]

Inclusive start and end dates (time component is midnight UTC).

Raises:

Type Description
ValueError

If no recognisable date pattern is found in filename.

Source code in point_collocation/core/_granule.py
def parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]:
    """Return ``(start, end)`` timestamps for the granule named *filename*.

    Only the basename of *filename* is examined.

    Parameters
    ----------
    filename:
        File path or basename.

    Returns
    -------
    tuple[pandas.Timestamp, pandas.Timestamp]
        Inclusive start and end dates (time component is midnight UTC).

    Raises
    ------
    ValueError
        If no recognisable date pattern is found in *filename*.
    """
    basename = os.path.basename(filename)

    # ------------------------------------------------------------------
    # DOY-format pair:  YYYYDOY_YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})_(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = datetime.strptime(m.group(2), "%Y%j")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Calendar-format pair:  YYYYMMDD_YYYYMMDD
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})_(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = datetime.strptime(m.group(2), "%Y%m%d")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single DOY date:  YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single calendar date:  YYYYMMDD (must start with "20…")
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    raise ValueError(
        f"Cannot parse temporal range from filename: {basename!r}"
    )