Skip to content

API Reference

Auto-generated from source docstrings.


point_collocation

Top-level convenience imports:

import point_collocation as pc

pc.plan(...)     # build a matchup plan
pc.matchup(...)  # execute the plan

Core

plan

point_collocation.core.plan.plan

plan(points: PointsFrame, *, data_source: str = 'earthaccess', source_kwargs: dict[str, Any] | None = None, time_buffer: str | Timedelta | timedelta | int = '0h') -> Plan

Build a :class:Plan previewing which granules cover each point.

Parameters:

Name Type Description Default
points PointsFrame

DataFrame with at minimum lat, lon, and time (or date as an alias). If the column is named date and contains date-only values, the time-of-day is set to noon (12:00 UTC) for matching purposes.

required
data_source str

Data source to search. Currently only "earthaccess" is supported.

'earthaccess'
source_kwargs dict[str, Any] | None

Keyword arguments forwarded to earthaccess.search_data(). Must contain at least "short_name".

None
time_buffer str | Timedelta | timedelta | int

Extra temporal margin when matching a point to a granule. A point at time t matches a granule whose coverage is [begin, end] if begin - buffer ≤ t ≤ end + buffer. Accepts a :class:pandas.Timedelta, :class:datetime.timedelta, or a pandas-parseable string ("12H", "30min", …). Default is "0h" (exact overlap required).

'0h'

Returns:

Type Description
Plan

The planning object; inspect with :meth:Plan.summary and execute with :func:~point_collocation.matchup.

Raises:

Type Description
ValueError

If points is missing required columns, data_source is not recognised, or source_kwargs does not contain "short_name".

ImportError

If the earthaccess package is not installed.

Source code in point_collocation/core/plan.py
def plan(
    points: PointsFrame,
    *,
    data_source: str = "earthaccess",
    source_kwargs: dict[str, Any] | None = None,
    time_buffer: str | pd.Timedelta | datetime.timedelta | int = "0h",
) -> Plan:
    """Build a :class:`Plan` previewing which granules cover each point.

    Parameters
    ----------
    points:
        DataFrame with at minimum ``lat``, ``lon``, and ``time`` (or
        ``date`` as an alias).  If the column is named ``date`` and
        contains date-only values, the time-of-day is set to noon
        (12:00 UTC) for matching purposes.
    data_source:
        Data source to search.  Currently only ``"earthaccess"`` is
        supported.
    source_kwargs:
        Keyword arguments forwarded to ``earthaccess.search_data()``.
        Must contain at least ``"short_name"``.
    time_buffer:
        Extra temporal margin when matching a point to a granule.  A
        point at time *t* matches a granule whose coverage is
        ``[begin, end]`` if ``begin - buffer ≤ t ≤ end + buffer``.
        Accepts a :class:`pandas.Timedelta`, :class:`datetime.timedelta`,
        or a pandas-parseable string (``"12H"``, ``"30min"``, …).
        Default is ``"0h"`` (exact overlap required).

    Returns
    -------
    Plan
        The planning object; inspect with :meth:`Plan.summary` and
        execute with :func:`~point_collocation.matchup`.

    Raises
    ------
    ValueError
        If *points* is missing required columns, *data_source* is not
        recognised, or ``source_kwargs`` does not contain ``"short_name"``.
    ImportError
        If the ``earthaccess`` package is not installed.
    """
    if data_source != "earthaccess":
        raise ValueError(
            f"Unknown data_source {data_source!r}. "
            "Currently only 'earthaccess' is supported."
        )

    points = _plan_normalise_time(points)
    _plan_validate_points(points)

    buffer = _parse_time_buffer(time_buffer)
    results, granule_metas = _search_earthaccess(points, source_kwargs=source_kwargs)
    point_granule_map = _match_points_to_granules(points, granule_metas, buffer)

    return Plan(
        points=points,
        results=results,
        granules=granule_metas,
        point_granule_map=point_granule_map,
        source_kwargs=dict(source_kwargs or {}),
        time_buffer=buffer,
    )

matchup

point_collocation.core.engine.matchup

matchup(plan: 'Plan', *, geometry: str, variables: list[str] | None = None, open_method: str | None = None, spatial_method: str | None = None, open_dataset_kwargs: dict | None = None, silent: bool = False, batch_size: int = 10, save_dir: str | PathLike | None = None, granule_range: tuple[int, int] | None = None) -> pd.DataFrame

Extract variables from cloud-hosted granules at the given points.

Parameters:

Name Type Description Default
plan 'Plan'

A :class:~point_collocation.core.plan.Plan object previously built with :func:~point_collocation.plan. Data source and search parameters are taken from the plan. One output row is produced per (point, granule) pair; points with zero matching granules produce a single NaN row.

required
geometry str

Data geometry type. Must be "grid" (L3/gridded, 1-D lat/lon coordinates) or "swath" (L2/swath, 2-D lat/lon arrays). This is a required argument — no default is provided.

required
variables list[str] | None

Variable names to extract from each granule. When provided, overrides any variables stored on the plan. When omitted, falls back to plan.variables. If the resolved list is empty, the output will have no variable columns. Raises :exc:ValueError if a requested variable is not found in the opened dataset.

None
open_method str | None

How granules are opened. "dataset" opens each granule with xarray.open_dataset; "datatree-merge" opens with DataTree and merges groups into a flat dataset. Defaults to "dataset" when geometry="grid" and "datatree-merge" when geometry="swath".

None
spatial_method str | None

Method used for spatial matching. "nearest" uses ds.sel(..., method="nearest") and requires 1-D coordinates (gridded data). "xoak" uses the xoak package for nearest-neighbour matching on 2-D (irregular/swath) grids. Defaults to "nearest" when geometry="grid" and "xoak" when geometry="swath".

None
open_dataset_kwargs dict | None

Optional dictionary of keyword arguments forwarded to xarray.open_dataset for every granule opened during the run. chunks defaults to {} (lazy/dask loading) unless explicitly overridden. engine defaults to "h5netcdf" when no engine key is present in the dict.

None
silent bool

When False (default), a progress message is printed to stdout after every batch_size granules. Set to True to suppress all progress output.

False
batch_size int

Number of granules to process between progress reports (and between intermediate saves when save_dir is set). Defaults to 10.

10
save_dir str | PathLike | None

Directory in which intermediate results are saved as Parquet files after each batch of batch_size granules. The directory is created automatically if it does not exist. Each batch is saved as plan_<first>_<last>.parquet where first and last are the granule indices from the plan. When None (default), no intermediate files are written.

None
granule_range tuple[int, int] | None

Optional (start, end) tuple (both 1-based and inclusive) that restricts processing to a contiguous slice of the matched granules, ordered by granule index. For example, granule_range=(261, 620) resumes from granule 261 after a crash that completed granules 1–260. Progress messages continue to report absolute granule numbers (e.g. "granules 261-270 of 620 processed") so the output is directly comparable with messages from the original run. When None (default), all matched granules are processed.

None

Returns:

Type Description
DataFrame

One row per (point, granule) pair, including a granule_id column and one column per variable. Points with zero matching granules contribute a single NaN row.

Raises:

Type Description
ValueError

If geometry is not "grid" or "swath".

ValueError

If a requested variable is not present in an opened dataset.

ValueError

If geolocation variables cannot be detected unambiguously.

ValueError

If the geolocation array dimensionality does not match geometry.

ValueError

If granule_range is not a 2-tuple of positive integers with start <= end, or if either bound exceeds the number of matched granules in the plan.

ImportError

If spatial_method="xoak" and the xoak package is not installed.

Source code in point_collocation/core/engine.py
def matchup(
    plan: "Plan",
    *,
    geometry: str,
    variables: list[str] | None = None,
    open_method: str | None = None,
    spatial_method: str | None = None,
    open_dataset_kwargs: dict | None = None,
    silent: bool = False,
    batch_size: int = 10,
    save_dir: str | os.PathLike | None = None,
    granule_range: tuple[int, int] | None = None,
) -> pd.DataFrame:
    """Extract variables from cloud-hosted granules at the given points.

    Parameters
    ----------
    plan:
        A :class:`~point_collocation.core.plan.Plan` object previously
        built with :func:`~point_collocation.plan`.  Data source and
        search parameters are taken from the plan.  One output row is
        produced per (point, granule) pair; points with zero matching
        granules produce a single NaN row.
    geometry:
        Data geometry type.  Must be ``"grid"`` (L3/gridded, 1-D lat/lon
        coordinates) or ``"swath"`` (L2/swath, 2-D lat/lon arrays).
        This is a required argument — no default is provided.
    variables:
        Variable names to extract from each granule.  When provided,
        overrides any variables stored on the plan.  When omitted,
        falls back to ``plan.variables``.  If the resolved list is
        empty, the output will have no variable columns.
        Raises :exc:`ValueError` if a requested variable is not found
        in the opened dataset.
    open_method:
        How granules are opened.  ``"dataset"`` opens each granule with
        ``xarray.open_dataset``; ``"datatree-merge"`` opens with
        DataTree and merges groups into a flat dataset.  Defaults to
        ``"dataset"`` when ``geometry="grid"`` and ``"datatree-merge"``
        when ``geometry="swath"``.
    spatial_method:
        Method used for spatial matching.  ``"nearest"`` uses
        ``ds.sel(..., method="nearest")`` and requires 1-D coordinates
        (gridded data).  ``"xoak"`` uses the ``xoak`` package for
        nearest-neighbour matching on 2-D (irregular/swath) grids.
        Defaults to ``"nearest"`` when ``geometry="grid"`` and
        ``"xoak"`` when ``geometry="swath"``.
    open_dataset_kwargs:
        Optional dictionary of keyword arguments forwarded to
        ``xarray.open_dataset`` for every granule opened during the run.
        ``chunks`` defaults to ``{}`` (lazy/dask loading) unless
        explicitly overridden.  ``engine`` defaults to ``"h5netcdf"``
        when no ``engine`` key is present in the dict.
    silent:
        When ``False`` (default), a progress message is printed to
        stdout after every *batch_size* granules.  Set to ``True`` to
        suppress all progress output.
    batch_size:
        Number of granules to process between progress reports (and
        between intermediate saves when *save_dir* is set).  Defaults
        to ``10``.
    save_dir:
        Directory in which intermediate results are saved as Parquet
        files after each batch of *batch_size* granules.  The directory
        is created automatically if it does not exist.  Each batch is
        saved as ``plan_<first>_<last>.parquet`` where *first* and
        *last* are the granule indices from the plan.  When ``None``
        (default), no intermediate files are written.
    granule_range:
        Optional ``(start, end)`` tuple (both **1-based and inclusive**)
        that restricts processing to a contiguous slice of the matched
        granules, ordered by granule index.  For example,
        ``granule_range=(261, 620)`` resumes from granule 261 after a
        crash that completed granules 1–260.  Progress messages continue
        to report absolute granule numbers (e.g.
        "granules 261-270 of 620 processed") so the output is directly
        comparable with messages from the original run.  When ``None``
        (default), all matched granules are processed.

    Returns
    -------
    pandas.DataFrame
        One row per (point, granule) pair, including a ``granule_id``
        column and one column per variable.  Points with zero matching
        granules contribute a single NaN row.

    Raises
    ------
    ValueError
        If ``geometry`` is not ``"grid"`` or ``"swath"``.
    ValueError
        If a requested variable is not present in an opened dataset.
    ValueError
        If geolocation variables cannot be detected unambiguously.
    ValueError
        If the geolocation array dimensionality does not match *geometry*.
    ValueError
        If ``granule_range`` is not a 2-tuple of positive integers with
        ``start <= end``, or if either bound exceeds the number of matched
        granules in the plan.
    ImportError
        If ``spatial_method="xoak"`` and the ``xoak`` package is not
        installed.
    """
    if geometry not in _VALID_GEOMETRIES:
        raise ValueError(
            f"geometry={geometry!r} is not valid. "
            f"Must be one of {sorted(_VALID_GEOMETRIES)}."
        )

    if granule_range is not None:
        if (
            len(granule_range) != 2
            or not isinstance(granule_range[0], int)
            or not isinstance(granule_range[1], int)
            or granule_range[0] < 1
            or granule_range[1] < granule_range[0]
        ):
            raise ValueError(
                f"granule_range={granule_range!r} is not valid. "
                "Must be a (start, end) tuple of positive integers with start <= end, "
                "both 1-based and inclusive (e.g. granule_range=(261, 620))."
            )

    # Apply geometry-based defaults.
    if open_method is None:
        open_method = "dataset" if geometry == "grid" else "datatree-merge"
    if spatial_method is None:
        spatial_method = "nearest" if geometry == "grid" else "xoak"

    if open_method not in _VALID_OPEN_METHODS:
        raise ValueError(
            f"open_method={open_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
        )
    if spatial_method not in _VALID_SPATIAL_METHODS:
        raise ValueError(
            f"spatial_method={spatial_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_SPATIAL_METHODS)}."
        )

    # Validate xoak is importable before we start processing granules.
    if spatial_method == "xoak":
        try:
            from xoak.tree_adapters import SklearnKDTreeAdapter  # type: ignore[import-untyped]  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "The 'xoak' package (and scikit-learn) are required for spatial_method='xoak'. "
                "Install them with: pip install xoak scikit-learn"
            ) from exc

    effective_vars: list[str] = variables if variables is not None else plan.variables
    effective_kwargs = {"chunks": {}, **(open_dataset_kwargs or {})}
    return _execute_plan(
        plan,
        geometry=geometry,
        open_method=open_method,
        spatial_method=spatial_method,
        variables=effective_vars,
        silent=silent,
        batch_size=batch_size,
        save_dir=save_dir,
        granule_range=granule_range,
        **effective_kwargs,
    )

Plan

point_collocation.core.plan.Plan dataclass

A planned matchup: stores the point→granule mapping and search results.

Attributes:

Name Type Description
points DataFrame

Normalised points DataFrame (always has a time column).

results list[Any]

Original earthaccess result objects in search order. Passed directly to earthaccess.open() when executing the plan.

granules list[GranuleMeta]

:class:GranuleMeta for every unique granule returned by the search (parallel with results).

point_granule_map dict[Any, list[int]]

Maps each row index of points to a (possibly empty) list of indices into granules.

variables list[str]

Default variables to extract during :func:~point_collocation.matchup. Can be overridden by passing variables directly to :func:~point_collocation.matchup.

source_kwargs dict[str, Any]

earthaccess search kwargs used to build this plan.

time_buffer Timedelta

Temporal buffer that was applied when matching points to granules.

Source code in point_collocation/core/plan.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
@dataclass
class Plan:
    """A planned matchup: stores the point→granule mapping and search results.

    Attributes
    ----------
    points:
        Normalised points DataFrame (always has a ``time`` column).
    results:
        Original earthaccess result objects in search order.  Passed
        directly to ``earthaccess.open()`` when executing the plan.
    granules:
        :class:`GranuleMeta` for every unique granule returned by the
        search (parallel with *results*).
    point_granule_map:
        Maps each row index of *points* to a (possibly empty) list of
        indices into *granules*.
    variables:
        Default variables to extract during :func:`~point_collocation.matchup`.
        Can be overridden by passing ``variables`` directly to
        :func:`~point_collocation.matchup`.
    source_kwargs:
        earthaccess search kwargs used to build this plan.
    time_buffer:
        Temporal buffer that was applied when matching points to granules.
    """

    points: pd.DataFrame
    results: list[Any]
    granules: list[GranuleMeta]
    point_granule_map: dict[Any, list[int]]
    variables: list[str] = field(default_factory=list)
    source_kwargs: dict[str, Any] = field(default_factory=dict)
    time_buffer: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(0))

    # ------------------------------------------------------------------
    # Indexing — plan[0] returns a result object; plan[0:10] returns a
    # subset Plan restricted to the sliced points.
    # ------------------------------------------------------------------

    def __getitem__(self, idx: int | slice) -> "Plan | Any":
        """Return a subset :class:`Plan` or a single earthaccess result.

        Parameters
        ----------
        idx:
            * **Integer** — returns the earthaccess result object at that
              position (``self.results[idx]``), so that ``plan[0]`` can
              still be passed to :meth:`open_dataset`.
            * **Slice** — returns a new :class:`Plan` whose ``points``
              are the rows selected by the slice (``points.iloc[idx]``),
              with ``point_granule_map``, ``granules``, and ``results``
              filtered and re-indexed accordingly.  This allows users to
              test a subset of a large plan::

                  res = pc.matchup(plan[0:10], geometry="grid", variables=["avw"])
        """
        if isinstance(idx, int):
            return self.results[idx]

        # --- Slice: subset by points ---
        subset_points = self.points.iloc[idx]
        subset_pt_indices = list(subset_points.index)

        # Collect granule indices (into self.granules) needed by the subset.
        needed_g_idx: list[int] = []
        seen_g: set[int] = set()
        for pt_idx in subset_pt_indices:
            for g_idx in self.point_granule_map.get(pt_idx, []):
                if g_idx not in seen_g:
                    needed_g_idx.append(g_idx)
                    seen_g.add(g_idx)
        needed_g_idx.sort()

        # Build re-index map: old granule index → new granule index.
        g_remap: dict[int, int] = {old: new for new, old in enumerate(needed_g_idx)}

        # New granules with corrected result_index (sequential from 0).
        new_granules = [
            GranuleMeta(
                granule_id=self.granules[old_g].granule_id,
                begin=self.granules[old_g].begin,
                end=self.granules[old_g].end,
                bbox=self.granules[old_g].bbox,
                result_index=new_g,
            )
            for new_g, old_g in enumerate(needed_g_idx)
        ]

        # New results list — only the results referenced by kept granules.
        new_results = [self.results[self.granules[old_g].result_index] for old_g in needed_g_idx]

        # New point_granule_map using re-indexed granule indices.
        new_pgm: dict[Any, list[int]] = {
            pt_idx: [g_remap[g] for g in self.point_granule_map.get(pt_idx, [])]
            for pt_idx in subset_pt_indices
        }

        return Plan(
            points=subset_points,
            results=new_results,
            granules=new_granules,
            point_granule_map=new_pgm,
            variables=list(self.variables),
            source_kwargs=dict(self.source_kwargs),
            time_buffer=self.time_buffer,
        )

    # ------------------------------------------------------------------
    # Dataset opening helpers
    # ------------------------------------------------------------------

    def open_dataset(
        self,
        result: Any,
        geometry: str | None = None,
        open_method: str | None = None,
        open_dataset_kwargs: dict[str, Any] | None = None,
    ) -> "xr.Dataset":
        """Open a single granule result as an :class:`xarray.Dataset`.

        Parameters
        ----------
        result:
            A single earthaccess result object, typically obtained via
            ``plan[n]``.
        geometry:
            Data geometry type.  ``"grid"`` (L3/gridded) or ``"swath"``
            (L2/swath).  When provided, determines the default
            ``open_method`` if *open_method* is not given explicitly.
        open_method:
            How to open the granule.  ``"dataset"`` uses a plain
            ``xarray.open_dataset`` call (the default when *geometry* is
            ``None`` or ``"grid"``).  ``"datatree-merge"`` opens as a
            DataTree and merges all groups into a flat dataset (the
            default when *geometry* is ``"swath"``).
        open_dataset_kwargs:
            Keyword arguments forwarded to ``xarray.open_dataset`` or
            ``xarray.open_datatree``.  ``chunks`` defaults to ``{}``
            (lazy/dask loading) unless explicitly overridden.  ``engine``
            defaults to ``"h5netcdf"`` when not specified.

        Returns
        -------
        xarray.Dataset
        """
        from point_collocation.core.engine import (
            _VALID_GEOMETRIES,
            _VALID_OPEN_METHODS,
            _merge_datatree,
            _open_datatree,
        )

        if geometry is not None and geometry not in _VALID_GEOMETRIES:
            raise ValueError(
                f"geometry={geometry!r} is not valid. "
                f"Must be one of {sorted(_VALID_GEOMETRIES)}."
            )

        # Resolve open_method default from geometry.
        if open_method is None:
            open_method = "datatree-merge" if geometry == "swath" else "dataset"

        if open_method not in _VALID_OPEN_METHODS:
            raise ValueError(
                f"open_method={open_method!r} is not valid. "
                f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
            )

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        kwargs = {"chunks": {}, **(open_dataset_kwargs or {})}
        if "engine" not in kwargs:
            kwargs["engine"] = "h5netcdf"

        file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
        if len(file_objs) != 1:
            raise RuntimeError(
                f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
            )

        if open_method == "datatree-merge":
            dt = _open_datatree(file_objs[0], kwargs)
            try:
                return _merge_datatree(dt)
            finally:
                if hasattr(dt, "close"):
                    dt.close()

        return xr.open_dataset(file_objs[0], **kwargs)  # type: ignore[arg-type]

    def open_mfdataset(
        self,
        results: "list[Any] | Plan",
        geometry: str | None = None,
        open_method: str | None = None,
        open_dataset_kwargs: dict[str, Any] | None = None,
    ) -> "xr.Dataset":
        """Open multiple granule results as a single :class:`xarray.Dataset`.

        Parameters
        ----------
        results:
            A list of earthaccess result objects, or a :class:`Plan`
            (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
            ``results`` attribute is used.
        geometry:
            Data geometry type.  ``"grid"`` (L3/gridded) or ``"swath"``
            (L2/swath).  When provided, determines the default
            ``open_method`` if *open_method* is not given explicitly.
        open_method:
            How to open each granule.  ``"dataset"`` uses
            ``xarray.open_mfdataset`` across all file objects (the default
            when *geometry* is ``None`` or ``"grid"``).
            ``"datatree-merge"`` opens each granule as a DataTree, merges
            its groups into a flat dataset, then concatenates all granules
            along a new ``granule`` dimension (the default when *geometry*
            is ``"swath"``).
        open_dataset_kwargs:
            Keyword arguments forwarded to ``xarray.open_mfdataset`` or
            ``xarray.open_datatree``.  ``chunks`` defaults to ``{}``
            (lazy/dask loading) unless explicitly overridden.  ``engine``
            defaults to ``"h5netcdf"`` when not specified.

        Returns
        -------
        xarray.Dataset
        """
        from point_collocation.core.engine import (
            _VALID_GEOMETRIES,
            _VALID_OPEN_METHODS,
            _merge_datatree,
            _open_datatree,
        )

        if geometry is not None and geometry not in _VALID_GEOMETRIES:
            raise ValueError(
                f"geometry={geometry!r} is not valid. "
                f"Must be one of {sorted(_VALID_GEOMETRIES)}."
            )

        # Resolve open_method default from geometry.
        if open_method is None:
            open_method = "datatree-merge" if geometry == "swath" else "dataset"

        if open_method not in _VALID_OPEN_METHODS:
            raise ValueError(
                f"open_method={open_method!r} is not valid. "
                f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
            )

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        import xarray as xr

        kwargs = {"chunks": {}, **(open_dataset_kwargs or {})}
        if "engine" not in kwargs:
            kwargs["engine"] = "h5netcdf"

        result_list = results.results if isinstance(results, Plan) else list(results)
        file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

        if open_method == "datatree-merge":
            # Open each granule as a DataTree, merge its groups, then
            # concatenate all granule datasets along a new "granule" dim.
            merged_datasets: list[xr.Dataset] = []
            for file_obj in file_objs:
                dt = _open_datatree(file_obj, kwargs)
                try:
                    merged_datasets.append(_merge_datatree(dt))
                finally:
                    if hasattr(dt, "close"):
                        dt.close()
            if not merged_datasets:
                return xr.Dataset()
            return xr.concat(merged_datasets, dim="granule")

        return xr.open_mfdataset(file_objs, **kwargs)  # type: ignore[arg-type]

    # ------------------------------------------------------------------
    # Variable inspection
    # ------------------------------------------------------------------

    def show_variables(
        self,
        geometry: str,
        open_method: str | None = None,
        open_dataset_kwargs: dict[str, Any] | None = None,
    ) -> None:
        """Open the first granule and print its dimensions and variables.

        Uses :meth:`open_dataset` (or a DataTree for
        ``open_method="datatree-merge"``) to load the first result in the
        plan, then prints the dataset dimensions, data variable names, and
        geolocation detection results.  This lets users discover available
        variable names before running a full :func:`~point_collocation.matchup`.

        Parameters
        ----------
        geometry:
            Data geometry type.  Must be ``"grid"`` (L3/gridded, 1-D
            lat/lon coordinates) or ``"swath"`` (L2/swath, 2-D lat/lon
            arrays).  This is a required argument — no default is provided.
        open_method:
            How to open the granule.  ``"dataset"`` uses a plain
            ``xarray.open_dataset`` call.  ``"datatree-merge"`` opens as a
            DataTree, merges into a flat dataset, then prints the merged
            summary followed by group details at the end.  Defaults to
            ``"dataset"`` when ``geometry="grid"`` and ``"datatree-merge"``
            when ``geometry="swath"``.
        open_dataset_kwargs:
            Keyword arguments forwarded to ``xarray.open_dataset`` when
            opening the first granule.  Passed unchanged to
            :meth:`open_dataset`.

        Raises
        ------
        ValueError
            If the plan contains no granules.
        """
        from point_collocation.core.engine import (
            _GEOLOC_PAIRS,
            _VALID_GEOMETRIES,
            _VALID_OPEN_METHODS,
            _merge_datatree,
            _open_datatree,
        )

        if geometry not in _VALID_GEOMETRIES:
            raise ValueError(
                f"geometry={geometry!r} is not valid. "
                f"Must be one of {sorted(_VALID_GEOMETRIES)}."
            )

        if open_method is None:
            open_method = "dataset" if geometry == "grid" else "datatree-merge"

        if open_method not in _VALID_OPEN_METHODS:
            raise ValueError(
                f"open_method={open_method!r} is not valid. "
                f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
            )

        if not self.results:
            raise ValueError("No granules in plan — cannot show variables.")

        import xarray as xr

        kwargs: dict[str, Any] = {"chunks": {}, **(open_dataset_kwargs or {})}
        if "engine" not in kwargs:
            kwargs["engine"] = "h5netcdf"

        try:
            import earthaccess  # type: ignore[import-untyped]
        except ImportError as exc:
            raise ImportError(
                "The 'earthaccess' package is required. "
                "Install it with: pip install earthaccess"
            ) from exc

        file_objs = earthaccess.open([self.results[0]], pqdm_kwargs={"disable": True})
        if len(file_objs) != 1:
            raise RuntimeError(
                f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
            )
        file_obj = file_objs[0]

        print(f"geometry     : {geometry!r}")
        print(f"open_method  : {open_method!r}")

        if open_method == "datatree-merge":
            # Open as DataTree and merge for the summary view.
            dt = _open_datatree(file_obj, kwargs)
            ds_flat = _merge_datatree(dt)

            # Print merged summary first.
            print(f"Dimensions : {dict(ds_flat.sizes)}")
            print(f"Variables  : {list(ds_flat.data_vars)}")
        else:
            ds_flat = xr.open_dataset(file_obj, **kwargs)  # type: ignore[arg-type]
            print(f"Dimensions : {dict(ds_flat.sizes)}")
            print(f"Variables  : {list(ds_flat.data_vars)}")

        # Geolocation detection results.
        found_pairs: list[tuple[str, str]] = []
        for lon_name, lat_name in _GEOLOC_PAIRS:
            has_lon = lon_name in ds_flat.coords or lon_name in ds_flat.data_vars
            has_lat = lat_name in ds_flat.coords or lat_name in ds_flat.data_vars
            if has_lon and has_lat:
                found_pairs.append((lon_name, lat_name))

        if len(found_pairs) == 0:
            alt_open_method = "datatree-merge" if open_method == "dataset" else "dataset"
            alt = f"plan.show_variables(geometry={geometry!r}, open_method={alt_open_method!r})"
            print(
                f"\nGeolocation: NONE detected with open_method={open_method!r}. "
                f"Try {alt}."
            )
        elif len(found_pairs) == 1:
            lon_n, lat_n = found_pairs[0]
            lon_var = ds_flat.coords[lon_n] if lon_n in ds_flat.coords else ds_flat[lon_n]
            lat_var = ds_flat.coords[lat_n] if lat_n in ds_flat.coords else ds_flat[lat_n]
            print(
                f"\nGeolocation: ({lon_n!r}, {lat_n!r}) — "
                f"lon dims={tuple(lon_var.dims)}, lat dims={tuple(lat_var.dims)}"
            )
        else:
            print(f"\nGeolocation: ambiguous — detected pairs: {found_pairs}")

        # For datatree-merge, print group details at the end.
        if open_method == "datatree-merge":
            print("\nDataTree groups (detail):")
            try:
                # xarray DataTree API (>= 2024.x).
                for node in dt.subtree:  # type: ignore[union-attr]
                    path = node.path if hasattr(node, "path") else str(node.name)
                    ds_node = node.ds
                    if ds_node is not None:
                        dims_str = dict(ds_node.sizes)
                        vars_list = list(ds_node.data_vars)
                        print(f"  {path or '/'}")
                        print(f"    Dimensions : {dims_str}")
                        print(f"    Variables  : {vars_list}")
            except AttributeError:
                # datatree package API.
                for path, node in dt.items():  # type: ignore[union-attr]
                    ds_node = node.ds
                    if ds_node is not None:
                        dims_str = dict(ds_node.sizes)
                        vars_list = list(ds_node.data_vars)
                        print(f"  {path or '/'}")
                        print(f"    Dimensions : {dims_str}")
                        print(f"    Variables  : {vars_list}")

    # ------------------------------------------------------------------
    # Summary
    # ------------------------------------------------------------------

    def summary(self, n: int | None = None) -> None:
        """Print a human-readable summary of the plan.

        Parameters
        ----------
        n:
            Number of points to show in the per-point section.
            Defaults to ``min(5, len(self.points))``.
            ``0`` or negative values suppress the per-point section.
        """
        if n is None:
            n = min(5, len(self.points))
        elif n < 0:
            n = 0

        zero_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) == 0
        )
        multi_match = sum(
            1 for g_list in self.point_granule_map.values() if len(g_list) > 1
        )

        matched_granule_count = len(
            {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
        )

        lines: list[str] = [
            f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
            f"  Points with 0 matches : {zero_match}",
            f"  Points with >1 matches: {multi_match}",
            f"  Time buffer: {self.time_buffer}",
        ]

        n_show = min(n, len(self.points))
        if n_show > 0:
            lines.append("")
            lines.append(f"First {n_show} point(s):")
            for pt_idx, row in self.points.head(n_show).iterrows():
                g_indices = self.point_granule_map.get(pt_idx, [])
                lines.append(
                    f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                    f"time={row['time']}: {len(g_indices)} match(es)"
                )
                for g_idx in g_indices:
                    lines.append(f"    → {self.granules[g_idx].granule_id}")

        print("\n".join(lines))

summary

summary(n: int | None = None) -> None

Print a human-readable summary of the plan.

Parameters:

Name Type Description Default
n int | None

Number of points to show in the per-point section. Defaults to min(5, len(self.points)). 0 or negative values suppress the per-point section.

None
Source code in point_collocation/core/plan.py
def summary(self, n: int | None = None) -> None:
    """Print a human-readable summary of the plan.

    Parameters
    ----------
    n:
        Number of points to show in the per-point section.
        Defaults to ``min(5, len(self.points))``.
        ``0`` or negative values suppress the per-point section.
    """
    if n is None:
        n = min(5, len(self.points))
    elif n < 0:
        n = 0

    zero_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) == 0
    )
    multi_match = sum(
        1 for g_list in self.point_granule_map.values() if len(g_list) > 1
    )

    matched_granule_count = len(
        {g_idx for g_list in self.point_granule_map.values() for g_idx in g_list}
    )

    lines: list[str] = [
        f"Plan: {len(self.points)} points → {matched_granule_count} unique granule(s)",
        f"  Points with 0 matches : {zero_match}",
        f"  Points with >1 matches: {multi_match}",
        f"  Time buffer: {self.time_buffer}",
    ]

    n_show = min(n, len(self.points))
    if n_show > 0:
        lines.append("")
        lines.append(f"First {n_show} point(s):")
        for pt_idx, row in self.points.head(n_show).iterrows():
            g_indices = self.point_granule_map.get(pt_idx, [])
            lines.append(
                f"  [{pt_idx}] lat={row['lat']:.4f}, lon={row['lon']:.4f}, "
                f"time={row['time']}: {len(g_indices)} match(es)"
            )
            for g_idx in g_indices:
                lines.append(f"    → {self.granules[g_idx].granule_id}")

    print("\n".join(lines))

show_variables

show_variables(geometry: str, open_method: str | None = None, open_dataset_kwargs: dict[str, Any] | None = None) -> None

Open the first granule and print its dimensions and variables.

Uses :meth:open_dataset (or a DataTree for open_method="datatree-merge") to load the first result in the plan, then prints the dataset dimensions, data variable names, and geolocation detection results. This lets users discover available variable names before running a full :func:~point_collocation.matchup.

Parameters:

Name Type Description Default
geometry str

Data geometry type. Must be "grid" (L3/gridded, 1-D lat/lon coordinates) or "swath" (L2/swath, 2-D lat/lon arrays). This is a required argument — no default is provided.

required
open_method str | None

How to open the granule. "dataset" uses a plain xarray.open_dataset call. "datatree-merge" opens as a DataTree, merges into a flat dataset, then prints the merged summary followed by group details at the end. Defaults to "dataset" when geometry="grid" and "datatree-merge" when geometry="swath".

None
open_dataset_kwargs dict[str, Any] | None

Keyword arguments forwarded to xarray.open_dataset when opening the first granule. Passed unchanged to :meth:open_dataset.

None

Raises:

Type Description
ValueError

If the plan contains no granules.

Source code in point_collocation/core/plan.py
def show_variables(
    self,
    geometry: str,
    open_method: str | None = None,
    open_dataset_kwargs: dict[str, Any] | None = None,
) -> None:
    """Open the first granule and print its dimensions and variables.

    Uses :meth:`open_dataset` (or a DataTree for
    ``open_method="datatree-merge"``) to load the first result in the
    plan, then prints the dataset dimensions, data variable names, and
    geolocation detection results.  This lets users discover available
    variable names before running a full :func:`~point_collocation.matchup`.

    Parameters
    ----------
    geometry:
        Data geometry type.  Must be ``"grid"`` (L3/gridded, 1-D
        lat/lon coordinates) or ``"swath"`` (L2/swath, 2-D lat/lon
        arrays).  This is a required argument — no default is provided.
    open_method:
        How to open the granule.  ``"dataset"`` uses a plain
        ``xarray.open_dataset`` call.  ``"datatree-merge"`` opens as a
        DataTree, merges into a flat dataset, then prints the merged
        summary followed by group details at the end.  Defaults to
        ``"dataset"`` when ``geometry="grid"`` and ``"datatree-merge"``
        when ``geometry="swath"``.
    open_dataset_kwargs:
        Keyword arguments forwarded to ``xarray.open_dataset`` when
        opening the first granule.  Passed unchanged to
        :meth:`open_dataset`.

    Raises
    ------
    ValueError
        If the plan contains no granules.
    """
    from point_collocation.core.engine import (
        _GEOLOC_PAIRS,
        _VALID_GEOMETRIES,
        _VALID_OPEN_METHODS,
        _merge_datatree,
        _open_datatree,
    )

    if geometry not in _VALID_GEOMETRIES:
        raise ValueError(
            f"geometry={geometry!r} is not valid. "
            f"Must be one of {sorted(_VALID_GEOMETRIES)}."
        )

    if open_method is None:
        open_method = "dataset" if geometry == "grid" else "datatree-merge"

    if open_method not in _VALID_OPEN_METHODS:
        raise ValueError(
            f"open_method={open_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
        )

    if not self.results:
        raise ValueError("No granules in plan — cannot show variables.")

    import xarray as xr

    kwargs: dict[str, Any] = {"chunks": {}, **(open_dataset_kwargs or {})}
    if "engine" not in kwargs:
        kwargs["engine"] = "h5netcdf"

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    file_objs = earthaccess.open([self.results[0]], pqdm_kwargs={"disable": True})
    if len(file_objs) != 1:
        raise RuntimeError(
            f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
        )
    file_obj = file_objs[0]

    print(f"geometry     : {geometry!r}")
    print(f"open_method  : {open_method!r}")

    if open_method == "datatree-merge":
        # Open as DataTree and merge for the summary view.
        dt = _open_datatree(file_obj, kwargs)
        ds_flat = _merge_datatree(dt)

        # Print merged summary first.
        print(f"Dimensions : {dict(ds_flat.sizes)}")
        print(f"Variables  : {list(ds_flat.data_vars)}")
    else:
        ds_flat = xr.open_dataset(file_obj, **kwargs)  # type: ignore[arg-type]
        print(f"Dimensions : {dict(ds_flat.sizes)}")
        print(f"Variables  : {list(ds_flat.data_vars)}")

    # Geolocation detection results.
    found_pairs: list[tuple[str, str]] = []
    for lon_name, lat_name in _GEOLOC_PAIRS:
        has_lon = lon_name in ds_flat.coords or lon_name in ds_flat.data_vars
        has_lat = lat_name in ds_flat.coords or lat_name in ds_flat.data_vars
        if has_lon and has_lat:
            found_pairs.append((lon_name, lat_name))

    if len(found_pairs) == 0:
        alt_open_method = "datatree-merge" if open_method == "dataset" else "dataset"
        alt = f"plan.show_variables(geometry={geometry!r}, open_method={alt_open_method!r})"
        print(
            f"\nGeolocation: NONE detected with open_method={open_method!r}. "
            f"Try {alt}."
        )
    elif len(found_pairs) == 1:
        lon_n, lat_n = found_pairs[0]
        lon_var = ds_flat.coords[lon_n] if lon_n in ds_flat.coords else ds_flat[lon_n]
        lat_var = ds_flat.coords[lat_n] if lat_n in ds_flat.coords else ds_flat[lat_n]
        print(
            f"\nGeolocation: ({lon_n!r}, {lat_n!r}) — "
            f"lon dims={tuple(lon_var.dims)}, lat dims={tuple(lat_var.dims)}"
        )
    else:
        print(f"\nGeolocation: ambiguous — detected pairs: {found_pairs}")

    # For datatree-merge, print group details at the end.
    if open_method == "datatree-merge":
        print("\nDataTree groups (detail):")
        try:
            # xarray DataTree API (>= 2024.x).
            for node in dt.subtree:  # type: ignore[union-attr]
                path = node.path if hasattr(node, "path") else str(node.name)
                ds_node = node.ds
                if ds_node is not None:
                    dims_str = dict(ds_node.sizes)
                    vars_list = list(ds_node.data_vars)
                    print(f"  {path or '/'}")
                    print(f"    Dimensions : {dims_str}")
                    print(f"    Variables  : {vars_list}")
        except AttributeError:
            # datatree package API.
            for path, node in dt.items():  # type: ignore[union-attr]
                ds_node = node.ds
                if ds_node is not None:
                    dims_str = dict(ds_node.sizes)
                    vars_list = list(ds_node.data_vars)
                    print(f"  {path or '/'}")
                    print(f"    Dimensions : {dims_str}")
                    print(f"    Variables  : {vars_list}")

open_dataset

open_dataset(result: Any, geometry: str | None = None, open_method: str | None = None, open_dataset_kwargs: dict[str, Any] | None = None) -> 'xr.Dataset'

Open a single granule result as an :class:xarray.Dataset.

Parameters:

Name Type Description Default
result Any

A single earthaccess result object, typically obtained via plan[n].

required
geometry str | None

Data geometry type. "grid" (L3/gridded) or "swath" (L2/swath). When provided, determines the default open_method if open_method is not given explicitly.

None
open_method str | None

How to open the granule. "dataset" uses a plain xarray.open_dataset call (the default when geometry is None or "grid"). "datatree-merge" opens as a DataTree and merges all groups into a flat dataset (the default when geometry is "swath").

None
open_dataset_kwargs dict[str, Any] | None

Keyword arguments forwarded to xarray.open_dataset or xarray.open_datatree. chunks defaults to {} (lazy/dask loading) unless explicitly overridden. engine defaults to "h5netcdf" when not specified.

None

Returns:

Type Description
Dataset
Source code in point_collocation/core/plan.py
def open_dataset(
    self,
    result: Any,
    geometry: str | None = None,
    open_method: str | None = None,
    open_dataset_kwargs: dict[str, Any] | None = None,
) -> "xr.Dataset":
    """Open a single granule result as an :class:`xarray.Dataset`.

    Parameters
    ----------
    result:
        A single earthaccess result object, typically obtained via
        ``plan[n]``.
    geometry:
        Data geometry type.  ``"grid"`` (L3/gridded) or ``"swath"``
        (L2/swath).  When provided, determines the default
        ``open_method`` if *open_method* is not given explicitly.
    open_method:
        How to open the granule.  ``"dataset"`` uses a plain
        ``xarray.open_dataset`` call (the default when *geometry* is
        ``None`` or ``"grid"``).  ``"datatree-merge"`` opens as a
        DataTree and merges all groups into a flat dataset (the
        default when *geometry* is ``"swath"``).
    open_dataset_kwargs:
        Keyword arguments forwarded to ``xarray.open_dataset`` or
        ``xarray.open_datatree``.  ``chunks`` defaults to ``{}``
        (lazy/dask loading) unless explicitly overridden.  ``engine``
        defaults to ``"h5netcdf"`` when not specified.

    Returns
    -------
    xarray.Dataset
    """
    from point_collocation.core.engine import (
        _VALID_GEOMETRIES,
        _VALID_OPEN_METHODS,
        _merge_datatree,
        _open_datatree,
    )

    if geometry is not None and geometry not in _VALID_GEOMETRIES:
        raise ValueError(
            f"geometry={geometry!r} is not valid. "
            f"Must be one of {sorted(_VALID_GEOMETRIES)}."
        )

    # Resolve open_method default from geometry.
    if open_method is None:
        open_method = "datatree-merge" if geometry == "swath" else "dataset"

    if open_method not in _VALID_OPEN_METHODS:
        raise ValueError(
            f"open_method={open_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
        )

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    kwargs = {"chunks": {}, **(open_dataset_kwargs or {})}
    if "engine" not in kwargs:
        kwargs["engine"] = "h5netcdf"

    file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
    if len(file_objs) != 1:
        raise RuntimeError(
            f"Expected 1 file object from earthaccess.open, got {len(file_objs)}."
        )

    if open_method == "datatree-merge":
        dt = _open_datatree(file_objs[0], kwargs)
        try:
            return _merge_datatree(dt)
        finally:
            if hasattr(dt, "close"):
                dt.close()

    return xr.open_dataset(file_objs[0], **kwargs)  # type: ignore[arg-type]

open_mfdataset

open_mfdataset(results: 'list[Any] | Plan', geometry: str | None = None, open_method: str | None = None, open_dataset_kwargs: dict[str, Any] | None = None) -> 'xr.Dataset'

Open multiple granule results as a single :class:xarray.Dataset.

Parameters:

Name Type Description Default
results 'list[Any] | Plan'

A list of earthaccess result objects, or a :class:Plan (e.g. plan[0:2]). When a :class:Plan is passed its results attribute is used.

required
geometry str | None

Data geometry type. "grid" (L3/gridded) or "swath" (L2/swath). When provided, determines the default open_method if open_method is not given explicitly.

None
open_method str | None

How to open each granule. "dataset" uses xarray.open_mfdataset across all file objects (the default when geometry is None or "grid"). "datatree-merge" opens each granule as a DataTree, merges its groups into a flat dataset, then concatenates all granules along a new granule dimension (the default when geometry is "swath").

None
open_dataset_kwargs dict[str, Any] | None

Keyword arguments forwarded to xarray.open_mfdataset or xarray.open_datatree. chunks defaults to {} (lazy/dask loading) unless explicitly overridden. engine defaults to "h5netcdf" when not specified.

None

Returns:

Type Description
Dataset
Source code in point_collocation/core/plan.py
def open_mfdataset(
    self,
    results: "list[Any] | Plan",
    geometry: str | None = None,
    open_method: str | None = None,
    open_dataset_kwargs: dict[str, Any] | None = None,
) -> "xr.Dataset":
    """Open multiple granule results as a single :class:`xarray.Dataset`.

    Parameters
    ----------
    results:
        A list of earthaccess result objects, or a :class:`Plan`
        (e.g. ``plan[0:2]``).  When a :class:`Plan` is passed its
        ``results`` attribute is used.
    geometry:
        Data geometry type.  ``"grid"`` (L3/gridded) or ``"swath"``
        (L2/swath).  When provided, determines the default
        ``open_method`` if *open_method* is not given explicitly.
    open_method:
        How to open each granule.  ``"dataset"`` uses
        ``xarray.open_mfdataset`` across all file objects (the default
        when *geometry* is ``None`` or ``"grid"``).
        ``"datatree-merge"`` opens each granule as a DataTree, merges
        its groups into a flat dataset, then concatenates all granules
        along a new ``granule`` dimension (the default when *geometry*
        is ``"swath"``).
    open_dataset_kwargs:
        Keyword arguments forwarded to ``xarray.open_mfdataset`` or
        ``xarray.open_datatree``.  ``chunks`` defaults to ``{}``
        (lazy/dask loading) unless explicitly overridden.  ``engine``
        defaults to ``"h5netcdf"`` when not specified.

    Returns
    -------
    xarray.Dataset
    """
    from point_collocation.core.engine import (
        _VALID_GEOMETRIES,
        _VALID_OPEN_METHODS,
        _merge_datatree,
        _open_datatree,
    )

    if geometry is not None and geometry not in _VALID_GEOMETRIES:
        raise ValueError(
            f"geometry={geometry!r} is not valid. "
            f"Must be one of {sorted(_VALID_GEOMETRIES)}."
        )

    # Resolve open_method default from geometry.
    if open_method is None:
        open_method = "datatree-merge" if geometry == "swath" else "dataset"

    if open_method not in _VALID_OPEN_METHODS:
        raise ValueError(
            f"open_method={open_method!r} is not valid. "
            f"Must be one of {sorted(_VALID_OPEN_METHODS)}."
        )

    try:
        import earthaccess  # type: ignore[import-untyped]
    except ImportError as exc:
        raise ImportError(
            "The 'earthaccess' package is required. "
            "Install it with: pip install earthaccess"
        ) from exc

    import xarray as xr

    kwargs = {"chunks": {}, **(open_dataset_kwargs or {})}
    if "engine" not in kwargs:
        kwargs["engine"] = "h5netcdf"

    result_list = results.results if isinstance(results, Plan) else list(results)
    file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})

    if open_method == "datatree-merge":
        # Open each granule as a DataTree, merge its groups, then
        # concatenate all granule datasets along a new "granule" dim.
        merged_datasets: list[xr.Dataset] = []
        for file_obj in file_objs:
            dt = _open_datatree(file_obj, kwargs)
            try:
                merged_datasets.append(_merge_datatree(dt))
            finally:
                if hasattr(dt, "close"):
                    dt.close()
        if not merged_datasets:
            return xr.Dataset()
        return xr.concat(merged_datasets, dim="granule")

    return xr.open_mfdataset(file_objs, **kwargs)  # type: ignore[arg-type]

IO / Adapters

point_collocation.adapters

Source adapters that normalise heterogeneous inputs into the SourceProtocol.

Built-in adapters

earthaccess : wraps file-like objects returned by earthaccess.open()

Future adapters (not yet implemented)

stac : STAC item assets url : plain HTTPS URLs local : local file paths

SourceAdapter

Bases: ABC

Abstract base for source adapters.

Subclass this to add support for a new data source. The core engine only calls :meth:open_dataset; everything else is internal to the adapter.

Source code in point_collocation/adapters/base.py
class SourceAdapter(ABC):
    """Abstract base for source adapters.

    Subclass this to add support for a new data source.  The core
    engine only calls :meth:`open_dataset`; everything else is internal
    to the adapter.
    """

    @abstractmethod
    def open_dataset(self, **kwargs: object) -> object:
        """Return an ``xarray.Dataset`` for this source.

        Parameters
        ----------
        **kwargs:
            Forwarded verbatim to ``xarray.open_dataset``.
        """
        raise NotImplementedError  # pragma: no cover

open_dataset abstractmethod

open_dataset(**kwargs: object) -> object

Return an xarray.Dataset for this source.

Parameters:

Name Type Description Default
**kwargs object

Forwarded verbatim to xarray.open_dataset.

{}
Source code in point_collocation/adapters/base.py
@abstractmethod
def open_dataset(self, **kwargs: object) -> object:
    """Return an ``xarray.Dataset`` for this source.

    Parameters
    ----------
    **kwargs:
        Forwarded verbatim to ``xarray.open_dataset``.
    """
    raise NotImplementedError  # pragma: no cover

point_collocation.core._granule

Helpers for working with individual granules (source files).

Responsibilities
  • Extract a human-readable identifier from an arbitrary source object.
  • Parse the temporal coverage (start/end date) from a NASA-style L3 granule filename.
Supported filename conventions

YYYYDOY — single day (DOY = day-of-year, 001–366) YYYYDOY_YYYYDOY — multi-day range (e.g., 8-day composites, monthly) YYYYMMDD — single day in calendar format YYYYMMDD_YYYYMMDD — multi-day range in calendar format

The period keyword embedded in the filename (.DAY., .8D., .MO.) is used to infer the end date when only a start date is present.

Examples of supported filenames
  • PACE_OCI_2024070.L3m.DAY.RRS.Rrs_412.4km.nc
  • PACE_OCI_2024049_2024056.L3m.8D.CHL.chlor_a.9km.nc
  • AQUA_MODIS.20230601.L3m.DAY.SST.sst.4km.nc
  • AQUA_MODIS.20230601_20230630.L3m.MO.CHL.chlor_a.9km.nc

get_source_id

get_source_id(source: object) -> str

Return a human-readable identifier (basename) for source.

Tries, in order:

  1. pathlib.Pathpath.name
  2. Plain stros.path.basename(source)
  3. Object with a .path or .name string attribute
  4. str(source) as last resort
Source code in point_collocation/core/_granule.py
def get_source_id(source: object) -> str:
    """Return a human-readable identifier (basename) for *source*.

    Tries, in order:

    1. ``pathlib.Path`` → ``path.name``
    2. Plain ``str`` → ``os.path.basename(source)``
    3. Object with a ``.path`` or ``.name`` string attribute
    4. ``str(source)`` as last resort
    """
    if isinstance(source, pathlib.Path):
        return source.name
    if isinstance(source, str):
        return os.path.basename(source)
    for attr in ("path", "name"):
        val = getattr(source, attr, None)
        if isinstance(val, str) and val:
            return os.path.basename(val)
    return str(source)

parse_temporal_range

parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]

Return (start, end) timestamps for the granule named filename.

Only the basename of filename is examined.

Parameters:

Name Type Description Default
filename str

File path or basename.

required

Returns:

Type Description
tuple[Timestamp, Timestamp]

Inclusive start and end dates (time component is midnight UTC).

Raises:

Type Description
ValueError

If no recognisable date pattern is found in filename.

Source code in point_collocation/core/_granule.py
def parse_temporal_range(filename: str) -> tuple[pd.Timestamp, pd.Timestamp]:
    """Return ``(start, end)`` timestamps for the granule named *filename*.

    Only the basename of *filename* is examined.

    Parameters
    ----------
    filename:
        File path or basename.

    Returns
    -------
    tuple[pandas.Timestamp, pandas.Timestamp]
        Inclusive start and end dates (time component is midnight UTC).

    Raises
    ------
    ValueError
        If no recognisable date pattern is found in *filename*.
    """
    basename = os.path.basename(filename)

    # ------------------------------------------------------------------
    # DOY-format pair:  YYYYDOY_YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})_(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = datetime.strptime(m.group(2), "%Y%j")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Calendar-format pair:  YYYYMMDD_YYYYMMDD
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})_(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = datetime.strptime(m.group(2), "%Y%m%d")
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single DOY date:  YYYYDOY
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(\d{7})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%j")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    # ------------------------------------------------------------------
    # Single calendar date:  YYYYMMDD (must start with "20…")
    # ------------------------------------------------------------------
    m = re.search(r"(?<!\d)(20\d{6})(?!\d)", basename)
    if m:
        try:
            start = datetime.strptime(m.group(1), "%Y%m%d")
            end = _infer_end_date(start, basename)
            return pd.Timestamp(start), pd.Timestamp(end)
        except ValueError:
            pass

    raise ValueError(
        f"Cannot parse temporal range from filename: {basename!r}"
    )