Source code for opacity.dataset
import os
import xarray as xr
import zarr
from zarr.experimental.cache_store import CacheStore
from opacity.filesystem import s3_filesystem
from opacity.resolve_species import find_matching_maestro_zarr_prefix
__all__ = [
"open_dataset",
]
default_cache_path = os.path.join(os.path.expanduser("~"), "opacities")
[docs]
def open_dataset(
species,
cache=True,
cache_path=None,
fsspec_filesystem=None,
local_cache_store=None,
max_cache_size_gb=20.0,
):
"""Open an `~xarray.Dataset` for ``species``, and optionally cache
downloads.
Parameters
----------
species : str
Name of the atom or molecule
cache : bool, optional
On retrieving (all, or parts of) a remote array cache the
result locally. Default is True,
cache_path : str or None, optional
If specified and ``cache=True``, the dataset will be cached
in a zarr array located at ``cache_path``. Default is None.
fsspec_filesystem : `~fsspec.filesystem`, optional
Remote filesystem, with or without signed requests. Default is None.
local_cache_store : `~zarr.storage.LocalStore`, optional
User-defined local Zarr cache store. Default is None.
max_cache_size_gb : float, optional
Maximum Zarr array size cached in local memory. Default is 20.
Returns
-------
`~xarray.Dataset`
Opacity Dataset.
Raises
------
ValueError
If ``species`` is None.
"""
if species is None:
raise ValueError(
f'opacity.open_dataset requires a species, got {species}.'
)
zarr_s3_prefix = find_matching_maestro_zarr_prefix(species)
# access to a remote S3 bucket as a file system
if fsspec_filesystem is None:
fsspec_filesystem = s3_filesystem()
remote_store = zarr.storage.FsspecStore(
fs=fsspec_filesystem,
read_only=True,
path=zarr_s3_prefix
)
if cache:
# create local cache for reads from the remote array, give the
# zarr array the same name as the remote zarr array
zarr_name = zarr_s3_prefix.split('/')[-1]
if local_cache_store is None:
if cache_path is None:
cache_path = os.path.join(default_cache_path, zarr_name)
local_cache_store = zarr.storage.LocalStore(cache_path)
# prepare a cache store that links the remote data to the local cache.
# do not use >max_cache_size_gb GB of local memory towards the cache
load_store = CacheStore(
store=remote_store,
cache_store=local_cache_store,
max_size=max_cache_size_gb * 1024 ** 3
)
else:
load_store = remote_store
# open the dataset with zarr:
ds = xr.open_dataset(load_store, engine='zarr')
return ds