From 0598eacb8c59b26a275983e8924a83711d3b6eec Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 10 Apr 2024 12:45:55 +0200 Subject: [PATCH 01/14] adds GDAL retries as the default to method ItemCollection.to_xarray --- simplestac/utils.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/simplestac/utils.py b/simplestac/utils.py index 9ed7813..8f0c39f 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -27,6 +27,34 @@ logger = logging.getLogger(__name__) #### Generic functions and classes #### +# Adds GDAL_HTTP_MAX_RETRY and GDAL_HTTP_RETRY_DELAY to +# stackstac.rio_reader.DEFAULT_GDAL_ENV +# https://github.com/microsoft/PlanetaryComputerExamples/issues/279 +# while waiting for a PR to be merged: https://github.com/gjoseph92/stackstac/pull/232 +# See also https://github.com/gjoseph92/stackstac/issues/18 +DEFAULT_GDAL_ENV = stackstac.rio_reader.LayeredEnv( + always=dict( + GDAL_HTTP_MULTIRANGE="YES", # unclear if this actually works + GDAL_HTTP_MERGE_CONSECUTIVE_RANGES="YES", + # ^ unclear if this works either. won't do much when our dask chunks are aligned to the dataset's chunks. + GDAL_HTTP_MAX_RETRY="5", + GDAL_HTTP_RETRY_DELAY="1", + ), + open=dict( + GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR", + # ^ stop GDAL from requesting `.aux` and `.msk` files from the bucket (speeds up `open` time a lot) + VSI_CACHE=True + # ^ cache HTTP requests for opening datasets. This is critical for `ThreadLocalRioDataset`, + # which re-opens the same URL many times---having the request cached makes subsequent `open`s + # in different threads snappy. + ), + read=dict( + VSI_CACHE=False + # ^ *don't* cache HTTP requests for actual data. We don't expect to re-request data, + # so this would just blow out the HTTP cache that we rely on to make repeated `open`s fast + # (see above) + ), +) S2_THEIA_BANDS = [f"B{i+1}" for i in range(12)]+["B8A"] S2_SEN2COR_BANDS = [f"B{i+1:02}" for i in range(12)]+["B8A"] @@ -58,7 +86,7 @@ class ExtendPystacClasses: if not inplace: return x - def to_xarray(self, xy_coords="center", bbox=None, geometry=None, **kwargs): + def to_xarray(self, xy_coords="center", bbox=None, geometry=None, gdal_env=DEFAULT_GDAL_ENV, **kwargs): """Returns a DASK xarray() This is a proxy to stackstac.stac @@ -92,7 +120,7 @@ class ExtendPystacClasses: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) try: - arr = stackstac.stack(self, xy_coords=xy_coords, **kwargs) + arr = stackstac.stack(self, xy_coords=xy_coords, gdal_env=gdal_env, **kwargs) except ValueError as e: if "Cannot automatically compute the resolution" in str(e): raise ValueError(str(e)+"\nOr drop non-raster assets from collection with ItemCollection.drop_non_raster()") -- GitLab From 247870a1e1b72c80a79974b1eb59fb181a548487 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 10 Apr 2024 16:55:57 +0200 Subject: [PATCH 02/14] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6387fd9..4720be6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - function `harmonize_sen2cor_offset`: adds an `offset` property to the assets so it is taken into account by `to_xarray`. - method `ItemCollection.drop_duplicates`: drop duplicated ID returned by pgstac. - method `ItemCollection.drop_non_raster`: drop non raster assets. +- method `ItemCollection.to_xarray` default arg `gdal_env`: now it inlcudes GDAL retries in case of failure while reading url - function `extract_points` and method `ItemCollection.extract_points` to extract points time series. - `writer_args` to `ItemCollection.apply_...` methods and function in order to specify the outputs format, e.g. the encoding. - in local.py, `start_datetime` and `end_datetime` can now be used instead of `datetime` in the template used to build a local ItemCollection. -- GitLab From e0ae19cfdb6409721e3fa2f70518e9a5a472a91a Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Tue, 23 Apr 2024 17:00:30 +0200 Subject: [PATCH 03/14] fix bug in harmonize_sen2cor_offset --- simplestac/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplestac/utils.py b/simplestac/utils.py index 8f0c39f..1a86bee 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -985,7 +985,7 @@ def harmonize_sen2cor_offset(x, bands=S2_SEN2COR_BANDS, inplace=False): item.assets[asset].extra_fields["raster:bands"] = [dict(offset=-1000)] else: item.assets[asset].extra_fields["raster:bands"] = [dict(offset=0)] - if inplace: + if not inplace: return x def extract_points(x, points, method=None, tolerance=None, drop=False): -- GitLab From 19bfc91ab77ad01516ae6edae9e80d658264e840 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Thu, 2 May 2024 19:06:06 +0200 Subject: [PATCH 04/14] update some docstring --- simplestac/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/simplestac/utils.py b/simplestac/utils.py index 1a86bee..cb9d04f 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -365,6 +365,8 @@ class ExtendPystacClasses: inplace : bool, optional Whether to modify the collection in place. Defaults to False. In that case, a cloned collection is returned. + datetime : datetime, optional + A datetime to filter the items with. Defaults to None. bbox : tuple, optional A bounding box to clip_box the items with. Defaults to None. geometry : shapely.geometry, optional @@ -614,7 +616,8 @@ def write_assets(x: Union[ItemCollection, pystac.Item], output_dir : str The directory to write the assets to. bbox : Optional - The bounding box to clip the assets to. + Argument forwarded to ItemCollection.to_xarray. + The bounding box (in the CRS of the items) to clip the assets to. remove_item_props : list of str List of regex patterns to remove from item properties. If None, no properties are removed. -- GitLab From e22fe52bd5d507fe17291d18a7433e19a265f636 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Tue, 7 May 2024 20:08:09 +0200 Subject: [PATCH 05/14] fix stac_raster_info which had wrong format for spatial_resolution in raster:bands --- simplestac/local.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/simplestac/local.py b/simplestac/local.py index dc5c091..dfe946b 100644 --- a/simplestac/local.py +++ b/simplestac/local.py @@ -153,7 +153,7 @@ def stac_proj_info(bbox, gsd, meta): return proj_info -def stac_raster_info(meta, tags, scales, offsets): +def stac_raster_info(meta, tags, gsd, scales, offsets): """Raster information returned in the STAC format. Parameters @@ -162,6 +162,8 @@ def stac_raster_info(meta, tags, scales, offsets): Metadata dict returned by rasterio. tags : dict Tags returned by rasterio. + gsd : float + Ground sample distance. scales : list Scales returned by rasterio. offsets : list @@ -170,8 +172,15 @@ def stac_raster_info(meta, tags, scales, offsets): Returns ------- dict - bands - with prefix `raster:` + STAC extension raster information, with prefix `raster:bands` + + See also + -------- + simplestac.local.get_rio_info + + Notes + ----- + See https://github.com/stac-extensions/raster """ bands = [{}] if "nodata" in meta and meta["nodata"] is not None: @@ -180,8 +189,10 @@ def stac_raster_info(meta, tags, scales, offsets): bands[0]["sampling"] = tags["AREA_OR_POINT"].lower() if "dtype" in meta: bands[0]["datatype"] = meta["dtype"] - if "resolution" in tags: - bands[0]["spatial_resolution"] = tags["resolution"] + + # 'resolution' is not always in tags, thus gsd is used instead. + bands[0]["spatial_resolution"] = gsd + if scales is not None: bands[0]["scale"] = scales[0] if offsets is not None: @@ -347,7 +358,7 @@ def stac_asset_info_from_raster(band_file, band_fmt=None): # It could be set at the item level otherwise. proj_info = stac_proj_info(bbox, gsd, meta) stac_fields.update(proj_info) - raster_info = stac_raster_info(meta, tags, scales, offsets) + raster_info = stac_raster_info(meta, tags, gsd, scales, offsets) stac_fields.update(raster_info) return stac_fields -- GitLab From 97afc90e9b9d32e0ca59fa30b2b0b61209aa309b Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Tue, 7 May 2024 20:09:04 +0200 Subject: [PATCH 06/14] add print when downloading data in conftest --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index f4ae733..39b8819 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ logging.basicConfig(level=logging.INFO) here = Path(__file__).parent download_script = here / "download_data.py" +print("Downloading the test data...") exec(open(download_script).read()) @pytest.fixture(scope="session") -- GitLab From e30287faba334cbabab315016c9b558b4138dd0a Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 11:35:08 +0200 Subject: [PATCH 07/14] add test for raster:bands spatial_resolution --- tests/test_local.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_local.py b/tests/test_local.py index 3471050..49e7263 100644 --- a/tests/test_local.py +++ b/tests/test_local.py @@ -14,6 +14,10 @@ def test_build(s2scene_dir): col = build_item_collection(s2scene_dir, collection_format()) assert len(col) == len(s2scene_dir.dirs()) assert len(col[0].assets) == 11 + extra_fields = col[0].assets["B02"].extra_fields + raster_bands = extra_fields["raster:bands"][0] + assert raster_bands["spatial_resolution"] == 10 + def test_datetime(s2scene_dir): fmt = collection_format() -- GitLab From 9db41b62c6120a4348f5fddcda0f6f4b948d6496 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 11:43:31 +0200 Subject: [PATCH 08/14] change CI --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 41bc8c8..f6182b3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -37,7 +37,7 @@ pages: artifacts: paths: - public - only: - - main - - pages + rules: + - if: $CI_COMMIT_TAG =~ /v\.[0-9]+\.[0-9]+\.[0-9]+$/ + - if: $CI_COMMIT_BRANCH == pages -- GitLab From 5014ced23fc57e1ba16cd47cec77397fb1a1c10b Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 12:40:58 +0200 Subject: [PATCH 09/14] add support for direct GeoDataFrame in to_xarray() --- simplestac/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplestac/utils.py b/simplestac/utils.py index cb9d04f..865f239 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -130,6 +130,8 @@ class ExtendPystacClasses: if bbox is not None: arr = arr.rio.clip_box(*bbox) if geometry is not None: + if isinstance(geometry, gpd.GeoDataFrame): + geometry = geometry.geometry if hasattr(geometry, 'crs') and not geometry.crs.equals(arr.rio.crs): logger.debug(f"Reprojecting geometry from {geometry.crs} to {arr.rio.crs}") geometry = geometry.to_crs(arr.rio.crs) -- GitLab From deacb1ea20a85e1aa08d9d22d4efe8baae33132f Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 12:44:57 +0200 Subject: [PATCH 10/14] fix bug in write_assets: it was implicitly modifying input collection --- simplestac/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/simplestac/utils.py b/simplestac/utils.py index 865f239..0bb6a4b 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -604,6 +604,7 @@ def write_assets(x: Union[ItemCollection, pystac.Item], overwrite=False, progress=True, writer_args=None, + inplace=False, **kwargs): """ Writes item(s) assets to the specified output directory. @@ -637,7 +638,10 @@ def write_assets(x: Union[ItemCollection, pystac.Item], The item collection with the metadata updated with local asset paths. """ if isinstance(x, pystac.Item): - x = [x] + x = ItemCollection([x]) + + if not inplace: + x = x.clone() output_dir = Path(output_dir).expand() items = [] @@ -682,7 +686,8 @@ def write_assets(x: Union[ItemCollection, pystac.Item], logger.info(f'Item "{item.id}" is empty, skipping it.') item_dir.rmtree_p() - return ItemCollection(items, clone_items=False) + if not inplace: + return x def update_item_properties(x: pystac.Item, remove_item_props=DEFAULT_REMOVE_PROPS): """Update item bbox, geometry and proj:epsg introspecting assets. -- GitLab From dfab03dd2200166e5699007817416bd5e7e07c28 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 13:22:25 +0200 Subject: [PATCH 11/14] add geometry arg to write_assets --- simplestac/utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- tests/test_remote.py | 7 +++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/simplestac/utils.py b/simplestac/utils.py index 0bb6a4b..9291296 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -598,7 +598,10 @@ class ItemCollection(pystac.ItemCollection, ExtendPystacClasses): DEFAULT_REMOVE_PROPS = ['.*percentage', 'eo:cloud_cover', '.*mean_solar.*'] def write_assets(x: Union[ItemCollection, pystac.Item], - output_dir: str, bbox=None, update=True, + output_dir: str, + bbox=None, + geometry=None, + update=True, xy_coords='center', remove_item_props=DEFAULT_REMOVE_PROPS, overwrite=False, @@ -621,6 +624,15 @@ def write_assets(x: Union[ItemCollection, pystac.Item], bbox : Optional Argument forwarded to ItemCollection.to_xarray. The bounding box (in the CRS of the items) to clip the assets to. + geometry : Optional + Argument forwarded to ItemCollection.to_xarray to rioxarray.clip the assets to. + Usually a GeoDataFrame or GeoSeries. + See notes. + update : bool, optional + Whether to update the item properties with the new asset paths. + Defaults to True. + xy_coords : str, optional + The coordinate system to use for the x and y coordinates of the remove_item_props : list of str List of regex patterns to remove from item properties. If None, no properties are removed. @@ -629,6 +641,8 @@ def write_assets(x: Union[ItemCollection, pystac.Item], writer_args : dict, optional Arguments to pass to write_raster for each asset. Defaults to `None`. See Notes for an example. + inplace : bool, optional + Whether to modify the input collection in place or clone it. Defaults to False (i.e. clone). **kwargs Additional keyword arguments passed to write_raster. @@ -636,6 +650,32 @@ def write_assets(x: Union[ItemCollection, pystac.Item], ------- ItemCollection The item collection with the metadata updated with local asset paths. + + Notes + ----- + Arguments `bbox` and `geometry` are to ways to clip the assets before writing. + Although they look similar, they may lead to different results. + First, `bbox` does not have CRS, thus it is to the user to know + in which CRS x.to_xarray() will be before being clipped. If geometry is used instead, + it is automatically converted to the collection xarray CRS. + Second, as we use the default arguments for rio.clip and rio.clip_box, + the clip_box with bbox will contain all touched pixels while the clip with geometry will + contain only pixels whose center is within the polygon (all_touched=False). + Adding a buffer of resolution/2 could be a workaround to avoid that, + i.e. keep all touched pixels while clipping with a geometry. + + The `writer_args` argument can be used to specify the writing arguments (e.g. encoding) for specific assets. + Thus, it must be a dictionary with the keys corresponding to asset keys. + If the asset key is not in `writer_args`, the `kwargs` are passed to `write_raster`. + The following example would encode the B02 band as int16, and the rest of the assets as float: + writer_args = { + "B02": { + "encoding": { + "dtype": "int16", + } + } + } + """ if isinstance(x, pystac.Item): x = ItemCollection([x]) @@ -647,7 +687,7 @@ def write_assets(x: Union[ItemCollection, pystac.Item], items = [] for item in tqdm(x, disable=not progress): ic = ItemCollection([item], clone_items=True) - arr = ic.to_xarray(bbox=bbox, xy_coords=xy_coords, ).squeeze("time") + arr = ic.to_xarray(bbox=bbox, geometry=geometry,xy_coords=xy_coords, ).squeeze("time") item_dir = (output_dir / item.id).mkdir_p() for b in arr.band.values: filename = '_'.join([item.id, b+'.tif']) diff --git a/tests/test_remote.py b/tests/test_remote.py index 3333982..5a293ac 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -1,6 +1,8 @@ from simplestac.utils import write_assets, ItemCollection, harmonize_sen2cor_offset import planetary_computer as pc import pystac_client +from tempfile import TemporaryDirectory +import numpy as np URL = "https://planetarycomputer.microsoft.com/api/stac/v1" @@ -80,6 +82,11 @@ def test_write_assets(roi, s2scene_pc_dir): assert len(item.assets) == len(s2scene_pc_dir.dirs()[0].files("*.tif")) assert item.assets["B08"].href.startswith(s2scene_pc_dir) assert new_col[0].assets["B08"].extra_fields["raster:bands"][0]["scale"] == 0.001 + + with TemporaryDirectory(prefix="simplestac-tests_") as tempdir: + new_col2 = write_assets(col, tempdir, geometry=roi.buffer(5), encoding=encoding) + assert len(new_col2) == len(new_col) + assert new_col2[0].bbox == new_col[0].bbox -- GitLab From dab28c44d8f3423f0927e297c08cdefd25752d22 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 13:32:05 +0200 Subject: [PATCH 12/14] update CI --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f6182b3..779d2c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -38,6 +38,6 @@ pages: paths: - public rules: - - if: $CI_COMMIT_TAG =~ /v\.[0-9]+\.[0-9]+\.[0-9]+$/ + - if: $CI_COMMIT_TAG =~ /^v[0-9]+\.[0-9]+\.[0-9]+$/ - if: $CI_COMMIT_BRANCH == pages -- GitLab From ae1bb76d9dd8576f3e097816a1ff703cf3cd9a9f Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 13:35:11 +0200 Subject: [PATCH 13/14] fix CI --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 779d2c5..09ac858 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,5 +39,5 @@ pages: - public rules: - if: $CI_COMMIT_TAG =~ /^v[0-9]+\.[0-9]+\.[0-9]+$/ - - if: $CI_COMMIT_BRANCH == pages + - if: $CI_COMMIT_BRANCH == "pages" -- GitLab From e49120decba6256a00271f4316394def02e55370 Mon Sep 17 00:00:00 2001 From: Florian de Boissieu <fdeboiss@gmail.com> Date: Wed, 8 May 2024 15:57:14 +0200 Subject: [PATCH 14/14] add version badge to CI --- .gitlab-ci.yml | 7 +++++-- README.md | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 09ac858..1170ee0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -30,6 +30,9 @@ pages: - micromamba activate simplestac - micromamba env list - pip install -e . + - version=`fordead --version| sed -E 's/.*([0-9]+\.[0-9]+\.[0-9]+).*/\1/'` + - echo $version + - echo "{\"version\":\"$version\"}" > badges.json - cp -r examples docs/ - jupytext --to markdown docs/examples/*.py - portray as_html -c pyproject_doc.toml @@ -37,7 +40,7 @@ pages: artifacts: paths: - public + - version rules: - - if: $CI_COMMIT_TAG =~ /^v[0-9]+\.[0-9]+\.[0-9]+$/ - - if: $CI_COMMIT_BRANCH == "pages" + - if: $CI_COMMIT_BRANCH == "pages" || $CI_COMMIT_BRANCH == "main" diff --git a/README.md b/README.md index 50e1d07..b802333 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ __[Documentation](https://umr-tetis.pages.mia.inra.fr/stac/simplestac)__ [](https://www.r-project.org/Licenses/GPL-3) [](https://www.python.org) [](https://forgemia.inra.fr/umr-tetis/stac/simplestac/pipelines/main/latest) +[](https://forgemia.inra.fr/umr-tetis/stac/simplestac) # Features -- GitLab