Create a fsspec ReferenceFileSystem for a large set of remote GeoTIFF files¶
Updated on May 22, 2024
This Python script uses the tifffile and imagecodecs packages to create a fsspec ReferenceFileSystem file in JSON format for the Earthbigdata set, which consists of 1,033,422 GeoTIFF files stored on AWS. The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.
Refer to the discussion at kerchunk/issues/78.
import base64
import os
import fsspec
import imagecodecs.numcodecs
import matplotlib.pyplot
import numcodecs
import tifffile
import xarray
import zarr
Get a list of all remote TIFF files¶
Call the AWS command line app to recursively list all files in the Earthbigdata set. Cache the output in a local file. Filter the list for TIFF files and remove the common path.
if not os.path.exists('earthbigdata.txt'):
os.system(
'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
' --recursive > earthbigdata.txt'
)
with open('earthbigdata.txt', encoding='utf-8') as fh:
tiff_files = [
line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
]
print('Number of TIFF files:', len(tiff_files))
Number of TIFF files: 1033422
Define metadata to describe the dataset¶
Define labels, coordinate arrays, file name regular expression patterns, and categories for all dimensions in the Earthbigdata set.
baseurl = (
'https://'
'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
'/data/tiles/'
)
chunkshape = (1200, 1200)
fillvalue = 0
latitude_label = 'latitude'
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
latitude_coordinates = [
(j * -0.00083333333 - 0.000416666665 + i)
for i in range(82, -79, -1)
for j in range(1200)
]
latitude_category = {}
i = 0
for j in range(82, -1, -1):
latitude_category[f'N{j:-02}'] = i
i += 1
for j in range(1, 79):
latitude_category[f'S{j:-02}'] = i
i += 1
longitude_label = 'longitude'
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
longitude_coordinates = [
(j * 0.00083333333 + 0.000416666665 + i)
for i in range(-180, 180)
for j in range(1200)
]
longitude_category = {}
i = 0
for j in range(180, 0, -1):
longitude_category[f'W{j:-03}'] = i
i += 1
for j in range(180):
longitude_category[f'E{j:-03}'] = i
i += 1
season_label = 'season'
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
season_coordinates = list(season_category.keys())
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
polarization_label = 'polarization'
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
polarization_coordinates = list(polarization_category.keys())
polarization_pattern = (
rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
)
coherence_label = 'coherence'
coherence_category = {
'06': 0,
'12': 1,
'18': 2,
'24': 3,
'36': 4,
'48': 5,
}
coherence_coordinates = list(int(i) for i in coherence_category.keys())
coherence_pattern = (
rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
)
orbit_label = 'orbit'
orbit_coordinates = list(range(1, 176))
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
flightdirection_label = 'flightdirection'
flightdirection_category = {'A': 0, 'D': 1}
flightdirection_coordinates = list(flightdirection_category.keys())
flightdirection_pattern = (
rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
)
Open a file for writing the fsspec ReferenceFileSystem in JSON format¶
jsonfile = open('earthbigdata.json', 'w', encoding='utf-8', newline='\n')
Write the coordinate arrays¶
Add the coordinate arrays to a Zarr group, convert it to a fsspec ReferenceFileSystem JSON string, and write it to the open file.
coordinates = {} # type: ignore
zarrgroup = zarr.open_group(coordinates)
zarrgroup.array(
longitude_label,
data=longitude_coordinates,
dtype='float32',
# compression='zlib',
).attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
zarrgroup.array(
latitude_label,
data=latitude_coordinates,
dtype='float32',
# compression='zlib',
).attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
zarrgroup.array(
season_label,
data=season_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [season_label]
zarrgroup.array(
polarization_label,
data=polarization_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
zarrgroup.array(
coherence_label,
data=coherence_coordinates,
dtype='uint8',
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
zarrgroup.array(orbit_label, data=orbit_coordinates, dtype='int32').attrs[
'_ARRAY_DIMENSIONS'
] = [orbit_label]
zarrgroup.array(
flightdirection_label,
data=flightdirection_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
# base64 encode any values containing non-ascii characters
for k, v in coordinates.items():
try:
coordinates[k] = v.decode()
except UnicodeDecodeError:
coordinates[k] = 'base64:' + base64.b64encode(v).decode()
coordinates_json = tifffile.ZarrStore._json(coordinates).decode()
jsonfile.write(coordinates_json[:-2]) # skip the last newline and brace
69839
Create a TiffSequence from a list of file names¶
Filter the list of GeoTIFF files for files containing coherence 'COH' data. The regular expression pattern and categories are used to parse the file names for chunk indices.
Note: the created TiffSequence cannot be used to access any files. The file
names do not refer to existing files. The baseurl
is later used to get
the real location of the files.
mode = 'COH'
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
+ coherence_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
coherence_label: coherence_category,
},
)
assert len(fileseq) == 444821
assert fileseq.files_missing == 5119339
assert fileseq.shape == (161, 360, 4, 4, 6)
assert fileseq.dims == (
'latitude',
'longitude',
'season',
'polarization',
'coherence',
)
print(fileseq)
TiffSequence N00E005_fall_vv_COH12.tif files: 444821 (5119339 missing) shape: 161, 360, 4, 4, 6 dims: latitude, longitude, season, polarization, coherence
Create a ZarrTiffStore from the TiffSequence¶
Define axestiled
to tile the latitude and longitude dimensions of the
TiffSequence with the first and second image/chunk dimensions.
Define extra zattrs
to create a Xarray compatible store.
store = fileseq.aszarr(
chunkdtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
coherence_label,
latitude_label,
longitude_label,
]
},
)
print(store)
ZarrFileSequenceStore shape: 4, 4, 6, 193200, 432000 chunks: 1, 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
Append the ZarrTiffStore to the open ReferenceFileSystem file¶
Use the mode name to create a Zarr subgroup.
Use the imagecodecs_tiff
Numcodecs compatible codec for decoding TIFF files.
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
Repeat for the other modes¶
Repeat the TiffSequence->aszarr->write_fsspec
workflow for the other modes.
for mode in (
'AMP',
'tau',
'rmse',
'rho',
):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
},
)
print(fileseq)
with fileseq.aszarr(
chunkdtype='uint16',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
for mode in ('inc', 'lsmap'):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ orbit_pattern
+ flightdirection_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
# orbit has no category
flightdirection_label: flightdirection_category,
},
)
print(fileseq)
with fileseq.aszarr(
chunkdtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
orbit_label,
flightdirection_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=mode == 'lsmap', # close after last store
)
TiffSequence N00E005_fall_vh_AMP.tif files: 187693 (739667 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0
TiffSequence N00E005_fall_vv_tau.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0
TiffSequence N00E005_fall_vv_rmse.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0
TiffSequence N00E005_fall_vv_rho.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0
TiffSequence N00E005_124D_inc.tif files: 52956 (20233044 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
TiffSequence N00E005_124D_lsmap.tif files: 52959 (20233041 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
Close the JSON file¶
jsonfile.close()
Use the fsspec ReferenceFileSystem file to create a Xarray dataset¶
Register imagecodecs.numcodecs
before using the ReferenceFileSystem.
imagecodecs.numcodecs.register_codecs()
Create a fsspec mapper instance from the ReferenceFileSystem file¶
Specify the target_protocol
to load a local file.
mapper = fsspec.get_mapper(
'reference://',
fo='earthbigdata.json',
target_protocol='file',
remote_protocol='https',
)
Create a Xarray dataset from the mapper¶
Use mask_and_scale
to disable conversion to floating point.
dataset = xarray.open_dataset(
mapper,
engine='zarr',
mask_and_scale=False,
backend_kwargs={'consolidated': False},
)
print(dataset)
<xarray.Dataset> Size: 77TB Dimensions: (season: 4, polarization: 4, latitude: 193200, longitude: 432000, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6B 6 12 18 24 36 48 * flightdirection (flightdirection) object 16B 'A' 'D' * latitude (latitude) float32 773kB 82.0 82.0 82.0 ... -79.0 -79.0 * longitude (longitude) float32 2MB -180.0 -180.0 ... 180.0 180.0 * orbit (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175 * polarization (polarization) object 32B 'vv' 'vh' 'hv' 'hh' * season (season) object 32B 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 3TB ... COH (season, polarization, coherence, latitude, longitude) uint8 8TB ... inc (orbit, flightdirection, latitude, longitude) uint8 29TB ... lsmap (orbit, flightdirection, latitude, longitude) uint8 29TB ... rho (season, polarization, latitude, longitude) uint16 3TB ... rmse (season, polarization, latitude, longitude) uint16 3TB ... tau (season, polarization, latitude, longitude) uint16 3TB ...
Select the Southern California region in the dataset¶
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
print(socal)
<xarray.Dataset> Size: 28GB Dimensions: (season: 4, polarization: 4, latitude: 4200, longitude: 7200, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6B 6 12 18 24 36 48 * flightdirection (flightdirection) object 16B 'A' 'D' * latitude (latitude) float32 17kB 36.0 36.0 36.0 ... 32.5 32.5 32.5 * longitude (longitude) float32 29kB -121.0 -121.0 ... -115.0 -115.0 * orbit (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175 * polarization (polarization) object 32B 'vv' 'vh' 'hv' 'hh' * season (season) object 32B 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 968MB ... COH (season, polarization, coherence, latitude, longitude) uint8 3GB ... inc (orbit, flightdirection, latitude, longitude) uint8 11GB ... lsmap (orbit, flightdirection, latitude, longitude) uint8 11GB ... rho (season, polarization, latitude, longitude) uint16 968MB ... rmse (season, polarization, latitude, longitude) uint16 968MB ... tau (season, polarization, latitude, longitude) uint16 968MB ...
Plot a selection of the dataset¶
The few GeoTIFF files comprising the selection are transparently downloaded, decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.
image = socal['COH'].loc['winter', 'vv', 12]
assert image[100, 100] == 53
xarray.plot.imshow(image, size=6, aspect=1.8)
matplotlib.pyplot.show()
System information¶
Print information about the software used to run this script.
def system_info():
import datetime
import sys
import fsspec
import imagecodecs
import matplotlib
import numcodecs
import numpy
import tifffile
import xarray
import zarr
return '\n'.join(
(
sys.executable,
f'Python {sys.version}',
'',
f'numpy {numpy.__version__}',
f'matplotlib {matplotlib.__version__}',
f'tifffile {tifffile.__version__}',
f'imagecodecs {imagecodecs.__version__}',
f'numcodecs {numcodecs.__version__}',
f'fsspec {fsspec.__version__}',
f'xarray {xarray.__version__}',
f'zarr {zarr.__version__}',
'',
str(datetime.datetime.now()),
)
)
print(system_info())
X:\Python312\python.exe Python 3.12.3 (tags/v3.12.3:f6650f9, Apr 9 2024, 14:05:25) [MSC v.1938 64 bit (AMD64)] numpy 2.0.0rc2 matplotlib 3.9.0 tifffile 2024.5.22 imagecodecs 2024.6.1 numcodecs 0.12.1 fsspec 2024.5.0 xarray 2024.5.0 zarr 2.18.2 2024-06-02 17:33:30.515403