Streaming data from NASA's Earth Surface Minteral Dust Source Investigation (EMIT)¶
This is a proof of concept notebook to demonstrate how earthaccess can facilitate the use of cloud hosted data from NASA using xarray and holoviews. For a formal tutorial on EMIT please visit the official repository where things are explained in detail. EMIT Science Tutorial
Prerequisites
- NASA EDL credentials
- Openscapes Conda environment installed
- For direct access this notebook should run in AWS
IMPORTANT: This notebook should run out of AWS but is not recommended as streaming HDF5 data is slow out of region
from pprint import pprint
import earthaccess
import xarray as xr
print(f"using earthaccess version {earthaccess.__version__}")
auth = earthaccess.login()
using earthaccess version 0.16.0
--------------------------------------------------------------------------- TimeoutError Traceback (most recent call last) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connectionpool.py:534, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length) 533 try: --> 534 response = conn.getresponse() 535 except (BaseSSLError, OSError) as e: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connection.py:565, in HTTPConnection.getresponse(self) 564 # Get the response from http.client.HTTPConnection --> 565 httplib_response = super().getresponse() 567 try: File ~/.asdf/installs/python/3.11.12/lib/python3.11/http/client.py:1395, in HTTPConnection.getresponse(self) 1394 try: -> 1395 response.begin() 1396 except ConnectionError: File ~/.asdf/installs/python/3.11.12/lib/python3.11/http/client.py:325, in HTTPResponse.begin(self) 324 while True: --> 325 version, status, reason = self._read_status() 326 if status != CONTINUE: File ~/.asdf/installs/python/3.11.12/lib/python3.11/http/client.py:286, in HTTPResponse._read_status(self) 285 def _read_status(self): --> 286 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 287 if len(line) > _MAXLINE: File ~/.asdf/installs/python/3.11.12/lib/python3.11/socket.py:718, in SocketIO.readinto(self, b) 717 try: --> 718 return self._sock.recv_into(b) 719 except timeout: File ~/.asdf/installs/python/3.11.12/lib/python3.11/ssl.py:1314, in SSLSocket.recv_into(self, buffer, nbytes, flags) 1311 raise ValueError( 1312 "non-zero flags not allowed in calls to recv_into() on %s" % 1313 self.__class__) -> 1314 return self.read(nbytes, buffer) 1315 else: File ~/.asdf/installs/python/3.11.12/lib/python3.11/ssl.py:1166, in SSLSocket.read(self, len, buffer) 1165 if buffer is not None: -> 1166 return self._sslobj.read(len, buffer) 1167 else: TimeoutError: The read operation timed out The above exception was the direct cause of the following exception: ReadTimeoutError Traceback (most recent call last) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/requests/adapters.py:644, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 643 try: --> 644 resp = conn.urlopen( 645 method=request.method, 646 url=url, 647 body=request.body, 648 headers=request.headers, 649 redirect=False, 650 assert_same_host=False, 651 preload_content=False, 652 decode_content=False, 653 retries=self.max_retries, 654 timeout=timeout, 655 chunked=chunked, 656 ) 658 except (ProtocolError, OSError) as err: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connectionpool.py:841, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw) 839 new_e = ProtocolError("Connection aborted.", new_e) --> 841 retries = retries.increment( 842 method, url, error=new_e, _pool=self, _stacktrace=sys.exc_info()[2] 843 ) 844 retries.sleep() File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/util/retry.py:474, in Retry.increment(self, method, url, response, error, _pool, _stacktrace) 473 if read is False or method is None or not self._is_method_retryable(method): --> 474 raise reraise(type(error), error, _stacktrace) 475 elif read is not None: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/util/util.py:39, in reraise(tp, value, tb) 38 raise value.with_traceback(tb) ---> 39 raise value 40 finally: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw) 786 # Make the request on the HTTPConnection object --> 787 response = self._make_request( 788 conn, 789 method, 790 url, 791 timeout=timeout_obj, 792 body=body, 793 headers=headers, 794 chunked=chunked, 795 retries=retries, 796 response_conn=response_conn, 797 preload_content=preload_content, 798 decode_content=decode_content, 799 **response_kw, 800 ) 802 # Everything went great! File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connectionpool.py:536, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length) 535 except (BaseSSLError, OSError) as e: --> 536 self._raise_timeout(err=e, url=url, timeout_value=read_timeout) 537 raise File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/urllib3/connectionpool.py:367, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value) 366 if isinstance(err, SocketTimeout): --> 367 raise ReadTimeoutError( 368 self, url, f"Read timed out. (read timeout={timeout_value})" 369 ) from err 371 # See the above comment about EAGAIN in Python 3. ReadTimeoutError: HTTPSConnectionPool(host='urs.earthdata.nasa.gov', port=443): Read timed out. (read timeout=10) During handling of the above exception, another exception occurred: ReadTimeout Traceback (most recent call last) Cell In[1], line 8 4 import xarray as xr 6 print(f"using earthaccess version {earthaccess.__version__}") ----> 8 auth = earthaccess.login() File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/api.py:360, in login(strategy, persist, system) 358 for strategy in ["environment", "netrc", "interactive"]: 359 try: --> 360 earthaccess.__auth__.login( 361 strategy=strategy, 362 persist=persist, 363 system=system, 364 ) 365 except LoginStrategyUnavailable as err: 366 logger.debug(err) File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/auth.py:148, in Auth.login(self, strategy, persist, system) 146 self._netrc() 147 elif strategy == "environment": --> 148 self._environment() 150 return self File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/auth.py:300, in Auth._environment(self) 293 raise LoginStrategyUnavailable( 294 "Either the environment variables EARTHDATA_USERNAME and " 295 "EARTHDATA_PASSWORD must both be set, or EARTHDATA_TOKEN must be set for " 296 "the 'environment' login strategy." 297 ) 299 logger.debug("Using environment variables for EDL") --> 300 return self._get_credentials(username, password, token) File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/auth.py:314, in Auth._get_credentials(self, username, password, user_token) 312 self.username = username 313 self.password = password --> 314 token_resp = self._find_or_create_token() 316 if not (token_resp.ok): # type: ignore 317 msg = f"Authentication with Earthdata Login failed with:\n{token_resp.text}" File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/auth.py:332, in Auth._find_or_create_token(self) 330 def _find_or_create_token(self) -> requests.Response: 331 with self.get_session() as session: --> 332 return session.post( 333 self.EDL_FIND_OR_CREATE_TOKEN_URL, 334 headers={"Accept": "application/json"}, 335 timeout=10, 336 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/requests/sessions.py:637, in Session.post(self, url, data, json, **kwargs) 626 def post(self, url, data=None, json=None, **kwargs): 627 r"""Sends a POST request. Returns :class:`Response` object. 628 629 :param url: URL for the new :class:`Request` object. (...) 634 :rtype: requests.Response 635 """ --> 637 return self.request("POST", url, data=data, json=json, **kwargs) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 584 send_kwargs = { 585 "timeout": timeout, 586 "allow_redirects": allow_redirects, 587 } 588 send_kwargs.update(settings) --> 589 resp = self.send(prep, **send_kwargs) 591 return resp File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs) 700 start = preferred_clock() 702 # Send the request --> 703 r = adapter.send(request, **kwargs) 705 # Total elapsed time of the request (approximately) 706 elapsed = preferred_clock() - start File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/requests/adapters.py:690, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 688 raise SSLError(e, request=request) 689 elif isinstance(e, ReadTimeoutError): --> 690 raise ReadTimeout(e, request=request) 691 elif isinstance(e, _InvalidHeader): 692 raise InvalidHeader(e, request=request) ReadTimeout: HTTPSConnectionPool(host='urs.earthdata.nasa.gov', port=443): Read timed out. (read timeout=10)
Searching for the dataset with .search_datasets()¶
Note: API docs can be found at earthaccess
results = earthaccess.search_datasets(short_name="EMITL2ARFL", cloud_hosted=True)
# Let's print our datasets
for dataset in results:
pprint(dataset.summary())
{'cloud-info': {'Region': 'us-west-2',
'S3BucketAndObjectPrefixNames': ['s3://lp-prod-protected/EMITL2ARFL.001',
's3://lp-prod-public/EMITL2ARFL.001'],
'S3CredentialsAPIDocumentationURL': 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentialsREADME',
'S3CredentialsAPIEndpoint': 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials'},
'concept-id': 'C2408750690-LPCLOUD',
'file-type': "[{'FormatType': 'Native', 'AverageFileSize': 1.8, 'Format': "
"'netCDF-4', 'TotalCollectionFileSizeBeginDate': "
"'2022-08-09T00:00:00.000Z', 'FormatDescription': 'Network "
"Common Data Format Version 4', 'AverageFileSizeUnit': 'GB', "
"'Media': ['Earthdata Cloud', 'HTTPS']}]",
'get-data': ['https://search.earthdata.nasa.gov/search/granules?p=C2408750690-LPCLOUD',
'https://appeears.earthdatacloud.nasa.gov/'],
'short-name': 'EMITL2ARFL',
'version': '001'}
Searching for the data with .search_data() over Ecuador¶
# ~Ecuador = -82.05,-3.17,-76.94,-0.52
granules = earthaccess.search_data(
short_name="EMITL2ARFL",
bounding_box=(-82.05, -3.17, -76.94, -0.52),
count=10,
)
print(len(granules))
10
earthaccess can print a preview of the data using the metadata from CMR¶
Note: there is a bug in earthaccess where the reported size of the granules are always 0, fix is coming next week
granules[7]
Data: EMIT_L2A_RFL_001_20230304T151234_2306310_003.ncEMIT_L2A_RFLUNCERT_001_20230304T151234_2306310_003.ncEMIT_L2A_MASK_001_20230304T151234_2306310_003.nc
Size: 3578.78 MB
Cloud Hosted: True
Streaming data from S3 with fsspec¶
Opening the data with earthaccess.open() and accessing the NetCDF as if it was local
If we run this code in AWS(us-west-2), earthaccess can use direct S3 links. If we run it out of AWS, earthaccess can only use HTTPS links. Direct S3 access for NASA data is only allowed in region.
# open() accepts a list of results or a list of links
file_handlers = earthaccess.open(granules)
file_handlers
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[5], line 2 1 # open() accepts a list of results or a list of links ----> 2 file_handlers = earthaccess.open(granules) 3 file_handlers File ~/checkouts/readthedocs.org/user_builds/earthaccess/checkouts/1109/earthaccess/api.py:475, in open(granules, provider, credentials_endpoint, show_progress, pqdm_kwargs, open_kwargs) 448 def open( 449 granules: Union[List[str], List[DataGranule]], 450 provider: Optional[str] = None, (...) 455 open_kwargs: Optional[Dict[str, Any]] = None, 456 ) -> List[AbstractFileSystem]: 457 """Returns a list of file-like objects that can be used to access files 458 hosted on S3 or HTTPS by third party libraries like xarray. 459 (...) 473 A list of "file pointers" to remote (i.e. s3 or https) files. 474 """ --> 475 return earthaccess.__store__.open( 476 granules=granules, 477 provider=_normalize_location(provider), 478 credentials_endpoint=credentials_endpoint, 479 show_progress=show_progress, 480 pqdm_kwargs=pqdm_kwargs, 481 open_kwargs=open_kwargs, 482 ) AttributeError: 'NoneType' object has no attribute 'open'
%%time
# we can use any file from the array
file_p = file_handlers[4]
refl = xr.open_dataset(file_p)
wvl = xr.open_dataset(file_p, group="sensor_band_parameters")
loc = xr.open_dataset(file_p, group="location")
ds = xr.merge([refl, loc])
ds = ds.assign_coords(
{
"downtrack": (["downtrack"], refl.downtrack.data),
"crosstrack": (["crosstrack"], refl.crosstrack.data),
**wvl.variables,
}
)
ds
CPU times: user 5 μs, sys: 1 μs, total: 6 μs Wall time: 8.58 μs
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 1 ----> 1 get_ipython().run_cell_magic('time', '', '\n# we can use any file from the array\nfile_p = file_handlers[4]\n\nrefl = xr.open_dataset(file_p)\nwvl = xr.open_dataset(file_p, group="sensor_band_parameters")\nloc = xr.open_dataset(file_p, group="location")\nds = xr.merge([refl, loc])\nds = ds.assign_coords(\n {\n "downtrack": (["downtrack"], refl.downtrack.data),\n "crosstrack": (["crosstrack"], refl.crosstrack.data),\n **wvl.variables,\n }\n)\n\nds\n') File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565, in InteractiveShell.run_cell_magic(self, magic_name, line, cell) 2563 with self.builtin_trap: 2564 args = (magic_arg_s, cell) -> 2565 result = fn(*args, **kwargs) 2567 # The code below prevents the output from being displayed 2568 # when using magics with decorator @output_can_be_silenced 2569 # when the last Python token in the expression is a ';'. 2570 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False): File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470, in ExecutionMagics.time(self, line, cell, local_ns) 1468 if interrupt_occured: 1469 if exit_on_interrupt and captured_exception: -> 1470 raise captured_exception 1471 return 1472 return out File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1109/lib/python3.11/site-packages/IPython/core/magics/execution.py:1434, in ExecutionMagics.time(self, line, cell, local_ns) 1432 st = clock2() 1433 try: -> 1434 exec(code, glob, local_ns) 1435 out = None 1436 # multi-line %%time case File <timed exec>:2 NameError: name 'file_handlers' is not defined
Plotting non orthorectified data¶
Use the following code to plot the Panel widget when you run this code on AWS us-west-2
import holoviews as hv
import hvplot.xarray
import numpy as np
import panel as pn
pn.extension()
# Find band nearest to value of 850 nm (NIR)
b850 = np.nanargmin(abs(ds["wavelengths"].values - 850))
ref_unc = ds["reflectance_uncertainty"]
image = ref_unc.sel(bands=b850).hvplot("crosstrack", "downtrack", cmap="viridis")
stream = hv.streams.Tap(source=image, x=255, y=484)
def wavelengths_histogram(x, y):
histo = ref_unc.sel(crosstrack=x, downtrack=y, method="nearest").hvplot(
x="wavelengths", color="green"
)
return histo
tap_dmap = hv.DynamicMap(wavelengths_histogram, streams=[stream])
pn.Column(image, tap_dmap)
