This is an automated email from the ASF dual-hosted git repository. skperez pushed a commit to branch SDAP-481a in repository https://gitbox.apache.org/repos/asf/incubator-sdap-nexus.git
commit 187c4722688ba78f1445aa2178418640c02b04c6 Author: skorper <[email protected]> AuthorDate: Wed Aug 16 14:38:48 2023 -0700 Add suport for netcdf compression --- CHANGELOG.md | 1 + analysis/conda-requirements.txt | 2 +- .../webservice/algorithms/doms/BaseDomsHandler.py | 40 +++++++++++++++++----- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c857124..bc7581d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-467: Added pagination to cdmsresults endpoint - SDAP-461: Added 4 remaining Saildrone insitu datasets. - SDAP-473: Added support for matchup job prioritization +- SDAP-481: Added support for NetCDF compression ### Changed - SDAP-453: Updated results storage and retrieval to support output JSON from `/cdmsresults` that matches output from `/match_spark`. - **NOTE:** Deploying these changes to an existing SDAP deployment will require modifying the Cassandra database with stored results. There is a script to do so at `/tools/update-doms-data-schema/update.py` diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt index e27bdea..c092350 100644 --- a/analysis/conda-requirements.txt +++ b/analysis/conda-requirements.txt @@ -14,7 +14,7 @@ # limitations under the License. -netcdf4==1.5.5.1 +netcdf4==1.6.4 basemap==1.2.2 scipy==1.6.0 pyspark==3.2.1 diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 4140684..1ed398f 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -297,6 +297,8 @@ class DomsCSVFormatter: class DomsNetCDFFormatter: + compression = 'zlib' + comp_level = 5 @staticmethod def create(executionId, results, params, details): @@ -362,17 +364,20 @@ class DomsNetCDFFormatter: #Create Satellite group, variables, and attributes satelliteGroup = dataset.createGroup(satellite_group_name) - satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, params["parameter"]) + satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level) # Create InSitu group, variables, and attributes insituGroup = dataset.createGroup(insitu_group_name) - insituWriter = DomsNetCDFValueWriter(insituGroup, params["parameter"]) + insituWriter = DomsNetCDFValueWriter(insituGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level) # Add data to Insitu and Satellite groups, generate array of match ID pairs matches = DomsNetCDFFormatter.__writeResults(results, satelliteWriter, insituWriter) dataset.createDimension("MatchedRecords", size=None) dataset.createDimension("MatchedGroups", size=2) - matchArray = dataset.createVariable("matchIDs", "f4", ("MatchedRecords", "MatchedGroups")) + matchArray = dataset.createVariable( + 'matchIDs', 'f4', ('MatchedRecords', 'MatchedGroups'), + compression=DomsNetCDFFormatter.compression, complevel=DomsNetCDFFormatter.comp_level + ) matchArray[:] = matches dataset.close() @@ -441,7 +446,7 @@ class DomsNetCDFFormatter: class DomsNetCDFValueWriter: - def __init__(self, group, matchup_parameter): + def __init__(self, group, compression=None, comp_level=None): group.createDimension("dim", size=None) self.group = group @@ -454,6 +459,9 @@ class DomsNetCDFValueWriter: self.secondary_group_name = "SecondaryData" self.data_map = defaultdict(list) + self.compression = compression + self.comp_level = comp_level + def addData(self, result_item): """ Populate DomsNetCDFValueWriter fields from matchup results dict @@ -491,9 +499,18 @@ class DomsNetCDFValueWriter: # # Create variables, enrich with attributes, and add data # - lonVar = self.group.createVariable('lon', 'f4', ('dim',), fill_value=-32767.0) - latVar = self.group.createVariable('lat', 'f4', ('dim',), fill_value=-32767.0) - timeVar = self.group.createVariable('time', 'f4', ('dim',), fill_value=-32767.0) + lonVar = self.group.createVariable( + 'lon', 'f4', ('dim',), fill_value=-32767.0, + compression=self.compression, complevel=self.comp_level + ) + latVar = self.group.createVariable( + 'lat', 'f4', ('dim',), fill_value=-32767.0, + compression=self.compression, complevel=self.comp_level + ) + timeVar = self.group.createVariable( + 'time', 'f4', ('dim',), fill_value=-32767.0, + compression=self.compression, complevel=self.comp_level + ) self.__enrichLon(lonVar, min(self.lon), max(self.lon)) self.__enrichLat(latVar, min(self.lat), max(self.lat)) @@ -505,7 +522,10 @@ class DomsNetCDFValueWriter: # Add depth variable, if present if self.depth and any(self.depth): - depthVar = self.group.createVariable('depth', 'f4', ('dim',), fill_value=-32767.0) + depthVar = self.group.createVariable( + 'depth', 'f4', ('dim',), fill_value=-32767.0, + compression=self.compression, complevel=self.comp_level + ) self.__enrichDepth(depthVar, self.__calcMin(self.depth), max(self.depth)) depthVar[:] = self.depth @@ -533,7 +553,9 @@ class DomsNetCDFValueWriter: cf_name = variable[1] data_variable = self.group.createVariable( - cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',), fill_value=-32767.0) + cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',), + fill_value=-32767.0, compression=self.compression, complevel=self.comp_level + ) # Find min/max for data variables. It is possible for 'None' to # be in this list, so filter those out when doing the calculation. min_data = np.nanmin(variables[variable])
