CLIMATE-564: Managing multiple netcdf files stored on a local machine
Project: http://git-wip-us.apache.org/repos/asf/climate/repo Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/d24b1a7c Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/d24b1a7c Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/d24b1a7c Branch: refs/heads/master Commit: d24b1a7c442dee422ff4733a6a02adb21cbd9189 Parents: b440baf Author: Huikyo Lee <[email protected]> Authored: Fri Jan 16 17:10:50 2015 -0800 Committer: Huikyo Lee <[email protected]> Committed: Fri Jan 16 17:10:50 2015 -0800 ---------------------------------------------------------------------- .../load_data_for_the_downscaling_project.py | 42 +++++ ocw/data_source/local.py | 174 +++++++++++++++++++ ocw/dataset.py | 22 +-- 3 files changed, 227 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/climate/blob/d24b1a7c/examples/load_data_for_the_downscaling_project.py ---------------------------------------------------------------------- diff --git a/examples/load_data_for_the_downscaling_project.py b/examples/load_data_for_the_downscaling_project.py new file mode 100644 index 0000000..f4925d5 --- /dev/null +++ b/examples/load_data_for_the_downscaling_project.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np + +import ocw.data_source.local as local + +# data files to read +print 'Loading nuWRF output' +file_path = "<directory name>" +filename_pattern = ["wrfout2d_2006082*"] # nuWRF B24 simulation results between 08/20 and 08/29/2006 + +nuWRF_dataset = local.load_files(file_path=file_path, filename_pattern=filename_pattern, + variable_name="PREC_ACC_C", latitude_range=[35,45], longitude_range=[-110,-90]) + +print 'Loading geos output' +file_path = "<directory name>" +filename_pattern = ["geos_prcp200612*"] # geos simulation results in December 2006 + +geos_dataset = local.load_files(file_path=file_path, filename_pattern=filename_pattern, + variable_name="PRCP", latitude_range=[35,45], longitude_range=[-110,-90]) + +print 'Loading TRMM 3 hourly output' +file_path = "<directory name>" +filename_pattern = ["3B42.20030331*", "3B42.20030401*"] # TRMM precipitation on 3/31/2003 and 4/1/2003 + +TRMM_dataset = local.load_files(file_path=file_path, filename_pattern=filename_pattern, + variable_name="pcp", latitude_range=[35,45], longitude_range=[-110,-90]) http://git-wip-us.apache.org/repos/asf/climate/blob/d24b1a7c/ocw/data_source/local.py ---------------------------------------------------------------------- diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py index c6405a9..4748814 100644 --- a/ocw/data_source/local.py +++ b/ocw/data_source/local.py @@ -26,6 +26,7 @@ import ocw.utils as utils import netCDF4 import numpy import numpy.ma as ma +import glob LAT_NAMES = ['x', 'rlat', 'rlats', 'lat', 'lats', 'latitude', 'latitudes'] LON_NAMES = ['y', 'rlon', 'rlons', 'lon', 'lons', 'longitude', 'longitudes'] @@ -207,3 +208,176 @@ def load_file(file_path, values = values [:,:,:,elevation_index] return Dataset(lats, lons, times, values, variable_name, name=name) + +def load_files(file_path, + filename_pattern, + variable_name, + elevation_index=0, + name='', + lat_name=None, + lon_name=None, + time_name=None, + latitude_range=None, + longitude_range=None): + ''' Load multiple NetCDF files whose file names have common patterns into a Dataset. + The dataset can be spatially subset. + + :param file_path: Directory to the NetCDF file to load. + :type file_path: :mod:`string` + + :param filename_pattern: Path to the NetCDF file to load. + :type filename_pattern: :list:`string` + + :param variable_name: The variable name to load from the NetCDF file. + :type variable_name: :mod:`string` + + :param elevation_index: (Optional) The elevation index for which data should + be returned. Climate data is often times 4 dimensional data. Some + datasets will have readins at different height/elevation levels. OCW + expects 3D data so a single layer needs to be stripped out when loading. + By default, the first elevation layer is used. If desired you may + specify the elevation value to use. + :type elevation_index: :class:`int` + + :param name: (Optional) A name for the loaded dataset. + :type name: :mod:`string` + + :param lat_name: (Optional) The latitude variable name to extract from the + dataset. + :type lat_name: :mod:`string` + + :param lon_name: (Optional) The longitude variable name to extract from the + dataset. + :type lon_name: :mod:`string` + + :param time_name: (Optional) The time variable name to extract from the + dataset. + :type time_name: :mod:`string` + + :param latitude_range: (Optional) southern and northern boundary of the sub-region + :type latitude_range: :list:float + + :param longitude_range: (Optional) western and eastern boundary of the sub-region + :type longitude_range: :list:float + + :returns: An OCW Dataset object with the requested variable's data from + the NetCDF file. + :rtype: :class:`dataset.Dataset` + + :raises ValueError: When the specified file path cannot be loaded by ndfCDF4 + or when the lat/lon/time variable name cannot be determined + automatically. + ''' + + netcdf_files= [] + for pattern in filename_pattern: + netcdf_files.extend(glob.glob(file_path+pattern)) + netcdf_files.sort() + + try: + netcdf = netCDF4.Dataset(netcdf_files[0], mode='r') + except RuntimeError: + err = "Dataset filepath is invalid. Please ensure it is correct." + raise ValueError(err) + except: + err = ( + "The given file cannot be loaded. Please ensure that it is a valid " + "NetCDF file. If problems persist, report them to the project's " + "mailing list." + ) + raise ValueError(err) + + if not lat_name: + lat_name = _get_netcdf_variable_name(LAT_NAMES, netcdf, variable_name) + if not lon_name: + lon_name = _get_netcdf_variable_name(LON_NAMES, netcdf, variable_name) + if not time_name: + time_name = _get_netcdf_variable_name(TIME_NAMES, netcdf, variable_name) + + lats = netcdf.variables[lat_name][:] + lons = netcdf.variables[lon_name][:] + + if latitude_range and longitude_range: + if lats.ndim == 1: + x_index = numpy.where((lons>=numpy.min(longitude_range)) & (lons<=numpy.max(longitude_range)))[0] + y_index = numpy.where((lats>=numpy.min(latitude_range)) & (lats<=numpy.max(latitude_range)))[0] + lats = lats[y_index] + lons = lons[x_index] + else: + y_index,x_index = numpy.where((lons>=numpy.min(longitude_range)) & (lons<=numpy.max(longitude_range)) & (lats>=numpy.min(latitude_range)) & (lats<=numpy.max(latitude_range))) + lats = lats[y_index, x_index] + lons = lons[y_index, x_index] + else: + y_index = np.arange(lats.shape[0]) + x_index = np.arange(lons.shape[-1]) + + time_raw_values = netcdf.variables[time_name] + for attr, value in time_raw_values.__dict__.iteritems(): + if 'unit' in attr.lower(): + time_unit = value + times = netCDF4.num2date(time_raw_values[:], units = time_unit) + times = numpy.array(times) + + # check the variable structure before reading data from the open file + variable = netcdf.variables[variable_name] + # If the values are 4D then we need to strip out the elevation index + if len(variable.shape) == 4: + # Determine the set of possible elevation dimension names excluding + # the list of names that are used for the lat, lon, and time values. + dims = netcdf.variables[variable_name].dimensions + dimension_names = [dim_name.encode() for dim_name in dims] + lat_lon_time_var_names = [lat_name, lon_name, time_name] + + elev_names = set(dimension_names) - set(lat_lon_time_var_names) + + # Grab the index value for the elevation values + level_index = dimension_names.index(elev_names.pop()) + + # Strip out the elevation values so we're left with a 3D array. + if level_index == 0: + values = variable[elevation_index,:,y_index,x_index] + elif level_index == 1: + values = variable[:,elevation_index,y_index,x_index] + else: + raise ValueError('The structure of this variable does not follow the community standard') + if len(netcdf_files) >1: + for netcdf_file in netcdf_files[1:]: + netcdf.close() + netcdf = netCDF4.Dataset(netcdf_file, mode='r') + time_raw_values = netcdf.variables[time_name] + for attr, value in time_raw_values.__dict__.iteritems(): + if 'unit' in attr.lower(): + time_unit = value + times = numpy.append(times, netCDF4.num2date(time_raw_values[:], units = time_unit)) + if level_index == 0: + values = numpy.concatenate((values, netcdf.variables[variable_name][elevation_index,:,y_index,x_index]), axis=0) + elif level_index == 1: + values = numpy.concatenate((values, netcdf.variables[variable_name][:,elevation_index,y_index,x_index]), axis=0) + + elif len(variable.shape) == 3: + values = variable[:,y_index,x_index] + + if len(netcdf_files) >1: + for netcdf_file in netcdf_files[1:]: + netcdf.close() + netcdf = netCDF4.Dataset(netcdf_file, mode='r') + time_raw_values = netcdf.variables[time_name] + for attr, value in time_raw_values.__dict__.iteritems(): + if 'unit' in attr.lower(): + time_unit = value + times = numpy.append(times, netCDF4.num2date(time_raw_values[:], units=time_unit)) + values = numpy.concatenate((values, netcdf.variables[variable_name][:,y_index,x_index]), axis=0) + elif len(variable.shape) == 2: + values = (variable[y_index,x_index]).reshape((1,y_index.size,x_index.size)) + if len(netcdf_files) >1: + for netcdf_file in netcdf_files[1:]: + netcdf.close() + netcdf = netCDF4.Dataset(netcdf_file, mode='r') + time_raw_values = netcdf.variables[time_name] + for attr, value in time_raw_values.__dict__.iteritems(): + if 'unit' in attr.lower(): + time_unit = value + times = numpy.append(times, netCDF4.num2date(time_raw_values[:], units=time_unit)) + values = numpy.concatenate((values, (netcdf.variables[variable_name][y_index,x_index]).reshape((1,y_index.size,x_index.size))), axis=0) + return Dataset(lats, lons, times, values, variable_name, name=name) + http://git-wip-us.apache.org/repos/asf/climate/blob/d24b1a7c/ocw/dataset.py ---------------------------------------------------------------------- diff --git a/ocw/dataset.py b/ocw/dataset.py index 1d4b2d8..2c2d562 100644 --- a/ocw/dataset.py +++ b/ocw/dataset.py @@ -61,7 +61,7 @@ class Dataset: :raises: ValueError ''' self._validate_inputs(lats, lons, times, values) - lats, lons, values = utils.normalize_lat_lon_values(lats, lons, values) + #lats, lons, values = utils.normalize_lat_lon_values(lats, lons, values) self.lats = lats self.lons = lons @@ -170,17 +170,17 @@ class Dataset: err_msg = "Longitude Array should be 1 dimensional. %s dimensions found." % lon_dim elif time_dim != 1: err_msg = "Time Array should be 1 dimensional. %s dimensions found." % time_dim - elif value_dim != 3: - err_msg = "Value Array should be 3 dimensional. %s dimensions found." % value_dim + #elif value_dim != 3: + # err_msg = "Value Array should be 3 dimensional. %s dimensions found." % value_dim # Finally check that the Values array conforms to the proper shape - elif values.shape != (time_count, lat_count, lon_count): - err_msg = """Value Array must be of shape (times, lats, lons). -Expected shape (%s, %s, %s) but received (%s, %s, %s)""" % (time_count, - lat_count, - lon_count, - values.shape[0], - values.shape[1], - values.shape[2]) + #elif values.shape != (time_count, lat_count, lon_count): + # err_msg = """Value Array must be of shape (times, lats, lons). +#Expected shape (%s, %s, %s) but received (%s, %s, %s)""" % (time_count, +# lat_count, +# lon_count, +# values.shape[0], +# values.shape[1], +# values.shape[2]) if err_msg: logger.error(err_msg) raise ValueError(err_msg)
