Repository: climate Updated Branches: refs/heads/master 7ab014106 -> 1e0763865
Updates to DatasetLoader Project: http://git-wip-us.apache.org/repos/asf/climate/repo Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/b2f0ad4b Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/b2f0ad4b Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/b2f0ad4b Branch: refs/heads/master Commit: b2f0ad4b1516b8824acb4321a463b8d7e0c7c90c Parents: 7ab0141 Author: Alex Goodman <ago...@users.noreply.github.com> Authored: Mon Aug 1 14:11:27 2016 -0700 Committer: Alex Goodman <ago...@users.noreply.github.com> Committed: Mon Aug 1 14:11:27 2016 -0700 ---------------------------------------------------------------------- RCMES/run_RCMES.py | 25 +++--- ocw/data_source/local.py | 2 +- ocw/dataset_loader.py | 156 ++++++++++++++-------------------- ocw/tests/test_dataset_loader.py | 72 ++++------------ 4 files changed, 94 insertions(+), 161 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/RCMES/run_RCMES.py ---------------------------------------------------------------------- diff --git a/RCMES/run_RCMES.py b/RCMES/run_RCMES.py index cd69bc4..3cc2245 100644 --- a/RCMES/run_RCMES.py +++ b/RCMES/run_RCMES.py @@ -39,7 +39,7 @@ from getpass import getpass from metrics_and_plots import * -import ssl +import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context @@ -107,12 +107,13 @@ if 'longitude_name' in model_data_info.keys(): boundary_check_model = True if 'GCM_data' in model_data_info.keys(): if model_data_info['GCM_data']: - boundary_check_model = False + boundary_check_model = False print 'Loading model datasets:\n',model_data_info if model_data_info['data_source'] == 'local': - model_datasets, model_names = local.load_multiple_files(file_path = model_data_info['path'], - variable_name =model_data_info['variable'], - lat_name=model_lat_name, lon_name=model_lon_name) + model_datasets = local.load_multiple_files(file_path=model_data_info['path'], + variable_name =model_data_info['variable'], + lat_name=model_lat_name, lon_name=model_lon_name) + model_names = [dataset.name for dataset in model_datasets] elif model_data_info['data_source'] == 'ESGF': md = esgf.load_dataset(dataset_id=model_data_info['dataset_id'], variable=model_data_info['variable'], @@ -166,7 +167,7 @@ for idata,dataset in enumerate(model_datasets): # generate grid points for regridding if config['regrid']['regrid_on_reference']: new_lat = ref_dataset.lats - new_lon = ref_dataset.lons + new_lon = ref_dataset.lons else: delta_lat = config['regrid']['regrid_dlat'] delta_lon = config['regrid']['regrid_dlon'] @@ -178,7 +179,7 @@ else: # number of models nmodel = len(model_datasets) print 'Dataset loading completed' -print 'Observation data:', ref_name +print 'Observation data:', ref_name print 'Number of model datasets:',nmodel for model_name in model_names: print model_name @@ -200,7 +201,7 @@ print 'Checking and converting variable units' ref_dataset = dsp.variable_unit_conversion(ref_dataset) for idata,dataset in enumerate(model_datasets): model_datasets[idata] = dsp.variable_unit_conversion(dataset) - + print 'Generating multi-model ensemble' if len(model_datasets) >= 2.: @@ -217,8 +218,8 @@ if config['use_subregions']: print 'Calculating spatial averages and standard deviations of ',str(nsubregion),' subregions' - ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions) - model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions) + ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions) + model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions) """ Step 7: Write a netCDF file """ workdir = config['workdir'] @@ -231,7 +232,7 @@ if not os.path.exists(workdir): if config['use_subregions']: dsp.write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name, model_datasets, model_names, path=workdir+config['output_netcdf_filename'], - subregions=subregions, subregion_array = subregion_array, + subregions=subregions, subregion_array = subregion_array, ref_subregion_mean=ref_subregion_mean, ref_subregion_std=ref_subregion_std, model_subregion_mean=model_subregion_mean, model_subregion_std=model_subregion_std) else: @@ -279,5 +280,3 @@ if nmetrics > 0: file_name) else: print 'please check the currently supported metrics' - - http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/data_source/local.py ---------------------------------------------------------------------- diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py index 35041ac..98de937 100644 --- a/ocw/data_source/local.py +++ b/ocw/data_source/local.py @@ -350,7 +350,7 @@ def load_multiple_files(file_path, datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile], lat_name=lat_name, lon_name=lon_name, time_name=time_name)) - return datasets, data_name + return datasets def load_WRF_2d_files_RAIN(file_path=None, filename_pattern=None, http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/dataset_loader.py ---------------------------------------------------------------------- diff --git a/ocw/dataset_loader.py b/ocw/dataset_loader.py index 8ee1b93..3c8b95f 100644 --- a/ocw/dataset_loader.py +++ b/ocw/dataset_loader.py @@ -27,20 +27,19 @@ import ocw.data_source.dap as dap class DatasetLoader: - '''Generate OCW Dataset objects from a variety of sources.''' + '''Generate a list of OCW Dataset objects from a variety of sources.''' - def __init__(self, reference, targets): - '''Generate OCW Dataset objects from a variety of sources. + def __init__(self, *loader_opts): + '''Generate a list of OCW Dataset objects from a variety of sources. Each keyword argument can be information for a dataset in dictionary form. For example: `` - >>> reference = {'data_source':'rcmed', 'name':'cru', 'dataset_id':10, - 'parameter_id':34} - >>> targets = {'data_source':'local_multiple', - 'path':'./data/CORDEX-Africa_data/AFRICA*pr.nc', - 'variable':'pr'} - >>> loader = DatasetLoader(reference, targets) + >>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru', + 'dataset_id': 10, 'parameter_id': 34} + >>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc, + 'variable': 'pcp'} + >>> loader = DatasetLoader(loader_opt1, loader_opt2) `` Or more conveniently if the loader configuration is defined in a @@ -48,67 +47,57 @@ class DatasetLoader: `` >>> import yaml >>> config = yaml.load(open(config_file)) - >>> loader = DatasetLoader(**config['datasets']) + >>> obs_loader_config = config['datasets']['reference'] + >>> loader = DatasetLoader(*obs_loader_config) `` - As shown in the first example, the dictionary for each keyword argument - should contain a data source and parameters specific to the loader for - that data source. Once the configuration is entered, the datasets may be - loaded using: + As shown in the first example, the dictionary for each argument should + contain a loader name and parameters specific to the particular loader. + Once the configuration is entered, the datasets may be loaded using: `` >>> loader.load_datasets() - >>> target_datasets = loader.target_datasets + >>> obs_datasets = loader.datasets `` - If ``reference`` is entered as a keyword argument, then it may be - accesed from: - `` - >>> reference_dataset = loader.reference_dataset - `` - - Additionally, each dataset must have a ``data_source`` keyword. This may + Additionally, each dataset must have a ``loader_name`` keyword. This may be one of the following: - * ``'local'`` - A single dataset file in a local directory + * ``'local'`` - One or multiple dataset files in a local directory * ``'local_split'`` - A single dataset split accross multiple files in a local directory - * ``'local_multiple'`` - Multiple datasets in a local directory * ``'esgf'`` - Download the dataset from the Earth System Grid Federation * ``'rcmed'`` - Download the dataset from the Regional Climate Model Evaluation System Database * ``'dap'`` - Download the dataset from an OPeNDAP URL - Users who wish to download datasets from sources not described above + Users who wish to download datasets from loaders not described above may define their own custom dataset loader function and incorporate it as follows: - >>> loader.add_source_loader('my_source_name', my_loader_func) - - :param reference: The reference dataset loader configuration. - :type reference: :mod:`dict` + >>> loader.add_source_loader('my_loader_name', my_loader_func) - :param targets: The target dataset loader configurations. - :type targets: :mod:`dict` or list of mod:`dict` + :param loader_opts: Dictionaries containing the each dataset loader + configuration, representing the keyword arguments of + the loader function specified by an additional key + called 'loader_name'. If not specified by the user, + this defaults to local. + :type loader_opts: :class:`dict` :raises KeyError: If an invalid argument is passed to a data source loader function. ''' # Reference dataset config - self.set_reference(**reference) - - # Target dataset(s) config - self.set_targets(targets) + self.set_loader_opts(*loader_opts) # Default loaders self._source_loaders = { - 'local': local.load_file, + 'local': local.load_multiple_files, 'local_split': local.load_dataset_from_multiple_netcdf_files, - 'local_multiple': local.load_multiple_files, 'esgf': esgf.load_dataset, 'rcmed': rcmed.parameter_dataset, 'dap': dap.load } - def add_source_loader(self, source_name, loader_func): + def add_source_loader(self, loader_name, loader_func): ''' Add a custom source loader. @@ -119,89 +108,72 @@ class DatasetLoader: return an OCW Dataset object. :type loader_func: :class:`callable` ''' - self._source_loaders[source_name] = loader_func + self._source_loaders[loader_name] = loader_func - def add_target(self, **kwargs): + def add_loader_opts(self, *loader_opts): ''' - A convenient means of adding a target dataset to the loader. - :raises KeyError: If data_source is not specified. + A convenient means of adding loader options for each dataset to the + loader. If 'loader_name' is not entered as a keyword argument, then + 'local' is used by default. + + :param loader_opts: Dictionaries containing the each dataset loader + configuration, representing the keyword arguments of + the loader function specified by an additional key + called 'loader_name'. If not specified by the user, + this defaults to local. + :type loader_opts: :mod:`dict` ''' - if 'data_source' not in kwargs: - raise KeyError('Dataset configuration must contain a data_source.') - self._target_config.append(kwargs) + for opt in loader_opts: + if 'loader_name' not in opt: + opt['loader_name'] = 'local' + self._config.extend(loader_opts) - def add_targets(self, targets): + def set_loader_opts(self, *loader_opts): ''' - A convenient means of adding multiple target datasets to the loader. - - :param targets: List of loader configurations for each target - :type targets: List of :mod:`dict` - - :raises KeyError: If data_source is not specified. + Reset the dataset loader config. + + :param loader_opts: Dictionaries containing the each dataset loader + configuration, representing the keyword arguments of + the loader function specified by an additional key + called 'loader_name'. If not specified by the user, + this defaults to local. + :type loader_opts: :mod:`dict` ''' - for target_config in targets: - self.add_target(**target_config) - - def set_targets(self, targets): - ''' - Reset the target dataset config. - - :param targets: List of loader configurations for each target - :type targets: List of :mod:`dict` - - :raises KeyError: If data_source is not specified. - ''' - # This check allows for the user to enter targets as one block or - # as a list of separate blocks in their config files - if not isinstance(targets, list): - targets = [targets] - self._target_config = [] - self.add_targets(targets) - - def set_reference(self, **kwargs): - ''' - Reset the reference dataset config. - :raises KeyError: If data_source is not specified. - ''' - if 'data_source' not in kwargs: - raise KeyError('Dataset configuration must contain a data_source.') - self._reference_config = kwargs + self._config = [] + self.add_loader_opts(*loader_opts) def load_datasets(self): ''' Loads the datasets from the given loader configurations. ''' - # Load the reference dataset - self.reference_dataset = self._load(**self._reference_config) - # Ensure output is clear if loading is performed more than once to # prevent duplicates. - self.target_datasets = [] + self.datasets = [] # Load the target datasets - for loader_params in self._target_config: - output = self._load(**loader_params) + for loader_opt in self._config: + output = self._load(**loader_opt) # Need to account for the fact that some loaders return lists # of OCW Dataset objects instead of just one if isinstance(output, list): - self.target_datasets.extend(output) + self.datasets.extend(output) else: - self.target_datasets.append(output) + self.datasets.append(output) def _load(self, **kwargs): ''' Generic dataset loading method. ''' - # Extract the data source - data_source = kwargs.pop('data_source') + # Extract the loader name + loader_name = kwargs.pop('loader_name') # Find the correct loader function for the given data source - loader_func = self._source_loaders[data_source] + loader_func = self._source_loaders[loader_name] # The remaining kwargs should be specific to the loader output = loader_func(**kwargs) - # Preserve data_source info for later use - kwargs['data_source'] = data_source + # Preserve loader_name info for later use + kwargs['loader_name'] = loader_name return output http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/tests/test_dataset_loader.py ---------------------------------------------------------------------- diff --git a/ocw/tests/test_dataset_loader.py b/ocw/tests/test_dataset_loader.py index 2d192c1..b3c613b 100644 --- a/ocw/tests/test_dataset_loader.py +++ b/ocw/tests/test_dataset_loader.py @@ -17,7 +17,6 @@ import unittest import os -import copy import netCDF4 import numpy as np from ocw.dataset import Dataset @@ -37,13 +36,8 @@ class TestDatasetLoader(unittest.TestCase): self.values2 = self.values + 1 # Set up config - self.reference_config = {'data_source': 'local', - 'file_path': self.file_path, - 'variable_name': 'value'} - self.target_config = copy.deepcopy(self.reference_config) - self.no_data_source_config = {'file_path': self.file_path, - 'variable_name': 'value'} - self.new_data_source_config = {'data_source': 'foo', + self.config = {'file_path': self.file_path, 'variable_name': 'value'} + self.new_data_source_config = {'loader_name': 'foo', 'lats': self.latitudes, 'lons': self.longitudes, 'times': self.times, @@ -53,77 +47,45 @@ class TestDatasetLoader(unittest.TestCase): def tearDown(self): os.remove(self.file_path) - def testInputHasDataSource(self): - ''' - Make sure input data source is specified for each dataset to be loaded - ''' - with self.assertRaises(KeyError): - self.loader = DatasetLoader(self.reference_config, - self.no_data_source_config) - - def testReferenceHasDataSource(self): - ''' - Make sure ref data source is specified for each dataset to be loaded - ''' - with self.assertRaises(KeyError): - self.loader = DatasetLoader(self.reference_config, - self.target_config) - self.loader.set_reference(**self.no_data_source_config) - - def testTargetHasDataSource(self): - ''' - Make sure target data source is specified for each dataset to be loaded - ''' - with self.assertRaises(KeyError): - self.loader = DatasetLoader(self.reference_config, - self.target_config) - self.loader.add_target(**self.no_data_source_config) - def testNewDataSource(self): ''' Ensures that custom data source loaders can be added ''' - self.loader = DatasetLoader(self.new_data_source_config, - self.target_config) + self.loader = DatasetLoader(self.new_data_source_config) - # Here the the data_source "foo" represents the Dataset constructor + # Here the data_source "foo" represents the Dataset constructor self.loader.add_source_loader('foo', build_dataset) self.loader.load_datasets() - self.assertEqual(self.loader.reference_dataset.origin['source'], - 'foo') - np.testing.assert_array_equal(self.loader.reference_dataset.values, + self.assertEqual(self.loader.datasets[0].origin['source'], 'foo') + np.testing.assert_array_equal(self.loader.datasets[0].values, self.values2) def testExistingDataSource(self): ''' Ensures that existing data source loaders can be added ''' - self.loader = DatasetLoader(self.reference_config, - self.target_config) + self.loader = DatasetLoader(self.config) self.loader.load_datasets() - self.assertEqual(self.loader.reference_dataset.origin['source'], - 'local') - np.testing.assert_array_equal(self.loader.reference_dataset.values, + self.assertEqual(self.loader.datasets[0].origin['source'], 'local') + np.testing.assert_array_equal(self.loader.datasets[0].values, self.values) - def testMultipleTargets(self): + def testMultipleDataSources(self): ''' - Test for when multiple target dataset configs are specified + Test for when multiple dataset configs are specified ''' - self.loader = DatasetLoader(self.reference_config, - [self.target_config, - self.new_data_source_config]) + self.loader = DatasetLoader(self.config, self.new_data_source_config) - # Here the the data_source "foo" represents the Dataset constructor + # Here the data_source "foo" represents the Dataset constructor self.loader.add_source_loader('foo', build_dataset) self.loader.load_datasets() - self.assertEqual(self.loader.target_datasets[0].origin['source'], + self.assertEqual(self.loader.datasets[0].origin['source'], 'local') - self.assertEqual(self.loader.target_datasets[1].origin['source'], + self.assertEqual(self.loader.datasets[1].origin['source'], 'foo') - np.testing.assert_array_equal(self.loader.target_datasets[0].values, + np.testing.assert_array_equal(self.loader.datasets[0].values, self.values) - np.testing.assert_array_equal(self.loader.target_datasets[1].values, + np.testing.assert_array_equal(self.loader.datasets[1].values, self.values2) def build_dataset(*args, **kwargs):