[1/2] climate git commit: Updates to DatasetLoader

goodman Mon, 01 Aug 2016 15:00:21 -0700

Repository: climate
Updated Branches:
  refs/heads/master 7ab014106 -> 1e0763865



Updates to DatasetLoader


Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/b2f0ad4b
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/b2f0ad4b
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/b2f0ad4b

Branch: refs/heads/master
Commit: b2f0ad4b1516b8824acb4321a463b8d7e0c7c90c
Parents: 7ab0141
Author: Alex Goodman <ago...@users.noreply.github.com>
Authored: Mon Aug 1 14:11:27 2016 -0700
Committer: Alex Goodman <ago...@users.noreply.github.com>
Committed: Mon Aug 1 14:11:27 2016 -0700

----------------------------------------------------------------------
 RCMES/run_RCMES.py               |  25 +++---
 ocw/data_source/local.py         |   2 +-
 ocw/dataset_loader.py            | 156 ++++++++++++++--------------------
 ocw/tests/test_dataset_loader.py |  72 ++++------------
 4 files changed, 94 insertions(+), 161 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/RCMES/run_RCMES.py
----------------------------------------------------------------------
diff --git a/RCMES/run_RCMES.py b/RCMES/run_RCMES.py
index cd69bc4..3cc2245 100644
--- a/RCMES/run_RCMES.py
+++ b/RCMES/run_RCMES.py
@@ -39,7 +39,7 @@ from getpass import getpass
 
 from metrics_and_plots import *
 
-import ssl 
+import ssl
 
 if hasattr(ssl, '_create_unverified_context'):
   ssl._create_default_https_context = ssl._create_unverified_context
@@ -107,12 +107,13 @@ if 'longitude_name' in model_data_info.keys():
 boundary_check_model = True
 if 'GCM_data' in model_data_info.keys():
     if model_data_info['GCM_data']:
-        boundary_check_model = False                                           
+        boundary_check_model = False
 print 'Loading model datasets:\n',model_data_info
 if model_data_info['data_source'] == 'local':
-    model_datasets, model_names = local.load_multiple_files(file_path = 
model_data_info['path'],
-                                                            variable_name 
=model_data_info['variable'], 
-                                                            
lat_name=model_lat_name, lon_name=model_lon_name)
+    model_datasets = 
local.load_multiple_files(file_path=model_data_info['path'],
+                                               variable_name 
=model_data_info['variable'],
+                                               lat_name=model_lat_name, 
lon_name=model_lon_name)
+    model_names = [dataset.name for dataset in model_datasets]
 elif model_data_info['data_source'] == 'ESGF':
       md = esgf.load_dataset(dataset_id=model_data_info['dataset_id'],
                              variable=model_data_info['variable'],
@@ -166,7 +167,7 @@ for idata,dataset in enumerate(model_datasets):
 # generate grid points for regridding
 if config['regrid']['regrid_on_reference']:
     new_lat = ref_dataset.lats
-    new_lon = ref_dataset.lons 
+    new_lon = ref_dataset.lons
 else:
     delta_lat = config['regrid']['regrid_dlat']
     delta_lon = config['regrid']['regrid_dlon']
@@ -178,7 +179,7 @@ else:
 # number of models
 nmodel = len(model_datasets)
 print 'Dataset loading completed'
-print 'Observation data:', ref_name 
+print 'Observation data:', ref_name
 print 'Number of model datasets:',nmodel
 for model_name in model_names:
     print model_name
@@ -200,7 +201,7 @@ print 'Checking and converting variable units'
 ref_dataset = dsp.variable_unit_conversion(ref_dataset)
 for idata,dataset in enumerate(model_datasets):
     model_datasets[idata] = dsp.variable_unit_conversion(dataset)
-    
+
 
 print 'Generating multi-model ensemble'
 if len(model_datasets) >= 2.:
@@ -217,8 +218,8 @@ if config['use_subregions']:
 
     print 'Calculating spatial averages and standard deviations of 
',str(nsubregion),' subregions'
 
-    ref_subregion_mean, ref_subregion_std, subregion_array = 
utils.calc_subregion_area_mean_and_std([ref_dataset], subregions) 
-    model_subregion_mean, model_subregion_std, subregion_array = 
utils.calc_subregion_area_mean_and_std(model_datasets, subregions) 
+    ref_subregion_mean, ref_subregion_std, subregion_array = 
utils.calc_subregion_area_mean_and_std([ref_dataset], subregions)
+    model_subregion_mean, model_subregion_std, subregion_array = 
utils.calc_subregion_area_mean_and_std(model_datasets, subregions)
 
 """ Step 7: Write a netCDF file """
 workdir = config['workdir']
@@ -231,7 +232,7 @@ if not os.path.exists(workdir):
 if config['use_subregions']:
     dsp.write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name, 
model_datasets, model_names,
                                                        
path=workdir+config['output_netcdf_filename'],
-                                                       subregions=subregions, 
subregion_array = subregion_array, 
+                                                       subregions=subregions, 
subregion_array = subregion_array,
                                                        
ref_subregion_mean=ref_subregion_mean, ref_subregion_std=ref_subregion_std,
                                                        
model_subregion_mean=model_subregion_mean, 
model_subregion_std=model_subregion_std)
 else:
@@ -279,5 +280,3 @@ if nmetrics > 0:
                                       file_name)
         else:
             print 'please check the currently supported metrics'
-
-

http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/data_source/local.py
----------------------------------------------------------------------
diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py
index 35041ac..98de937 100644
--- a/ocw/data_source/local.py
+++ b/ocw/data_source/local.py
@@ -350,7 +350,7 @@ def load_multiple_files(file_path,
         datasets.append(load_file(filename, variable_name, variable_unit, 
name=data_name[ifile],
                         lat_name=lat_name, lon_name=lon_name, 
time_name=time_name))
 
-    return datasets, data_name
+    return datasets
 
 def load_WRF_2d_files_RAIN(file_path=None,
                       filename_pattern=None,

http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/dataset_loader.py
----------------------------------------------------------------------
diff --git a/ocw/dataset_loader.py b/ocw/dataset_loader.py
index 8ee1b93..3c8b95f 100644
--- a/ocw/dataset_loader.py
+++ b/ocw/dataset_loader.py
@@ -27,20 +27,19 @@ import ocw.data_source.dap as dap
 
 
 class DatasetLoader:
-    '''Generate OCW Dataset objects from a variety of sources.'''
+    '''Generate a list of OCW Dataset objects from a variety of sources.'''
 
-    def __init__(self, reference, targets):
-        '''Generate OCW Dataset objects from a variety of sources.
+    def __init__(self, *loader_opts):
+        '''Generate a list of OCW Dataset objects from a variety of sources.
 
         Each keyword argument can be information for a dataset in dictionary
         form. For example:
         ``
-        >>> reference = {'data_source':'rcmed', 'name':'cru', 'dataset_id':10,
-                         'parameter_id':34}
-        >>> targets = {'data_source':'local_multiple',
-                       'path':'./data/CORDEX-Africa_data/AFRICA*pr.nc',
-                       'variable':'pr'}
-        >>> loader = DatasetLoader(reference, targets)
+        >>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru',
+                           'dataset_id': 10, 'parameter_id': 34}
+        >>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc,
+                           'variable': 'pcp'}
+        >>> loader = DatasetLoader(loader_opt1, loader_opt2)
         ``
 
         Or more conveniently if the loader configuration is defined in a
@@ -48,67 +47,57 @@ class DatasetLoader:
         ``
         >>> import yaml
         >>> config = yaml.load(open(config_file))
-        >>> loader = DatasetLoader(**config['datasets'])
+        >>> obs_loader_config = config['datasets']['reference']
+        >>> loader = DatasetLoader(*obs_loader_config)
         ``
 
-        As shown in the first example, the dictionary for each keyword argument
-        should contain a data source and parameters specific to the loader for
-        that data source. Once the configuration is entered, the datasets may 
be
-        loaded using:
+        As shown in the first example, the dictionary for each argument should
+        contain a loader name and parameters specific to the particular loader.
+        Once the configuration is entered, the datasets may be loaded using:
         ``
         >>> loader.load_datasets()
-        >>> target_datasets = loader.target_datasets
+        >>> obs_datasets = loader.datasets
         ``
 
-        If ``reference`` is entered as a keyword argument, then it may be
-        accesed from:
-        ``
-        >>> reference_dataset = loader.reference_dataset
-        ``
-
-        Additionally, each dataset must have a ``data_source`` keyword. This 
may
+        Additionally, each dataset must have a ``loader_name`` keyword. This 
may
         be one of the following:
-        * ``'local'`` - A single dataset file in a local directory
+        * ``'local'`` - One or multiple dataset files in a local directory
         * ``'local_split'`` - A single dataset split accross multiple files in 
a
                               local directory
-        * ``'local_multiple'`` - Multiple datasets in a local directory
         * ``'esgf'`` - Download the dataset from the Earth System Grid
                        Federation
         * ``'rcmed'`` - Download the dataset from the Regional Climate Model
                         Evaluation System Database
         * ``'dap'`` - Download the dataset from an OPeNDAP URL
 
-        Users who wish to download datasets from sources not described above
+        Users who wish to download datasets from loaders not described above
         may define their own custom dataset loader function and incorporate it
         as follows:
-        >>> loader.add_source_loader('my_source_name', my_loader_func)
-
-        :param reference: The reference dataset loader configuration.
-        :type reference: :mod:`dict`
+        >>> loader.add_source_loader('my_loader_name', my_loader_func)
 
-        :param targets: The target dataset loader configurations.
-        :type targets: :mod:`dict` or list of mod:`dict`
+        :param loader_opts: Dictionaries containing the each dataset loader
+                            configuration, representing the keyword arguments 
of
+                            the loader function specified by an additional key
+                            called 'loader_name'. If not specified by the user,
+                            this defaults to local.
+        :type loader_opts: :class:`dict`
 
         :raises KeyError: If an invalid argument is passed to a data source
         loader function.
         '''
         # Reference dataset config
-        self.set_reference(**reference)
-
-        # Target dataset(s) config
-        self.set_targets(targets)
+        self.set_loader_opts(*loader_opts)
 
         # Default loaders
         self._source_loaders = {
-            'local': local.load_file,
+            'local': local.load_multiple_files,
             'local_split': local.load_dataset_from_multiple_netcdf_files,
-            'local_multiple': local.load_multiple_files,
             'esgf': esgf.load_dataset,
             'rcmed': rcmed.parameter_dataset,
             'dap': dap.load
         }
 
-    def add_source_loader(self, source_name, loader_func):
+    def add_source_loader(self, loader_name, loader_func):
         '''
         Add a custom source loader.
 
@@ -119,89 +108,72 @@ class DatasetLoader:
         return an OCW Dataset object.
         :type loader_func: :class:`callable`
         '''
-        self._source_loaders[source_name] = loader_func
+        self._source_loaders[loader_name] = loader_func
 
-    def add_target(self, **kwargs):
+    def add_loader_opts(self, *loader_opts):
         '''
-        A convenient means of adding a target dataset to the loader.
-        :raises KeyError: If data_source is not specified.
+        A convenient means of adding loader options for each dataset to the
+        loader. If 'loader_name' is not entered as a keyword argument, then
+        'local' is used by default.
+
+        :param loader_opts: Dictionaries containing the each dataset loader
+                            configuration, representing the keyword arguments 
of
+                            the loader function specified by an additional key
+                            called 'loader_name'. If not specified by the user,
+                            this defaults to local.
+        :type loader_opts: :mod:`dict`
         '''
-        if 'data_source' not in kwargs:
-            raise KeyError('Dataset configuration must contain a data_source.')
-        self._target_config.append(kwargs)
+        for opt in loader_opts:
+            if 'loader_name' not in opt:
+                opt['loader_name'] = 'local'
+        self._config.extend(loader_opts)
 
-    def add_targets(self, targets):
+    def set_loader_opts(self, *loader_opts):
         '''
-        A convenient means of adding multiple target datasets to the loader.
-
-        :param targets: List of loader configurations for each target
-        :type targets: List of :mod:`dict`
-
-        :raises KeyError: If data_source is not specified.
+        Reset the dataset loader config.
+
+        :param loader_opts: Dictionaries containing the each dataset loader
+                            configuration, representing the keyword arguments 
of
+                            the loader function specified by an additional key
+                            called 'loader_name'. If not specified by the user,
+                            this defaults to local.
+        :type loader_opts: :mod:`dict`
         '''
-        for target_config in targets:
-            self.add_target(**target_config)
-
-    def set_targets(self, targets):
-        '''
-        Reset the target dataset config.
-
-        :param targets: List of loader configurations for each target
-        :type targets: List of :mod:`dict`
-
-        :raises KeyError: If data_source is not specified.
-        '''
-        # This check allows for the user to enter targets as one block or
-        # as a list of separate blocks in their config files
-        if not isinstance(targets, list):
-            targets = [targets]
-        self._target_config = []
-        self.add_targets(targets)
-
-    def set_reference(self, **kwargs):
-        '''
-        Reset the reference dataset config.
-        :raises KeyError: If data_source is not specified.
-        '''
-        if 'data_source' not in kwargs:
-            raise KeyError('Dataset configuration must contain a data_source.')
-        self._reference_config = kwargs
+        self._config = []
+        self.add_loader_opts(*loader_opts)
 
     def load_datasets(self):
         '''
         Loads the datasets from the given loader configurations.
         '''
-        # Load the reference dataset
-        self.reference_dataset = self._load(**self._reference_config)
-
         # Ensure output is clear if loading is performed more than once to
         # prevent duplicates.
-        self.target_datasets = []
+        self.datasets = []
 
         # Load the target datasets
-        for loader_params in self._target_config:
-            output = self._load(**loader_params)
+        for loader_opt in self._config:
+            output = self._load(**loader_opt)
 
             # Need to account for the fact that some loaders return lists
             # of OCW Dataset objects instead of just one
             if isinstance(output, list):
-                self.target_datasets.extend(output)
+                self.datasets.extend(output)
             else:
-                self.target_datasets.append(output)
+                self.datasets.append(output)
 
     def _load(self, **kwargs):
         '''
         Generic dataset loading method.
         '''
-        # Extract the data source
-        data_source = kwargs.pop('data_source')
+        # Extract the loader name
+        loader_name = kwargs.pop('loader_name')
 
         # Find the correct loader function for the given data source
-        loader_func = self._source_loaders[data_source]
+        loader_func = self._source_loaders[loader_name]
 
         # The remaining kwargs should be specific to the loader
         output = loader_func(**kwargs)
 
-        # Preserve data_source info for later use
-        kwargs['data_source'] = data_source
+        # Preserve loader_name info for later use
+        kwargs['loader_name'] = loader_name
         return output

http://git-wip-us.apache.org/repos/asf/climate/blob/b2f0ad4b/ocw/tests/test_dataset_loader.py
----------------------------------------------------------------------
diff --git a/ocw/tests/test_dataset_loader.py b/ocw/tests/test_dataset_loader.py
index 2d192c1..b3c613b 100644
--- a/ocw/tests/test_dataset_loader.py
+++ b/ocw/tests/test_dataset_loader.py
@@ -17,7 +17,6 @@
 
 import unittest
 import os
-import copy
 import netCDF4
 import numpy as np
 from ocw.dataset import Dataset
@@ -37,13 +36,8 @@ class TestDatasetLoader(unittest.TestCase):
         self.values2 = self.values + 1
 
         # Set up config
-        self.reference_config = {'data_source': 'local',
-                                 'file_path': self.file_path,
-                                 'variable_name': 'value'}
-        self.target_config = copy.deepcopy(self.reference_config)
-        self.no_data_source_config = {'file_path': self.file_path,
-                                      'variable_name': 'value'}
-        self.new_data_source_config = {'data_source': 'foo',
+        self.config = {'file_path': self.file_path, 'variable_name': 'value'}
+        self.new_data_source_config = {'loader_name': 'foo',
                                        'lats': self.latitudes,
                                        'lons': self.longitudes,
                                        'times': self.times,
@@ -53,77 +47,45 @@ class TestDatasetLoader(unittest.TestCase):
     def tearDown(self):
         os.remove(self.file_path)
 
-    def testInputHasDataSource(self):
-        '''
-        Make sure input data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.no_data_source_config)
-
-    def testReferenceHasDataSource(self):
-        '''
-        Make sure ref data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.target_config)
-            self.loader.set_reference(**self.no_data_source_config)
-
-    def testTargetHasDataSource(self):
-        '''
-        Make sure target data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.target_config)
-            self.loader.add_target(**self.no_data_source_config)
-
     def testNewDataSource(self):
         '''
         Ensures that custom data source loaders can be added
         '''
-        self.loader = DatasetLoader(self.new_data_source_config,
-                                    self.target_config)
+        self.loader = DatasetLoader(self.new_data_source_config)
 
-        # Here the the data_source "foo" represents the Dataset constructor
+        # Here the data_source "foo" represents the Dataset constructor
         self.loader.add_source_loader('foo', build_dataset)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.reference_dataset.origin['source'],
-                         'foo')
-        np.testing.assert_array_equal(self.loader.reference_dataset.values,
+        self.assertEqual(self.loader.datasets[0].origin['source'], 'foo')
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values2)
 
     def testExistingDataSource(self):
         '''
         Ensures that existing data source loaders can be added
         '''
-        self.loader = DatasetLoader(self.reference_config,
-                                    self.target_config)
+        self.loader = DatasetLoader(self.config)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.reference_dataset.origin['source'],
-                         'local')
-        np.testing.assert_array_equal(self.loader.reference_dataset.values,
+        self.assertEqual(self.loader.datasets[0].origin['source'], 'local')
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values)
 
-    def testMultipleTargets(self):
+    def testMultipleDataSources(self):
         '''
-        Test for when multiple target dataset configs are specified
+        Test for when multiple dataset configs are specified
         '''
-        self.loader = DatasetLoader(self.reference_config,
-                                    [self.target_config,
-                                     self.new_data_source_config])
+        self.loader = DatasetLoader(self.config, self.new_data_source_config)
 
-        # Here the the data_source "foo" represents the Dataset constructor
+        # Here the data_source "foo" represents the Dataset constructor
         self.loader.add_source_loader('foo', build_dataset)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.target_datasets[0].origin['source'],
+        self.assertEqual(self.loader.datasets[0].origin['source'],
                          'local')
-        self.assertEqual(self.loader.target_datasets[1].origin['source'],
+        self.assertEqual(self.loader.datasets[1].origin['source'],
                          'foo')
-        np.testing.assert_array_equal(self.loader.target_datasets[0].values,
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values)
-        np.testing.assert_array_equal(self.loader.target_datasets[1].values,
+        np.testing.assert_array_equal(self.loader.datasets[1].values,
                                       self.values2)
 
 def build_dataset(*args, **kwargs):

[1/2] climate git commit: Updates to DatasetLoader

Reply via email to