CLIMATE-518 - Add multi-dataset handling - Add multi-dataset handling to the ESGF data source. Loaded dataset(s) are returned as a list now. - Add support for elevation layer selection when loading a dataset.
Project: http://git-wip-us.apache.org/repos/asf/climate/repo Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/08b780d7 Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/08b780d7 Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/08b780d7 Branch: refs/heads/master Commit: 08b780d7004dc0ef254c5da595fd2152960485f9 Parents: 1094b18 Author: Michael Joyce <[email protected]> Authored: Fri Oct 24 08:15:28 2014 -0700 Committer: Michael Joyce <[email protected]> Committed: Fri Oct 24 08:15:28 2014 -0700 ---------------------------------------------------------------------- ocw/data_source/esgf.py | 81 +++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/climate/blob/08b780d7/ocw/data_source/esgf.py ---------------------------------------------------------------------- diff --git a/ocw/data_source/esgf.py b/ocw/data_source/esgf.py index a072b21..f138367 100644 --- a/ocw/data_source/esgf.py +++ b/ocw/data_source/esgf.py @@ -19,24 +19,24 @@ import urllib2 -from ocw.esgf.constants import JPL_SEARCH_SERVICE_URL +from ocw.esgf.constants import DEFAULT_ESGF_SEARCH from ocw.esgf.download import download from ocw.esgf.logon2 import logon2 from ocw.esgf.search import SearchClient import ocw.data_source.local as local +from bs4 import BeautifulSoup +import requests + def load_dataset(dataset_id, variable, esgf_username, esgf_password, - search_url=JPL_SEARCH_SERVICE_URL, + search_url=DEFAULT_ESGF_SEARCH, + elevation_index=0, **additional_constraints): ''' Load an ESGF dataset. - .. note: - Currently, multi-file datasets aren't supported. This functionality - will be added soon! - :param dataset_id: The ESGF ID of the dataset to load. :type dataset_id: String :param variable: The variable to load. @@ -48,6 +48,8 @@ def load_dataset(dataset_id, :param search_url: (Optional) The ESGF node to use for searching. Defaults to the Jet Propulsion Laboratory node. :type search_url: String + :param elevation_index: (Optional) The elevation level to strip out when + loading the dataset using ocw.data_source.local. :param additional_constraints: (Optional) Additional key,value pairs to pass as constraints to the search wrapper. These can be anything found on the ESGF metadata page for a dataset. @@ -57,41 +59,44 @@ def load_dataset(dataset_id, :raises ValueError: If no dataset can be found for the supplied ID and variable, or if the requested dataset is a multi-file dataset. ''' - urls = _get_file_urls(url=search_url, - id=dataset_id, - variable=variable, - **additional_constraints) - - if len(urls) > 1: - err = ( - "esgf.load_dataset: Unable to handle multi-file datasets. " - "Feature coming soon ..." - ) - raise ValueError(err) - elif len(urls) == 0: - err = ( - "esgf.load_dataset: No files found for specified dataset." - ) - raise ValueError(err) + download_data = _get_file_download_data(url=search_url, + dataset_id=dataset_id, + variable=variable) + + datasets = [] + for url, var in download_data: + _download_files([url], esgf_username, esgf_password) + datasets.append(local.load_file('/tmp/' + url.split('/')[-1], + var, + elevation_index=elevation_index)) - # TODO: In the future, we need to combine multi-file datasets into a single - # file and then load it. For now we're only handling a single file use case. - _download_files(urls, esgf_username, esgf_password) - return local.load_file('/tmp/' + urls[0].split('/')[-1], variable) + return datasets -def _get_file_urls(**constraints): +def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH): '''''' - # Allow the user to optionally specify the URL for a search node to use. - # Default to the Jet Propulsion Laboratory node if nothing is specified. - if 'url' in constraints: - url = constraints['url'] - constraints.pop('url', None) - else: - url = ocw.esgf.constants.JPL_SEARCH_SERVICE_URL - - sc = SearchClient(searchServiceUrl=url, distrib=False) - sc.setConstraint(**constraints) - return sc.getFiles() + url += '?distrib=false&type=File&dataset_id={}&variable={}' + url = url.format(dataset_id, variable) + + r = requests.get(url) + xml = BeautifulSoup(r.content) + + dont_have_results = not bool(xml.response.result['numfound']) + + if dont_have_results: + err = "esgf.load_dataset: No files found for specified dataset." + raise ValueError(err) + + # Split out URLs for dataset download along with variable names for each + # of those files. + url_groups = xml.response.result.findAll('arr', {'name': 'url'}) + variable_groups = xml.response.result.findAll('arr', {'name': 'variable'}) + + urls = [group.findAll('str')[0].string.split('|')[0] + for group in url_groups] + variables = [group.findAll('str')[0].string + for group in variable_groups] + + return zip(urls, variables) def _download_files(file_urls, username, password, download_directory='/tmp'): ''''''
