[1/3] climate git commit: CLIMATE-926 - Metadata Extractors

huikyole Mon, 25 Sep 2017 11:13:04 -0700

Repository: climate
Updated Branches:
  refs/heads/master cf4fb57fd -> fce720570



CLIMATE-926 - Metadata Extractors


Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/8217d12f
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/8217d12f
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/8217d12f

Branch: refs/heads/master
Commit: 8217d12f06987d852f9294da94a5af243116e751
Parents: cf4fb57
Author: Alex Goodman <ago...@users.noreply.github.com>
Authored: Mon Sep 25 10:35:20 2017 -0700
Committer: Alex Goodman <ago...@users.noreply.github.com>
Committed: Mon Sep 25 10:35:20 2017 -0700

----------------------------------------------------------------------
 RCMES/CORDEX/metadata_extractor.py | 222 ++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/climate/blob/8217d12f/RCMES/CORDEX/metadata_extractor.py
----------------------------------------------------------------------
diff --git a/RCMES/CORDEX/metadata_extractor.py 
b/RCMES/CORDEX/metadata_extractor.py
new file mode 100644
index 0000000..7351cf4
--- /dev/null
+++ b/RCMES/CORDEX/metadata_extractor.py
@@ -0,0 +1,222 @@
+import glob
+import os
+
+
+class MetadataExtractor(object):
+    def __init__(self, *paths):
+        """Extracts metadata from data filenames.
+
+        Instances of MetadataExtractor are used to extract metadata from
+        filenames in bulk. Example usage:
+        >>> extractor = MetadataExtractor('/path/to/data')
+
+        Suppose the data in this directory had the following files:
+        pr_*.nc, uas_*.nc, vas_*.nc
+
+        All of the metadata lies in the data attribute:
+        >>> extractor.data
+        [{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'},
+         {'filename': /path/to/data/vas_*.nc, 'variable': 'vas'},
+         {'filename': /path/to/data/uas_*.nc, 'variable': 'uas'}]
+
+        Results can be narrowed down by specifying values for a field:
+        >>> extractor.query(variable='pr')
+        [{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'}]
+
+        Finally, metadata from two sets of extractors can be grouped together
+        based on common field name as follows:
+        >>> extractor.group(extractor2, 'variable')
+
+        This class should only be used as a starting point. We recommend using
+        the included obs4MIPSMetadataExtractor and CORDEXMetadataExtractor
+        subclasses or creating your own subclass for your usecase.
+        """
+        self.paths = paths
+
+    @property
+    def data(self):
+        """
+        The extracted metadata for each file, with all fields listed in
+        the fields attribute included.
+        """
+        return self._data
+
+    @property
+    def paths(self):
+        """
+        Search paths containing the dataset files.
+        """
+        return self._paths
+
+    @paths.setter
+    def paths(self, paths):
+        """
+        Extracts the metadata from scratch when paths are reset.
+        """
+        self._paths = paths
+        self._extract()
+
+    @property
+    def fields(self):
+        """
+        The name of field in the filename, assuming the fully filtered
+        filename conforms to the following convention:
+        filename = <field[0]>_<field[1]>_..._<field[n]>.nc. Using fewer fields
+        than the filename defines is allowed.
+        """
+        fields = ['variable']
+        return fields
+
+    @property
+    def files(self):
+        """
+        List of files (or regular expressions) for each dataset.
+        """
+        files = []
+        for path in self.paths:
+            files.extend(glob.glob(os.path.join(path, '*.nc')))
+        return list(set(self.get_pattern(fname) for fname in files))
+
+    @property
+    def variables(self):
+        """
+        Get the list of variables included accross all the datasets.
+        """
+        return self.get_field('variable')
+
+    def query(self, **kwargs):
+        """
+        Narrow down the list of files by field names.
+        """
+        fields = kwargs.keys()
+        if not set(fields).issubset(set(self.fields)):
+            raise ValueError("Invalid fields: {}. Must be subset of: {}"
+                             .format(fields, self.fields))
+        data = self.data
+        for field, value in kwargs.items():
+            value = value if isinstance(value, list) else [value]
+            data = [meta for meta in data if meta[field] in value]
+        return data
+
+    def group(self, extractor, field):
+        """
+        Compare the data of this extractor with another extractor instance
+        and group each of their metadata together by given field.
+        """
+        # First we only want to consider values of field which are contained
+        # in both extractors
+        subset = self.get_field(field)
+        other_subset = extractor.get_field(field)
+        intersection = list(subset.intersection(other_subset))
+
+        # Next we will group the datasets in each extractor together by common
+        # field values
+        kwargs = {field: intersection}
+        results = self.query(**kwargs)
+
+        groups = []
+        for meta in results:
+            val = meta[field]
+            kwargs.update({field: val})
+            match = extractor.query(**kwargs)
+            groups.append((meta, match))
+
+        return groups
+
+    def get_field(self, field):
+        """
+        Returns only the selected field of the extracted data.
+        """
+        if field not in self.fields:
+            raise ValueError("Invalid field: {}. Must be one of: {}"
+                             .format(field, self.fields))
+        sub = set(meta[field] for meta in self.data)
+        return sub
+
+    def filter_filename(self, fname):
+        """
+        Applies a filter to each individual filename contained in the _files
+        attribute, which is useful if some files within a data set are known
+        to not follow conventions, and "fix" them so that they do.
+        """
+        return os.path.basename(fname)
+
+    def get_pattern(self, fname):
+        """
+        Used to group multiple file datasets together via regular expresssions.
+        The most common convention is to split files by time periods, which
+        are generally the last field in a filename.
+        """
+        base = fname.split('_')
+        pattern = '_'.join(base[:len(self.fields)] + ['*.nc'])
+        return pattern
+
+    def _extract(self):
+        """
+        Do the actual metadata extraction from the list of filename given
+        via filter_filelist(). Additionally, filenames can also be filtered
+        via filter_filename() to remove unwanted characters from the 
extraction.
+        """
+        self._data = []
+        for fname in self.files:
+            meta = dict(filename=fname)
+
+            # Perform the actual metadata extraction
+            fname = self.filter_filename(fname)
+            meta.update(dict(zip(self.fields, fname.split('_')[:-1])))
+            self._data.append(meta)
+
+
+class obs4MIPSMetadataExtractor(MetadataExtractor):
+    @property
+    def instruments(self):
+        """
+        Get the list of instruments accross all the datasets.
+        """
+        return self.get_field('instrument')
+
+    @property
+    def fields(self):
+        """
+        obs4MIPs fields
+        """
+        fields = ['variable', 'instrument', 'processing_level', 'version']
+        return fields
+
+    def filter_filename(self, fname):
+        """
+        CALIPSO files have odd naming conventions, so we will use
+        a modified version to conform to standard obs4MIPs conventions.
+        """
+        fname = os.path.basename(fname)
+        fname = fname.replace('_obs4MIPs_', '_')
+        fname = fname.replace('calipso', '')
+        fname = fname.replace('Lidarsr532', '')
+        return fname
+
+    def get_pattern(self, fname):
+        """
+        Overriden to deal with CALIPSO filenames
+        """
+        base = fname.split('_')
+        offset = -2 if len(base) != 5 else -1
+        pattern = '_'.join(base[:offset] + ['*.nc'])
+        return pattern
+        
+        
+class CORDEXMetadataExtractor(MetadataExtractor):
+    @property
+    def models(self):
+        """
+        Get the list of models accross all the datasets.
+        """
+        return self.get_field('models')
+
+    @property
+    def fields(self):
+        """
+        obs4MIPs fields
+        """
+        fields = ['variable', 'domain', 'driving_model', 'experiment',
+                  'ensemble', 'model', 'version', 'time_step']
+        return fields

[1/3] climate git commit: CLIMATE-926 - Metadata Extractors

Reply via email to