http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/db.py ---------------------------------------------------------------------- diff --git a/rcmet/src/main/python/rcmes/storage/db.py b/rcmet/src/main/python/rcmes/storage/db.py deleted file mode 100644 index 22a1dfd..0000000 --- a/rcmet/src/main/python/rcmes/storage/db.py +++ /dev/null @@ -1,359 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Collection of functions used to interface with the database and to create netCDF file -""" -import os -import urllib2 -import re -import numpy as np -import numpy.ma as ma -import json -import netCDF4 - -from classes import RCMED -from toolkit import process -from datetime import timedelta ,datetime -from calendar import monthrange - -def reorderXYT(lons, lats, times, values): - # Re-order values in values array such that when reshaped everywhere is where it should be - # (as DB doesn't necessarily return everything in order) - order = np.lexsort((lons, lats, times)) - counter = 0 - sortedValues = np.zeros_like(values) - sortedLats = np.zeros_like(lats) - sortedLons = np.zeros_like(lons) - for i in order: - sortedValues[counter] = values[i] - sortedLats[counter] = lats[i] - sortedLons[counter] = lons[i] - counter += 1 - - return sortedValues, sortedLats, sortedLons - -def findUnique(seq, idfun=None): - """ - Function to find unique values (used in construction of unique datetime list) - NB. order preserving - Input: seq - a list of randomly ordered values - Output: result - list of ordered values - """ - if idfun is None: - def idfun(x): - return x - - seen = {}; - result = [] - - for item in seq: - marker = idfun(item) - # in old Python versions: - # if seen.has_key(marker) - # but in new ones: - if marker in seen: continue - seen[marker] = 1 - result.append(item) - return result - -def get_param_info(url): - - ''' - This function will get the general information by given URL from the parameter table. - ''' - url = url + "&info=yes" - result = urllib2.urlopen(url) - datastring = result.read() - datastring=json.loads(datastring) - database=datastring["database"] - timestep=datastring["timestep"] - realm=datastring["realm"] - instrument=datastring["instrument"] - start_date=datastring["start_date"] - end_date=datastring["end_date"] - unit=datastring["units"] - - return database, timestep, realm, instrument, start_date, end_date, unit - -def get_data(url): - - ''' - This function will get the url, query from database and will return datapoints' latitude, longitude, level, time and value. - ''' - - result = urllib2.urlopen(url) - datastring = result.read() - d = re.search('data: \r\n', datastring) - data = datastring[d.end():len(datastring)] - - # To create a list of all datapoints - data=data.split('\r\n') - - latitudes = [] - longitudes = [] - levels = [] - values = [] - timestamps = [] - - # To make a series of lists from datapoints - for i in range(len(data)-1): # Because the last row is empty, "len(data)-1" is used. - row=data[i].split(',') - latitudes.append(np.float32(row[0])) - longitudes.append(np.float32(row[1])) - levels.append(np.float32(row[2])) - # timestamps are strings so we will leave them alone for now - timestamps.append(row[3]) - values.append(np.float32(row[4])) - - return latitudes, longitudes, levels, values, timestamps - - -def create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName): - - ''' - This function will generate netCDF files. - ''' - - # To generate netCDF file from database - netcdf = netCDF4.Dataset(netCD_fileName,mode='w') - string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(startTime) + " and endTime: " + str(endTime) + "." - netcdf.globalAttName = str(string) - netcdf.createDimension('dim', len(latitudes)) - latitude = netcdf.createVariable('lat', 'd', ('dim',)) - longitude = netcdf.createVariable('lon', 'd', ('dim',)) - level = netcdf.createVariable('lev', 'd', ('dim',)) - time = netcdf.createVariable('time', 'd', ('dim',)) - value = netcdf.createVariable('value', 'd', ('dim',)) - - netcdf.variables['lat'].varAttName = 'latitude' - netcdf.variables['lat'].units = 'degrees_north' - netcdf.variables['lon'].varAttName = 'longitude' - netcdf.variables['lon'].units = 'degrees_east' - netcdf.variables['time'].varAttName = 'time' - netcdf.variables['time'].units = 'hours since ' + str(startTime) - netcdf.variables['value'].varAttName = 'value' - netcdf.variables['value'].units = str(unit) - netcdf.variables['lev'].varAttName = 'level' - netcdf.variables['lev'].units = 'hPa' - - hours=[] - timeFormat = "%Y-%m-%d %H:%M:%S" - base_date=startTime - # To convert the date to hours - for t in timestamps: - date=datetime.strptime(t, timeFormat) - diff=date-base_date - hours.append(diff.days*24) - - latitude[:]=latitudes[:] - longitude[:]=longitudes[:] - level[:]=levels[:] - time[:]=hours[:] - value[:]=values[:] - netcdf.close() - -def read_netcdf(netCD_fileName): - - ''' - This function will read the existed netCDF file, convert the hours from netCDF time variable - and return latitudes, longitudes, levels, times and values. - ''' - # To use the created netCDF file - netcdf = netCDF4.Dataset(netCD_fileName , mode='r') - # To get all data from netCDF file - latitudes = netcdf.variables['lat'][:] - longitudes = netcdf.variables['lon'][:] - levels = netcdf.variables['lev'][:] - hours = netcdf.variables['time'][:] - values = ma.array(netcdf.variables['value'][:]) - - # To get the base date - time_unit=netcdf.variables['time'].units.encode() - time_unit=time_unit.split(' ') - base_date=time_unit[2] + " " + time_unit[3] - - netcdf.close() - - timeFormat = "%Y-%m-%d %H:%M:%S" - - # Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format - times=[] - # To convert the base date to the python datetime format - base_date = datetime.strptime(base_date, timeFormat) - for each in range(len(hours)): - hour=timedelta(hours[each]/24) - eachTime=base_date + hour - times.append(str(eachTime.year) + '-' + str("%02d" % (eachTime.month)) + '-' + str("%02d" % (eachTime.day)) + ' ' + str("%02d" % (eachTime.hour)) + ':' + str("%02d" % (eachTime.minute)) + ':' + str("%02d" % (eachTime.second))) - - return latitudes, longitudes, levels, times, values - - -def improve_data(latitudes, longitudes, levels, times, values, timestep): - - # Make arrays of unique latitudes, longitudes, levels and times - uniqueLatitudes = np.unique(latitudes) - uniqueLongitudes = np.unique(longitudes) - uniqueLevels = np.unique(levels) - uniqueTimestamps = np.unique(times) - - # Calculate nx and ny - uniqueLongitudeCount = len(uniqueLongitudes) - uniqueLatitudeCount = len(uniqueLatitudes) - uniqueLevelCount = len(uniqueLevels) - uniqueTimeCount = len(uniqueTimestamps) - - values, latitudes, longitudes = reorderXYT(longitudes, latitudes, times, values) - - # Convert each unique time from strings into list of Python datetime objects - # TODO - LIST COMPS! - timeFormat = "%Y-%m-%d %H:%M:%S" - timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps] - timesUnique.sort() - timesUnique = process.normalizeDatetimes(timesUnique, timestep) - - # Reshape arrays - latitudes = latitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount) - longitudes = longitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount) - levels = np.array(levels).reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount) - values = values.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount) - - # Flatten dimension if only single level - if uniqueLevelCount == 1: - values = values[:, :, :, 0] - latitudes = latitudes[0, :, :, 0] - longitudes = longitudes[0, :, :, 0] - - # Created masked array to deal with missing values - # -these make functions like values.mean(), values.max() etc ignore missing values - mdi = -9999 # TODO: extract this value from the DB retrieval metadata - mdata = ma.masked_array(values, mask=(values == mdi)) - - - return latitudes, longitudes, uniqueLevels, timesUnique, mdata - - -def extractData ( datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep ): - - """ - Main function to extract data from DB into numpy masked arrays, and also to create monthly netCDF file as cache - - Input:: - datasetID, paramID: required identifiers of data in database - latMin, latMax, lonMin, lonMax: location range to extract data for - startTime, endTime: python datetime objects describing required time range to extract - cachedir: directory path used to store temporary cache files - timestep: "daily" | "monthly" so we can be sure to query the RCMED properly - Output: - uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values - uniqueLevels: 1d-numpy array of vertical level values - timesUnique: list of python datetime objects describing times of returned data - mdata: masked numpy arrays of data values - """ - - url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep) - - # To get the parameter's information from parameter table - database, timestep, realm, instrument, dbStartDate, dbEndDate, unit = get_param_info(url) - - # Create a directory inside the cache directory - name = [] - # activity is a fix value - activity = "obs4cmip5" - name.append(activity) - # product is a fix value - product = "observations" - name.append(product) - # realm, variable,frequency and instrument will be get from parameter table - realm = realm - name.append(realm) - variable = database - name.append(variable) - frequency = timestep - name.append(frequency) - data_structure = "grid" - name.append(data_structure) - institution = "NASA" - name.append(institution) - project = "RCMES" - name.append(project) - instrument = instrument - name.append(instrument) - version = "v1" - name.append(version) - - # Check to see whether the folder is already created for netCDF or not, then it will be created - temp_path = cachedir - for n in name: - path = os.path.join(temp_path, n) - if os.path.exists(path): - temp_path = path - pass - else: - os.mkdir(path) - temp_path = path - - processing_level = 'L3' - processing_version = "processing_version" # the processing version is still unknown and can be added later - - timeFormat = "%Y-%m-%d %H:%M:%S" - - date_list, lats, longs, uniqueLevls, uniqueTimes, vals = [], [], [], [], [], [] - - # To make a list (date_list) of all months available based on user time request - while userStartTime <= userEndTime: - #To get the beginning of month - beginningOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + "01 00:00:00" - #To get the end of month - endOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + str(monthrange(userStartTime.year,userStartTime.month)[1]) + " 00:00:00" - #To convert both beginning and end of month from string to Python datetime format - beginningOfMonth = datetime.strptime(beginningOfMonth, timeFormat) - endOfMonth = datetime.strptime(endOfMonth, timeFormat) - #To add beginning and end of month as a list to the date_list list - date_list.append([beginningOfMonth, endOfMonth]) - #To get the beginning of next month - userStartTime= endOfMonth + timedelta(days=1) - - - # To loop over all months and return data - for i, date in enumerate(date_list): - netCDF_name = variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + str("%04d" % date[0].year) + str("%02d" % date[0].month) + '.nc' - - # To check if netCDF file exists, then use it - if os.path.exists(path+"/"+ netCDF_name): - latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name) - - # If the netCDF file does not exist, then create one and read it. - else: - # To just query for one year of data - print "%s of %s Database Download(s) Complete" % (i, len(date_list)) - url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, date[0], date[1], cachedir, timestep) - - # To get data from DB - latitudes, longitudes, levels, values, timestamps = get_data(url) - create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, date[0], date[1], unit, path + "/" + netCDF_name) - - # To read from netCDF files - latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name) - - lats=np.append(lats,latitudes) - longs=np.append(longs,longitudes) - uniqueLevls=np.append(uniqueLevls,levels) - uniqueTimes=np.append(uniqueTimes,times) - vals=np.append(vals,values) - - latitudes, longitudes, uniqueLevels, timesUnique, mdata = improve_data(lats, longs, uniqueLevls, uniqueTimes, vals, timestep) - - return latitudes, longitudes, uniqueLevels, timesUnique, mdata
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/files.py ---------------------------------------------------------------------- diff --git a/rcmet/src/main/python/rcmes/storage/files.py b/rcmet/src/main/python/rcmes/storage/files.py deleted file mode 100644 index b238754..0000000 --- a/rcmet/src/main/python/rcmes/storage/files.py +++ /dev/null @@ -1,783 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Module for handling data input files. Requires netCDF and Numpy be -installed. - -This module can easily open NetCDF, HDF and Grib files. Search the netCDF4 -documentation for a complete list of supported formats. -""" - -from os import path -import netCDF4 -import numpy as np -import numpy.ma as ma -import sys - -from toolkit import process -from utils import fortranfile -from utils import misc - - -VARIABLE_NAMES = {'time': ['time', 'times', 'date', 'dates', 'julian'], - 'latitude': ['latitude', 'lat', 'lats', 'latitudes'], - 'longitude': ['longitude', 'lon', 'lons', 'longitudes'] - } - - -def findunique(seq): - keys = {} - for e in seq: - keys[e] = 1 - return keys.keys() - -def getVariableByType(filename, variableType): - """ - Function that will try to return the variable from a file based on a provided - parameter type. - - Input:: - filename - the file to inspect - variableType - time | latitude | longitude - - Output:: - variable name OR list of all variables in the file if a single variable - name match cannot be found. - """ - try: - f = netCDF4.Dataset(filename, mode='r') - except: - print "netCDF4Error:", sys.exc_info()[0] - raise - - variableKeys = f.variables.keys() - f.close() - variableKeys = [variable.encode().lower() for variable in variableKeys] - variableMatch = VARIABLE_NAMES[variableType] - - commonVariables = list(set(variableKeys).intersection(variableMatch)) - - if len(commonVariables) == 1: - return str(commonVariables[0]) - - else: - return variableKeys - -def getVariableRange(filename, variableName): - """ - Function to return the min and max values of the given variable in - the supplied filename. - - Input:: - filename - absolute path to a file - variableName - variable whose min and max values should be returned - - Output:: - variableRange - tuple of order (variableMin, variableMax) - """ - try: - f = netCDF4.Dataset(filename, mode='r') - except: - print "netCDF4Error:", sys.exc_info()[0] - raise - - varArray = f.variables[variableName][:] - return (varArray.min(), varArray.max()) - - -def read_data_from_file_list(filelist, myvar, timeVarName, latVarName, lonVarName): - ''' - Read in data from a list of model files into a single data structure - - Input: - filelist - list of filenames (including path) - myvar - string containing name of variable to load in (as it appears in file) - Output: - lat, lon - 2D array of latitude and longitude values - timestore - list of times - t2store - numpy array containing data from all files - - NB. originally written specific for WRF netCDF output files - modified to make more general (Feb 2011) - - Peter Lean July 2010 - ''' - - filelist.sort() - filename = filelist[0] - # Crash nicely if 'filelist' is zero length - """TODO: Throw Error instead via try Except""" - if len(filelist) == 0: - print 'Error: no files have been passed to read_data_from_file_list()' - sys.exit() - - # Open the first file in the list to: - # i) read in lats, lons - # ii) find out how many timesteps in the file - # (assume same ntimes in each file in list) - # -allows you to create an empty array to store variable data for all times - tmp = netCDF4.Dataset(filename, mode='r') - latsraw = tmp.variables[latVarName][:] - lonsraw = tmp.variables[lonVarName][:] - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - if(latsraw.ndim == 2): - lon = lonsraw - lat = latsraw - - timesraw = tmp.variables[timeVarName] - ntimes = len(timesraw) - - print 'Lats and lons read in for first file in filelist' - - # Create a single empty masked array to store model data from all files - t2store = ma.zeros((ntimes * len(filelist), len(lat[:, 0]), len(lon[0, :]))) - timestore = ma.zeros((ntimes * len(filelist))) - - # Now load in the data for real - # NB. no need to reload in the latitudes and longitudes -assume invariant - i = 0 - timesaccu = 0 # a counter for number of times stored so far in t2store - # NB. this method allows for missing times in data files - # as no assumption made that same number of times in each file... - - - for i, ifile in enumerate(filelist): - - #print 'Loading data from file: ',filelist[i] - f = netCDF4.Dataset(ifile, mode='r') - t2raw = ma.array(f.variables[myvar][:]) - timesraw = f.variables[timeVarName] - time = timesraw[:] - ntimes = len(time) - print 'file= ', i, 'ntimes= ', ntimes, filelist[i] - print 't2raw shape: ', t2raw.shape - - # Flatten dimensions which needn't exist, i.e. level - # e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do. - # Code requires data to have dimensions, (time,lat,lon) - # i.e. remove level dimensions - # Remove 1d axis from the t2raw array - # Example: t2raw.shape == (365, 180, 360 1) <maps to (time, lat, lon, height)> - # After the squeeze you will be left with (365, 180, 360) instead - t2tmp = t2raw.squeeze() - # Nb. if this happens to be data for a single time only, then we just flattened it by accident - # lets put it back... - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - - t2store[timesaccu + np.arange(ntimes), :, :] = t2tmp[:, :, :] - timestore[timesaccu + np.arange(ntimes)] = time - timesaccu += ntimes - f.close() - - print 'Data read in successfully with dimensions: ', t2store.shape - - # TODO: search for duplicated entries (same time) and remove duplicates. - # Check to see if number of unique times == number of times, if so then no problem - - if(len(np.unique(timestore)) != len(np.where(timestore != 0)[0].view())): - print 'WARNING: Possible duplicated times' - - # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here - timestore, _ = process.getModelTimes(filename, timeVarName) - - # Make sure latlon grid is monotonically increasing and that the domains - # are correct - lat, lon, t2store = checkLatLon(lat, lon, t2store) - data_dict = {'lats': lat, 'lons': lon, 'times': timestore, 'data': t2store} - return data_dict - -def select_var_from_file(myfile, fmt='not set'): - ''' - Routine to act as user interface to allow users to select variable of interest from a file. - - Input: - myfile - filename - fmt - (optional) specify fileformat for netCDF4 if filename suffix is non-standard - - Output: - myvar - variable name in file - - Peter Lean September 2010 - ''' - - print fmt - - if fmt == 'not set': - f = netCDF4.Dataset(myfile, mode='r') - - if fmt != 'not set': - f = netCDF4.Dataset(myfile, mode='r', format=fmt) - - keylist = [key.encode().lower() for key in f.variables.keys()] - - i = 0 - for v in keylist: - print '[', i, '] ', f.variables[v].long_name, ' (', v, ')' - i += 1 - - user_selection = raw_input('Please select variable : [0 -' + str(i - 1) + '] ') - - myvar = keylist[int(user_selection)] - - return myvar - -def select_var_from_wrf_file(myfile): - ''' - Routine to act as user interface to allow users to select variable of interest from a wrf netCDF file. - - Input: - myfile - filename - - Output: - mywrfvar - variable name in wrf file - - Peter Lean September 2010 - ''' - - f = netCDF4.Dataset(myfile, mode='r', format='NETCDF4') - keylist = [key.encode().lower() for key in f.variables.keys()] - - i = 0 - for v in keylist: - try: - print '[', i, '] ', f.variables[v].description, ' (', v, ')' - except: - print '' - - i += 1 - - user_selection = raw_input('Please select WRF variable : [0 -' + str(i - 1) + '] ') - - mywrfvar = keylist[int(user_selection)] - - return mywrfvar - -def read_data_from_one_file(ifile, myvar, latVarName, lonVarName, timeVarName, file_type): - """ - Purpose:: - Read in data from one file at a time - - Input:: - filelist - list of filenames (including path) - myvar - string containing name of variable to load in (as it appears in file)s - lonVarName - name of the Longitude Variable - timeVarName - name of the Time Variable - fileType - type of file we are trying to parse - - Output:: - lat, lon - 2D arrays of latitude and longitude values - times - list of times - t2store - numpy array containing data from the file for the requested variable - varUnit - units for the variable given by t2store - """ - f = netCDF4.Dataset(ifile, mode='r') - try: - varUnit = f.variables[myvar].units.encode().upper() - except: - varUnit = raw_input('Enter the model variable unit: \n> ').upper() - t2raw = ma.array(f.variables[myvar][:]) - t2tmp = t2raw.squeeze() - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - - lonsraw = f.variables[lonVarName][:] - latsraw = f.variables[latVarName][:] - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - if(latsraw.ndim == 2): - lon = lonsraw - lat = latsraw - - f.close() - print ' success read_data_from_one_file: VarName=', myvar, ' Shape(Full)= ', t2tmp.shape, ' Unit= ', varUnit - timestore = process.decode_model_timesK(ifile, timeVarName, file_type) - - # Make sure latlon grid is monotonically increasing and that the domains - # are correct - lat, lon, t2store = checkLatLon(lat, lon, t2tmp) - return lat, lon, timestore, t2store, varUnit - -def findTimeVariable(filename): - """ - Function to find what the time variable is called in a model file. - Input:: - filename - file to crack open and check for a time variable - Output:: - timeName - name of the input file's time variable - variableNameList - list of variable names from the input filename - """ - try: - f = netCDF4.Dataset(filename, mode='r') - except: - print("Unable to open '%s' to try and read the Time variable" % filename) - raise - - variableNameList = [variable.encode() for variable in f.variables.keys()] - # convert all variable names into lower case - varNameListLowerCase = [x.lower() for x in variableNameList] - - # Use "set" types for finding common variable name from in the file and from the list of possibilities - possibleTimeNames = set(['time', 'times', 'date', 'dates', 'julian']) - - # Use the sets to find the intersection where variable names are in possibleNames - timeNameSet = possibleTimeNames.intersection(varNameListLowerCase) - - if len(timeNameSet) == 0: - print("Unable to autodetect the Time Variable Name in the '%s'" % filename) - timeName = misc.askUserForVariableName(variableNameList, targetName ="Time") - - else: - timeName = timeNameSet.pop() - - return timeName, variableNameList - - -def findLatLonVarFromFile(filename): - """ - Function to find what the latitude and longitude variables are called in a model file. - - Input:: - -filename - Output:: - -latVarName - -lonVarName - -latMin - -latMax - -lonMin - -lonMax - """ - try: - f = netCDF4.Dataset(filename, mode='r') - except: - print("Unable to open '%s' to try and read the Latitude and Longitude variables" % filename) - raise - - variableNameList = [variable.encode() for variable in f.variables.keys()] - # convert all variable names into lower case - varNameListLowerCase = [x.lower() for x in variableNameList] - - # Use "set" types for finding common variable name from in the file and from the list of possibilities - possibleLatNames = set(['latitude', 'lat', 'lats', 'latitudes']) - possibleLonNames = set(['longitude', 'lon', 'lons', 'longitudes']) - - # Use the sets to find the intersection where variable names are in possibleNames - latNameSet = possibleLatNames.intersection(varNameListLowerCase) - lonNameSet = possibleLonNames.intersection(varNameListLowerCase) - - if len(latNameSet) == 0 or len(lonNameSet) == 0: - print("Unable to autodetect Latitude and/or Longitude values in the file") - latName = misc.askUserForVariableName(variableNameList, targetName ="Latitude") - lonName = misc.askUserForVariableName(variableNameList, targetName ="Longitude") - - else: - latName = latNameSet.pop() - lonName = lonNameSet.pop() - - lats = np.array(f.variables[latName][:]) - lons = np.array(f.variables[lonName][:]) - - latMin = lats.min() - latMax = lats.max() - - # Convert the lons from 0:360 into -180:180 - lons[lons > 180] = lons[lons > 180] - 360. - lonMin = lons.min() - lonMax = lons.max() - - return latName, lonName, latMin, latMax, lonMin, lonMax - - -def read_data_from_file_list_K(filelist, myvar, timeVarName, latVarName, lonVarName, file_type): - ################################################################################## - # Read in data from a list of model files into a single data structure - # Input: filelist - list of filenames (including path) - # myvar - string containing name of variable to load in (as it appears in file) - # Output: lat, lon - 2D array of latitude and longitude values - # times - list of times - # t2store - numpy array containing data from all files - # Modified from read_data_from_file_list to read data from multiple models into a 4-D array - # 1. The code now processes model data that completely covers the 20-yr period. Thus, - # all model data must have the same time levels (ntimes). Unlike in the oroginal, ntimes - # is fixed here. - # 2. Because one of the model data exceeds 240 mos (243 mos), the model data must be - # truncated to the 240 mons using the ntimes determined from the first file. - ################################################################################## - filelist.sort() - nfiles = len(filelist) - # Crash nicely if 'filelist' is zero length - if nfiles == 0: - print 'Error: no files have been passed to read_data_from_file_list(): Exit' - sys.exit() - - # Open the first file in the list to: - # i) read in lats, lons - # ii) find out how many timesteps in the file (assume same ntimes in each file in list) - # -allows you to create an empty array to store variable data for all times - tmp = netCDF4.Dataset(filelist[0], mode='r', format=file_type) - latsraw = tmp.variables[latVarName][:] - lonsraw = tmp.variables[lonVarName][:] - timesraw = tmp.variables[timeVarName] - - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - - elif(latsraw.ndim == 2): - lon = lonsraw - lat = latsraw - ntimes = len(timesraw); nygrd = len(lat[:, 0]); nxgrd = len(lon[0, :]) - - print 'Lats and lons read in for first file in filelist' - - # Create a single empty masked array to store model data from all files - #t2store = ma.zeros((ntimes*nfiles,nygrd,nxgrd)) - t2store = ma.zeros((nfiles, ntimes, nygrd, nxgrd)) - #timestore=ma.zeros((ntimes*nfiles)) - - ## Now load in the data for real - ## NB. no need to reload in the latitudes and longitudes -assume invariant - #timesaccu=0 # a counter for number of times stored so far in t2store - # NB. this method allows for missing times in data files - # as no assumption made that same number of times in each file... - - for i, ifile in enumerate(filelist): - #print 'Loading data from file: ',filelist[i] - f = netCDF4.Dataset(ifile, mode='r') - t2raw = ma.array(f.variables[myvar][:]) - timesraw = f.variables[timeVarName] - #ntimes=len(time) - #print 'file= ',i,'ntimes= ',ntimes,filelist[i] - ## Flatten dimensions which needn't exist, i.e. level - ## e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do. - ## Code requires data to have dimensions, (time,lat,lon) - ## i.e. remove level dimensions - t2tmp = t2raw.squeeze() - ## Nb. if data happen to be for a single time, we flattened it by accident; lets put it back... - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - #t2store[timesaccu+np.arange(ntimes),:,:]=t2tmp[0:ntimes,:,:] - t2store[i, 0:ntimes, :, :] = t2tmp[0:ntimes, :, :] - #timestore[timesaccu+np.arange(ntimes)]=time - #timesaccu=timesaccu+ntimes - f.close() - - print 'Data read in successfully with dimensions: ', t2store.shape - - # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here - ifile = filelist[0] - timestore, _ = process.getModelTimes(ifile, timeVarName) - - # Make sure latlon grid is monotonically increasing and that the domains - # are correct - lat, lon, t2store = checkLatLon(lat, lon, t2store) - return lat, lon, timestore, t2store - -def find_latlon_ranges(filelist, lat_var_name, lon_var_name): - # Function to return the latitude and longitude ranges of the data in a file, - # given the identifying variable names. - # - # Input: - # filelist - list of filenames (data is read in from first file only) - # lat_var_name - variable name of the 'latitude' variable - # lon_var_name - variable name of the 'longitude' variable - # - # Output: - # latMin, latMax, lonMin, lonMax - self explanatory - # - # Peter Lean March 2011 - - filename = filelist[0] - - try: - f = netCDF4.Dataset(filename, mode='r') - - lats = f.variables[lat_var_name][:] - latMin = lats.min() - latMax = lats.max() - - lons = f.variables[lon_var_name][:] - lons[lons > 180] = lons[lons > 180] - 360. - lonMin = lons.min() - lonMax = lons.max() - - return latMin, latMax, lonMin, lonMax - - except: - print 'Error: there was a problem with finding the latitude and longitude ranges in the file' - print ' Please check that you specified the filename, and variable names correctly.' - - sys.exit() - -def writeBN_lola(fileName, lons, lats): - # write a binary data file that include longitude (1-d) and latitude (1-d) values - - F = fortranfile.FortranFile(fileName, mode='w') - ngrdY = lons.shape[0]; ngrdX = lons.shape[1] - tmpDat = ma.zeros(ngrdX); tmpDat[:] = lons[0, :]; F.writeReals(tmpDat) - tmpDat = ma.zeros(ngrdY); tmpDat[:] = lats[:, 0]; F.writeReals(tmpDat) - # release temporary arrays - tmpDat = 0 - F.close() - -def writeBNdata(fileName, numOBSs, numMDLs, nT, ngrdX, ngrdY, numSubRgn, obsData, mdlData, obsRgnAvg, mdlRgnAvg): - #(fileName,maskOption,numOBSs,numMDLs,nT,ngrdX,ngrdY,numSubRgn,obsData,mdlData,obsRgnAvg,mdlRgnAvg): - # write spatially- and regionally regridded data into a binary data file - missing = -1.e26 - F = fortranfile.FortranFile(fileName, mode='w') - # construct a data array to replace mask flag with a missing value (missing=-1.e12) for printing - data = ma.zeros((nT, ngrdY, ngrdX)) - for m in np.arange(numOBSs): - data[:, :, :] = obsData[m, :, :, :]; msk = data.mask - for n in np.arange(nT): - for j in np.arange(ngrdY): - for i in np.arange(ngrdX): - if msk[n, j, i]: data[n, j, i] = missing - - # write observed data. allowed to write only one row at a time - tmpDat = ma.zeros(ngrdX) - for n in np.arange(nT): - for j in np.arange(ngrdY): - tmpDat[:] = data[n, j, :] - F.writeReals(tmpDat) - - # write model data (dep. on the number of models). - for m in np.arange(numMDLs): - data[:, :, :] = mdlData[m, :, :, :] - msk = data.mask - for n in np.arange(nT): - for j in np.arange(ngrdY): - for i in np.arange(ngrdX): - if msk[n, j, i]: - data[n, j, i] = missing - - for n in np.arange(nT): - for j in np.arange(ngrdY): - tmpDat[:] = data[n, j, :] - F.writeReals(tmpDat) - - data = 0 # release the array allocated for data - # write data in subregions - if numSubRgn > 0: - print 'Also included are the time series of the means over ', numSubRgn, ' areas from obs and model data' - tmpDat = ma.zeros(nT); print numSubRgn - for m in np.arange(numOBSs): - for n in np.arange(numSubRgn): - tmpDat[:] = obsRgnAvg[m, n, :] - F.writeReals(tmpDat) - for m in np.arange(numMDLs): - for n in np.arange(numSubRgn): - tmpDat[:] = mdlRgnAvg[m, n, :] - F.writeReals(tmpDat) - tmpDat = 0 # release the array allocated for tmpDat - F.close() - -def writeNCfile(fileName, numSubRgn, lons, lats, obsData, mdlData, obsRgnAvg, mdlRgnAvg, obsList, mdlList, subRegions): - # write an output file of variables up to 3 dimensions - # fileName: the name of the output data file - # numSubRgn : the number of subregions - # lons[ngrdX]: longitude - # lats[ngrdY]: latitudes - # obsData[nT,ngrdY,ngrdX]: the obs time series of the entire model domain - # mdlData[numMDLs,nT,ngrdY,ngrdX]: the mdltime series of the entire model domain - # obsRgnAvg[numSubRgn,nT]: the obs time series for the all subregions - # mdlRgnAvg[numMDLs,numSubRgn,nT]: the mdl time series for the all subregions - dimO = obsData.shape[0] # the number of obs data - dimM = mdlData.shape[0] # the number of mdl data - dimT = mdlData.shape[1] # the number of time levels - dimY = mdlData.shape[2] # y-dimension - dimX = mdlData.shape[3] # x-dimension - dimR = obsRgnAvg.shape[1] # the number of subregions - f = netCDF4.Dataset(fileName, mode='w', format='NETCDF4') - print mdlRgnAvg.shape, dimM, dimR, dimT - #create global attributes - f.description = '' - # create dimensions - print 'Creating Dimensions within the NetCDF Object...' - f.createDimension('unity', 1) - f.createDimension('time', dimT) - f.createDimension('west_east', dimX) - f.createDimension('south_north', dimY) - f.createDimension('obs', dimO) - f.createDimension('models', dimM) - - # create the variable (real*4) to be written in the file - print 'Creating Variables...' - f.createVariable('lon', 'd', ('south_north', 'west_east')) - f.createVariable('lat', 'd', ('south_north', 'west_east')) - f.createVariable('oDat', 'd', ('obs', 'time', 'south_north', 'west_east')) - f.createVariable('mDat', 'd', ('models', 'time', 'south_north', 'west_east')) - - if subRegions: - f.createDimension('regions', dimR) - f.createVariable('oRgn', 'd', ('obs', 'regions', 'time')) - f.createVariable('mRgn', 'd', ('models', 'regions', 'time')) - f.variables['oRgn'].varAttName = 'Observation time series: Subregions' - f.variables['mRgn'].varAttName = 'Model time series: Subregions' - - loadDataIntoNetCDF(f, obsList, obsData, 'Observation') - print 'Loaded the Observations into the NetCDF' - - loadDataIntoNetCDF(f, mdlList, mdlData, 'Model') - - # create attributes and units for the variable - print 'Creating Attributes and Units...' - f.variables['lon'].varAttName = 'Longitudes' - f.variables['lon'].varUnit = 'degrees East' - f.variables['lat'].varAttName = 'Latitudes' - f.variables['lat'].varUnit = 'degrees North' - f.variables['oDat'].varAttName = 'Observation time series: entire domain' - f.variables['mDat'].varAttName = 'Model time series: entire domain' - - # assign the values to the variable and write it - f.variables['lon'][:] = lons[:] - f.variables['lat'][:] = lats[:] - if subRegions: - f.variables['oRgn'][:] = obsRgnAvg[:] - f.variables['mRgn'][:] = mdlRgnAvg[:] - - f.close() - -def loadDataIntoNetCDF(fileObject, datasets, dataArray, dataType): - """ - Input:: - fileObject - netCDF4 file object data will be loaded into - datasets - List of dataset names - dataArray - Multi-dimensional array of data to be loaded into the NetCDF file - dataType - String with value of either 'Model' or 'Observation' - Output:: - No return value. netCDF4 file object is updated in place - """ - datasetCount = 0 - for datasetCount, dataset in enumerate(datasets): - if dataType.lower() == 'observation': - datasetName = dataset.replace(' ','') - elif dataType.lower() == 'model': - datasetName = path.splitext(path.basename(dataset))[0] - print "Creating variable %s" % datasetName - fileObject.createVariable(datasetName, 'd', ('time', 'south_north', 'west_east')) - fileObject.variables[datasetName].varAttName = 'Obseration time series: entire domain' - print 'Loading values into %s' % datasetName - fileObject.variables[datasetName][:] = dataArray[datasetCount,:,:,:] - -def checkLatLon(latsin, lonsin, datain): - """ - Purpose:: - Checks whether latitudes and longitudes are monotonically increasing - within the domains [-90, 90) and [-180, 180) respectively, and rearranges the input data - accordingly if they are not. - - Input:: - latsin - Array of latitudes read from a raw netcdf file - lonsin - Array of longitudes read from a raw netcdf file - datain - Array of data values read from a raw netcdf file. - The shape is assumed to be (..., nLat, nLon). - - Output:: - latsout - 2D array of (rearranged) latitudes - lonsout - 2D array of (rearranged) longitudes - dataout - Array of (rearranged) data - """ - # Avoid unnecessary shifting if all lons are higher than 180 - if lonsin.min() > 180: - lonsin -= 360 - - # Make sure lats and lons are monotonically increasing - latsDecreasing = np.diff(latsin[:, 0]) < 0 - lonsDecreasing = np.diff(lonsin[0]) < 0 - - # If all values are decreasing then they just need to be reversed - latsReversed, lonsReversed = latsDecreasing.all(), lonsDecreasing.all() - - # If the lat values are unsorted then raise an exception - if not latsReversed and latsDecreasing.any(): - raise ValueError('Latitudes must be monotonically increasing.') - - # Perform same checks now for lons - if not lonsReversed and lonsDecreasing.any(): - raise ValueError('Longitudes must be monotonically increasing.') - - # Also check if lons go from [0, 360), and convert to [-180, 180) - # if necessary - lonsShifted = lonsin.max() > 180 - latsout, lonsout, dataout = latsin[:], lonsin[:], datain[:] - # Now correct data if latlon grid needs to be shifted - if latsReversed: - latsout = latsout[::-1] - dataout = dataout[..., ::-1, :] - - if lonsReversed: - lonsout = lonsout[..., ::-1] - dataout = dataout[..., ::-1] - - if lonsShifted: - lat1d = latsout[:, 0] - dataout, lon1d = shiftgrid(180, dataout, lonsout[0], start=False) - lonsout, latsout = np.meshgrid(lon1d, lat1d) - - return latsout, lonsout, dataout - -def shiftgrid(lon0, datain, lonsin, start= True, cyclic=360.0): - """ - Purpose:: - Shift global lat/lon grid east or west. This function is taken directly - from the (unreleased) basemap 1.0.7 source code as version 1.0.6 does not - currently support arrays with more than two dimensions. - https://github.com/matplotlib/basemap - - Input:: - lon0 - starting longitude for shifted grid (ending longitude if start=False). - lon0 must be on input grid (within the range of lonsin). - datain - original data with longitude the right-most dimension. - lonsin - original longitudes. - start - if True, lon0 represents the starting longitude of the new grid. - if False, lon0 is the ending longitude. Default True. - cyclic - width of periodic domain (default 360) - - Output:: - dataout - data on shifted grid - lonsout - lons on shifted grid - """ - if np.fabs(lonsin[-1]-lonsin[0]-cyclic) > 1.e-4: - # Use all data instead of raise ValueError, 'cyclic point not included' - start_idx = 0 - else: - # If cyclic, remove the duplicate point - start_idx = 1 - if lon0 < lonsin[0] or lon0 > lonsin[-1]: - raise ValueError('lon0 outside of range of lonsin') - i0 = np.argmin(np.fabs(lonsin-lon0)) - i0_shift = len(lonsin)-i0 - if ma.isMA(datain): - dataout = ma.zeros(datain.shape,datain.dtype) - else: - dataout = np.zeros(datain.shape,datain.dtype) - if ma.isMA(lonsin): - lonsout = ma.zeros(lonsin.shape,lonsin.dtype) - else: - lonsout = np.zeros(lonsin.shape,lonsin.dtype) - if start: - lonsout[0:i0_shift] = lonsin[i0:] - else: - lonsout[0:i0_shift] = lonsin[i0:]-cyclic - dataout[...,0:i0_shift] = datain[...,i0:] - if start: - lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]+cyclic - else: - lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx] - dataout[...,i0_shift:] = datain[...,start_idx:i0+start_idx] - return dataout,lonsout \ No newline at end of file http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/rcmed.py ---------------------------------------------------------------------- diff --git a/rcmet/src/main/python/rcmes/storage/rcmed.py b/rcmet/src/main/python/rcmes/storage/rcmed.py deleted file mode 100644 index c99334c..0000000 --- a/rcmet/src/main/python/rcmes/storage/rcmed.py +++ /dev/null @@ -1,129 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -'''This is a collection of functions that provide the single interface to the -rcmed. Initial design will includes several functions to interact with the -available parameters within rcmed and their metadata. - -Future work includes rolling the rcmed querying code into this module as well. -''' - -import requests, json - -paramUri = 'http://rcmes.jpl.nasa.gov/bottle/rcmed/param.json' - -def getParams(uri=paramUri): - '''This will return all of the parameters from the database as - a list of dictionaries. - - If the database is not available, then the method will return None''' - # Use a get request to call the Web Service - try: - httpRequest = requests.get(uri) - except: - print "HTTPRequest failed. Bottle WebServer is offline" - raise - # TODO Check the request status code if it is 400 or 500 range then - # return None - # if the status code is 200 then return the request.text's param list - # http_request.status_code is an int we can inspect - paramDict = json.loads(httpRequest.text) - paramList = paramDict['param'] - - filteredParams = [] - # Filter list to remove missing data values - for param in paramList: - paramGood = True - for key, value in param.iteritems(): - if value == None: - paramGood = False - - if paramGood: - filteredParams.append(param) - else: - filteredParams.append(param) - - - return filteredParams - - - -#class Parameter(object): -# -# def __init__(self): -# self.param_query_uri = 'http://some.url' -# self.param_list = self.param_metadata() -# -# def param_metadata(self): -# '''This method will return a list of python dict's. Each dict will -# contain a complete record for each parameter from rcmed''' -# # 1. Query the Parameters Metadata Endpoint using param_query_uri -# # 2. Parse the returned data and re-format into a dict -# # 3. define self.para_met_dict -# test_list = [{"id":12, -# "description":"ERA Dataset 2 Metre Temperature", -# "type":'temp' -# }, -# {"id":13, -# "description":"ERA Dataset 2 Metre Dewpoint Temperature", -# 'type':'temp' -# }, -# {"id":14, -# "description":"TRMM Dataset HRF parameter", -# 'type':'hrf' -# } -# ] -# print "self.param_met_dict has been created" -# return test_list -# -# def get_param_by_id(self, id): -# '''This will take in a parameter id and return a single dict. Can we -# safely assume we will always hold a unique parameter id? - Currently -# this is True''' -# for p in self.param_list: -# if p['id'] == id: -# return p -# else: -# pass -# -# def get_params_by_type(self, type): -# '''This will take in a parameter type like precip, temp, pressure, etc. -# and will return a list of all the params that are of the given type.''' -# param_list = [] #empty list to collect the param dicts -# for p in self.param_list: -# if p['type'] == type: -# param_list.append(p) -# else: -# pass -# return param_list -# -# -#class ObsData(object): -# -# def __init__(self): -# self.query_url = 'http://rcmes/rcmed....' #can we merely insert the query criteria into the url attribute? -# self.param_id = 6 -# self.dataset_id = 1 -# self.lat_range = [25.4,55.0] -# self.lon_range = [0.0,10.7] -# self.time_range = [start,end] -# -# def set_param(self, param_dict): -# self.param_id = param_dict['id'] -# self.dataset_id = null -# # look up the dataset id using the parameter id and set it -# p = Parameter.get_param_by_id(id) - \ No newline at end of file http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/toolkit/__init__.py ---------------------------------------------------------------------- diff --git a/rcmet/src/main/python/rcmes/toolkit/__init__.py b/rcmet/src/main/python/rcmes/toolkit/__init__.py deleted file mode 100644 index c980999..0000000 --- a/rcmet/src/main/python/rcmes/toolkit/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""The toolkit Package is a collection of modules that provide a set of tools -that can be used to process, analyze and plot the analysis results.""" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py ---------------------------------------------------------------------- diff --git a/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py b/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py deleted file mode 100644 index 0b910c4..0000000 --- a/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py +++ /dev/null @@ -1,366 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Prepare Datasets both model and observations for analysis using metrics""" - - -import numpy as np -import numpy.ma as ma -import sys, os - -from storage import db, files -import process -from utils import misc - -# TODO: swap gridBox for Domain -def prep_data(settings, obsDatasetList, gridBox, modelList): - """ - - returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList - - numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead - numMDLs - number of Models used. Again should use the len(modelList) instead - nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset - ngrdY - size of the Y-Axis grid after spatial regridding - ngrdX - size of the X-Axis grid after spatial regridding - Times - list of python datetime objects the represent the list of time to be used in further calculations - lons - - lats - - obsData - - modelData - - obsList - - - - """ - - # TODO: Stop the object Deserialization and work on refactoring the core code here - cachedir = settings.cacheDir - workdir = settings.workDir - - # Use list comprehensions to deconstruct obsDatasetList - # ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later - # the plan here is to use the obsDatasetList which contains a longName key we can use. - obsList = [str(x['longname']) for x in obsDatasetList] - # Also using the obsDatasetList with a key of ['dataset_id'] - obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList] - # obsDatasetList ['paramter_id'] list - obsParameterId = [str(x['parameter_id']) for x in obsDatasetList] - obsTimestep = [str(x['timestep']) for x in obsDatasetList] - mdlList = [model.filename for model in modelList] - - # Since all of the model objects in the modelList have the same Varnames and Precip Flag, I am going to merely - # pull this from modelList[0] for now - modelVarName = modelList[0].varName - precipFlag = modelList[0].precipFlag - modelTimeVarName = modelList[0].timeVariable - modelLatVarName = modelList[0].latVariable - modelLonVarName = modelList[0].lonVariable - regridOption = settings.spatialGrid - timeRegridOption = settings.temporalGrid - - """ - Routine to read-in and re-grid both obs and mdl datasets. - Processes both single and multiple files of obs and mdl or combinations in a general way. - i) retrieve observations from the database - ii) load in model data - iii) spatial regridding - iv) temporal regridding - v) area-averaging - Input: - cachedir - string describing directory path - workdir - string describing directory path - obsList - string describing the observation data files - obsDatasetId - int, db dataset id - obsParameterId - int, db parameter id - latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system - latMin - float - latMax - float - lonMin - float - lonMax - float - dLat - float - dLon - float - naLats - integer - naLons - integer - mdlList - string describing model file name + path - modelVarName - string describing name of variable to evaluate (as written in model file) - precipFlag - bool (is this precipitation data? True/False) - modelTimeVarName - string describing name of time variable in model file - modelLatVarName - string describing name of latitude variable in model file - modelLonVarName - string describing name of longitude variable in model file - regridOption - string: 'obs'|'model'|'user' - timeRegridOption -string: 'full'|'annual'|'monthly'|'daily' - maskOption - Boolean - - # TODO: This isn't true in the current codebase. - Instead the SubRegion's are being used. You can see that these values are not - being used in the code, at least they are not being passed in from the function - - maskLatMin - float (only used if maskOption=1) - maskLatMax - float (only used if maskOption=1) - maskLonMin - float (only used if maskOption=1) - maskLonMax - float (only used if maskOption=1) - Output: image files of plots + possibly data - Jinwon Kim, 7/11/2012 - """ - - - # check the number of obs & model data files - numOBSs = len(obsList) - numMDLs = len(mdlList) - - # assign parameters that must be preserved throughout the process - - print 'start & end time = ', settings.startDate, settings.endDate - yymm0 = settings.startDate.strftime("%Y%m") - yymm1 = settings.endDate.strftime("%Y%m") - print 'start & end eval period = ', yymm0, yymm1 - - - - #TODO: Wrap in try except blocks instead - if numMDLs < 1: - print 'No input model data file. EXIT' - sys.exit() - if numOBSs < 1: - print 'No input observation data file. EXIT' - sys.exit() - - ## Part 0: Setup the regrid variables based on the regridOption given - - # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY) - print 'regridOption in prep_data= ', regridOption - if regridOption == 'model': - ifile = mdlList[0] - typeF = 'nc' - lats, lons, mTimes = files.read_data_from_one_file(ifile, modelVarName, - modelLatVarName, - modelLonVarName, - modelTimeVarName, - typeF)[:3] - modelObject = modelList[0] - latMin = modelObject.latMin - latMax = modelObject.latMax - lonMin = modelObject.lonMin - lonMax = modelObject.lonMax - elif regridOption == 'user': - # Use the GridBox Object - latMin = gridBox.latMin - latMax = gridBox.latMax - lonMin = gridBox.lonMin - lonMax = gridBox.lonMax - naLats = gridBox.latCount - naLons = gridBox.lonCount - dLat = gridBox.latStep - dLon = gridBox.lonStep - lat = np.arange(naLats) * dLat + latMin - lon = np.arange(naLons) * dLon + lonMin - lons, lats = np.meshgrid(lon, lat) - lon = 0. - lat = 0. - else: - print "INVALID REGRID OPTION USED" - sys.exit() - - ngrdY = lats.shape[0] - ngrdX = lats.shape[1] - - regObsData = [] - - - - ## Part 1: retrieve observation data from the database and regrid them - ## NB. automatically uses local cache if already retrieved. - - for n in np.arange(numOBSs): - # spatial regridding - oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n], - obsParameterId[n], - latMin, latMax, - lonMin, lonMax, - settings.startDate, settings.endDate, - cachedir, obsTimestep[n]) - - # TODO: modify this if block with new metadata usage. - if precipFlag == True and obsList[n][0:3] == 'CRU': - oData = 86400.0 * oData - - nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly) - print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs - print 'For dataset: %s' % obsList[n] - - tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX)) - - print 'tmpOBS shape = ', tmpOBS.shape - - # OBS SPATIAL REGRIDING - for t in np.arange(nstOBSs): - tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons) - - # TODO: Not sure this is needed with Python Garbage Collector - # The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look - # at using generators where possible. - oLats = 0.0 - oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons. - - # OBS TEMPORAL REGRIDING - oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption) - - tmpOBS = 0.0 - - # check the consistency of temporally regridded obs data - if n == 0: - oldObsTimes = newObsTimes - else: - if oldObsTimes != newObsTimes: - print 'temporally regridded obs data time levels do not match at ', n - 1, n - print '%s Time through Loop' % (n + 1) - print 'oldObsTimes Count: %s' % len(oldObsTimes) - print 'newObsTimes Count: %s' % len(newObsTimes) - # TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate - sys.exit() - else: - oldObsTimes = newObsTimes - # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData) - regObsData.append(oData) - - - """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array' - also finalize 'obsTimes', the time coordinate values of the regridded obs data. - NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data. - this problem disappears by using 'ma.array'.""" - - obsData = ma.array(regObsData) - obsTimes = newObsTimes - - # compute the simple multi-obs ensemble if multiple obs are used - if numOBSs > 1: - ensemble = np.mean(regObsData, axis=0) - regObsData.append(ensemble) - numOBSs = len(regObsData) - obsList.append('ENS-OBS') - obsData = ma.array(regObsData) # Cast to a masked array - - - ## Part 2: load in and regrid model data from file(s) - ## NOTE: the two parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) & - ## the total number of months, respectively, in later multi-model calculations. - - typeF = 'nc' - regridMdlData = [] - # extract the model names and store them in the list 'mdlList' - mdlName = [] - mdlListReversed=[] - if len(mdlList) >1: - for element in mdlList: - mdlListReversed.append(element[::-1]) - prefix=os.path.commonprefix(mdlList) - postfix=os.path.commonprefix(mdlListReversed)[::-1] - for element in mdlList: - mdlName.append(element.replace(prefix,'').replace(postfix,'')) - else: - mdlName.append('model') - - - for n in np.arange(numMDLs): - # read model grid info, then model data - ifile = mdlList[n] - print 'ifile= ', ifile - modelLats, modelLons, mTimes, mdlDat, mvUnit = files.read_data_from_one_file(ifile, - modelVarName, - modelLatVarName, - modelLonVarName, - modelTimeVarName, - typeF) - mdlT = [] - mStep = len(mTimes) - - for i in np.arange(mStep): - mdlT.append(mTimes[i].strftime("%Y%m")) - - wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1) - modelTimes = list(np.array(mTimes)[wh]) - mData = mdlDat[wh, :, :] - - # determine the dimension size from the model time and latitude data. - nT = len(modelTimes) - - print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes) - # spatial regridding of the modl data - tmpMDL = ma.zeros((nT, ngrdY, ngrdX)) - - if regridOption != 'model': - for t in np.arange(nT): - tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons) - else: - tmpMDL = mData - - # temporally regrid the model data - mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption) - tmpMDL = 0.0 - - # check data consistency for all models - if n == 0: - oldMdlTimes = newMdlTimes - else: - if oldMdlTimes != newMdlTimes: - print 'temporally regridded mdl data time levels do not match at ', n - 1, n - print len(oldMdlTimes), len(newMdlTimes) - sys.exit() - else: - oldMdlTimes = newMdlTimes - - # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData) - regridMdlData.append(mData) - - modelData = ma.array(regridMdlData) - modelTimes = newMdlTimes - - if (precipFlag == True) & (mvUnit == 'KG M-2 S-1'): - print 'convert model variable unit from mm/s to mm/day' - modelData = 86400.*modelData - - # check consistency between the time levels of the model and obs data - # this is the final update of time levels: 'Times' and 'nT' - if obsTimes != modelTimes: - print 'time levels of the obs and model data are not consistent. EXIT' - print 'obsTimes' - print obsTimes - print 'modelTimes' - print modelTimes - sys.exit() - # 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated) - Times = modelTimes - nT = len(modelTimes) - modelTimes = 0 - obsTimes = 0 - - print 'Reading and regridding model data are completed' - print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape - - # TODO: Commented out until I can talk with Jinwon about this - # compute the simple multi-model ensemble if multiple models are evaluated - if numMDLs > 1: - model_ensemble = np.mean(regridMdlData, axis=0) - regridMdlData.append(model_ensemble) - numMDLs = len(regridMdlData) - modelData = ma.array(regridMdlData) - mdlName.append('ENS-MODEL') - print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape - # GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing - - # Processing complete - print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid' - return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName
