http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/util/stats.py ---------------------------------------------------------------------- diff --git a/climatology/clim/util/stats.py b/climatology/clim/util/stats.py new file mode 100755 index 0000000..e133d3a --- /dev/null +++ b/climatology/clim/util/stats.py @@ -0,0 +1,218 @@ +import sys +from math import sqrt +from collections import namedtuple + +import numpy as NP +import scipy.stats + + +def mad( l ): + """Compute the median absolute deviation (a robust measure of spread) of the list of + values *l*.""" + median = NP.median(l) + return NP.median([abs(x - median) for x in l]) + + +def robust_std(l, alpha=1/scipy.stats.norm.ppf(0.75)): + """Compute a robust estimate of the standard deviation (by default, for normally + distrbuted samples).""" + return alpha * mad(l) + + +def filter_outliers( time_series, n_std=6, indices=False ): + """Filter outliers (those samples a distance of *n_std* robust standard deviations + from the median) and return.""" + med = NP.median( time_series ) + std = robust_std( time_series ) + lhs = zip( *[ (i, x) for i, x in enumerate( time_series ) if abs(x - med) < ( std * n_std ) ] ) + if len( lhs ) == 0: + # No outliers detected, return the full time series + return time_series + I, out = lhs + if isinstance( time_series, NP.ndarray ): + out = NP.array( out ) + if indices: + return out, I + else: + return out + + +#################################################################################################### + +#!/usr/bin/env python +# +# Stats.py -- Simple statistics class: computes mean, stddev, min, max, rms. +# +# Author: Brian Wilson +# @(#) Stats.py 1.0 2003/11/24 +# 1.1 2010/12/14 --- rewrote the add method so that +# higher moments are c alculated correctly +# (Mark D. Butala) +# +# Implemented by saving five accumulators: +# no of points, mean, sum of squares of diffs from mean, min, and max. +# Methods: +# add -- add a data point to the accumulating stats +# calc -- compute the five statistics: n, mean, std dev, min, max, rms +# label -- set the label for printing +# format -- set the float format for printing +# __repr__ -- generates one-line string version of statistics for easy printing +# reset -- zero the accumulators +# addm -- add an array of data points to the accumulators (add multiple) +# +# See tests at end of file for example usage. +# + +StatsCalc = namedtuple('StatsCalc', 'n mean stddev min max rms skewness kurtosis') + + +class Stats(object): + """Simple statistics class that computes mean, std dev, min, max, and rms.""" + __slots__ = ('count', 'mean', 'stddev', 'min', 'max', 'rms', 'skewness', 'kurtosis', + 'rms2', 'M2', 'M3', 'M4', 'labelStr', 'formatStr', 'missingValue') + + def __init__(self, missingValue=-9999., label=None, format=None): + """Create Stats object, optionally set print label and float format string.""" + self.reset(missingValue) + self.missingValue = missingValue + self.labelStr = label + self.formatStr = format + + def add(self, val): + """Add one data point to the accumulators.""" + self.count += 1 + n = self.count + if n == 1: + self.mean = 0. + self.M2 = 0. + self.rms2 = 0. + self.M3 = 0. + self.M4 = 0. + self.min = val + self.max = val + else: + self.min = min(self.min, val) + self.max = max(self.max, val) + + delta = val - self.mean # use devation from mean to prevent roundoff/overflow problems + delta_n = delta / float(n) + delta_n2 = delta_n * delta_n + self.mean += delta_n + self.rms2 += (val**2 - self.rms2) / float(n) + term = delta * delta_n * (n-1) + self.M4 += term * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * self.M2 - 4 * delta_n * self.M3 + self.M3 += term * delta_n * (n - 2) - 3 * delta_n * self.M2 + self.M2 += term + return self + + def calc(self): + """Calculate the statistics for the data added so far. + Returns tuple of six values: n, mean, stddev, min, max, rms. + """ + n = self.count + if (n >= 2): + M2 = self.M2 + stddev = sqrt(M2 / float(n - 1)) + rms = sqrt(self.rms2) + self.stddev = stddev + self.rms = rms + self.skewness = sqrt(n) * self.M3 / (M2 * sqrt(M2)) + self.kurtosis = (n * self.M4) / (M2 * M2) - 3 + return StatsCalc(self.count, self.mean, self.stddev, self.min, self.max, self.rms, self.skewness, self.kurtosis) + + def label(self, str): + """Label the statistics for printing.""" + self.labelStr = str + return self + + def format(self, str): + """Set the float format to be used in printing stats.""" + self.formatStr = str + return self + + def __repr__(self): + """One-line stats representation for simple printing.""" + if (self.labelStr == None or self.labelStr == ""): self.labelStr = "Stats" + line = self.labelStr + ": " + if self.formatStr: + a = [self.formatStr for i in xrange(7)] + a.insert(0, '%d') + format = ' '.join(a) + line += format % self.calc() + else: + line += "N=%d mean=%f stddev=%f min=%f max=%f rms=%f skewness=%f kurtosis=%f" % self.calc() + return line + + def reset(self, missingValue): + """Reset the accumulators to start over.""" + self.count = 0 + self.mean = missingValue + self.stddev = missingValue + self.min = missingValue + self.max = missingValue + self.rms = missingValue + self.skewness = missingValue + self.kurtosis = missingValue + self.M2 = 0. + self.rms2 = 0. + self.M3 = 0. + self.M4 = 0. + self.labelStr = None + self.formatStr = None + return self + + def addm(self, seq): + """Add multiple - add a sequence of data points all at once.""" + for val in seq: + self.add(val) + return self + + +#################################################################################################### + + +def main(args): + fn = args[0] + try: + if fn == '-': + fid = sys.stdin + else: + fid = open(fn, 'r') + stats = Stats() + stats.addm( (float(x) for x in fid) ) + print(stats) + finally: + if fid is not sys.stdin: + fid.close() + + +if __name__ == '__main__': + main(sys.argv[1:]) + + +''' + def test(): + """ +>>> print Stats() +Stats: 0 0.000000 0.000000 0.000000 0.000000 0.000000 + +>>> def f(s): +... for v in [2.3, 4.5, 1.8, 6.2, 3.5]: s.add(v) +... s.label('test2') +... return s +>>> print f( Stats() ) +test2: 5 3.660000 1.468279 1.800000 6.200000 3.888480 + +>>> print Stats().label('test3').addm([2.3, 4.5, 1.8, 6.2, 3.5]) +test3: 5 3.660000 1.468279 1.800000 6.200000 3.888480 + +>>> print Stats('test4').format('%5.2f').addm([2.3, 4.5, 1.8, 6.2, 3.5]) +test4: 5 3.66 1.47 1.80 6.20 3.89 + +>>> print Stats('test5', '%4.1f').addm([2.3, 4.5, 1.8, 6.2, 3.5]) +test5: 5 3.7 1.5 1.8 6.2 3.9 + """ + + import doctest + doctest.testmod() +'''
http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/util/timeJ2000.py ---------------------------------------------------------------------- diff --git a/climatology/clim/util/timeJ2000.py b/climatology/clim/util/timeJ2000.py new file mode 100755 index 0000000..99bcb5e --- /dev/null +++ b/climatology/clim/util/timeJ2000.py @@ -0,0 +1,369 @@ +#!/bin/env python + +""" +timeJ2000.py -- Date & Time class based on native Python datetime, time, and calendar + libraries. Represents a Date/Time as seconds past J2000 epoch + and provides various format conversions and date delta arithmetic. + Also includes some new smart functions that perform desired + transformations on a Do The Right Thing basis. +""" + +import sys, datetime, calendar, time, types + +##CONSTANTS +J2000_1970_EPOCH = 946684800 + 12*60*60 #2000/01/01,12:00:00 in seconds past 1970 +LATEST_TIME = 9999999999 #Highest (latest) time in J2000 to care about... useful for initializations +EARLIEST_TIME = -9999999999 #Lowest (earlist) time in J2000 to care about... useful for initializations + +def echo (str ): sys.stdout.write(str + "\n") +def err (str ): sys.stderr.write(str + "\n") +def warn (str ): err("---WARNING, IONOTIME: "+str) +def die (str, status=1): err("***ERROR: "+str); sys.exit(status) + +##BASE TRANSFORMATIONS +def ensureYYYY(y): + if y>99: return y + if y>50: return 1900+y + return 2000+y + +def ensureYY(y): + return y%100 + +#transforms an hms string to a float hours +def hms_to_hours(str): + return float(str[0:2])+float(str[2:4])/60.0+float(str[4:6])/360.0 + +def J2000_to_list(sec=0.0): + #check for fractional seconds + frac=0.0 + if sec > int(sec): + frac=sec-int(sec) + sec =int(sec) + callist=list(time.gmtime(sec+J2000_1970_EPOCH)) + #add back in fractional seconds if present + if frac > 0.0: + callist[5]=callist[5]+frac + return callist[0:6] +def list_to_J2000(inlist): + #check for fractional seconds and remove + clist=[0,0,0,0,0,0.0] #default to zeros everywhere + clist[:len(inlist)]=inlist + ss=clist[5] + frac=0.0 + if ss > int(ss): + frac=ss-int(ss) + clist[5]=int(ss) + #transform, adding fractional seconds afterwards + return calendar.timegm(clist)-J2000_1970_EPOCH+frac + +##INTELLIGENT FUNCTIONS +def valid_formats(): + return ('J2000', #int or float bare number + 'HHMMSS', #string + 'YYMMDD', #string + 'YYYYMMDD', #string + 'YYMMDDHHMMSS', #string . + 'YYYYMMDDHHMMSS', #string . + 'YYYYMMDD_HHMMSS', #string . + 'YYMMDD_HHMMSS', #string . + 'DOY', #string + 'HOD',"HOURSINDAY", #string hours of day + 'MOD',"MINUTESINDAY", #string minutes of day + 'SOD',"SECONDSINDAY", #string seconds of day + 'YYDOY', #string + 'LIST', #list(y,m,d,h,m,s) + 'HMS', #string + 'YMD', #string + 'YMDHMS', #string + 'GAIMSTRING', #string yyyy/mm/dd,hh:mm:ss.frac + 'TENETHOURLY', #string siteDOYlmm.yy.tenet + 'LOCALHMS', #string HHMMSS.F adjusted for local time (requires longitude in deg) + 'HOURLETTER'#, #string a where a(a,x) for each hour of day +# 'RINEX' #string + ) + +def to_J2000(input,format=None): + sec=0 #internal representation + if format: format=format.upper() + + #assume J2000 seconds for any bare number + if isinstance(input,types.IntType) or isinstance(input,types.FloatType) or isinstance(input,types.LongType) or format=='J2000': return float(input) + #if it's a list, simple... will be interpretted as y,m,d,hh,mm,ss with 0's in any unspecified slot + elif isinstance(input,types.ListType) or isinstance(input,types.TupleType): return list_to_J2000(input) + #if it's a string, could be many things + elif isinstance(input,types.StringType): + #strip off any fractional second information first + p=input.find('.') + frac=0.0 + if p>=0: + if input.find('tenet') < 0: + frac=float(input[p:]) + input =input[:p] + #Autoguess format based on length or user-specified request + if len(input)==len('siteDOYlmm.yy.tenet') and format=="TENETHOURLY": + (doy,hl,mm,y)=(int(input[4:7]),input[7:8],int(input[8:10]),int(input[11:13])) + (yyyy,m,d)=J2000_to_list(list_to_J2000((ensureYYYY(int(y)),1,doy)))[0:3] + return list_to_J2000((yyyy,m,d,ord(hl)-ord('a'),mm,0)) + + if format=="DOY": + return list_to_J2000((2000,1,int(input))) + + if format in ("HOD","HOURSINDAY"): + return list_to_J2000((2000,1,1,int(input),0,0)) + + if format in ("MOD","MINUTESINDAY"): + return list_to_J2000((2000,1,1,0,int(input),0)) + + if format in ("SOD","SECONDSINDAY"): + return list_to_J2000((2000,1,1,0,0,int(input))) + + if format=="YYDOY": + return list_to_J2000((ensureYYYY(int(input[0:2])),1,int(input[2:]))) + + if len(input)==len('a') or format=='HOURLETTER': + return list_to_J2000((2000,1,1,ord(input)-ord('a'),0,0)) + if len(input)==len('YYYY/MM/DD,HH:MM:SS') or format=='GAIMSTRING' or format=='ISO': + return list_to_J2000((int(input[0:4]), + int(input[5:7]), + int(input[8:10]), + int(input[11:13]), + int(input[14:16]), + int(input[17:19])+frac)) + if len(input)==len('YYYYMMDD_HHMMSS') or format=='YYYYMMDD_HHMMSS': + return list_to_J2000((int(input[0:4]), + int(input[4:6]), + int(input[6:8]), + int(input[9:11]), + int(input[11:13]), + int(input[13:15])+frac)) + + if len(input)==len('YYMMDD_HHMMSS') or format=='YYMMDD_HHMMSS': + return list_to_J2000((ensureYYYY(int(input[0:2])), + int(input[2:4]), + int(input[4:6]), + int(input[7:9]), + int(input[9:11]), + int(input[11:13])+frac)) + + if len(input)==len('YYYYMMDDHHMMSS') or format=='YYYYMMDDHHMMSS': + return list_to_J2000((int(input[0:4]), + int(input[4:6]), + int(input[6:8]), + int(input[8:10]), + int(input[10:12]), + int(input[12:14])+frac)) + + if len(input)==len('YYMMDDHHMMSS') or format=='YYMMDDHHMMSS' or format=="YMDHMS": + return list_to_J2000((ensureYYYY(int(input[0:2])), + int(input[2:4]), + int(input[4:6]), + int(input[6:8]), + int(input[8:10]), + int(input[10:12])+frac)) + + if len(input)==len('YYYYMMDD') or format=='YYYYMMDD': + return list_to_J2000((int(input[0:4]), + int(input[4:6]), + int(input[6:8]))) + + if len(input)==len('HHMMSS') and format in ('HHMMSS','HMS'): + return list_to_J2000((2000,1,1, + int(input[0:2]), + int(input[2:4]), + int(input[4:6])+frac)) + + if len(input)==len('YYMMDD') or format in ('YYMMDD','YMD'): + return list_to_J2000((ensureYYYY(int(input[0:2])), + int(input[2:4]), + int(input[4:6]))) + + die("Unknown string format",input) + die("Unknown input type to to_J2000:",input) + +def from_J2000(sec=0,format="YYYYMMDD_HHMMSS",aux=None): + #aux contains spare information, thusfar only used for site id's for filenames or longitude for localtime + format=format.upper() + if format == "J2000" : return sec + (y,m,d,hh,mm,ss)=J2000_to_list(sec) + f="" + if ss > int(ss): f=("%f"%(ss-int(ss))).strip('0') #remove leading and trailing 0 + if format == "LIST" : return [y,m,d,hh,mm,ss] + if format == "HOURLETTER" : return chr(hh+ord('a')) + if format in("HOURSINDAY","HOD") : return hh+mm/60.0+ss/60.0/60.0 + if format in("MINUTESINDAY","MOD") : return hh*60+mm+ss/60.0 + if format in("SECONDSINDAY","SOD") : return (hh*60+mm)*60+ss + if format in("HHMMSS","HMS") : return "%02d%02d%02d"%(hh,mm,ss)+f + if format in("YYMMDD","YMD") : return "%02d%02d%02d"%(ensureYY(y),m,d) + if format == "YYYYMMDD" : return "%04d%02d%02d"%(y,m,d) + if format in("YYMMDDHHMMSS","YMDHMS"): return "%02d%02d%02d%02d%02d%02d"%(ensureYY(y),m,d,hh,mm,ss)+f + if format == "YYYYMMDDHHMMSS" : return "%04d%02d%02d%02d%02d%02d"%(y,m,d,hh,mm,ss)+f + if format == "YYMMDD_HHMMSS" : return "%02d%02d%02d_%02d%02d%02d"%(ensureYY(y),m,d,hh,mm,ss)+f + if format == "YYYYMMDD_HHMMSS" : return "%04d%02d%02d_%02d%02d%02d"%(y,m,d,hh,mm,ss)+f + if format == "GAIMSTRING" : return "%04d/%02d/%02d,%02d:%02d:%02d"%(y,m,d,hh,mm,ss)+f + if format == "ISO" : return "%04d-%02d-%02dT%02d:%02d:%02dZ"%(y,m,d,hh,mm,ss)+f + doy = time.gmtime(sec+J2000_1970_EPOCH)[7] #fetch doy + if format == "DOY" : return "%03d"%(doy) + if format == "YYDOY" : return "%02d%03d"%(ensureYY(y),doy) + if format == "TENETHOURLY" : + if not aux: aux="site" + return "%4s%03d%1s%02d.%02d.tenet"%(aux,doy,chr(ord('a')+hh),mm,ensureYY(y)) + if format == "LOCALHMS" : + if not aux: aux=0 + localtime = hh + aux/360.0*24.0 #in this case, aux is longitude in deg + while (localtime < 0): localtime+=+24 + while (localtime >= 24): localtime-= 24 + return "%02d%02d%02d"%(localtime,mm,ss)+f + die("Unrecognized format string in from_J2000 "+format) + +class IonoTime: + "Handles conversions between times and dates for all variety of ionospheric time interests" + #internal representation is seconds past J2000 + def __init__(self,input=None): + self.sec = 0 + self.set(input) + def set(self,input=None,format=None): + if not input: return self + if isinstance(input,IonoTime): + self.sec=input.sec + else: + self.sec = to_J2000(input,format) + return self + def to(self,format=None,aux=None): + if not format: return self.sec + return from_J2000(self.sec,format,aux) + def now(self): + self.sec = to_J2000(time.localtime()[0:6]) + return self + def nowUTC(self): + self.sec = to_J2000(time.gmtime()[0:6]) + return self + def addSeconds(self,s): + self.sec+=s + return self + def addMinutes(self,m): + self.sec+=m*60.0 + return self + def addHours (self,h): + self.sec+=h*60.0*60.0 + return self + def addDays (self,d): + self.sec+=d*60.0*60.0*24.0 + return self + def addMonths (self,mi): + (y,m,d,hh,mm,ss)=from_J2000(self.sec,"LIST") + m+=mi + while m > 12: + y=y+1 + m-=12 + while m < 1: + y=y-1 + m+=12 + self.sec=to_J2000((y,m,d,hh,mm,ss)) + return self + def addYears (self,yi): + (y,m,d,hh,mm,ss)=from_J2000(self.sec,"LIST") + self.sec=to_J2000((y+yi,m,d,hh,mm,ss)) + return self + def copy (self): + n=IonoTime(self.sec) + return n + def makemidnight(self): + (y,m,d,hh,mm,ss)=from_J2000(self.sec,"LIST") + self.sec=to_J2000((y,m,d)) + return self + def floor(self,interval): #round current object to a specified accuracy + (y,m,d,hh,mm,ss)=from_J2000(self.sec,"LIST") + interval=interval.lower() + if interval.find('year' )>=0: self.sec=to_J2000((y, 1, 0, 0, 0, 0)) + elif interval.find('month' )>=0: self.sec=to_J2000((y, m, 1, 0, 0, 0)) + elif interval.find('day' )>=0: self.sec=to_J2000((y, m, d, 0, 0, 0)) + elif interval.find('hour' )>=0: self.sec=to_J2000((y, m, d, hh, 0, 0)) + elif interval.find('minute')>=0: self.sec=to_J2000((y, m, d, hh, mm, 0)) + elif interval.find('second')>=0: self.sec=to_J2000((y, m, d, hh, mm,int(ss))) + else : die("IonoTime: Floor: Malformed interval: "+interval) + return self + def __sub__(self,other): + return IonoTime(self.sec-other) + def __add__(self,other): + return IonoTime(self.sec+other) +# def __iadd__(self,other): +# return IonoTime(self.sec+other) +# def __isub__(self,other): +# return IonoTime(self.sec-other) + def __cmp__(self,other): + return cmp(self.sec,other.sec) + def __coerce__(self,other): + if isinstance(other,types.FloatType) or isinstance(other,types.IntType) or isinstance(other,types.LongType): + return (self.sec,other) + if isinstance(other,types.StringType): + return (from_J2000(self.sec,"YYYYMMDD_HHMMSS"),other) + if isinstance(other,types.ListType) or isinstance(other,types.TupleType): + return (from_J2000(self.sec,"LIST"),other) + def __repr__(self): + return from_J2000(self.sec,"YYYYMMDD_HHMMSS") + +def test(): + print "Testing timeJ2000 routines:" + print "Checking to_J2000" + if not to_J2000("20040606" )==139752000 : die("FAILED YYYYMMDD test") + if not to_J2000("040606" )==139752000 : die("FAILED YYMMDD test") + if not to_J2000("20040606010101" )==139755661 : die("FAILED YYYYMMDDHHMMSS test") + if not to_J2000("c" )==-36000.0 : die("FAILED HOURLETTER test") + if not to_J2000("20040606010101.1" )==139755661.1 : die("FAILED YYYYMMDDHHMMSS.F test") + if not to_J2000("20040606_010101" )==139755661 : die("FAILED YYYYMMDD_HHMMSS test") + if not to_J2000("20040606_010101.1")==139755661.1 : die("FAILED YYYYMMDD_HHMMSS.F test") + if not to_J2000("040606_010101" )==139755661 : die("FAILED YYMMDD_HHMMSS test") + if not to_J2000("040606_010101.1" )==139755661.1 : die("FAILED YYMMDD_HHMMSS.F test") + if not to_J2000("040606010101" )==139755661 : die("FAILED YYMMDDHHMMSS test") + if not to_J2000("040606010101.1" )==139755661.1 : die("FAILED YYMMDDHHMMSS.F test") + if not to_J2000("121212.1",'HHMMSS')==732.1 : die("FAILED HHMMSS test") + if not to_J2000(10244201.1 )==10244201.1 : die("FAILED J2000 test") + if not to_J2000((2004,6,6,1,1,1.1) )==139755661.1 : die("FAILED list test") + if not to_J2000("103",'DOY' )==8769600 : die("FAILED DOY test") + if not to_J2000("00103",'YYDOY' )==8769600 : die("FAILED YYDOY test") + if not to_J2000("2004/06/06,01:01:01.1")==139755661.1: die("FAILED GAIMSTRING test") + if not to_J2000("help158b01.04.tenet",'TENETHOURLY')==139755660.0 : die("FAILED TENETHOURLY test") + print "Passed to_J2000" + + print "Checking from_J2000" + if not from_J2000(139752000 ,"YYYYMMDD" )=="20040606" : die("FAILED YYYYMMDD test") + if not from_J2000(139752000.1,"YYYYMMDD" )=="20040606" : die("FAILED YYYYMMDD test") + if not from_J2000(139752000 ,"YYMMDD" )=="040606" : die("FAILED YYMMDD test") + if not from_J2000(139752000.1,"YYMMDD" )=="040606" : die("FAILED YYMMDD test") + if not from_J2000(139755661 ,"HOURLETTER" )=="b" : die("FAILED HOURLETTER test") + if not from_J2000(139755661 ,"YYYYMMDDHHMMSS" )=="20040606010101" : die("FAILED YYYYMMDDHHMMSS test") + if not from_J2000(139755661.1,"YYYYMMDDHHMMSS" )=="20040606010101.1" : die("FAILED YYYYMMDDHHMMSS.F test") + if not from_J2000(139755661 ,"YYYYMMDD_HHMMSS")=="20040606_010101" : die("FAILED YYYYMMDD_HHMMSS test") + if not from_J2000(139755661.1,"YYYYMMDD_HHMMSS")=="20040606_010101.1" : die("FAILED YYYYMMDD_HHMMSS.F test") + if not from_J2000(139755661 ,"YYMMDD_HHMMSS" )=="040606_010101" : die("FAILED YYMMDD_HHMMSS test") + if not from_J2000(139755661.1,"YYMMDD_HHMMSS" )=="040606_010101.1" : die("FAILED YYMMDD_HHMMSS.F test") + if not from_J2000(139755661 ,"YYMMDDHHMMSS" )=="040606010101" : die("FAILED YYMMDDHHMMSS test") + if not from_J2000(139755661.1,"YYMMDDHHMMSS" )=="040606010101.1" : die("FAILED YYMMDDHHMMSS.F test") + if not from_J2000(732.1 ,"HHMMSS" )=="121212.1" : die("FAILED HHMMSS.F test") + if not from_J2000(139752000.1,"J2000" )==139752000.1 : die("FAILED J2000 test") +# (1,1.1) == (1,1.1000000001) ?! +# if not from_J2000(139755661.1,"LIST" )==(2004,6,6,1,1,1.1) : die("FAILED LIST test") + if not from_J2000(8769600 ,"DOY" )=="103" : die("FAILED DOY test") + if not from_J2000(8769600 ,"YYDOY" )=="00103" : die("FAILED YYDOY test") + if not from_J2000(139755661.1,"GAIMSTRING" )=="2004/06/06,01:01:01.1": die("FAILED GAIMSTRING test") + if not from_J2000(139755661.1,"TENETHOURLY",'help')=="help158b01.04.tenet": die("FAILED TENETHOURLY test") + print "Passed from_J2000" + + print "Testing IonoTime" + if not IonoTime(0)+"a" =="20000101_120000a" : die("FAILED string coersion test") + if not IonoTime(0)+1.0 ==1 : die("FAILED integer coersion test") + if not IonoTime(0)+[1,2]==[2000,1,1,12,0,0,1,2] : die("FAILED list coersion test") + if not IonoTime(0).addDays(2).addHours(2).addMinutes(2).addSeconds(2) == ((2*24+2)*60+2)*60+2: die("FAILED deltatime test") + if not IonoTime(10) == IonoTime(10) : die("FAILED equivalence test") + if not IonoTime(12) - IonoTime(10) == 2 : die("FAILED subtraction test") + if not IonoTime(12) + IonoTime(10) == 22 : die("FAILED addition test") + if not IonoTime(12).makemidnight().to('LOCALHMS',140) == "090000" : die("FAILED Midnight or LOCALHMS test") + if not IonoTime(6576).floor('day').to('YYYYMMDDHHMMSS') == "20000101000000": die("FAILED floor test") + print "Passed IonoTime" + + +def main(args): + test() + +if __name__ == "__main__": + main(sys.argv[1:]) http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/util/warn.py ---------------------------------------------------------------------- diff --git a/climatology/clim/util/warn.py b/climatology/clim/util/warn.py new file mode 100644 index 0000000..499813d --- /dev/null +++ b/climatology/clim/util/warn.py @@ -0,0 +1,43 @@ + +# +# warn.py -- Utility routines to print warning & error messages like -- +# "module: error message" +# +try: __file__ +except: __file__ = 'warn.py' # ensure __file__ is set for warning messages + # each module file will execute this code +import sys, os +from inspect import getmodule, currentframe + +def echo(*s): + """Stringify & join any number of args and print resulting string to stdout""" + sys.stdout.write(' '.join(map(str, s)) + '\n') + +def echon(*s): + """Same as echo() except join with newlines.""" + sys.stdout.write('\n'.join(map(str, s)) + '\n') + +def echo2(*s): + """Stringify & join any number of args and print resulting string to stderr""" + sys.stderr.write(' '.join(map(str, s)) + '\n') + +def echo2n(*s): + """Same as echo2() except join with newlines.""" + sys.stderr.write('\n'.join(map(str, s)) + '\n') + +def moduleName(file): + """Extract a module name from the python source file name, with appended ':'.""" + return os.path.splitext(os.path.split(file)[1])[0] + ":" + + +# Each module must define these functions so that the module name is the proper file. + +def warn(*s): + """Print a warning message to stderr, identifying the module it came from.""" + echo2(moduleName(__file__)+':', *s) + +def die(ss, status=1): + """Print a warning message to stderr, and die with a non-zero status value.""" + if type(ss) == str: ss = [ss] + warn(*ss); sys.exit(status) + http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/util/wls.py ---------------------------------------------------------------------- diff --git a/climatology/clim/util/wls.py b/climatology/clim/util/wls.py new file mode 100755 index 0000000..d7ed8a5 --- /dev/null +++ b/climatology/clim/util/wls.py @@ -0,0 +1,798 @@ +#!/usr/bin/env python +#----------------------------------------------------------------------------- +# Name: filelist.py +# Purpose: File listing class/functions. +# +# Author: Brian Wilson +# +# Created: Mon Apr 10 11:01:06 2006 +# Copyright: (c) 2006, California Institute of Technology. +# U.S. Government Sponsorship acknowledged. +#----------------------------------------------------------------------------- +# +USAGE = """ +filelist.py [--help] [--bottomUp] [--directory] [--delete] + [--fetchDir <outputDir>] [--fetchWitSubDirs] + [--list] [--matchUrl] --quiet] [--regex '.*\.[cC]'] + [--size] [--topOnly] [--url] + [--wildcard '*.txt.*'] [--xml] <topPaths ...> + +Recursively traverse and print (with full paths or URL's) all files +under the topPath(s) that match ANY of one or more regular expressions +and/or wildcard glob) strings. By default, it simply prints the matches, +but one can also get their sizes, fetch them, or delete them. + +The topPaths can be a mixture of local and remote (ftp or http) +paths, in which case a list of URL's is returned. If xml mode is +turned on, then the output is an XML list. + +If no regex or wildcard patterns are specified, then ALL files +are returned. If files are fetched, then the URL's are +REWRITTEN to point to the local copies. + +""" +# See the bottom of the file for exact switches and example of use. + +import sys, os, re, string, getopt, types, getpass +import urllib, urllib2, urlparse, time, shutil, socket, stat +from fnmatch import fnmatchcase +from ftplib import FTP +#import dataenc + +def matchAnyThenConstrain(root, name, haveRegs, regs, haveWilds, wildCards, + constraintFunction): + """Return True if the file name matches any of the compiled regular + expressions or any of the wildcard (glob) specs, and (if present) the + constraintFunction returns True. The regex can be a pair of match & + substitution patterns. The 'name' of the file might be altered by a + regex substitution and/or the constraintFunction. + """ + if not haveRegs and not haveWilds: + if constraintFunction is not None: + return constraintFunction(root, name) + else: + return (True, name) + else: + match = False + if haveRegs: + for reg in regs: + pattern, subst = reg + if pattern.search(name): + match = True + if subst: + name = pattern.sub(subst, name) + break + if haveWilds and not match: + for wild in wildCards: + if fnmatchcase(name, wild): + match = True + break + if match and constraintFunction is not None: + match, name = constraintFunction(root, name) + return (match, name) + + +# Users call this function +def filelist(urlPaths, regSpecs=[], wildCards=[], needCredentials=False, userCredentials=None, + matchFunction=matchAnyThenConstrain, constraintFunction=None, + matchUrl=False, walkDirectories=True, + urlMode=True, xmlMode=True, quietMode=False, verboseMode=False, getFileInfo=False, + fetchDir=None, fetchIfNewer=False, fetchWithSubDirs=False, + directoryMode=False, listMode=False, deleteMode=False, topDown=True, + stream=sys.stdout): + """Recursively traverse and print (with full paths or URL's) all files + under the topPath(s) that match one or more regular expressions and/or + wildcard (glob) strings, and an optional constraint (T/F) function to + further winnow the candidate matches. (The matchFunction can also be + entirely replaced with custom logic.) + + By default, it simply generates the matches, but one can also fetch them, + get their sizes, or delete them (if they are local files). + Handles local directory paths and ftp/http URL's. + + Returns three file lists: matched, actually fetched, & destination names. + """ + try: + matchedFiles = [] # source files that match criteria + fetchedFiles = [] # files that were actually fetched this run + destinationFiles = [] # destination (local) file names (rewritten URL) + + topPaths = [] + for url in urlPaths: + if url == '' or url == None: continue + remote, protocol, netloc, path = remoteUrl(url) + if not remote: url = os.path.abspath(url) + if url[-1] == '/': url = url[:-1] + topPaths.append(url) + + if needCredentials and userCredentials is None: + userCredentials = promptForCredentials(topPaths) + + if fetchDir: + workDir = os.path.join(fetchDir, '.tmp') + # fetch into tmp directory & then rename so fetching is atomic + try: os.mkdir(workDir) + except: pass + if not os.path.exists(workDir): + die("filelist: Cannot write to fetch directory %s" % fetchDir) + + if isinstance(topPaths, types.StringType): topPaths = [topPaths] + regSpecs = [s for s in regSpecs if s != '' and s != None] + wildCards = [s for s in wildCards if s != '' and s != None] + + haveRegs = False; regs = []; haveWilds = False; haveMatchFunction = False + if len(regSpecs) > 0: + haveRegs = True + regs = [] + for reg in regSpecs: + (pattern, subst) = parse_re_with_subst(reg) + regs.append( (re.compile(pattern), subst) ) + if len(wildCards) > 0: + haveWilds = True + + prefix = '' + extra = '' + suffix = '' + if deleteMode: + suffix += ' deleted.' + if '.' in topPaths: + die("filelist: Recursively deleting from the dot (.) path is not safe. Shame.") + + if directoryMode: listMode = False + if listMode: getFileInfo = True + if quietMode: stream = None + sumSizes = 0 + if xmlMode: + matchedFiles.append('<files>') + fetchedFiles.append('<files>') + _output('<files>', destinationFiles, stream) + prefix += ' <file>' + suffix += '</file>' + + for top in topPaths: + if verboseMode: warn('filelist: searching', top) + topMatchCount = 0; topFetchCount = 0 + + for root, dirs, files, infos in walk(top, userCredentials, walkDirectories, topDown): + if verboseMode: warn('filelist: found files in', root) + remote, protocol, netloc, path = remoteUrl(root) + if directoryMode: + contents = dirs + else: + contents = files + + for i in range(len(contents)): + line = '' + file = contents[i] + try: + info = infos[i] + except: + info = None + if matchUrl: + name = os.path.join(root, file) + else: + name = file + + match, newname = matchFunction(root, name, haveRegs, regs, + haveWilds, wildCards, constraintFunction) + if match: + line = '' + topMatchCount += 1 + fn = os.path.join(root, file) + + if getFileInfo or (fetchIfNewer and not remote): + if remote: + if info and getFileInfo: + if listMode: line = info.line + extra = ' ' + str(info.size) + ' ' + str(info.modTime) + sumSizes += info.size + else: + st = os.stat(fn) + line = ' '.join( map(str, \ + (st.st_mode, st.st_uid, st.st_gid, st.st_size, st.st_mtime, fn))) + info = FileInfo(line, st.st_size, st.st_mtime, st.st_uid, st.st_gid, st.st_mode) + if getFileInfo: + extra = ' ' + str(info.size) + ' ' + str(info.modTime) + sumSizes += info.size + + if not remote and urlMode: fn = makeFileUrl(fn) + matchedFiles.append(prefix + fn + extra + suffix) + + if matchUrl: + newfn = newname + else: + newfn = os.path.join(root, newname) + newr, newp, newloc, newpath = remoteUrl(newfn) + newfile = os.path.split(newpath)[1] + + if fetchDir: + if fetchDir == '.': fetchDir = os.getcwd() + if fetchWithSubDirs: + destDir = os.path.join(fetchDir, newpath[1:]) + else: + destDir = fetchDir + destFile = os.path.join(destDir, newfile) + tmpFile = os.path.join(workDir, newfile) + + if shouldFetch(remote, destFile, fetchIfNewer, info): + if not quietMode: + warn('filelist: Fetching ', fn) + warn('filelist: Writing ', destFile) + try: + os.makedirs(destDir) + except: + # kludge, makedirs throws exception if any part of path exists + pass + if remote: + urllib.urlretrieve(fn, tmpFile) + else: + shutil.copyfile(fn, tmpFile) + os.rename(tmpFile, destFile) # atomic rename of file into destDir + + topFetchCount += 1 + fetchedFiles.append(prefix + fn + suffix) + if getFileInfo: line = line + ' ' + destFile + + # now rewrite URL to point to local copy of file + fn = destFile + if not remote and urlMode: fn = makeFileUrl(fn) + + if not listMode: + line = prefix + fn + extra + suffix + _output(line, destinationFiles, stream) + if deleteMode: + if remote: + die('filelist: Cannot delete remote files (yet)') + else: + os.unlink(fn) + + if verboseMode and fetchDir: + warn('filelist: Matched %d files from %s' % (topMatchCount, top)) + warn('filelist: Fetched %d files from %s' % (topFetchCount, top)) + if fetchDir: + for f in os.listdir(workDir): os.remove(os.path.join(workDir, f)) + os.rmdir(workDir) + + if xmlMode: + matchedFiles.append('</files>') + fetchedFiles.append('</files>') + _output('<files>', destinationFiles, stream) + + if getFileInfo: + if xmlMode: + line = '<totalSize>%s</totalSize>' % sumSizes + else: + line = '#filelist: total size %s' % sumSizes + matchedFiles.append(line) + _output(line, destinationFiles, stream) + + except KeyboardInterrupt: + if fetchDir: + for f in os.listdir(workDir): os.remove(os.path.join(workDir, f)) + os.rmdir(workDir) + die('filelist: Keyboard Interrupt') + + return (matchedFiles, fetchedFiles, destinationFiles) + + +def shouldFetch(remote, destFile, fetchIfNewer, srcFileInfo): + if remote: + if os.path.exists(destFile): + doFetch = False + else: + doFetch = True + else: + if os.path.exists(destFile): + if fetchIfNewer: + destModTime = os.path.getmtime(destFile) + if destModTime < srcFileInfo.modTime: + doFetch = True + else: + doFetch = False + else: + doFetch = False + else: + doFetch = True + return doFetch + +def _output(line, lines, stream=None): + """Internal function: Add line to output lines and optionally print to stream.""" + lines.append(line) + if stream: print >>stream, line + +class FileInfo: + """Holder class for those file info. elements that are consistent among local + files (output of stat), ftp directories, http, etc. Minimum useful fields are + modification time and size. Line contains usual string output of ls -l. + """ + def __init__(self, line, size, modTime, userId=None, groupId=None, protectMode=None): + self.line=line; self.size=size; self.modTime=modTime + self.userId=userId; self.groupId=groupId; self.protectMode=protectMode + +class UserCredential(object): + """Container for user credential info. like username, password, certificate, etc. + """ + def __init__(self, username=None, password=None, validInterval=None, certificate=None): + self.username = username + self.password = password + self.validInterval = validInterval # tuple of Ints (days, hours, minutes) + if password is not None and validInterval is None: + die('UserCredential: If password is present, validInterval is also required.') + self.certificate = certificate + + def getPassword(self): + pw = self._password + if pw: + pw, daynumber, timestamp = dataenc.pass_dec(pw) + if dataenc.unexpired(daynumber, timestamp, self.validInterval): + return pw + else: + return None + else: + return None + def setPassword(self, pw): + if pw and pw != '': + self._password = dataenc.pass_enc(pw, daynumber=True, timestamp=True) + else: + self._password = pw + password = property(getPassword, setPassword) + +class UserCredentials: + """Contains dictionary of (url, credential) pairs and optionally an httpProxy. + """ + def __init__(self, httpProxy=None, credentials={}): + self.httpProxy = httpProxy + self.credentials = credentials + def add(self, url, credential): + self.credentials[url] = credential; return self + def forUrl(self, url): + for key in self.credentials: + if url.startswith(key): + return self.credentials[key] + return None + +def promptForCredentials(urls, httpProxy=None): + if httpProxy == None: + httpProxy = raw_input('Enter HTTP proxy [none]: ') + if httpProxy == '': httpProxy = None + credentials = UserCredentials(httpProxy) + localUserName = getpass.getuser() + for url in urls: + remote, protocol, netloc, path = remoteUrl(url) + if remote: + username, password, validInterval = promptForCredential(url, localUserName) + credential = UserCredential(username, password, validInterval) + credentials.add(url, credential) + return credentials + +def promptForCredential(url, localUserName): + remote, protocol, netloc, path = remoteUrl(url) + if protocol == 'ftp': + defaultUserName = 'anonymous' + else: + defaultUserName = localUserName + username = raw_input('Need credentials for URL %s\nUsername [%s]: ' \ + % (url, defaultUserName)) + if username == '': username = defaultUserName + password = '' + while password == '': + password = getpass.getpass() + validInterval = [0, 1, 0] + if password != '': + response = raw_input('Enter valid time period for credential [(days, hours, minutes) = 0 1 0]: ') + if response != '': + validInterval = response.split() + return (username, password, validInterval) + +class DirectoryWalker: + """Recursively walk directories using the protocol specified in a URL. + Sublclasses handle ftp, http, sftp, local file system, etc. + """ + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + self.userCredentials = userCredentials + self.retries = retries + self.sleepTime = sleepTime + + def walk(self, top, walkDirectories=True): + """Recursively walk directories on a remote site to retrieve file lists. + """ + remote, protocol, netloc, path = remoteUrl(top) + status, dir_listing = self.retrieveDirList(top) + if status: + if len(dir_listing) == 0: + yield (top, [], [], []) + else: + (dirs, files, infos) = self.parseDirList(dir_listing, path) + yield (top, dirs, files, infos) + + if walkDirectories: + for dir in dirs: + # Depth-first recursion + for root, dirs, files, infos in self.walk(top + '/' + dir, walkDirectories): + yield (root, dirs, files, infos) + else: + warn('DirectoryWalker: error, unable to retrieve directory listing at', top) + yield (top, [], [], []) + + def retrieveDirList(self, url): + """Retrieve directory listing as a list of text lines. Returns (status, dirList).""" + pass + def parseDirList(self, dirList, path=None): + """Parse directory listing (text) and return three lists (dirs, files, fileInfos).""" + pass + +class FtpDirectoryWalker(DirectoryWalker): + """Recursively walk directories on an ftp site.""" + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + DirectoryWalker.__init__(self, userCredentials, retries, sleepTime) + + def retrieveDirList(self, url): + """Retrieve a directory listing via ftp with retries. + """ + remote, protocol, netloc, path = remoteUrl(url) + credential = None + if self.userCredentials: + credential = self.userCredentials.forUrl(url) + dir = ''; dir_list = [] + ftp = FTP() + for i in range(self.retries): + try: + ftp.connect(netloc) + if credential is None or \ + credential.username == 'anonymous' or \ + credential.username == '': + ftp.login() + else: + ftp.login(credential.username, credential.password) + ftp.cwd(path) + ftp.retrlines('LIST', dir_list.append) + ftp.quit() + dir = '\n'.join(dir_list) + return (True, dir) + except: + pass + time.sleep(self.sleepTime) + warn('FtpDirectoryWalker: connect retry to ', netloc, path) + return (False, dir) + + def parseDirList(self, dir, path=None): + """Parse long directory listing returned by ftp or (ls -l). + Separate entries into directories and files. + """ + dirs = []; files = []; infos = [] + for entry in dir.split('\n'): + fields = entry.split() + if len(fields) < 7: continue + fn = fields[-1] + if fn == '.' or fn == '..': continue + if re.match('^d', fields[0])and fields[0][7] == 'r': + dirs.append(fn) + else: + files.append(fn) + info = FileInfo(entry, int(fields[4]), '-'.join(fields[5:8]), \ + fields[2], fields[3], fields[0]) + infos.append(info) + return (dirs, files, infos) + +class DirListingParser(object): + """Base class for directory listing parsers.""" + def __init__(self, regex): + self.regex = regex + self.compiledRegex = re.compile(self.regex) + + def parse(self, dir, listingHtml): + """Return (dirs, files, infos).""" + dirs = []; files = []; infos = [] + raise NotImplementedError, "Override this method in sub class." + +class ApacheDirListingParser(DirListingParser): + """Parser class for apache.""" + def parse(self, dir, listingHtml): + dirs = []; files = []; infos = [] + items = self.compiledRegex.findall(listingHtml) + for item, itemName in items: + if itemName.strip() == 'Parent Directory': continue + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + + if name.endswith('/'): + type = 'd' + dirs.append(name[:-1]) + else: + type = '-' + files.append(name) + #not doing file info + ''' + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + ''' + infos.append(None) + return (dirs, files, infos) + +class CDAACDirListingParser(DirListingParser): + """Parser class for CDAAC data server.""" + def parse(self, dir, listingHtml): + dirs = []; files = []; infos = [] + items = self.compiledRegex.findall(listingHtml) + for item, itemName in items: + if itemName.strip() == 'Parent Directory': continue + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + if name.endswith('/'): + type = 'd' + dirs.append(name) + else: + type = '-' + files.append(name) + #not doing file info + ''' + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + ''' + infos.append(None) + return (dirs, files, infos) + +class HttpDirectoryWalker(DirectoryWalker): + """Recursively walk directories on an http (web) site to retrieve file lists. + Handles many styles of HTML directory listings, but still very FRAGILE. + """ + + #list of directory listing parser plugins + DIR_LIST_REGEX_PLUGINS = [ + #apache 2.0.55 directory listing + ApacheDirListingParser(r'(?i)alt="\[.*?\]">\s*<A HREF="(?P<name>.*?)">(.*?)</A>'), + #CDAAC (COSMIC Data) + CDAACDirListingParser(r'(?i)<LI><A HREF="(?P<name>.*?)">(.*?)</A>'), + ] + + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + DirectoryWalker.__init__(self, userCredentials, retries, sleepTime) + if self.userCredentials: + if self.userCredentials.httpProxy: + os.environ['http_proxy'] = self.userCredentials.httpProxy + # global kludge, default proxyHandler looks up proxy there + passwordMgr = urllib2.HTTPPasswordMgrWithDefaultRealm() + for url, cred in self.userCredentials.credentials.iteritems(): + passwordMgr.add_password(None, url, cred.username, cred.password) + authHandler = urllib2.HTTPBasicAuthHandler(passwordMgr) + opener = urllib2.build_opener(authHandler) + else: +# opener = urllib2.build_opener() + opener = None +# opener.add_headers = [('User-agent', 'Mozilla/5.0')] + self.opener = opener + + def retrieveDirList(self, url): + """Retrieve an HTML directory listing via http with retries. + """ +### url = os.path.join(url, 'contents.html') ### hack for DAP servers at GES-DISC + dir_listing = '' + proxies = {} + for i in range(self.retries): + try: + if self.opener: + response = self.opener.open(url) + else: + response = urllib.urlopen(url) + except IOError, e: + if hasattr(e, 'reason'): + warn('HttpDirectoryWalker: Error, failed to reach server because: %s' % e.reason) + elif hasattr(e, 'code'): + warn('HttpDirectoryWalker: Server could not fulfill request, error code %s' % e.code) + else: + dir_listing = response.read() + return (True, dir_listing) + time.sleep(self.sleepTime) + warn('HttpDirectoryWalker: retrying ', url) + return (False, dir_listing) + + reDirPath = re.compile(r'(?i)<H1>.*?Index of\s*?(\S+?)\s*?</H1>') + + def parseDirList(self, dir, path): + """Parse fragile HTML directory listings returned by various HTTP servers, + including Apache and OpenDAP. Separate entries into directories and files. + """ + dirs = []; files = []; infos = [] + if path: + match = HttpDirectoryWalker.reDirPath.search(dir) + if not match: + die('HttpDirectoryWalker: Cannot find directory name %s in HTML listing:\n%s' % (path, dir)) + dirName = match.group(1) + if dirName not in path: + warn('HttpDirectoryWalker: Directory name %s in HTML listing does not agree with path %s:\n%s' % (dirName, path, dir)) + + # Try to find directory lines that contain file info + reDirListWithStat = re.compile( \ + r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>\s*(?P<dateTime>\S+ \S+)\s+?(?P<size>\S+)\s*?$') + items = reDirListWithStat.findall(dir) + # If not, then try to find simple directory lines + if len(items) == 0: + reDirList = re.compile( \ + r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>') + items = reDirList.findall(dir) + + if len(items) != 0: + dateTime = '? ?'; size = '' + for item in items: + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + if dirName not in name: continue + + if name.endswith('/'): + type = 'd' + dirs.append(name) + else: + type = '-' + files.append(name) + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + infos.append(info) + print line + + #try plugins + else: + for plugin in self.DIR_LIST_REGEX_PLUGINS: + pluginResults = plugin.parse(dirName, dir) + if len(pluginResults[0]) != 0 or len(pluginResults[1]) != 0 or \ + len(pluginResults[2]) != 0: return pluginResults + + return (dirs, files, infos) + + +def walk(top, userCredentials=None, walkDirectories=True, topDown=True): + """Recursively walk directories to retrieve file lists. + Returns the topPath, contained subdirectories and files, and + optionally FileInfo objects (if info is included in protocol results). + Handles local directory paths and ftp/http protocols (URL's). + """ + remote, protocol, netloc, path = remoteUrl(top) + if remote: + if protocol == 'ftp': + ftpWalker = FtpDirectoryWalker(userCredentials) + for root, dirs, files, infos in ftpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + elif protocol == 'http': +# import pdb; pdb.set_trace() + httpWalker = HttpDirectoryWalker(userCredentials) + for root, dirs, files,infos in httpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + elif protocol == 'sftp': + sftpWalker = SftpDirectoryWalker(userCredentials) + for root, dirs, files,infos in sftpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + else: + die('filelist: Cannot handle protocol ', protocol) + else: + if walkDirectories: + for root, dirs, files in os.walk(top, topDown): + yield (root, dirs, files, []) + else: + files = os.listdir(top) + yield (top, [], files, []) + +def remoteUrl(url): + """Returns True if the URL is remote; also returns protocol, + net location (host:port), and path.""" + protocol, netloc, path, params, query, fragment = urlparse.urlparse(url) + if protocol == '': + return (False, protocol, netloc, path) + else: + return (True, protocol, netloc, path) + + +# utils +RE_WITH_SUBST_PATTERN = re.compile(r'^s/(.+)/(.+)/$') +def parse_re_with_subst(str): + match = RE_WITH_SUBST_PATTERN.match(str) + if match: + return (match.group(1), match.group(2)) + else: + return (str, None) + +def hostName(): + return socket.gethostbyaddr(socket.gethostname())[0] + +FILE_URL_PREFIX = 'file://' + hostName() +def makeFileUrl(file): + return FILE_URL_PREFIX + file + +def warn(*str): sys.stderr.write(' '.join(str) + '\n') +def die(str, status=1): warn(str); sys.exit(status) + +def main(): + """Main function for outside scripts to call.""" + + from sys import argv + + if len(argv) < 2: die(USAGE) + try: + opts, argv = getopt.getopt(argv[1:], 'hbcdf:ilqr:stuvw:x', + ['help', 'bottomUp', 'credentials', 'delete', 'directory', + 'fetchDir=', 'fetchIfNewer', 'fetchWithSubDirs', 'info', + 'list', 'quiet', 'regex=', 'size', 'topOnly', + 'url', 'verbose', 'wildcard=', 'xml']) + except getopt.GetoptError, (msg, bad_opt): + die("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg)) + + regSpecs = []; wildCards = []; matchUrl=False; walkDirectories = True + needCredentials = False; userCredentials = None + urlMode=False; xmlMode=False; quietMode=False; verboseMode=False; getFileInfo=False + fetchDir = None; fetchIfNewer=False; fetchWithSubDirs=False + directoryMode = False; deleteMode = False; topDown = True; listMode = False + + for opt, val in opts: + if opt in ('-h', '--help'): die(USAGE) + elif opt in ('-b', '--bottomUp'): topDown = False + elif opt in ('-c', '--credentials'): needCredentials = True + elif opt in ('-d', '--directory'): directoryMode=True + elif opt in ('--delete'): deleteMode=True + elif opt in ('-f', '--fetchDir'): fetchDir = val + # retrieve remote files to this dir + elif opt in ('--fetchIfNewer'): fetchIfNewer=True + # only fetch if src file is newer than existing dest file + elif opt in ('--fetchWithSubDirs'): fetchWithSubDirs=True + # mirror subdirectories when fetching + elif opt in ('-i', '--info'): getFileInfo=True + elif opt in ('-l', '--list'): listMode=True + elif opt in ('-m', '--matchUrl'): matchUrl=True + # regexs match entire URL/path, not just file name + elif opt in ('-q', '--quiet'): quietMode=True + # don't print files during walk + elif opt in ('-r', '--regex'): regSpecs.append(val) + elif opt in ('-s', '--size'): sizeMode=True + elif opt in ('-t', '--topOnly'): walkDirectories=False + elif opt in ('-u', '--url'): urlMode=True + # return URL's (file:, ftp:, http:, etc.) + elif opt in ('-v', '--verbose'): verboseMode=True + elif opt in ('-w', '--wildcard'): wildCards.append(val) + elif opt in ('-x', '--xml'): xmlMode=True # return list in XML format + else: die(USAGE) + +# import pdb; pdb.set_trace() + + matchedFiles, fetchedFiles, destinationFiles = \ + filelist(argv, regSpecs, wildCards, needCredentials, userCredentials, + matchAnyThenConstrain, None, matchUrl, walkDirectories, + urlMode, xmlMode, quietMode, verboseMode, getFileInfo, + fetchDir, fetchIfNewer, fetchWithSubDirs, + directoryMode, listMode, deleteMode, topDown) + + if quietMode: + if listMode == 'match': + print matchedFiles + elif listMode == 'fetch': + print fetchedFiles + elif listMode == 'destination': + print destinationFiles + else: + pass + + +if __name__ == '__main__': main() http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/variables.py ---------------------------------------------------------------------- diff --git a/climatology/clim/variables.py b/climatology/clim/variables.py new file mode 100755 index 0000000..c5136f0 --- /dev/null +++ b/climatology/clim/variables.py @@ -0,0 +1,140 @@ +""" + variables.py + +Interface to Get Variables out of EOS HDF4/5 and netCDF3/4 files, with +smart dataset discovery and variable caching behind it. + +""" + +import sys, os, urlparse, time +#from pyhdf.SD import SD, SDC +import netCDF4 +#from pydap.client import open_url +import numpy as N + + +def getVariables(url, varNames=None, vars={}, kind=None, arrayOnly=False, order='C', retries=2, sleep=1, set_auto_scale=True, set_auto_mask=True): + """Interface function to get variables from many file formats or via DAP. Here kludge for special case.""" + urlStr = url + url = urlparse.urlparse(url) + path = url.path + + if varNames is None: + varNames = url.query.split(',') + else: + if isinstance(varNames, tuple): + vars = [] + if url.scheme == 'http': + if 'dap' in urlStr.lower(): + if kind is None: kind = 'dap' + if url.query == '': + urlStr = urlStr + '?' + ','.join(varNames) + else: + if kind is None: kind = 'w10n' + + if url.scheme == '': + if kind is None: + kind = fileKind(path) + else: + kind = kind.lower() + + if kind == 'h5' or kind == 'hdf5': + pass + + elif kind == 'hdf' or kind == 'hdf4': + d = SD(path, SDC.READ) + if varNames == 'ALL': + varNames = d.datasets().keys() + for varName in varNames: + var = d.select(varName) + if arrayOnly: + if order == 'F': + var = N.array(var[:], order='F') + else: + var = var[:] + if isinstance(vars, list): + vars.append(var) + else: + vars[varName] = var + if not isinstance(vars, list): + vars['_fileHandle'] = d + + elif kind == 'nc': + d = netCDF4.Dataset(path) + d.set_auto_scale(set_auto_scale) + d.set_auto_mask(set_auto_mask) + if varNames == 'ALL': + varNames = d.variables.keys() + for varName in varNames: + var = d.variables[varName] + if arrayOnly: + if order == 'F': + var = N.array(var[:], order='F') + else: + var = var[:] + if isinstance(vars, list): + vars.append(var) + else: + vars[varName] = var + if not isinstance(vars, list): + vars['_fileHandle'] = d + + else: + if kind == 'dap': + print >>sys.stderr, 'DAP get of: %s' % urlStr + retries += 1 + retriesSave = retries + while retries > 0: + try: + d = open_url(urlStr) + retries = 0 + except: + retries -= 1 + if retries == 0: + print >>sys.stderr, 'getVariables: Error, DAP cannot open: %s' % urlStr + return (vars, d) + time.sleep(sleep) + + if varNames == 'ALL': + varNames = d.keys() + + for varName in varNames: + var = d[varName] + retries = retriesSave + while retries > 0: + try: + if arrayOnly: + if order == 'F': + var = N.array(var[:], order='F') + else: + var = var[:] # actually does DAP call to read array + retries = 0 + except: + retries -= 1 + if retries == 0: + print >>sys.stderr, 'getVariables: Error, DAP cannot get variable: %s' % varName + else: + time.sleep(sleep) + + if isinstance(vars, list): + vars.append(var) + else: + vars[varName] = var + if not isinstance(vars, list): + vars['_fileHandle'] = d + + + elif kind == 'w10n': + vars = None + return (vars, d) + + +def close(fh): + if hasattr(fh, 'end'): + fh.end() + elif hasattr(fh, 'close'): + fh.close() + +def fileKind(path): + return os.path.splitext(path)[1][1:].lower() +
