Well, problem solved.
As it always happens, it is sufficient to ask some people for help.
Then those people ask for some more information, one is
forced to do some stuff to get this information, and while doing it,
one sees that the two programs I have spoken about were not identical.
But, because I did do some runs in order to compare the compression rates,
I will give you the results.
I produced a number of test runs, with varied table length.
First kind of run: a test program with a simple table consisting
of two float32 columns, both filled with N random numbers.
result:
Pentium
(N) (Size of h5 file, complevel=7 , zlib, in MByte)
1 0.00683594
2 0.00683594
4 0.00683594
8 0.00683594
16 0.00683594
32 0.00683594
64 0.00683594
128 0.00683594
256 0.00878906
512 0.0107422
1024 0.0146484
2048 0.0224609
4096 0.0380859
8192 0.0693359
16384 0.135742
32768 0.263672
65536 0.519531
131072 1.03223
262144 2.05566
524288 4.10352
1048576 8.20605
2097152 16.4043
4194304 32.8008
8388608 65.5908
16777216 131.174
33554432 262.341
67108864 524.681
134217728 1049.35
AMD:
1 0.0302734
2 0.0302734
4 0.0302734
8 0.0302734
16 0.0302734
32 0.0302734
64 0.0302734
128 0.0302734
256 0.0302734
512 0.0302734
1024 0.0302734
2048 0.0302734
4096 0.0537109
8192 0.078125
16384 0.149414
32768 0.268555
65536 0.506836
131072 1.00781
262144 2.0127
524288 4.01758
1048576 8.0293
2097152 16.0518
4194304 32.0742
8388608 64.1162
16777216 128.207
33554432 256.406
67108864 512.778
134217728 1025.55
I also ran my original program, which basically writes 3 tables , each
consisting
of float32 columns of length N:
Pentium
1 0.015625
2 0.015625
4 0.015625
8 0.015625
16 0.015625
32 0.015625
64 0.015625
128 0.015625
256 0.0166016
512 0.0185547
1024 0.0244141
2048 0.03125
4096 0.0449219
8192 0.0751953
16384 0.131836
32768 0.261719
65536 0.504883
131072 0.986328
262144 2.14844
524288 4.47949
AMD:
1 0.0146484
2 0.0146484
4 0.0146484
8 0.0146484
16 0.0146484
32 0.0146484
64 0.015625
128 0.015625
256 0.0166016
512 0.0185547
1024 0.0214844
2048 0.0273438
4096 0.0400391
8192 0.0673828
16384 0.140625
32768 0.268555
65536 0.461914
131072 0.866211
262144 1.87598
524288 3.92285
Interestingly, both the AMD and the pentium compress the 3*N values in 3 tables
better than the tables consisting of 2 colums; both roughly store N=524288 in
4 MegaByte files.
Also interestingly, the AMD still uses 12.4 % less space than the pentium to do
the
3-tables compression, but it uses only 2.1 % less space than the pentium for
the
one-table, 2 cols compression (at N=2**19).
I therefore still have discrepancies in the file sizes, but they don't seem to
be
alarming any more. If you are still interested in the program which produces
the
12 % difference, here it is:
import neueprogramme.commonfunctions as cf
import cPickle
import neueprogramme.library as l
import math,sys,re,os,scipy,tables
pat_string=re.compile(r'"(.*?)"')
pat_two_strings=re.compile(r'"(.*?)".*?"(.*?)"')
class WaveformH5Class(tables.IsDescription):
value = tables.Float32Col() # float (single-precision)
class DataMarker:
def __init__(self):
Time=0
Marker=""
class DataWaveform:
def __init__(self):
self.BeginTime=0 #First Value is measured at BeginTime
self.EndTime=0 #Last Vaule is measured at EndTime-DT
self.DT=0
self.ID={}
self.ID["ChannelID"]=""
self.ID["PieceNumber"]=None
self.ID["ID"]="" #Eindeutige ID: CHANNELID_GAPID
self.H5FileName='' #Filename des h5 files, das die Daten enthaelt
self.table=None
self.H5File=None #File Identifier of H5File
Values=[]
def table_value_append(self,table,key,val):
for i in range(len(key)):
table.row[key[i]]=val[i]
table.row.append()
def create_table(self,H5File):
return
H5File.createTable("/",self.ID["ID"],WaveformH5Class,"Channel
%s, Piece %d"%(self.ID["ChannelID"],self.ID["PieceNumber"]))
def generate_ID(self,ChannelID,PieceNumber):
self.ID["ChannelID"]=ChannelID
self.ID["PieceNumber"]=PieceNumber
self.ID["ID"]="ChannelID_"+ChannelID+"_"+"PieceID_"+str(PieceNumber).zfill(3)
return self.ID
def read_channel_data_waveform_onepiece(self,file,FirstLine):
#Ich erwarte das Auftauchen von dem Keywort GAP oder START
i=FirstLine
if len(i.split("\t"))<3:
print "Channel header is too short"
sys.exit()
self.Values=[]
try: self.BeginTime=float(i.split("\t")[1])
except: print "Couldn't read BeginTime of Channel"
try: self.DT=float(i.split("\t")[2])
except: print "Couldn't read DT of Channel"
i=file.next()
#efficient data storage
#Wir brauchen ein offenes file; wenn es schon offen ist,
#macht nix; sonst werden die Daten drangehaengt
#Lokale instanz des files
H5File=tables.openFile(self.H5FileName,mode="a",filters=tables.Filters(complevel=7,
complib="zlib", shuffle=1, fletcher32=1))
#Als naechstes koennen wir nach der Tabelle suchen,
#die unsere Daten enthalten soll. Im Prinzip sollte sie
#noch nicht existieren. Wenn doch, sollten wir sie loeschen
#und neu kreieren
table=None
print self.ID["ID"]
os.system("date")
for node in H5File:
if node._v_name==self.ID["ID"]:
table=node
table.remove()
table=None
if table==None:
table = self.create_table(H5File)
#dann koennen wir drauf schreiben!!!
time=self.BeginTime
table.attrs.BeginTime=self.BeginTime
table.attrs.DT=self.DT
while 1:
try:
val=float(i)
#self.Values.append(val)
self.table_value_append(table,["value"],[val])
time+=self.DT
i=file.next()
except:
break
self.EndTime = len(self.Values)*self.DT+self.BeginTime
H5File.close()
return file,i
class LaufClass:
def __init__(self):
self.Spike2FileName=''
self.H5FileName=''
self.PickleFileName=''
self.channels=[]
def generate_H5FileName(self):
prefix,suffix=cf.PrefixSuffix(self.Spike2FileName)
return prefix+'.h5'
pass
def generate_PickleFileName(self):
prefix,suffix=cf.PrefixSuffix(self.Spike2FileName)
return prefix+'.cPickle'
pass
def H5FileRepack(self):
a=self.H5FileName
b=self.H5FileName+".h5"
os.system("h5repack -i %s -o %s; mv %s %s"%(a,b,b,a))
pass
def read_Spike2_file(self):
file=open(self.Spike2FileName)
SummaryAppendFlag=0
SummaryText=[]
# find "Summary"
for i in file:
if i.find("SUMMARY")> -1: SummaryAppendFlag=1
if len(i)<=2 and SummaryAppendFlag==1: SummaryAppendFlag=0
if SummaryAppendFlag==1: SummaryText.append(i)
if SummaryText<>[] and SummaryAppendFlag==0: break
# find all the Channel Descriptions in the Summary
channels=[]
## in principle, no need to read the summary; all this information
is in
the individual channel headers
if 0: self.channels=self.read_summary(SummaryText)
# go through the file and find all the channels on-the-fly
i=file.next()
while i:
if i.find("CHANNEL")>-1:
ch=ChannelClass()
ch.H5FileName=self.H5FileName
file,i=ch.read_complete_channel(file,i)
self.channels.append(ch)
else:
try:
i=file.next()
except:
break
#self.H5FileRepack()
return self.channels
def Pickle_Spike2(self):
self.channels=self.read_Spike2_file()
file=open(self.PickleFileName,"w")
cPickle.dump(self.channels,file,2)
file.close()
def read_summary(self,SummaryText):
self.channels=[]
for i in SummaryText[1:]:
channel=ChannelClass()
i_split=i.split("\t")
channel.ID = re.search(pat_string,i_split[0]).group(1)
channel.Type = re.search(pat_string,i_split[1]).group(1)
channel.Title = re.search(pat_string,i_split[2]).group(1)
if channel.Type=="Waveform":
channel.Unit = re.search(pat_string,i_split[3]).group(1)
pass
self.channels.append(channel)
return self.channels
class ChannelClass:
def __init__(self):
self.Title=''
self.Unit=''
self.Desc=''
self.ID =''
self.Type=''
self.Data=[]
def read_channel_description(self,file,firstLine):
Description=[firstLine]
for i in file: #Bricht ab, wenn entweder eines der Keywoerter CHANNEL,
GAP
oder START gefunden wurde, oder wenn eine Zahl gelesen werden konnte
if i.find("CHANNEL")>-1 or i.find("START")>-1 or i.find("GAP")>-1:
break
try:
value=float(i.split("\t")[0])
break
except:
Description.append(i)
return file,i,Description
def analyze_channel_description(self,Description):
if len(Description)>0: self.ID
=re.search(pat_two_strings,Description[0]).group(2)
if len(Description)>1:
self.Type=re.search(pat_string,Description[1]).group(1)
if len(Description)>2:
self.Desc=re.search(pat_string,Description[2]).group(1)
if len(Description)>3:
self.Title=re.search(pat_string,Description[3]).group(1)
if self.Type=="Waveform" and len(Description)>5:
self.Unit=re.search(pat_string,Description[5]).group(1)
return self
def read_channel_data(self,file,FirstLine):
# i is the first entry of the data
self.Data=[]
i=FirstLine
if self.Type=="Waveform":
PieceNumber=0
while 1:
while not ( i.find("GAP")>-1 or i.find("START")>-1 or
i.find("CHANNEL")>-1):
i=file.next()
if i.find("CHANNEL")>-1:
break
data=DataWaveform()
data.ID=data.generate_ID(self.ID,PieceNumber)
data.H5FileName=self.H5FileName
file,i=data.read_channel_data_waveform_onepiece(file,i)
self.Data.append(data)
PieceNumber=PieceNumber+1
pass
pass
if self.Type=="Evt+":
pass
if self.Type=="Marker":
while 1:
data=DataMarker()
try:
data.Time=float(i.split("\t")[0])
data.Marker=re.search(pat_string,i.split("\t")[1]).group(1)[0]
self.Data.append(data)
i=file.next()
except:
break
pass
return file,i
def read_complete_channel(self,file,FirstLine):
file,FirstLine,Description=self.read_channel_description(file,FirstLine)
self.analyze_channel_description(Description)
file,FirstLine=self.read_channel_data(file,FirstLine)
return file,FirstLine
def get_MarkerBlocks(self,Marker):
IndexList=[]
for i in range(len(self.Data)):
if self.Data[i].Marker==Marker: IndexList.append(i)
return IndexList
def make_TimeList(self,Marker,MarkerLength,Length):
TimeList=[]
IndexList=self.get_MarkerBlocks(Marker)
for i in IndexList:
a=self.Data[i].Time
if i==len(self.Data)-1 : b =1e7
else: b= self.Data[i+1].Time
TimeList.append([a,b])
return TimeList
def
get_ValueList(self,ControlChannel,Marker,MarkerLength,Length,ZeitFenster):
a,b=ZeitFenster
#ControlChannel enthaelt die Marker, die den Anfang eines jeden Pulses
bezeichnen
#Idee: Fuer eine gegebene Zeit [a:b] schreibe ich nur die
#Werte in die Value_List, die sich in der Mitte von jedem Marker des
ControlChannels
#befinden.
#Die Laenge jedes Markers wird mit MarkerLength angenommen, die zu
entnehmende
#Laenge der Daten ist in der Mitte jeden Abschnitts angesiedelt, mit
Length
Sekunden
#Puffer zu beiden Raendern
#Wenn ControlChannel==None, dann gibt es keine Kontrolle, und alle
vorhandenen
#Daten im Zeitfenster [a:b] werden zurueckgegeben (Ebenfalls mit Length
als
Puffer)
#
#Falls ein GAP dazwischen liegt, muss ich die Teilstuecke
[a:GAP_Anfang]
und [GAP_Ende:b]
#beruecksichtigen
TimeList_bis=[]
for i in range(len(self.Data)):
a_bis=max(a,self.Data[i].BeginTime)
b_bis=min(b,self.Data[i].EndTime)
if (b_bis>a_bis):
TimeList_bis.append([a_bis,b_bis])
pass
pass
TimeList=TimeList_bis
#Wenn es einen ControlChannel gibt, dann muss ich die TimeList in die
kleineren Stuecke gemaess
#der MarkerLength schlagen. Die PufferKorrektur kommt danach
if ControlChannel<>None:
test=0
for i in ControlChannel.Data:
if i.Marker==Marker:
#Es gibt Marker der gesuchten Form im KontrollKanal
test=1
break
if test:
TimeList=[]
for i in TimeList_bis:
[a,b]=i
for j in ControlChannel.Data:
if(j.Marker==Marker and j.Time >= a and
j.Time+MarkerLength
<= b):
TimeList.append([j.Time,j.Time+MarkerLength])
pass
pass
pass
#PufferKorrektur
TimeList=[[i[0]+Length,i[1]-Length] for i in TimeList]
#Und endlich die Werte, die uns interessieren!!!
ValueList=[]
for i in self.Data:
for j in TimeList:
[a,b]=j
if (a>i.BeginTime and b < i.EndTime):
a_ind=int(scipy.round((a-i.BeginTime)/i.DT))
b_ind=a_ind+int(scipy.floor((b-a)/i.DT))
data=DataWaveform()
data.DT=i.DT
data.BeginTime=i.BeginTime+a_ind*data.DT
data.EndTime=i.BeginTime+b_ind*data.DT
data.Values=i.Values[a_ind:b_ind]
ValueList.append(data)
pass
pass
pass
return ValueList #Dies ist eine Liste von DataWaveform Objekten
def calc_correlation_from_list(self,Values1,Values2,win,):
CoList=[]
for i in range(len(Values1)):
co=cf.correlate_FT(Values1[i],Values2[i],win)
if co<>None: CoList.append(co)
if len(CoList)==0: return [],()
N=len(CoList[0])
Co_mean=[scipy.mean([i[j] for i in CoList]) for j in range(N)]
Co_std =[scipy.std ([i[j] for i in CoList]) for j in range(N)]
return [[Co_mean[i],Co_mean[i]/scipy.sqrt(N),Co_std[i]] for i in
range(N)],(len(CoList),len(Values1[0]))
def
calc_powerspectrum(self,MarkerChannel,Marker,MarkerLength,Length,ControlChannel,deltafg,cutoff=None,zusammen=0):
TimeList=MarkerChannel.make_TimeList(Marker,MarkerLength,Length)
ValueList_bis=[]
for i in TimeList:
ValueList_bis.append(self.get_ValueList(ControlChannel,Marker,MarkerLength,Length,i))
#ValueList_bis sollte nun eine Liste von einer Liste von DataWaveform
Objekten enthalten!
#Das erste Niveau sind dabei die Haupteinteilungen.
#Im Prinzip sollte fuer jede einzelne ein Powerspectrum berechnet werden
#Deshalb gibt diese Prozedur eine Liste von Powerspektren zurueck.
#Ausnahme: der zusammen-flag ist 1, nuetzlich beim Berechnen des
#Spektrums ohne Reiz
if len(ValueList_bis)==0: return []
if len(ValueList_bis[0])==0: return []
fs=1./ValueList_bis[0][0].DT
Ng=cf.calc_Number_of_Points_for_Fourier_Transform(fs,deltafg,verbose=0)
win=scipy.signal.signaltools.get_window("triang",Ng)
ValueList=[]
Values=[]
for i in ValueList_bis:
if not zusammen: Values=[]
for j in i:
Values.extend([j.Values[i*Ng:(i+1)*Ng] for i in
range(len(j.Values)/Ng)])
if not zusammen: ValueList.append(Values)
if zusammen: ValueList=[Values]
psd=[]
for i in ValueList:
psd_hier=self.calc_correlation_from_list(i,i,win)
if len(psd_hier[0])>0:
psd.append(psd_hier)
sumw=scipy.sum(win**2)
psd=[[scipy.r_[i[0]]/sumw/Ng,i[1]] for i in psd] #Convention wie in
numerical recipes. fuer L^2/Hz muss es noch durch binweite geteilt werden
T=Ng*1./fs
psd=[[[[j/T,i[0][j][0],i[0][j][1],i[0][j][2]] for j in
range(Ng/2)],i[1]]
for i in psd]
return psd
def main(argv):
Spike2FileName=argv[1]
lauf=LaufClass()
lauf.Spike2FileName=Spike2FileName
lauf.H5FileName=lauf.generate_H5FileName()
lauf.PickleFileName=lauf.generate_PickleFileName()
lauf.Pickle_Spike2()
file=open(lauf.PickleFileName)
lauf.channels=cPickle.load(file)
file.close()
sys.exit()
for i in lauf.channels:
print i.Title,i.Type,i.Desc,i.ID,i.Unit
## if i.Type=="Waveform":
## print [(j.BeginTime,len(j.Values)) for j in i.Data]
## if i.Type=="Marker":
## print len(i.Data)
## if i.Type=="Waveform":
## pass
for i in ["0","a","b","c","d","e","f","g","h","i","j","k"][0:2]:
print i
zusammen=1
channel_number=0
psd=lauf.channels[channel_number].calc_powerspectrum(lauf.channels[3],i,1.01,0.1,lauf.channels[4],2,cutoff=None,zusammen=zusammen)
for j in range(len(psd)):
FileName=Spike2FileName+'_'+lauf.channels[channel_number].Title+'_'+i+'_'+str(j)+'.psd'
if zusammen:
FileName=Spike2FileName+'_'+lauf.channels[channel_number].Title+'_'+i+'_'+'all'+'.psd'
file=open(FileName,'w')
for kk in psd[j][0]:
k=abs(scipy.r_[kk])
#frequenz ps(normalized wie in numerical recipes, aber
2-seitig)
(standard deviation, theoretical) (standard deviation, experimental)
file.write('%g %g %g %g\n'%(k[0],k[1],k[2],k[3]))
file.close()
if __name__ == "__main__":
# import psyco
# psyco.bind(main)
sys.exit(main(sys.argv))
and a sample input file is here:
"INFORMATION"
"intens 1753.smr"
""
""
""
""
""
"SUMMARY"
"1" "Waveform" "Laser" "nm" 50000 41666.666667 10000 0
"2" "Waveform" "Stimulus" "V" 50000 41666.666667 1 0
"5" "Waveform" "nerve" "mV" 50000 41666.666667 10 0
"31" "Marker" "Keyboard"
"32" "Marker" "untitled"
"CHANNEL" "1"
"Waveform"
"wegamplitude"
"Laser"
"nm" 50000
"START" 0.000000 0.000024
-24738.8
-24744.8
-24750.8
-24756.8
-24762.8
-24768.8
-24774.8
-24780.8
-24786.8
-24792.8
-24798.7
-24804.7
-24810.7
-24816.7
-24822.7
-24828.7
"CHANNEL" "2"
"Waveform"
"Spannung des elektrostatischen Reizes"
"Stimulus"
"V" 50000
"START" 0.000006 0.000024
-0.00046
-0.00061
-0.00031
-0.00046
-0.00015
-0.00015
0
-0.00031
-0.00061
0
-0.00046
-0.00046
-0.00046
-0.00031
-0.00015
-0.00031
"CHANNEL" "5"
"Waveform"
"ephysiologie"
"nerve"
"mV" 50000
"START" 0.000018 0.000024
-0.017
-0.0323
0.0028
-0.0201
-0.0323
-0.0231
-0.0307
-0.0292
-0.0155
-0.0155
-0.017
0.0043
0.0028
0.0028
0.0058
0.0287
"CHANNEL" "32"
"Marker"
"No comment"
"untitled"
3.271452 "a???" 97 0 0 0
4.282449 "a???" 97 0 0 0
Bye, Bjoern
Francesc Altet wrote :
> A Dimarts 13 Juny 2006 17:21, Björn Nadrowski va escriure:
> > Hello,
> > I am running pytables on two machines, both ubuntu dapper drake.
> > One is an amd 64 bit machine with kernel 2.6.15,
> > the other an intel pentium m with kernel 2.6.17-rc6
> >
> > I am running an identical program on both machines,
> > writing large amounts of data to disk with zlib and complevel=7.
> > It turns out that the resulting file is more than twice as large
> > on the intel machine than on the amd machine.
> >
> > Is there a reason for this behaviour?
>
> Well, it's difficult to say what is happening without looking at some example
> of your code, but if it really happens then something is going on wrong. Can
> you send a small program reproducing this?
>
> Regards,
>
> --
> >0,0< Francesc Altet http://www.carabos.com/
> V V Cárabos Coop. V. Enjoy Data
> "-"
_______________________________________________
Pytables-users mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/pytables-users