Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-08 Thread Mike Looijmans
Here's one that passes all the tests, and is 2x as fast as the 'current' 
and 'new' implementations on random binary data. I haven't been able to 
generate data where the 'mike' version is slower:


def read_to_boundary(self, req, boundary, file, readBlockSize=65536):
prevline = 
last_bound = boundary + '--'
carry = None
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
if carry is not None:
file.write(carry)
file.write(prevline[:-2])
break
elif (carry == '\r') and (prevline[-1] == '\n'):
file.write(prevline[:-1])
break
# If we get here, it's not really a boundary!
if carry is not None:
file.write(carry)
carry = None
if prevline[-1:] == '\r':
file.write(prevline[:-1])
carry = '\r'
else:
file.write(prevline)
prevline = line

I've attached a modified upload_test_harness.py that includes the new 
and current, also the 'org' version (as in 3.1 release) and the 'mike' 
version.


In addition, I added some profiling calls to show the impact of the 
extra 'endswith' and slices.


--
Mike Looijmans
Philips Natlab / Topic Automation
#!/usr/bin/env python

import sys
from cStringIO import StringIO
import md5

##def generate_split_file(offset=-1,
##  readBlockSize=65368,
##  fname='testfile'):
##f = open(fname, 'wb')
##f.write('a'*50)
##f.write('\r\n')
##block_size =  readBlockSize + offset
##f.write('b'*block_size)
##f.close() 

def read_to_boundary_current(self, req, boundary, file, readBlockSize):
''' currrent version '''
#
# Although technically possible for the boundary to be split by the read, this will
# not happen because the readBlockSize is set quite high - far longer than any boundary line
# will ever contain.
#
# lastCharCarried is used to detect the situation where the \r\n is split across the end of
# a read block.
#
delim = ''
lastCharCarried = False
last_bound = boundary + '--'
roughBoundaryLength = len(last_bound) + 128
line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength  roughBoundaryLength:
sline = line.strip()
else:
sline = ''
while lineLength  0 and sline != boundary and sline != last_bound:
if not lastCharCarried:
file.write(delim)
delim = ''
else:
lastCharCarried = False
cutLength = 0

if lineLength == readBlockSize:
if line[-1:] == '\r':
delim = '\r'
cutLength = -1
lastCharCarried = True

if line[-2:] == '\r\n':
delim += '\r\n'
cutLength = -2
elif line[-1:] == '\n':
delim += '\n'
cutLength = -1
if cutLength != 0:
file.write(line[:cutLength])
else:
file.write(line)

line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength  roughBoundaryLength:
sline = line.strip()
else:
sline = ''

def read_to_boundary_new(self, req, boundary, file, readBlockSize):
''' Alexis' version
read from the request object line by line with a maximum size,
until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(readBlockSize)
if line.startswith(boundary):
break

if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'

elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = line[-1:]

else:
file.write(previous_delimiter + line)
previous_delimiter = ''

def read_to_boundary_org(self, req, boundary, file, readBlockSize):
delim = 
line = req.readline(readBlockSize)
while line and not line.startswith(boundary):
odelim = delim
if line[-2:] == \r\n:
delim = \r\n
line = line[:-2]
elif line[-1:] == \n:
delim = \n
line = line[:-1]
else:
delim = 
file.write(odelim + line)
line = req.readline(readBlockSize)

def read_to_boundary_mike(self, req, boundary, file, readBlockSize=65536):
prevline = 
last_bound = boundary + '--'
carry = None
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
if carry is not None:
file.write(carry)
file.write(prevline[:-2])
break
   

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-08 Thread Alexis Marrero
Thanks for that improvement, don't like its complexity though.  I'm  
testing mikes version with my set of files I will all let you know  
how it goes.


BTW, the line that reads last_bound = boundary + '--' so we save 4  
CPU cycles there :)


The next test that I will run this against will be with an obscene  
amount of data for which this improvement helps a lot!


/amn
On Nov 8, 2005, at 4:47 AM, Mike Looijmans wrote:

Here's one that passes all the tests, and is 2x as fast as the  
'current' and 'new' implementations on random binary data. I  
haven't been able to generate data where the 'mike' version is slower:


def read_to_boundary(self, req, boundary, file, readBlockSize=65536):
prevline = 
last_bound = boundary + '--'
carry = None
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
if carry is not None:
file.write(carry)
file.write(prevline[:-2])
break
elif (carry == '\r') and (prevline[-1] == '\n'):
file.write(prevline[:-1])
break
# If we get here, it's not really a boundary!
if carry is not None:
file.write(carry)
carry = None
if prevline[-1:] == '\r':
file.write(prevline[:-1])
carry = '\r'
else:
file.write(prevline)
prevline = line

I've attached a modified upload_test_harness.py that includes the  
new and current, also the 'org' version (as in 3.1 release) and the  
'mike' version.


In addition, I added some profiling calls to show the impact of the  
extra 'endswith' and slices.


--
Mike Looijmans
Philips Natlab / Topic Automation
#!/usr/bin/env python

import sys
from cStringIO import StringIO
import md5

##def generate_split_file(offset=-1,
##  readBlockSize=65368,
##  fname='testfile'):
##f = open(fname, 'wb')
##f.write('a'*50)
##f.write('\r\n')
##block_size =  readBlockSize + offset
##f.write('b'*block_size)
##f.close()

def read_to_boundary_current(self, req, boundary, file,  
readBlockSize):

''' currrent version '''
#
# Although technically possible for the boundary to be split by  
the read, this will
# not happen because the readBlockSize is set quite high - far  
longer than any boundary line

# will ever contain.
#
# lastCharCarried is used to detect the situation where the \r 
\n is split across the end of

# a read block.
#
delim = ''
lastCharCarried = False
last_bound = boundary + '--'
roughBoundaryLength = len(last_bound) + 128
line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength  roughBoundaryLength:
sline = line.strip()
else:
sline = ''
while lineLength  0 and sline != boundary and sline !=  
last_bound:

if not lastCharCarried:
file.write(delim)
delim = ''
else:
lastCharCarried = False
cutLength = 0

if lineLength == readBlockSize:
if line[-1:] == '\r':
delim = '\r'
cutLength = -1
lastCharCarried = True

if line[-2:] == '\r\n':
delim += '\r\n'
cutLength = -2
elif line[-1:] == '\n':
delim += '\n'
cutLength = -1
if cutLength != 0:
file.write(line[:cutLength])
else:
file.write(line)

line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength  roughBoundaryLength:
sline = line.strip()
else:
sline = ''

def read_to_boundary_new(self, req, boundary, file, readBlockSize):
''' Alexis' version
read from the request object line by line with a maximum size,
until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(readBlockSize)
if line.startswith(boundary):
break

if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'

elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = line[-1:]

else:
file.write(previous_delimiter + line)
previous_delimiter = ''

def read_to_boundary_org(self, req, boundary, file, readBlockSize):
delim = 
line = req.readline(readBlockSize)
while line and not line.startswith(boundary):
odelim = delim
if line[-2:] == \r\n:
delim = \r\n
line = line[:-2]
elif line[-1:] == \n:
delim = \n
line = line[:-1]
else:
delim = 
file.write(odelim + line)
line = 

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-08 Thread Alexis Marrero
Inspired by Mike's changes I made some changes the new version to  
improve performance while keeping readability:


def read_to_boundary_new(self, req, boundary, file, readBlockSize):
previous_delimiter = ''
bound_length = len(boundary)
while 1:
line = req.readline(readBlockSize)
if line[:bound_length] == boundary:
break

if line[-2:] == '\r\n':
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'

elif line[-1:] == '\r':
file.write(previous_delimiter + line[:-1])
previous_delimiter = '\r'

elif line[-1:] == '\n':
if len(line[:-1])  0:
file.write(previous_delimiter + line[:-1])
previous_delimiter = '\n'

else:
previous_delimiter += '\n'

else:
file.write(previous_delimiter + line)
previous_delimiter = ''

Results are very comparable to read_to_boundary_mike

/amn
On Nov 8, 2005, at 9:07 AM, Jim Gallacher wrote:


Mike Looijmans wrote:

I've attached a modified upload_test_harness.py that includes the  
new and current, also the 'org' version (as in 3.1 release) and  
the 'mike' version.




Nice changes, Mike.

I started to get confused by the names of the various  
read_to_boundary_* functions, so I've made a slight modification to  
your code and renamed these functions.


Note also that there are some failures for the split_boundary file  
using offset -2 chunk [CR], so I've change the default offset to  
include this condition.


I've also changed the generate_*file functions to embed an  
additional '\r' in a short string. Just being paranoid I guess.


Diff is attached.

If it's useful I can stick the upload_test_harness code in the  
mod_python svn repository.


Jim

--- upload_test_harness_mike.py.orig2005-11-08  
08:02:13.0 -0500

+++ upload_test_harness_mike.py2005-11-08 08:54:15.0 -0500
@@ -14,8 +14,8 @@
 ##f.write('b'*block_size)
 ##f.close()

-def read_to_boundary_current(self, req, boundary, file,  
readBlockSize):

-''' currrent version '''
+def read_to_boundary_324(self, req, boundary, file, readBlockSize):
+''' currrent version  in 3.2.4b'''
 #
 # Although technically possible for the boundary to be split  
by the read, this will
 # not happen because the readBlockSize is set quite high - far  
longer than any boundary line

@@ -66,7 +66,7 @@
 else:
 sline = ''

-def read_to_boundary_new(self, req, boundary, file, readBlockSize):
+def read_to_boundary_alexis(self, req, boundary, file,  
readBlockSize):

 ''' Alexis' version
 read from the request object line by line with a maximum  
size,

 until the new line starts with boundary
@@ -89,7 +89,7 @@
 file.write(previous_delimiter + line)
 previous_delimiter = ''

-def read_to_boundary_org(self, req, boundary, file, readBlockSize):
+def read_to_boundary_314(self, req, boundary, file, readBlockSize):
 delim = 
 line = req.readline(readBlockSize)
 while line and not line.startswith(boundary):
@@ -145,6 +145,8 @@

 f = open(fname, 'wb')
 f.write('a'*50)
+f.write('\r')
+f.write('a'*50)
 f.write('\r\n')

 block_size =  readBlockSize + offset
@@ -163,6 +165,8 @@
 
 f = open(fname, 'wb')
 f.write('a'*50)
+f.write('\r')
+f.write('a'*50)
 f.write('\r\n')

 block_size =  readBlockSize + offset
@@ -171,7 +175,7 @@

 f.close()

-read_boundaries = [read_to_boundary_current, read_to_boundary_new,  
read_to_boundary_org, read_to_boundary_mike]
+read_boundaries = [read_to_boundary_324, read_to_boundary_alexis,  
read_to_boundary_314, read_to_boundary_mike]


 def main(file_generator, offset, chunk, block_size=116):
 fname_in = 'testfile.in'
@@ -218,6 +222,9 @@
 pass

 if __name__ == '__main__':
+# offsets which show problems are in [-2, -1]
+first_offset = -2
+last_offset = 0

 #test_chunks =  ['', '\r', '\n', '\r\n']

@@ -229,7 +236,7 @@
 print '='*40
 print file_gen_obj.__name__
 for chunk in test_chunks:
-for i in range(-1, 0):
+for i in range(first_offset, last_offset):
 print '-'*40
 print 'test offset', i, 'chunk',[ cname(c) for c  
in chunk ]

 print '-'*40





Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-07 Thread Alexis Marrero
Sorry for all this emails, but my system depends 100% on mod_python  
specially file uploading. :)


On Nov 7, 2005, at 2:04 PM, Jim Gallacher wrote:


Alexis Marrero wrote:


Jim,
Nicolas,
Thanks for sending the function that creates the test file.  
However I ran it to create the test file, and after uploading the  
file the MD5 still the same.




Did you call it with the same block size as you are using in your  
code? The '\r' character must appear in the file right at the  
readBlockSize boundary.
YES I ran it with generate_file(offset=-1, readBlockSize=116,  
fname='testfile')

and the MD5 of the input and output files match.



ie.

generate_file(offset=-1, readBlockSize=116, fname='testfile')

Jim

PS. Please make sure you cc. python-dev when replying. It's best to  
keep discussion on the list.








Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-07 Thread Jim Gallacher

Alexis Marrero wrote:
Sorry for all this emails, 


No worries. It's a bug that needs to be fixed, so your work will benefit 
everyone. :)


Jim


Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-07 Thread Jim Gallacher

Alexis Marrero wrote:

Ok. Now I'm confused.


So am I!

I've created a test harness so we can bypass mod_python completely. It 
includes a slightly modified version of read_to_boundary which adds a 
new parameter, readBlockSize.


In the output from the test harness, your version is 'new' and the 
current version is 'cur'. Run it and see what happens.


Jim

$ ./upload_test_harness


generate_embedded_cr_file

test offset -1 chunk []

src 5a63347d1106afdfa264b2a61f81ae82
cur 5a63347d1106afdfa264b2a61f81ae82 PASS
new 5a63347d1106afdfa264b2a61f81ae82 PASS


test offset -1 chunk ['CR']

src 82204e52343d5b25c2e783cd59499973
cur e4af2eee73029642a114697ba59217b3 FAIL
new 82204e52343d5b25c2e783cd59499973 PASS


generate_split_boundary_file

test offset -1 chunk []

src d481990a0f0bbd8acf847cd732714555
cur d481990a0f0bbd8acf847cd732714555 PASS
new 8fa5ac9f913d778575ea871506c392a9 FAIL


test offset -1 chunk ['CR']

src 8fa5ac9f913d778575ea871506c392a9
cur d481990a0f0bbd8acf847cd732714555 FAIL
new 8fa5ac9f913d778575ea871506c392a9 PASS



What I was trying to say is that I created a file with this function:

def generate_split_file(offset=-1,
  readBlockSize=65368,
  fname='testfile'):
f = open(fname, 'w')
f.write('a'*50)
f.write('\r\n')
block_size =  readBlockSize + offset
f.write('b'*block_size)
f.close()

Then I uploaded 'testfile' using the following  
StorageField.read_to_boundary() method:


def read_to_boundary(self, req, boundary, file):
''' read from the request object line by line with a maximum size,
until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(116)
if line.startswith(boundary):
break

if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'

elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = line[-1:]

else:
file.write(previous_delimiter + line)
previous_delimiter = ''

And the md5 in the client is the same one as in the server.  Do you  
have different results? Let me know.


Regards,
/amn

On Nov 7, 2005, at 2:11 PM, Jim Gallacher wrote:


Jim Gallacher wrote:


Alexis Marrero wrote:


Jim,
Thanks for sending the function that creates the test file.  However 
I ran it to create the test file, and after uploading the  file the 
MD5 still the same.




Just to clarify, is this for your new read_to_boundary or the one  in 
3.2? If it's for yours then the MD5 sum *should* be the same,  since 
that's what you fixed. :)



Did you call it with the same block size as you are using in your  
code? The '\r' character must appear in the file right at the  
readBlockSize boundary.

ie.
generate_file(offset=-1, readBlockSize=116, fname='testfile')











#!/usr/bin/env python

from mkfile import generate_split_file, generate_file
import sys
from StringIO import StringIO
import md5

def read_to_boundary_current(self, req, boundary, file, readBlockSize):
''' currrent version '''
#
# Although technically possible for the boundary to be split by the read, this will
# not happen because the readBlockSize is set quite high - far longer than any boundary line
# will ever contain.
#
# lastCharCarried is used to detect the situation where the \r\n is split across the end of
# a read block.
#
delim = ''
lastCharCarried = False
last_bound = boundary + '--'
roughBoundaryLength = len(last_bound) + 128
line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength  roughBoundaryLength:
sline = line.strip()
else:
sline = ''
while lineLength  0 and sline != boundary and sline != last_bound:
if not lastCharCarried:
file.write(delim)
delim = ''
else:
lastCharCarried = False
cutLength = 0

if lineLength == readBlockSize:
if line[-1:] == '\r':
delim = '\r'
cutLength = -1
lastCharCarried = True

if line[-2:] == '\r\n':
delim += '\r\n'
cutLength = -2
elif line[-1:] == '\n':
delim += '\n'
cutLength = -1
if cutLength != 0:
file.write(line[:cutLength])
else:
file.write(line)

line = req.readline(readBlockSize)
lineLength = len(line)
  

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-07 Thread Alexis Marrero

New version of read_to_boundary(...)

readBlockSize = 1  16
def read_to_boundary(self, req, boundary, file):
previous_delimiter = ''
while 1:
line = req.readline(readBlockSize)
if line.strip().startswith(boundary):
break

if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'

elif line.endswith('\r'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = '\r'

elif line.endswith('\n'):
if len(line[:-1])  0:
file.write(previous_delimiter + line[:-1])
previous_delimiter = '\n'

else:
previous_delimiter += '\n'

else:
file.write(previous_delimiter + line)
previous_delimiter = ''

This new functions passes the test for Jim's filetest generator.

[core:~] amarrero% python filegenerator.py ; md5 testfile ; cp  
testfile testfile.new ; python test.py ; md5 test.bin

MD5 (testfile) = d481990a0f0bbd8acf847cd732714555
MD5 (outputtestfile.bin) = d481990a0f0bbd8acf847cd732714555

I'm running a test with my set of files (6+) to see any other new  
issues.




On Nov 7, 2005, at 6:35 PM, Jim Gallacher wrote:


Alexis Marrero wrote:


Ok. Now I'm confused.



So am I!

I've created a test harness so we can bypass mod_python completely.  
It includes a slightly modified version of read_to_boundary which  
adds a new parameter, readBlockSize.


In the output from the test harness, your version is 'new' and the  
current version is 'cur'. Run it and see what happens.


Jim

$ ./upload_test_harness


generate_embedded_cr_file

test offset -1 chunk []

src 5a63347d1106afdfa264b2a61f81ae82
cur 5a63347d1106afdfa264b2a61f81ae82 PASS
new 5a63347d1106afdfa264b2a61f81ae82 PASS


test offset -1 chunk ['CR']

src 82204e52343d5b25c2e783cd59499973
cur e4af2eee73029642a114697ba59217b3 FAIL
new 82204e52343d5b25c2e783cd59499973 PASS


generate_split_boundary_file

test offset -1 chunk []

src d481990a0f0bbd8acf847cd732714555
cur d481990a0f0bbd8acf847cd732714555 PASS
new 8fa5ac9f913d778575ea871506c392a9 FAIL


test offset -1 chunk ['CR']

src 8fa5ac9f913d778575ea871506c392a9
cur d481990a0f0bbd8acf847cd732714555 FAIL
new 8fa5ac9f913d778575ea871506c392a9 PASS




What I was trying to say is that I created a file with this function:
def generate_split_file(offset=-1,
  readBlockSize=65368,
  fname='testfile'):
f = open(fname, 'w')
f.write('a'*50)
f.write('\r\n')
block_size =  readBlockSize + offset
f.write('b'*block_size)
f.close()
Then I uploaded 'testfile' using the following   
StorageField.read_to_boundary() method:

def read_to_boundary(self, req, boundary, file):
''' read from the request object line by line with a maximum  
size,

until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(116)
if line.startswith(boundary):
break
if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'
elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = line[-1:]
else:
file.write(previous_delimiter + line)
previous_delimiter = ''
And the md5 in the client is the same one as in the server.  Do  
you  have different results? Let me know.

Regards,
/amn
On Nov 7, 2005, at 2:11 PM, Jim Gallacher wrote:


Jim Gallacher wrote:



Alexis Marrero wrote:



Jim,
Thanks for sending the function that creates the test file.   
However I ran it to create the test file, and after uploading  
the  file the MD5 still the same.





Just to clarify, is this for your new read_to_boundary or the  
one  in 3.2? If it's for yours then the MD5 sum *should* be the  
same,  since that's what you fixed. :)




Did you call it with the same block size as you are using in  
your  code? The '\r' character must appear in the file right at  
the  readBlockSize boundary.

ie.
generate_file(offset=-1, readBlockSize=116, fname='testfile')










#!/usr/bin/env python

from mkfile import generate_split_file, generate_file
import sys
from StringIO import StringIO
import md5

def read_to_boundary_current(self, req, boundary, file,  
readBlockSize):

''' currrent version '''
#
# Although technically possible for the boundary to be split by  
the read, this will

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-07 Thread Mike Looijmans
What i don't like at all in this implementation is the large amount of 
memcpy operations.

1. line.strip()
2. line[:-x]
3. previous_delimiter + ...
The average pass will perform between two and three memcopy operations 
on the read block.


Suggestion: Loose the strip() call - it serves no purpose (the boundary 
will be the only thing on that line, otherwise it's not a boundary)


I've built one without any memcpy whatsoever, I've tried it with the 
test harnass and it appears equal to the Alexis version (pass all except 
the generate_split_boundary_file test):


def read_to_boundary(self, req, boundary, file, readBlockSize=65536):
prevline = 
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
file.write(prevline[:-2])
elif prevline.endswith('\n'):
file.write(prevline[:-1])
break
file.write(prevline)
prevline = line






Alexis Marrero wrote:
 New version of read_to_boundary(...)

 readBlockSize = 1  16
 def read_to_boundary(self, req, boundary, file):
 previous_delimiter = ''
 while 1:
 line = req.readline(readBlockSize)
 if line.strip().startswith(boundary):
 break

 if line.endswith('\r\n'):
 file.write(previous_delimiter + line[:-2])
 previous_delimiter = '\r\n'

 elif line.endswith('\r'):
 file.write(previous_delimiter + line[:-1])
 previous_delimiter = '\r'

 elif line.endswith('\n'):
 if len(line[:-1])  0:
 file.write(previous_delimiter + line[:-1])
 previous_delimiter = '\n'

 else:
 previous_delimiter += '\n'

 else:
 file.write(previous_delimiter + line)
 previous_delimiter = ''

 This new functions passes the test for Jim's filetest generator.


--
Mike Looijmans
Philips Natlab / Topic Automation



Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Jim Gallacher

Alexis,

Do you a have a small file which shows this behaviour and could be used 
for testing? Even better would be a function which would generate a test 
file. This could be included in the mod_python unit tests.


Jim


Alexis Marrero wrote:
All, 

The current 3.1 mod_python implementation of 
mod_python.util.StorageField.read_to_boudary reads as follows:


   203  def read_to_boundary(self, req, boundary, file):
   204  delim = 
   205  line = req.readline()
   206  sline = line.strip()
   207  last_bound = boundary + --
   208  while line and sline != boundary and sline != last_bound:
   209  odelim = delim
   210  if line[-2:] == \r\n:
   211  delim = \r\n
   212  line = line[:-2]
   213  elif line[-1:] == \n:
   214  delim = \n
   215  line = line[:-1]
   216  file.write(odelim + line)
   217  line = req.readline()
   218  sline = line.strip()

As we have discussed previously: 
http://www.modpython.org/pipermail/mod_python/2005-March/017754.html

http://www.modpython.org/pipermail/mod_python/2005-March/017756.html
http://www.modpython.org/pipermail/mod_python/2005-November/019460.html

This triggered couple of changes in mod_python 3.2 Beta which reads as 
follows:

33  # Fixes memory error when upload large files such as 700+MB ISOs.
34  readBlockSize = 65368
35
*...*
   225 def read_to_boundary(self, req, boundary, file):
...
   234 delim = ''
   235 lastCharCarried = False
   236 last_bound = boundary + '--'
   237 roughBoundaryLength = len(last_bound) + 128
   238 line = req.readline(readBlockSize)
   239 lineLength = len(line)
   240 if lineLength  roughBoundaryLength:
   241 sline = line.strip()
   242 else:
   243 sline = ''
   244 while lineLength  0 and sline != boundary and sline != 
last_bound:

   245 if not lastCharCarried:
   246 file.write(delim)
   247 delim = ''
   248 else:
   249 lastCharCarried = False
   250 cutLength = 0
   251 if lineLength == readBlockSize:
   252 if line[-1:] == '\r':
   253 delim = '\r'
   254 cutLength = -1
   255 lastCharCarried = True
   256 if line[-2:] == '\r\n':
   257 delim += '\r\n'
   258 cutLength = -2
   259 elif line[-1:] == '\n':
   260 delim += '\n'
   261 cutLength = -1
   262 if cutLength != 0:
   263 file.write(line[:cutLength])
   264 else:
   265 file.write(line)
   266 line = req.readline(readBlockSize)
   267 lineLength = len(line)
   268 if lineLength  roughBoundaryLength:
   269 sline = line.strip()
   270 else:
   271 sline = ''

This function has a mysterious bug in it... For some files which I could 
disclose (one of them been the PDF file for Apple's Pages User Manual in 
Italian) the uploaded file in the server ends up with the same length 
but different sha512 (the only digest that I'm using).  The problem is a 
'\r' in the middle of a chunk of data that is much larger 
than readBlockSize.


Anyhow, I wrote a new function, which I believe is much simpler, and 
test it with thousands and thousands of different files and so far it 
seems to work fine.  It reads as follows:


def read_to_boundary(self, req, boundary, file):
''' read from the request object line by line with a maximum size,
until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(116)
if line.startswith(boundary):
break
   
if line.endswith('\r\n'):

file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'
   
elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])  
previous_delimiter = line[-1:]


else:
file.write(previous_delimiter + line)
previous_delimiter = ''

Let me know any comments on it and if you test it and fails please also 
let me know. I don't have subversion account neither I don't know how to 
use it thus this email.


/amn

___
Mod_python mailing list
[EMAIL PROTECTED] mailto:[EMAIL PROTECTED]
http://mailman.modpython.org/mailman/listinfo/mod_python






Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Nicolas Lehuen
Hi guys,

In the pure if it ain't tested, it ain't fixed fashion, I've added a
unit test for file upload to the test suite. It uploads a randomly
generated 1 MB file to the server, and check that the MD5 digest
returned by the server is correct. I could not reproduce Alexis' bug
report this way, however. But I then I added a test with the
UNIX-HATERS handbook file ugh.pdf, and bang, here comes the bug.

I've checked in both unit tests into subversion, so that you can play with them. I'm now going to test Alexis' fix.

Regards,
Nicolas2005/11/6, Alexis Marrero [EMAIL PROTECTED]:
I don't have a function that creates the files but the I can pointyou to a file that has the problem, ironically is Unix HatersHandbook :) Well, at least is not the Python HH
http://research.microsoft.com/~daniel/uhh-download.htmlIt's MD5 is 9e8c42be55aac825e7a34d448044d0fe. I don't know what itends up been after upload with read_to_boundary().When you use the function to copy the file you will see that the
digest will be e45979254297b0ece9c182a789d7966e.I have other 5 files out of 78546 files that I'm testing it againstthat have the same issues, coincidentally there are all PDF files.Here is the script that I was testing it with.
def read_to_boundary(self, req, boundary, file): ''' read from the request object line by line with a maximum size, until the new line starts with boundary ''' previous_delimiter = ''
 while 1: line = req.readline(116) if line.startswith(boundary): break if line.endswith('\r\n'): file.write(previous_delimiter + line[:-2])
 previous_delimiter = '\r\n' elif line.endswith('\r') or line.endswith('\n'): file.write(previous_delimiter + line[:-1]) previous_delimiter = line[-1:]
 else: file.write(previous_delimiter + line) previous_delimiter = ''#f = file('Perl Bookshelf [4th Ed]/mre/final/ch06.pdf.new', 'a+')#f = file('Pages User Guide.app/Contents/Resources/Italian.lproj/
Pages Manuale Utente.pdf', 'a+')f = file('ugh.pdf.new', 'a+')f.write('\r\n--myboundary--\r\n')f.seek(0)o = file('test.bin', 'wb')read_to_boundary(None, f, '--myboundary', o)o.close()/amn
On Nov 6, 2005, at 11:58 AM, Jim Gallacher wrote: Alexis, I wanted to add that I'm testing your code. Alexis Marrero wrote: Let me know any comments on it and if you test it and fails please
 also let me know. I don't have subversion account neither I don't know how to use it thus this email. You don't need an account to use subversion anonymously. Just install subversion and grab a mod_python working copy.
 $ svn co http://svn.apache.org/repos/asf/httpd/mod_python/trunk trunk This will checkout a working copy into a new directory called
 trunk onyour machine. All of the following commands assume you are working in trunk/. Make your changes in your working copy, and then create a diff with: $ svn diff lib/python/mod_python/util.py  
your-patch.diff The other commands which you'll find immediately useful are: svn update- update your working copy from the repository svn status- shows status of changes in your working copy
 svn -u status- shows status of your copy against the repository I've found Version Control with Subverion is an excellent resource and is available online. 
http://svnbook.red-bean.com/en/1.1/index.html Jim


Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Nicolas Lehuen
OK, it looks like Alexis' fix solves the problem with ugh.pdf without
breaking the other unit tests. So I think we can safely integrate his
patch. Shall I do it ?

Regards,
Nicolas2005/11/6, Nicolas Lehuen [EMAIL PROTECTED]:
Hi guys,

In the pure if it ain't tested, it ain't fixed fashion, I've added a
unit test for file upload to the test suite. It uploads a randomly
generated 1 MB file to the server, and check that the MD5 digest
returned by the server is correct. I could not reproduce Alexis' bug
report this way, however. But I then I added a test with the
UNIX-HATERS handbook file ugh.pdf, and bang, here comes the bug.

I've checked in both unit tests into subversion, so that you can play with them. I'm now going to test Alexis' fix.

Regards,
Nicolas2005/11/6, Alexis Marrero [EMAIL PROTECTED]:

I don't have a function that creates the files but the I can pointyou to a file that has the problem, ironically is Unix HatersHandbook :) Well, at least is not the Python HH

http://research.microsoft.com/~daniel/uhh-download.htmlIt's MD5 is 9e8c42be55aac825e7a34d448044d0fe. I don't know what itends up been after upload with read_to_boundary().When you use the function to copy the file you will see that the
digest will be e45979254297b0ece9c182a789d7966e.I have other 5 files out of 78546 files that I'm testing it againstthat have the same issues, coincidentally there are all PDF files.Here is the script that I was testing it with.
def read_to_boundary(self, req, boundary, file): ''' read from the request object line by line with a maximum size, until the new line starts with boundary ''' previous_delimiter = ''
 while 1: line = req.readline(116) if line.startswith(boundary): break if line.endswith('\r\n'): file.write(previous_delimiter + line[:-2])
 previous_delimiter = '\r\n' elif line.endswith('\r') or line.endswith('\n'): file.write(previous_delimiter + line[:-1]) previous_delimiter = line[-1:]

 else: file.write(previous_delimiter + line) previous_delimiter = ''#f = file('Perl Bookshelf [4th Ed]/mre/final/ch06.pdf.new', 'a+')#f = file('Pages User Guide.app/Contents/Resources/Italian.lproj/
Pages Manuale Utente.pdf', 'a+')f = file('ugh.pdf.new', 'a+')f.write('\r\n--myboundary--\r\n')f.seek(0)o = file('test.bin', 'wb')read_to_boundary(None, f, '--myboundary', o)o.close()/amn
On Nov 6, 2005, at 11:58 AM, Jim Gallacher wrote: Alexis, I wanted to add that I'm testing your code. Alexis Marrero wrote: Let me know any comments on it and if you test it and fails please
 also let me know. I don't have subversion account neither I don't know how to use it thus this email. You don't need an account to use subversion anonymously. Just
 install subversion and grab a mod_python working copy.
 $ svn co http://svn.apache.org/repos/asf/httpd/mod_python/trunk
 trunk This will checkout a working copy into a new directory called
 trunk onyour machine. All of the following commands assume you are working in trunk/. Make your changes in your working copy, and then create a diff with: $ svn diff lib/python/mod_python/util.py  
your-patch.diff The other commands which you'll find immediately useful are: svn update- update your working copy from the repository svn status- shows status of changes in your working copy
 svn -u status- shows status of your copy against the repository I've found Version Control with Subverion is an excellent resource and is available online. 

http://svnbook.red-bean.com/en/1.1/index.html Jim




Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Alexis Marrero
Nicolas,Not that I'm the one to give permission whether to integrate things or not, but just to let you know I don't even have svn installed so I won't do it. At least not for a while...BTW, if there are some cherrypy developers in this mailing list, the CherryPy function that handles file uploads DOES also has the same issue. I'm not subscribe to CherryPy dev group thus the cross posting.From http://svn.cherrypy.org/trunk/cherrypy/_cpcgifs.py     24      def read_lines_to_outerboundary(self):    25          """Internal: read lines until outerboundary."""    26          next = "--" + self.outerboundary    27          last = next + "--"    28          delim = ""    29          last_line_lfend = True    30          while 1:    31              line = self.fp.readline(116)    32              if not line:    33                  self.done = -1    34                  break    35              if line[:2] == "--" and last_line_lfend:    36                  strippedline = line.strip()    37                  if strippedline == next:    38                      break    39                  if strippedline == last:    40                      self.done = 1    41                      break    42              odelim = delim    43              if line[-2:] == "\r\n":    44                  delim = "\r\n"    45                  line = line[:-2]    46                  last_line_lfend = True    47              elif line[-1] == "\n":    48                  delim = "\n"    49                  line = line[:-1]    50                  last_line_lfend = True    51              else:    52                  delim = ""    53                  last_line_lfend = False    54              self.__write(odelim + line)/amnOn Nov 6, 2005, at 4:20 PM, Nicolas Lehuen wrote:OK, it looks like Alexis' fix solves the problem with ugh.pdf without breaking the other unit tests. So I think we can safely integrate his patch. Shall I do it ?  Regards, Nicolas2005/11/6, Nicolas Lehuen [EMAIL PROTECTED]: Hi guys,  In the pure "if it ain't tested, it ain't fixed" fashion, I've added a unit test for file upload to the test suite. It uploads a randomly generated 1 MB file to the server, and check that the MD5 digest returned by the server is correct. I could not reproduce Alexis' bug report this way, however. But I then I added a test with the UNIX-HATERS handbook file ugh.pdf, and bang, here comes the bug.  I've checked in both unit tests into subversion, so that you can play with them. I'm now going to test Alexis' fix.  Regards, Nicolas2005/11/6, Alexis Marrero [EMAIL PROTECTED]:  I don't have a function that creates the files but the I can pointyou to a file that has the problem, ironically is "Unix HatersHandbook" :) Well, at least is not the Python HH http://research.microsoft.com/~daniel/uhh-download.htmlIt's MD5 is 9e8c42be55aac825e7a34d448044d0fe. I don't know what itends up been after upload with read_to_boundary().When you use the function to copy the file you will see that the digest will be e45979254297b0ece9c182a789d7966e.I have other 5 files out of 78546 files that I'm testing it againstthat have the same issues, coincidentally there are all PDF files.Here is the script that I was testing it with. def read_to_boundary(self, req, boundary, file): ''' read from the request object line by line with a maximum size, until the new line starts with boundary ''' previous_delimiter = ''  while 1: line = req.readline(116) if line.startswith(boundary): break if line.endswith('\r\n'): file.write(previous_delimiter + line[:-2])  previous_delimiter = '\r\n' elif line.endswith('\r') or line.endswith('\n'): file.write(previous_delimiter + line[:-1]) previous_delimiter = line[-1:]  else: file.write(previous_delimiter + line) previous_delimiter = ''#f = file('Perl Bookshelf [4th Ed]/mre/final/ch06.pdf.new', 'a+')#f = file('Pages User Guide.app/Contents/Resources/Italian.lproj/ Pages Manuale Utente.pdf', 'a+')f = file('ugh.pdf.new', 'a+')f.write('\r\n--myboundary--\r\n')f.seek(0)o = file('test.bin', 'wb')read_to_boundary(None, f, '--myboundary', o)o.close()/amn On Nov 6, 2005, at 11:58 AM, Jim Gallacher wrote: Alexis, I wanted to add that I'm testing your code. Alexis Marrero wrote: Let me know any comments on it and if you test it and fails please  also let me know. I don't have subversion account neither I don't know how to use it thus this email. You don't need an account to use subversion anonymously. Just  install subversion and grab a mod_python working copy.  $ svn co http://svn.apache.org/repos/asf/httpd/mod_python/trunk trunk This will checkout a working copy into a new directory called  "trunk" on  your machine. All of the following commands assume you are working in trunk/. Make your changes in your working copy, and then create a diff with: $ svn diff lib/python/mod_python/util.py  your-patch.diff The 

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Jim Gallacher
I've been spending some quality time with hexedit, vim and a little bit 
of python. I can now generate a file which can be used in the unit test.


The problem seems to occur when a '\r' character is right at 
readBlockSize boundary, which is 65368 in the current mod_python.util.


I have not yet tested Alexis's fix yet. It's possible that it doesn't 
fix the problem but is just masking it since his code uses a 
readBlockSize of 65536, which will not be a problem for ugh.pdf.


The attached script will generate a file that can be used for testing. I 
haven't tried to integrate it into Nicolas's unit test as I wanted to 
share my findings with everyone asap. I'll spend some more time on this 
tommorrow.


Happy Bug Hunting,
Jim

Nicolas Lehuen wrote:

Hi guys,

In the pure if it ain't tested, it ain't fixed fashion, I've added a 
unit test for file upload to the test suite. It uploads a randomly 
generated 1 MB file to the server, and check that the MD5 digest 
returned by the server is correct. I could not reproduce Alexis' bug 
report this way, however. But I then I added a test with the UNIX-HATERS 
handbook file ugh.pdf, and bang, here comes the bug.


I've checked in both unit tests into subversion, so that you can play 
with them. I'm now going to test Alexis' fix.



#!/usr/bin/env python

import sys

def generate_file(offset=-1, readBlockSize=65368, fname='testfile'):
 Generate a file which causes the error with file upload
The default offset of -1 should generate a file which will
be corrupted by the file upload.

Test Results:

offset = -2
235a26d377f0b49ba2fd8706f27dc9e5  testfile.-2.bin
235a26d377f0b49ba2fd8706f27dc9e5  testfile.-2.bin.res

offset = -1
** THIS ONE FAILS **
39626fae4bd0812a2522ebe45c23e186  testfile.-1.bin
e1fc1008bfa9a98dde6c9481e3c83c43  testfile.-1.bin.res

offset = 0
8bd673f325f250a90d623defe1fff11d  testfile.0.bin
8bd673f325f250a90d623defe1fff11d  testfile.0.bin.res

offset = 1
d0f4adf904cdc058f0ac7013baa83998  testfile.1.bin
d0f4adf904cdc058f0ac7013baa83998  testfile.1.bin.res



# readBlockSize is the same as the value from mod_python.util
f = open('%s.%d.src' % (fname, offset), 'w')
f.write('a'*100)
f.write('\r\n')

block_size =  readBlockSize + offset
f.write('b'*block_size)
f.write('\rccc')

f.write('d'*100)
f.write('\r\n')

f.close()

if __name__ == '__main__':
generate_file(offset=int(sys.argv[1]))



Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

2005-11-06 Thread Jim Gallacher

Gregory (Grisha) Trubetskoy wrote:


So I guess this means we roll and vote on a 3.2.5b?



As much as it pains me to say it, but yes, this is a must fixm so it's 
on to 3.2.5b.


I think we need to do some more extensive testing on Alexis's fix before 
we roll 3.2.5b. His read_to_boundary is much simpler than the current 
one. This makes me wonder if there is some magic happening in the 
current version which is solving some weird problems, or is his code 
just that much better?


I'm feeling a little dull right now so all the code just blurs together. 
It also doesn't help that util.py uses *3 spaces* for the indent. Yikes. 
How the heck did that happen? :(


I'll take another look tomorrow.

Jim