Here's one that passes all the tests, and is 2x as fast as the 'current'
and 'new' implementations on random binary data. I haven't been able to
generate data where the 'mike' version is slower:
def read_to_boundary(self, req, boundary, file, readBlockSize=65536):
prevline = ""
last_bound = boundary + '--'
carry = None
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
if carry is not None:
file.write(carry)
file.write(prevline[:-2])
break
elif (carry == '\r') and (prevline[-1] == '\n'):
file.write(prevline[:-1])
break
# If we get here, it's not really a boundary!
if carry is not None:
file.write(carry)
carry = None
if prevline[-1:] == '\r':
file.write(prevline[:-1])
carry = '\r'
else:
file.write(prevline)
prevline = line
I've attached a modified upload_test_harness.py that includes the new
and current, also the 'org' version (as in 3.1 release) and the 'mike'
version.
In addition, I added some profiling calls to show the impact of the
extra 'endswith' and slices.
--
Mike Looijmans
Philips Natlab / Topic Automation
#!/usr/bin/env python
import sys
from cStringIO import StringIO
import md5
##def generate_split_file(offset=-1,
## readBlockSize=65368,
## fname='testfile'):
## f = open(fname, 'wb')
## f.write('a'*50)
## f.write('\r\n')
## block_size = readBlockSize + offset
## f.write('b'*block_size)
## f.close()
def read_to_boundary_current(self, req, boundary, file, readBlockSize):
''' currrent version '''
#
# Although technically possible for the boundary to be split by the read, this will
# not happen because the readBlockSize is set quite high - far longer than any boundary line
# will ever contain.
#
# lastCharCarried is used to detect the situation where the \r\n is split across the end of
# a read block.
#
delim = ''
lastCharCarried = False
last_bound = boundary + '--'
roughBoundaryLength = len(last_bound) + 128
line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength < roughBoundaryLength:
sline = line.strip()
else:
sline = ''
while lineLength > 0 and sline != boundary and sline != last_bound:
if not lastCharCarried:
file.write(delim)
delim = ''
else:
lastCharCarried = False
cutLength = 0
if lineLength == readBlockSize:
if line[-1:] == '\r':
delim = '\r'
cutLength = -1
lastCharCarried = True
if line[-2:] == '\r\n':
delim += '\r\n'
cutLength = -2
elif line[-1:] == '\n':
delim += '\n'
cutLength = -1
if cutLength != 0:
file.write(line[:cutLength])
else:
file.write(line)
line = req.readline(readBlockSize)
lineLength = len(line)
if lineLength < roughBoundaryLength:
sline = line.strip()
else:
sline = ''
def read_to_boundary_new(self, req, boundary, file, readBlockSize):
''' Alexis' version
read from the request object line by line with a maximum size,
until the new line starts with boundary
'''
previous_delimiter = ''
while 1:
line = req.readline(readBlockSize)
if line.startswith(boundary):
break
if line.endswith('\r\n'):
file.write(previous_delimiter + line[:-2])
previous_delimiter = '\r\n'
elif line.endswith('\r') or line.endswith('\n'):
file.write(previous_delimiter + line[:-1])
previous_delimiter = line[-1:]
else:
file.write(previous_delimiter + line)
previous_delimiter = ''
def read_to_boundary_org(self, req, boundary, file, readBlockSize):
delim = ""
line = req.readline(readBlockSize)
while line and not line.startswith(boundary):
odelim = delim
if line[-2:] == "\r\n":
delim = "\r\n"
line = line[:-2]
elif line[-1:] == "\n":
delim = "\n"
line = line[:-1]
else:
delim = ""
file.write(odelim + line)
line = req.readline(readBlockSize)
def read_to_boundary_mike(self, req, boundary, file, readBlockSize=65536):
prevline = ""
last_bound = boundary + '--'
carry = None
while 1:
line = req.readline(readBlockSize)
if not line or line.startswith(boundary):
if prevline.endswith('\r\n'):
if carry is not None:
file.write(carry)
file.write(prevline[:-2])
break
elif (carry == '\r') and (prevline[-1] == '\n'):
file.write(prevline[:-1])
break
# If we get here, it's not really a boundary!
if carry is not None:
file.write(carry)
carry = None
if prevline[-1:] == '\r':
file.write(prevline[:-1])
carry = '\r'
else:
file.write(prevline)
prevline = line
def get_checksum(fname):
data = open(fname, 'rb').read()
m = md5.new()
m.update(data)
return m.hexdigest()
def generate_embedded_cr_file(offset=-1, readBlockSize=65368, chunk='\r', fname='testfile'):
""" Generate a file which causes the error with file upload
The default offset of -1 should generate a file which will
be corrupted by the file upload.
"""
f = open(fname, 'wb')
f.write('a'*50)
f.write('\r\n')
block_size = readBlockSize + offset
f.write('b'*block_size)
f.write(chunk)
f.write('ccc')
f.write('d'*50)
f.write('\r\n')
f.close()
def generate_split_boundary_file(offset=-1, readBlockSize=65368, chunk='\r', fname='testfile'):
""" this function generates a file with a boundary string '\r\n--myboundary'
starting at readBlockSize - offset
"""
f = open(fname, 'wb')
f.write('a'*50)
f.write('\r\n')
block_size = readBlockSize + offset
f.write('b'*block_size)
f.write(chunk)
f.close()
read_boundaries = [read_to_boundary_current, read_to_boundary_new, read_to_boundary_org, read_to_boundary_mike]
def main(file_generator, offset, chunk, block_size=1<<16):
fname_in = 'testfile.in'
fname_out_base = 'testfile.out'
file_generator(offset=offset, readBlockSize=block_size, chunk=chunk, fname=fname_in)
orig_checksum = get_checksum(fname_in)
req = StringIO()
req.write(open(fname_in, 'rb').read())
req.write('\r\n--myboundary\r\n')
src_cs = get_checksum(fname_in)
print ' src', src_cs
for rtb in read_boundaries:
name = rtb.__name__.split('_')[-1]
fname_out = fname_out_base + name
o = file(fname_out, 'wb')
req.seek(0)
rtb(None, req, '--myboundary', o, block_size)
size = o.tell()
o.close()
cs = get_checksum(fname_out)
print "%8s %s %6d" % (name, cs, size),
if cs != src_cs:
print 'FAIL'
else:
print 'PASS'
def cname(ch):
if ch == '\r':
return 'CR'
elif ch == '\n':
return 'LF'
elif ch == '':
return 'None'
else:
return ord(ch)
class DevNull:
def write(self, data):
pass
if __name__ == '__main__':
#test_chunks = ['', '\r', '\n', '\r\n']
# only test the chunks that are currently a problem
test_chunks = ['', '\r',]
test_cases = (generate_embedded_cr_file, generate_split_boundary_file)
for file_gen_obj in test_cases:
print '='*40
print file_gen_obj.__name__
for chunk in test_chunks:
for i in range(-1, 0):
print '-'*40
print 'test offset', i, 'chunk',[ cname(c) for c in chunk ]
print '-'*40
main(file_gen_obj, i, chunk)
print
print
## sys.exit(0)
# Run profiling benchmarks with the four files
# (on my system, the _mike and _org versions are typically almost
# twice as fast as the _new and _current)
import profile, random
req = StringIO()
blocksize = 102400
block=[]
r = random.Random()
for i in xrange(blocksize):
block.append(chr(r.randint(0,255)))
block = "".join(block)
for c in range(64):
req.write(block)
req.write('\r\n--myboundary\r\n')
print "Benchmark bytes:", req.tell()
out = DevNull()
for rtb in read_boundaries:
req.seek(0)
print rtb.__name__
profile.run("%s(None, req, '--myboundary', out, 65536)" % rtb.__name__)