Here's one that passes all the tests, and is 2x as fast as the 'current' and 'new' implementations on random binary data. I haven't been able to generate data where the 'mike' version is slower:

def read_to_boundary(self, req, boundary, file, readBlockSize=65536):
    prevline = ""
    last_bound = boundary + '--'
    carry = None
    while 1:
        line = req.readline(readBlockSize)
        if not line or line.startswith(boundary):
            if prevline.endswith('\r\n'):
                if carry is not None:
                    file.write(carry)
                file.write(prevline[:-2])
                break
            elif (carry == '\r') and (prevline[-1] == '\n'):
                file.write(prevline[:-1])
                break
            # If we get here, it's not really a boundary!
        if carry is not None:
            file.write(carry)
            carry = None
        if prevline[-1:] == '\r':
            file.write(prevline[:-1])
            carry = '\r'
        else:
            file.write(prevline)
        prevline = line

I've attached a modified upload_test_harness.py that includes the new and current, also the 'org' version (as in 3.1 release) and the 'mike' version.

In addition, I added some profiling calls to show the impact of the extra 'endswith' and slices.

--
Mike Looijmans
Philips Natlab / Topic Automation
#!/usr/bin/env python

import sys
from cStringIO import StringIO
import md5

##def generate_split_file(offset=-1,
##                          readBlockSize=65368,
##                          fname='testfile'):
##    f = open(fname, 'wb')
##    f.write('a'*50)
##    f.write('\r\n')
##    block_size =  readBlockSize + offset
##    f.write('b'*block_size)
##    f.close() 

def read_to_boundary_current(self, req, boundary, file, readBlockSize):
    ''' currrent version '''
    #
    # Although technically possible for the boundary to be split by the read, this will
    # not happen because the readBlockSize is set quite high - far longer than any boundary line
    # will ever contain.
    #
    # lastCharCarried is used to detect the situation where the \r\n is split across the end of
    # a read block.
    #
    delim = ''
    lastCharCarried = False
    last_bound = boundary + '--'
    roughBoundaryLength = len(last_bound) + 128
    line = req.readline(readBlockSize)
    lineLength = len(line)
    if lineLength < roughBoundaryLength:
        sline = line.strip()
    else:
        sline = ''
    while lineLength > 0 and sline != boundary and sline != last_bound:
        if not lastCharCarried:
            file.write(delim)
            delim = ''
        else:
            lastCharCarried = False
        cutLength = 0

        if lineLength == readBlockSize:
            if line[-1:] == '\r':
                delim = '\r'
                cutLength = -1
                lastCharCarried = True

        if line[-2:] == '\r\n':
            delim += '\r\n'
            cutLength = -2
        elif line[-1:] == '\n':
            delim += '\n'
            cutLength = -1
        if cutLength != 0:
            file.write(line[:cutLength])
        else:
            file.write(line)

        line = req.readline(readBlockSize)
        lineLength = len(line)
        if lineLength < roughBoundaryLength:
            sline = line.strip()
        else:
            sline = ''

def read_to_boundary_new(self, req, boundary, file, readBlockSize):
    ''' Alexis' version
        read from the request object line by line with a maximum size,
        until the new line starts with boundary
    '''
    previous_delimiter = ''
    while 1:
        line = req.readline(readBlockSize)
        if line.startswith(boundary):
            break

        if line.endswith('\r\n'):
            file.write(previous_delimiter + line[:-2])
            previous_delimiter = '\r\n'

        elif line.endswith('\r') or line.endswith('\n'):
            file.write(previous_delimiter + line[:-1])
            previous_delimiter = line[-1:]

        else:
            file.write(previous_delimiter + line)
            previous_delimiter = ''

def read_to_boundary_org(self, req, boundary, file, readBlockSize):
    delim = ""
    line = req.readline(readBlockSize)
    while line and not line.startswith(boundary):
        odelim = delim
        if line[-2:] == "\r\n":
            delim = "\r\n"
            line = line[:-2]
        elif line[-1:] == "\n":
            delim = "\n"
            line = line[:-1]
        else:
            delim = ""
        file.write(odelim + line)
        line = req.readline(readBlockSize)

def read_to_boundary_mike(self, req, boundary, file, readBlockSize=65536):
    prevline = ""
    last_bound = boundary + '--'
    carry = None
    while 1:
        line = req.readline(readBlockSize)
        if not line or line.startswith(boundary):
            if prevline.endswith('\r\n'):
                if carry is not None:
                    file.write(carry)
                file.write(prevline[:-2])
                break
            elif (carry == '\r') and (prevline[-1] == '\n'):
                file.write(prevline[:-1])
                break
            # If we get here, it's not really a boundary!
        if carry is not None:
            file.write(carry)
            carry = None
        if prevline[-1:] == '\r':
            file.write(prevline[:-1])
            carry = '\r'
        else:
            file.write(prevline)
        prevline = line

def get_checksum(fname):
    data = open(fname, 'rb').read()
    m = md5.new()
    m.update(data)
    return m.hexdigest()

def generate_embedded_cr_file(offset=-1, readBlockSize=65368, chunk='\r', fname='testfile'):
    """ Generate a file which causes the error with file upload
        The default offset of -1 should generate a file which will
        be corrupted by the file upload.
    """

    f = open(fname, 'wb')
    f.write('a'*50)
    f.write('\r\n')
    
    block_size =  readBlockSize + offset
    f.write('b'*block_size)
    f.write(chunk)
    f.write('ccc')

    f.write('d'*50)
    f.write('\r\n')

    f.close()

def generate_split_boundary_file(offset=-1, readBlockSize=65368, chunk='\r', fname='testfile'):
    """ this function generates a file with a boundary string '\r\n--myboundary'
        starting at readBlockSize - offset
    """
    f = open(fname, 'wb')
    f.write('a'*50)
    f.write('\r\n')
    
    block_size =  readBlockSize + offset
    f.write('b'*block_size)
    f.write(chunk)

    f.close()

read_boundaries = [read_to_boundary_current, read_to_boundary_new, read_to_boundary_org, read_to_boundary_mike]

def main(file_generator, offset, chunk, block_size=1<<16):
    fname_in = 'testfile.in'
    fname_out_base = 'testfile.out'
    file_generator(offset=offset, readBlockSize=block_size, chunk=chunk, fname=fname_in)
    
    orig_checksum = get_checksum(fname_in)

    req = StringIO()
    req.write(open(fname_in, 'rb').read())
    req.write('\r\n--myboundary\r\n')

    src_cs = get_checksum(fname_in)
    print '     src', src_cs
    
    for rtb in read_boundaries:
        name = rtb.__name__.split('_')[-1]
        fname_out = fname_out_base + name
        o = file(fname_out, 'wb')
        req.seek(0)
        rtb(None, req, '--myboundary', o, block_size)
        size = o.tell()
        o.close() 
        cs = get_checksum(fname_out)
        print "%8s %s %6d" % (name, cs, size),
        if cs != src_cs:
            print 'FAIL'
        else:
            print 'PASS'


def cname(ch):
    if ch == '\r':
        return 'CR'
    elif ch == '\n':
        return 'LF'
    elif ch == '':
        return 'None'
    else:
        return ord(ch)

class DevNull:
    def write(self, data):
        pass

if __name__ == '__main__':
    
    #test_chunks =  ['', '\r', '\n', '\r\n']
    
    # only test the chunks that are currently a problem
    test_chunks =  ['', '\r',]
    
    test_cases = (generate_embedded_cr_file, generate_split_boundary_file)
    for file_gen_obj in test_cases:
        print '='*40
        print file_gen_obj.__name__
        for chunk in test_chunks:
            for i in range(-1, 0):
                print '-'*40
                print 'test offset', i, 'chunk',[ cname(c) for c in chunk ]
                print '-'*40
                main(file_gen_obj, i, chunk)
                print
            print
##    sys.exit(0)

    # Run profiling benchmarks with the four files
    # (on my system, the _mike and _org versions are typically almost
    # twice as fast as the _new and _current)
    import profile, random
    req = StringIO()
    blocksize = 102400
    block=[]
    r = random.Random()
    for i in xrange(blocksize):
        block.append(chr(r.randint(0,255)))
    block = "".join(block)
    for c in range(64):
      req.write(block)
    req.write('\r\n--myboundary\r\n')
    print "Benchmark bytes:", req.tell()
    out = DevNull()
    for rtb in read_boundaries:
        req.seek(0)
        print rtb.__name__
        profile.run("%s(None, req, '--myboundary', out, 65536)" % rtb.__name__)  

Reply via email to