Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

Alexis Marrero Mon, 07 Nov 2005 20:17:32 -0800

New version of read_to_boundary(...)

readBlockSize = 1 << 16
def read_to_boundary(self, req, boundary, file):
    previous_delimiter = ''
    while 1:
        line = req.readline(readBlockSize)
        if line.strip().startswith(boundary):
            break


        if line.endswith('\r\n'):
            file.write(previous_delimiter + line[:-2])
            previous_delimiter = '\r\n'

        elif line.endswith('\r'):
            file.write(previous_delimiter + line[:-1])
            previous_delimiter = '\r'

        elif line.endswith('\n'):
            if len(line[:-1]) > 0:
                file.write(previous_delimiter + line[:-1])
                previous_delimiter = '\n'

            else:
                previous_delimiter += '\n'

        else:
            file.write(previous_delimiter + line)
            previous_delimiter = ''

This new functions passes the test for Jim's filetest generator.

[core:~] amarrero% python filegenerator.py ; md5 testfile ; cptestfile testfile.new ; python test.py ; md5 test.bin

MD5 (testfile) = d481990a0f0bbd8acf847cd732714555
MD5 (outputtestfile.bin) = d481990a0f0bbd8acf847cd732714555

I'm running a test with my set of files (60000+) to see any other newissues.




On Nov 7, 2005, at 6:35 PM, Jim Gallacher wrote:

Alexis Marrero wrote:

Ok. Now I'm confused.


So am I!

I've created a test harness so we can bypass mod_python completely.It includes a slightly modified version of read_to_boundary whichadds a new parameter, readBlockSize.

In the output from the test harness, your version is 'new' and thecurrent version is 'cur'. Run it and see what happens.


Jim

$ ./upload_test_harness

========================================
generate_embedded_cr_file
----------------------------------------
test offset -1 chunk []
----------------------------------------
src 5a63347d1106afdfa264b2a61f81ae82
cur 5a63347d1106afdfa264b2a61f81ae82 PASS
new 5a63347d1106afdfa264b2a61f81ae82 PASS

----------------------------------------
test offset -1 chunk ['CR']
----------------------------------------
src 82204e52343d5b25c2e783cd59499973
cur e4af2eee73029642a114697ba59217b3 FAIL
new 82204e52343d5b25c2e783cd59499973 PASS

========================================
generate_split_boundary_file
----------------------------------------
test offset -1 chunk []
----------------------------------------
src d481990a0f0bbd8acf847cd732714555
cur d481990a0f0bbd8acf847cd732714555 PASS
new 8fa5ac9f913d778575ea871506c392a9 FAIL

----------------------------------------
test offset -1 chunk ['CR']
----------------------------------------
src 8fa5ac9f913d778575ea871506c392a9
cur d481990a0f0bbd8acf847cd732714555 FAIL
new 8fa5ac9f913d778575ea871506c392a9 PASS

What I was trying to say is that I created a file with this function:
def generate_split_file(offset=-1,
                          readBlockSize=65368,
                          fname='testfile'):
    f = open(fname, 'w')
    f.write('a'*50)
    f.write('\r\n')
    block_size =  readBlockSize + offset
    f.write('b'*block_size)
    f.close()
Then I uploaded 'testfile' using the followingStorageField.read_to_boundary() method:
def read_to_boundary(self, req, boundary, file):
''' read from the request object line by line with a maximumsize,
        until the new line starts with boundary
    '''
    previous_delimiter = ''
    while 1:
        line = req.readline(1<<16)
        if line.startswith(boundary):
            break
        if line.endswith('\r\n'):
            file.write(previous_delimiter + line[:-2])
            previous_delimiter = '\r\n'
        elif line.endswith('\r') or line.endswith('\n'):
            file.write(previous_delimiter + line[:-1])
            previous_delimiter = line[-1:]
        else:
            file.write(previous_delimiter + line)
            previous_delimiter = ''
And the md5 in the client is the same one as in the server. Doyou have different results? Let me know.
Regards,
/amn
On Nov 7, 2005, at 2:11 PM, Jim Gallacher wrote:
Jim Gallacher wrote:
Alexis Marrero wrote:
Jim,
Thanks for sending the function that creates the test file.However I ran it to create the test file, and after uploadingthe file the MD5 still the same.
Just to clarify, is this for your new read_to_boundary or theone in 3.2? If it's for yours then the MD5 sum *should* be thesame, since that's what you fixed. :)
Did you call it with the same block size as you are using inyour code? The '\r' character must appear in the file right atthe readBlockSize boundary.
ie.
generate_file(offset=-1, readBlockSize=1<<16, fname='testfile')


#!/usr/bin/env python

from mkfile import generate_split_file, generate_file
import sys
from StringIO import StringIO
import md5

def read_to_boundary_current(self, req, boundary, file,readBlockSize):

    ''' currrent version '''
    #

# Although technically possible for the boundary to be split bythe read, this will# not happen because the readBlockSize is set quite high - farlonger than any boundary line

    # will ever contain.
    #

# lastCharCarried is used to detect the situation where the \r\n is split across the end of

    # a read block.
    #
    delim = ''
    lastCharCarried = False
    last_bound = boundary + '--'
    roughBoundaryLength = len(last_bound) + 128
    line = req.readline(readBlockSize)
    lineLength = len(line)
    if lineLength < roughBoundaryLength:
        sline = line.strip()
    else:
        sline = ''

while lineLength > 0 and sline != boundary and sline !=last_bound:

        if not lastCharCarried:
            file.write(delim)
            delim = ''
        else:
            lastCharCarried = False
        cutLength = 0

        if lineLength == readBlockSize:
            if line[-1:] == '\r':
                delim = '\r'
                cutLength = -1
                lastCharCarried = True

        if line[-2:] == '\r\n':
            delim += '\r\n'
            cutLength = -2
        elif line[-1:] == '\n':
            delim += '\n'
            cutLength = -1
        if cutLength != 0:
            file.write(line[:cutLength])
        else:
            file.write(line)

        line = req.readline(readBlockSize)
        lineLength = len(line)
        if lineLength < roughBoundaryLength:
            sline = line.strip()
        else:
            sline = ''

def read_to_boundary_new(self, req, boundary, file, readBlockSize):
    ''' Alexis' version
        read from the request object line by line with a maximum size,
        until the new line starts with boundary
    '''
    previous_delimiter = ''
    while 1:
        line = req.readline(readBlockSize)
        if line.startswith(boundary):
            break

        if line.endswith('\r\n'):
            file.write(previous_delimiter + line[:-2])
            previous_delimiter = '\r\n'

        elif line.endswith('\r') or line.endswith('\n'):
            file.write(previous_delimiter + line[:-1])
            previous_delimiter = line[-1:]

        else:
            file.write(previous_delimiter + line)
            previous_delimiter = ''

def get_checksum(fname):
    data = open(fname).read()
    m = md5.new()
    m.update(data)
    return m.hexdigest()

def generate_embedded_cr_file(offset=-1, readBlockSize=65368,chunk='\r', fname='testfile'):

    """ Generate a file which causes the error with file upload
        The default offset of -1 should generate a file which will
        be corrupted by the file upload.
    """

    f = open(fname, 'w')
    f.write('a'*50)
    f.write('\r\n')

    block_size =  readBlockSize + offset
    f.write('b'*block_size)
    f.write(chunk)
    f.write('ccc')

    f.write('d'*50)
    f.write('\r\n')

    f.close()

def generate_split_boundary_file(offset=-1, readBlockSize=65368,chunk='\r', fname='testfile'):""" this function generates a file with a boundary string '\r\n--myboundary'

        starting at readBlockSize - offset
    """
    f = open(fname, 'w')
    f.write('a'*50)
    f.write('\r\n')

    block_size =  readBlockSize + offset
    f.write('b'*block_size)
    f.write(chunk)

    f.close()

def main(file_generator, offset, chunk, block_size=1<<16):
    fname_in = 'testfile.in'
    fname_out = 'testfile.out'

file_generator(offset=offset, readBlockSize=block_size,chunk=chunk, fname=fname_in)

    orig_checksum = get_checksum(fname_in)

    req = StringIO()
    req.write(open(fname_in).read())
    req.write('\r\n--myboundary\r\n')

    src_cs = get_checksum(fname_in)
    print 'src', src_cs

    fname_out = '%s.cur' % fname_out
    o = file(fname_out, 'wb')
    req.seek(0)
    read_to_boundary_current(None, req, '--myboundary', o, block_size)
    o.close()
    cs = get_checksum(fname_out)
    print 'cur', cs,
    if cs != src_cs:
        print 'FAIL'
    else:
        print 'PASS'

    fname_out = '%s.alexis' % fname_out
    o = file(fname_out, 'wb')
    req.seek(0)
    read_to_boundary_new(None, req, '--myboundary', o, block_size)
    o.close()
    cs = get_checksum(fname_out)
    print 'new', cs,
    if cs != src_cs:
        print 'FAIL'
    else:
        print 'PASS'

def cname(ch):
    if ch == '\r':
        return 'CR'
    elif ch == '\n':
        return 'LF'
    elif ch == '':
        return 'None'
    else:
        return ord(ch)

if __name__ == '__main__':

    #test_chunks =  ['', '\r', '\n', '\r\n']

    # only test the chunks that are currently a problem
    test_chunks =  ['', '\r',]

test_cases ={'generate_embedded_cr_file':generate_embedded_cr_file,'generate_split_boundary_file': generate_split_boundary_file, }

    for name,file_gen_obj in test_cases.items():
        print '='*40
        print name
        for chunk in test_chunks:
            for i in range(-1, 0):
                print '-'*40

print 'test offset', i, 'chunk',[ cname(c) for c inchunk ]

                print '-'*40
                main(file_gen_obj, i, chunk)
                print
            print

Re: mod_python.util.StorageField.read_to_boundary has problems in 3.1 and 3.2

Reply via email to