New issue 2271: Decompressing takes significantly longer than on CPython https://bitbucket.org/pypy/pypy/issues/2271/decompressing-takes-significantly-longer
John Longinotto: The following code takes about 9.5 minutes on PyPy, but only 42 seconds on CPython: ``` #!python import sys import zlib import struct def bgzip(data,blocks_at_a_time=1): if type(data) == str: d = open(data,'rb') else: d = data cache = '' bytes_read = 0 magic = d.read(4) blocks_left_to_grab = blocks_at_a_time while magic: if not magic: break # a child's heart bytes_read += 4 if magic != "\x1f\x8b\x08\x04": print "ERROR: The input file is not in a format I understand :("; exit() header_data = magic + d.read(8) header_size = 12 extra_len = struct.unpack("<H", header_data[-2:])[0] while header_size-12 < extra_len: header_data += d.read(4) bytes_read += 4 subfield_id = header_data[-4:-2] subfield_len = struct.unpack("<H", header_data[-2:])[0] subfield_data = d.read(subfield_len); bytes_read += subfield_len header_data += subfield_data header_size += subfield_len + 4 if subfield_id == 'BC': block_size = struct.unpack("<H", subfield_data)[0] raw_data = d.read(block_size - extra_len - 19); bytes_read += (block_size-extra_len-19) crc_data = d.read(8); bytes_read += 8 zipped_data = header_data + raw_data + crc_data unzipped_data = zlib.decompress(zipped_data,31) # Could parallize this in a worker poolchen expected_crc = crc_data[:4] expected_size = struct.unpack("<I", crc_data[4:])[0] if len(unzipped_data) != expected_size: print 'ERROR: Failed to unpack due to a Type 1 CRC error. Could the BAM be corrupted?'; exit() crc = zlib.crc32(unzipped_data) if crc < 0: crc = struct.pack("<i", crc) else: crc = struct.pack("<I", crc) if expected_crc != crc: print 'ERROR: Failed to unpack due to a Type 2 CRC error. Could the BAM be corrupted?'; exit() magic = d.read(4) if len(unzipped_data) > 0: cache += unzipped_data blocks_left_to_grab -= 1 if blocks_left_to_grab == 0: yield cache cache = '' blocks_left_to_grab = blocks_at_a_time if cache != '': yield cache d.close() data_generator = bgzip(sys.argv[-1],blocks_at_a_time=300) for block in data_generator: pass ``` Run via: the_code.py ./ENCFF001LCU.bam The input file ENCFF001LCU.bam can be downloaded from https://www.encodeproject.org/files/ENCFF001LCU/@@download/ENCFF001LCU.bam _______________________________________________ pypy-issue mailing list pypy-issue@python.org https://mail.python.org/mailman/listinfo/pypy-issue