Re: [ot][spam][crazy] log: append-only random-access data

Undiscussed Horrific Abuse, One Victim of Many Thu, 07 Jul 2022 14:13:17 -0700

here's what i have now

its intent is to test [the correctness] of iterating leaves when the
structure is just appending, without using a tree.


there is presently a logic error demonstrated when it is run. there
may be many, many logic errors in it.

i called the confusing class name "Chunk". seems ok.

# block tree structure

# blocks written to are leaves
# depth is log2 leaves

# when a flush is made, all blocks are written, and also enough nodes such that every leaf can be accessed within depth lookups.


# consider we have an existing tree
# with say m flushes, containing n leaves (or m leaves). we'll likely call it n.


# each flush shows which leaves it has

# additionally, with the final flush, each leaf has an existing depth.

# when we reflush, we need to provide a new index for any leaves that become too deep.

# which leaves are too deep?

# we could basically walk them all to find out. this would be a consistent first approach.

class Simple:
    class Chunk:
        def __init__(self, offset, data):
            self.start = offset
            self.data = data
            self.end = self.start + len(self.data)
    class Flush:
        # flush has a list of new leaves, and a list of indexes to old leaves with ranges
        def __init__(self, *writes, prev_flush=None):
            self.prev_flush = prev_flush
            self.data = writes

            # find extents
            start = min((write.start for write in self.data))
            end = max((write.end for write in self.data))
            
            if prev_flush is None:
                self.start = start
                self.end = end
                self.index = []
                return

            self.start = min(start, prev_flush.start)
            self.end = max(end, prev_flush.end)
            self.index = [(prev_flush.start, prev_flush.end, prev_flush)]

            # find leaf count and leaf depths
            #offset = start
            #while offset < end:
            
        #def lookup(self, offset
        def leaves(self, start = None, end = None, depth = 0):
            if start is None:
                start = self.start
            if end is None:
                end = self.end
            offset = start
            data_iter = iter(self.data)
            index_iter = iter(self.index)
            next_write = next(data_iter, None)
            next_index = next(index_iter, None)
            while offset < end:
                if next_write is not None and offset >= next_write.start:
                    # offset >= next_write
                    # so we look in the write
                    substart = next_write.start - offset
                    subend = min(end, next_write.end)
                    yield (depth, offset, next_write.data[substart:subend])
                else:
                    # offset < next_write
                    # so we look in the index
                    assert next_index is not None
                    subend = next_write.start if next_write is not None else end
                    while offset >= next_index.end:
                        next_index = next(index_iter)
                    assert offset >= next_index.start and offset < next_index.end
                    subend = min(subend, next_index.end)
                    yield from next_index.leaves(offset, subend, depth + 1)
            assert offset == end
            if end == self.end:
                assert next(index_iter, None) is None
            
    def __init__(self, latest = None):
        self.tip = latest
        self.pending = []
    def write(self, offset, data):
        self.pending.append(self.Chunk(offset, data))
    def flush(self):
        self.tip = self.Flush(*self.pending, prev_flush=self.tip)
        self.pending = []
    def leaves(self, start = None, end = None):
        if self.tip is not None:
            return self.tip.leaves(start, end)

if __name__ == '__main__':
    import random
    SIZE=4096
    store = Simple()
    comparison = bytearray(4096)
    for flushes in range(1024):
        for writes in range(1024):
            start = random.randint(0, SIZE)
            end = random.randint(0, SIZE)
            start, end = (start, end) if start <= end else (end, start)
            data = random.randbytes(end - start)
            comparison[start:end] = data
            store.write(start, data)
        store.flush()
        last_offset = 0
        max_depth = 0
        for depth, offset, data in store.leaves():
            assert comparison[last_offset:offset] == bytes(offset - last_offset)
            last_offset = offset + len(data)
            assert comparison[offset:last_offset] == data
            max_depth = max(depth, max_depth)
        assert comparison[last_offset:] == bytes(len(comparison) - last_offset)
        print(flush, max_depth, 'OK')

Re: [ot][spam][crazy] log: append-only random-access data

Reply via email to