UncheckedArray and closure issue

DMisener Sun, 18 Aug 2024 09:30:20 -0700

Thanks! That does eliminate the problem.

In my production code vs the above exemplar **elements** is indeed memory safe.


The results of incorporating it into my production code was interesting... I 
thought replacing the indirect sort using a **seq** copy to/from would speed 
things up. **Apparently not! :-)**

Running **with** _intermedate seq 's_ and **without** on **3,000,000 words** 
gives the following:
    
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger -d:SORT_BY_SEQ r 
google_dictionary
    Elapsed Time [Sort using SORT_BY_SEQ] 2.331s
    Enter words:
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger -d:SORT_BY_SEQ r 
google_dictionary
    Elapsed Time [Sort using SORT_BY_SEQ] 2.407s
    Enter words:
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger -d:SORT_BY_SEQ r 
google_dictionary
    Elapsed Time [Sort using SORT_BY_SEQ] 2.325s
    Enter words:
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger r google_dictionary
    Elapsed Time [Sort] 2.471s
    Enter words:
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger r google_dictionary
    Elapsed Time [Sort] 2.497s
    Enter words:
    
    rm google_dict.scf; nim -p=./ --hints:off -d:danger r google_dictionary
    Elapsed Time [Sort] 2.515s
    Enter words:
    
    
    Run

**Another case of premature optimization :-)**

Still a valuable learning exercise in using _runtime dynamically sized arrays_.

Here is the _dictutil.nim_ code for anyone who is interested. The primary area 
of interest is the variant iunder `if sortRequired`.
    
    
    import algorithm, buffer, math, std/memfiles, strutils, util
    
    const
      INT32_SIZE = 4
      HEADER_SIZE = 3 * INT32_SIZE
    
    type
      Descriptor = distinct int32
      
      Dictionary* = ptr object
        entries*: int32
        maxEntrySize*: int32
        entriesSize*: int32
    
    proc newDictionary*(
        entries: int32,
        entriesSize: int32,
        maxEntrySize: int32 = 0,
        base: pointer = nil
      ): Dictionary =
      
      result = cast[Dictionary](base)
      
      result.entries = entries
      result.entriesSize = entriesSize
      result.maxEntrySize = maxEntrySize
    
    # Syntactic sugar
    template withLength(elements: untyped, length: int): untyped =
      toOpenArray(elements, 0, length - 1)
    
    proc newDescriptor(start: int, length: int): Descriptor {.inline.} =
      cast[Descriptor](start shl 8 or length)
    
    proc start(descriptor: Descriptor): int {.inline.} =
      (cast[int32](descriptor) shr 8) and 0x00FFFFFF
    
    proc length(descriptor: Descriptor): int {.inline.} =
      cast[int32](descriptor) and 0xFF
    
    type StartLength* = tuple[start: pointer, length: int] # FIXME: use 
Decriptor
    
    proc getDescriptor(
        dictionary: var Dictionary,
        index: int
      ): Descriptor {.inline.} =
      
      let
        descriptorsBase = cast[int](dictionary) + HEADER_SIZE
        descriptorsBasePointer = cast[pointer](descriptorsBase)
      
      cast[ptr UncheckedArray[Descriptor]](descriptorsBasePointer)[index]
    
    proc getWord(
      dictionary: var Dictionary,
      descriptor: Descriptor,
      word: var string) {.inline.} =
      
      let
        descriptorsSize = dictionary.entries * INT32_SIZE
        descriptorsBase = cast[int](dictionary) + HEADER_SIZE
        contentBase = descriptorsBase + descriptorsSize
        length = descriptor.length
      
      # word.setLen 1 # needed so following word[0] doesn't generate index of 
of range
      
      copyMem(
        word[0].addr,
        cast[pointer](contentBase + descriptor.start),
        length
      )
      
      word.setlen length
    
    proc getWord*(
        dictionary: var Dictionary,
        index: int,
        word: var string) {.inline.} =
      
      getWord(dictionary, dictionary.getDescriptor index, word)
    
    proc buildDictionary*(
        filename: string,
        entries: int32,
        entriesSize: int32,
        maxEntrySize: int32 = 0,
        sortRequired = false,
        entrySource: iterator(): StartLength {.closure.}
    ): Dictionary {.discardable.} =
      
      let
        descriptorsSize = entries * INT32_SIZE
        entriesSize = entriesSize
        mmFileSize = HEADER_SIZE + descriptorsSize + entriesSize
      
      var memFile = memfiles.open(
          filename,
          mode = fmWrite,
          newFileSize = mmFileSize.roundUp
        )
      
      let descriptorsBase = cast[int](memFile.mem) + HEADER_SIZE
      
      var
        content: Buffer
        descriptors: ptr UncheckedArray[Descriptor]
      
      descriptors = cast[ptr 
UncheckedArray[Descriptor]](cast[pointer](descriptorsBase))
      content.data = cast[pointer](descriptorsBase + descriptorsSize)
      content.capacity = entriesSize
      
      result = newDictionary(entries, entriesSize, maxEntrySize, memFile.mem)
      
      var index = 0
      
      for entry in entrySource():
        let length = entry.length
        descriptors[index] = newDescriptor(content.len, length)
        content.append entry.start, length
        index.inc
      
      if sortRequired:
        when defined SORT_BY_SEQ: # For
          benchmark "Sort using SORT_BY_SEQ": # Sort entries -- too bad need a 
temporary copy
            var
              dict = result
              length = dict.entries
              tempDescriptors = newSeq[Descriptor](length)
              wordA = newStringofCap maxEntrySize
              wordB = newStringofCap maxEntrySize
            
            for i in 0 ..< length: tempDescriptors[i] = descriptors[i]
            
            tempDescriptors.sort do (a, b: Descriptor) -> int:
              dict.getWord a, wordA
              dict.getWord b, wordB
              wordA.cmpIgnoreCase wordB
            
            for i in 0 ..< length: descriptors[i] = tempDescriptors[i]
        else:
          benchmark "Sort":
            var
              dict = result
              length = dict.entries
              wordA = newStringofCap maxEntrySize
              wordB = newStringofCap maxEntrySize
            
            var descriptors = cast[ptr UncheckedArray[Descriptor]](descriptors)
            
            (descriptors.withLength length).sort do (a, b: Descriptor) -> int:
              dict.getWord a, wordA
              dict.getWord b, wordB
              wordA.cmpIgnoreCase wordB
    
    proc loadDictionary*(filename: string): Dictionary =
      cast[Dictionary](memfiles.open(filename).mem)
    
    proc find*(dictionary: var Dictionary, target: string): int {.inline.} =
      var
        low = 0
        high: int = dictionary.entries - 1
        entry = newStringOfCap dictionary.maxEntrySize
      
      while low <= high:
        let
          mid = (low + high) div 2
          # element = model.index[mid]
        
        dictionary.getWord mid, entry
        
        let compare = entry.cmpIgnoreCase target
        
        if compare == 0:
          return mid
        elif compare < 0:
          low = mid + 1
        else:
          high = mid - 1
      
      return -1 # Return -1 if the target string is not found
    
    proc sanityCheck*(dictionary: var Dictionary, words = dictionary.entries) =
      var searchWord = newStringofCap dictionary.maxEntrySize
      
      for index in 0 ..< words:
        dictionary.getWord index, searchWord
        if dictionary.find(searchWord) < 0:
          echo "Word at index ", index, " not found: ", searchWord.repr
    
    proc show*(dictionary: var Dictionary, words = dictionary.entries) = # For 
debugging
      var word = newStringofCap dictionary.maxEntrySize
      
      for index in 0 ..< words:
        dictionary.getWord index, word
        echo "word: ", word
    
    proc showStats*(dictionary: var Dictionary) = # For debugging
      var
        entriesSize = 0
        maxEntrySize = 0
        word = newStringofCap dictionary.maxEntrySize
      
      for index in 0 ..< dictionary.entries:
        dictionary.getWord index, word
        
        let entrySize = word.len
        entriesSize.inc entrySize
        if entrySize > maxEntrySize:
          maxEntrySize = entrySize
      
      echo "Entries size:   ", entriesSize
      echo "Max entry size: ", maxEntrySize
    
    proc queryLookup*(dictionary: var Dictionary) =
      var
        searchWord = newStringofCap dictionary.maxEntrySize
        word = newStringofCap dictionary.maxEntrySize
      
      echo "Enter words:"
      
      loop:
        searchWord = stdin.readLine
        if searchWord == "": break
        let index = dictionary.find searchWord
        
        if index >= 0:
          dictionary.getWord index, word
          echo "Index = ", index, "  Word = ", word
        else:
          echo "Word not found: ", searchWord
    
    
    Run

UncheckedArray and closure issue

Reply via email to