Background:

I have a 3,000,000 entry unsorted **word2vec** model (_each word has a 300 
floating point vector_). I have to sort this wordlist in alphabetic order and 
also reorder the associated vectors to allow binary search lookups of _word - > 
vector_

The problem is that I can't seem to write **generic** Nim sorts which handles 
these dynamically **UncheckedArray** _mmap based_ data structures.

The following is a simplified **failing** exemplar [ _Sorry I couldn 't make it 
shorter :-(_ ]

Any assistance would be greatly appreciated.

I am particularly interested in using large **mmap structures** (_whose 
dimensionality is not known at compile time_) as to avoid unnecessary string 
handling overhead
    
    
    import algorithm, strutils
    
    # Used to reorder original data inplace once sort is performed
    proc reorderInPlace[T](
      elements: var openArray[T];
      index: var openArray[int]) =
      
      for i in 0 ..< elements.len:
        while index[i] != i:
          let targetIndex = index[i]
          swap elements[i], elements[targetIndex]
          swap index[i], index[targetIndex]
    
    # Want to dynamically allocate ordinal data structure at run-time
    proc createUncheckedArray*(n: int): ptr UncheckedArray[int] =
      let rawMem = cast[pointer](alloc(n * sizeof(int)))
      cast[ptr UncheckedArray[int]](rawMem)
    
    # Future: return sorted index orderings to reorder additional associated 
datastores
    proc sortInPlace[T](elements: var openArray[T]) =
      let length = elements.len
      
      # Ideally want non-hardwired index orderings
      
      discard """
      var index = createUncheckedArray length
      for i in 0 ..< length: index[i] = i
      
      # Sort doesn't seem to like UncheckedArrays :-(
      index.sort do (a, b: int) -> int: elements[a].cmpIgnoreCase elements[b]
      
      generates compile time error:
      Expression: sort(index)do (a, b: int) -> int:
      elements[a].cmpIgnoreCase elements[b]
      [1] index: ptr UncheckedArray[int]
      [2] do (a, b: int) -> int:
      elements[a].cmpIgnoreCase elements[b]: void
      
      Expected one of (first mismatch at [position]):
      [1] func sort[T](a: var openArray[T]; cmp: proc (x, y: T): int 
{.closure.};
                   order = SortOrder.Ascending)
      [1] proc sort[T](a: var openArray[T]; order = SortOrder.Ascending)
      """
      
      # Even when I kludge a hardwired 0..n array I get a runtime error
      var index: array[4, int]
      for i in 0 ..< length: index[i] = i
      
      index.sort do (a, b: int) -> int: elements[a].cmpIgnoreCase elements[b]
      
      discard """
      Genererates runtime error:
      ... Error: 'elements' is of type <var seq[string]>
      which cannot be captured as it would violate memory safety,
      declared here: 
/Users/dennismisener/work/Nim/test_reorder_in_place.nim(17, 21);
      using '-d:nimNoLentIterators' helps in some cases.
      Consider using a <ref var seq[string]> which can be captured.
      """
      
      elements.reorderInPlace index
      
      # dealloc index # for UncheckedArray
    
    # Example usage
    
    when isMainModule:
      # Test reorder array - works!
      var
        elements1 = ['a', 'b', 'c', 'd']
        index1 = [3, 0, 1, 2]
      
      reorderInPlace(elements1, index1)
      assert elements1 == ['b', 'c', 'd', 'a']
      
      # Test reorder seq - works!
      var
        elements2 = @['a', 'b', 'c', 'd']
        index2 = [3, 2, 1, 0]
      
      reorderInPlace(elements2, index2)
      assert elements2 == @['d', 'c', 'b', 'a']
      
      # Now Test sortInplace - Fails
      var words = @["dog", "cat", "apple", "bob"]
      
      words.sortInPlace
    
    
    
    
    Run

Reply via email to