Fun with deduplicate

Stefan_Salewski Tue, 08 Nov 2016 11:45:05 +0100

I wonder if sequtils.deduplicate() should be renamed to deduplicated() as it 
returns a seq like sorted()? And maybe it should be noted that it is O(n^2), 
which is fine for small collections, but not so nice for large ones.


And maybe we should add a O(n) deduplicate to stdlib -- sets modul is really 
fast as shown below.
    
    
    import sets
    import random, times
    import sequtils # deduplicate
    
    # O(n)
    proc uniqCopy[T](s: openArray[T]): seq[T] =
      let h = s.toOrderedSet
      newSeq(result, h.len) # can we avoid the unnecessary initialization?
      var i = 0 # is there no items with index for sets?
      for el in h:
        result[i] = el
        inc(i)
    
    # O(n)
    proc uniq[T](s: var seq[T]) =
      let h = s.toOrderedSet
      var i = 0
      for el in h:
        s[i] = el
        inc(i)
      s.setLen(h.len)
    
    # O(n^2)
    proc xdeduplicate[T](s: var seq[T]) =
      var i, j: int
      while i < s.len:
        s[j] = s[i]
        var k = j
        while k > 0:
          dec k
          if s[k] == s[j]:
            dec(j)
            break
        inc(j)
        inc(i)
      s.setlen(j)
    
    var a = @[1, 3, 1, 2, 2, 5, 2]
    var b = a
    echo a.uniqCopy
    a.xdeduplicate
    b.uniq
    echo a
    echo b
    
    var t: float # cpuTime()
    var x: array[1000, int]
    for el in mitems(x): el = random(1000)
    
    var z = @x
    t = cpuTime()
    z = z.deduplicate
    echo "sequtils.dedublicate: ", cpuTime() - t
    
    z = @x
    t = cpuTime()
    z.xdeduplicate
    echo "inplace dedublicate: ", cpuTime() - t
    
    z = @x
    t = cpuTime()
    z = z.uniqCopy
    echo "uniqCopy: ", cpuTime() - t
    
    z = @x
    t = cpuTime()
    z.uniq
    echo "uniq: ", cpuTime() - t
    
    # nim c -d:release t.nim
    # Output
    # @[1, 3, 2, 5]
    # @[1, 3, 2, 5]
    # @[1, 3, 2, 5]
    # sequtils.dedublicate: 0.0001410000000000001
    # inplace dedublicate: 0.0001329999999999999
    # uniqCopy: 3.099999999999999e-05
    # uniq: 2.700000000000011e-05

Fun with deduplicate

Reply via email to