String "interning" attempt

cblake Wed, 17 Feb 2021 10:30:20 -0800

Here is a minimally optimized module to do interning - basically 30 lines of a 
specialized hash table, 30 of the main logic, and a little test thing (more 
useful if you change the XXX 1024 to 2). Only minimal error checking is done 
and interning `""` may not work and several things should probably be made into 
compile-time parameters like `lenBits`, but it seems mostly ok.


Time/space measuring it on the above linked SOWPODS file as input suggests it 
is about 2.5x faster and uses < 1/4 the space according to 
[cgmemtime](https://github.com/gsauthof/cgmemtime). { As a witty fella named 
@Araq once observed, "optimization is specialization". ;-) }: 
    
    
    import hashes
    
    type InTab*[Q, N] = object
      data: seq[N] # could adix/seqUint to mem optimize
      count: int
    
    proc find[Q, N](t: InTab[Q, N], q: Q|N, h: Hash): int =
      let mask = t.data.len - 1
      var i = h and mask
      while t.data[i] != N(0):
        if t.data[i] == q: return i
        i = (i + 1) and mask
      -i - 1 # missing, return -(insert point) - 1
    
    proc grow[Q, N](t: var InTab[Q, N]) =
      var alt = newSeq[N](2 * t.data.len)
      swap t.data, alt
      for n in alt:
        if n != N(0): t.data[-t.find(n, n.hash) - 1] = n
    
    proc getOrPut*[Q, N](t: var InTab[Q, N], q: Q, n: N): N =
      if t.data.len == 0: t.data.setLen 1024 #XXX 2 to test
      let h = q.hash
      var i = t.find(q, h)
      if i < 0:
        if (t.count + 1) * 4 > t.data.len * 3:
          t.grow; i = t.find(q, h)
        i = -i - 1
        t.data[i] = n
        t.count.inc
      t.data[i]
    
    type InStr* = distinct uint32       #spc =~ 11..15*nInStr()
    const lenBits = 6                   # 2^6=63 is v.long
    const lenMask = (1 shl lenBits) - 1 # 2^26 is much unique
    var tab: InTab[string, InStr]
    var dat = newStringOfCap(6000)
    proc off*(n: InStr): int = n.int shr lenBits
    proc len*(n: InStr): int = n.int and lenMask
    proc `$`*(n: InStr): string = dat[n.off ..< n.off + n.len]
    proc `==`*(a, b: InStr): bool {.borrow.} # the fast one
    proc space*(): int = dat.len + tab.data.len * InStr.sizeof
    proc nInStr*(): int = tab.count
    
    proc `==`*(a: InStr, b: string): bool =
      proc mcmp(a, b: pointer; n: csize): cint
           {. importc: "memcmp", header: "<string.h>" .}
      a.len == b.len and
        mcmp(dat[a.off].addr, b[0].unsafeAddr, b.len) == 0
    proc `==`*(a: string, b: InStr): bool = b == a # swap args
    
    proc hash(n: InStr): Hash = # MUST MATCH hash(string)!
      let p = cast[ptr UncheckedArray[byte]](dat[n.off].addr)
      hash(toOpenArray[byte](p, 0, n.len - 1))
    
    proc intern*(s: string): InStr =
      if s.len > lenMask:
        raise newException(ValueError, "s too long")
      if dat.len + s.len >= 1 shl (InStr.sizeof * 8 - lenBits):
        raise newException(ValueError, "too much unique")
      let n = InStr((dat.len.InStr.int shl lenBits) or s.len)
      result = tab.getOrPut(s, n)
      if result == n: dat.add(s)
    
    when isMainModule:
      import strutils
      for itr in 1..2:
        for w in "hello more interned world".split():
          let a=intern(w);echo a.int," ",a," ",tab.data," ",dat
      echo intern("hello")=="hello", intern("there")=="world"
      echo tab.data," ",dat
    
    
    Run

Oh, and for completeness here is my test program for that SOWPODS..pretty 
trivial but @Serge seems new to Nim: 
    
    
    import os, strutils
    when defined(trivialIntern):
      import intern0
    else:
      import intern
    for w in open(paramStr(1)).readAll.split(): discard intern(w)
    
    
    Run

where `intern0.nim` is just my above 10-line `Table & seq` variant with `Str` 
-> `InStr` (for "interned string").

Exact space will, of course, always depend upon average word length in your 
corpus. It's like 9.11 bytes in SOWPODS but like 7.46 bytes in `(cd 
/usr/src/linux-5.6.12; cat **/*.txt)` for purely alphabetic words. But I added 
an `nInStr()` and `space()` calls to assess that if you like.

Anyway, in spite of the many authors there are only like 21965 words in the 
Linux in-kernel doc corpus. So, 128 KiB for the hash table part and 142067 for 
the data part or total of 266 KiB means almost fully L2 resident (L2 is usually 
256 KiB, but it has been growing).

String "interning" attempt

Reply via email to