As @Stefan mentioned the string stuff can be slow. My `MemSlice` techniques 
might be more broadly interesting. The tokenizer below only works for "strict" 
one-character delimiting, not, e.g., repeated whitespace. Also, I agree 
completely with @arnetheduck that algorithm choices matter more than languages 
for Nim vs C comparisons.

That said, the below Nim is faster than the C version on my machine (215 ms for 
C, 176 ms for Nim). @enthus1ast can try that on his data and report back if he 
likes. There are several little `defined` switches to play with.

FWIW, the C version referenced is a rather lame one (fixed 64 KiEntry hash 
table with external chaining resolution). I didn't try 
profile-guided-optimization which could improve the Nim results some. Since the 
C version "cheats" a bit by pre-sizing its hash table, my version below takes 
any estimate on the number of unique words that will be found as input and 
pre-sizes the Nim table to fit that estimate.

Also, **beware** ` CountTable.sort()`. It uses Shell's sort, not a fast 
algorithm. They're always integers and typically small. A radix sort would be 
the best choice, but regular old merge sort from `algorithm` is a lot better. 
On my test input with 291,000 unique words, the version using `count.sort` took 
120 seconds instead of 176 milliseconds, almost 3 full orders of magnitude 
slower. Times seemed insensitive to hash functions (order 1% variations), 
though both the sorting and hash-sensitivity will be data set dependent, 
obviously. For very high entropy distributions you are going to want to avoid 
the current `CountTable.sort()`. Also, `CountTable` itself was a little slower 
than regular `Table` using the same sorting procedure. So, you might want to 
just avoid `CountTable` in general in peformance-sensitive cases.
    
    
    import memfiles, hashes, tables, os, algorithm, strutils
    
    proc hash*(ms: MemSlice): Hash {.inline.} =
      when defined(useNimHash):
        result = hashData(ms.data, ms.size)
      elif defined(useCB2):
        proc c_memhash(h0: int, a: pointer, size: int): int {. nodecl,
                importc: "memhash_cb2", header: getHomeDir() & 
"s/cb/hash_cb2.c".}
        result = c_memhash(0, ms.data, ms.size)
      elif defined(useCB4):
        proc c_memhash(h0: int, a: pointer, size: int): int {. nodecl,
                importc: "memhash_cb4", header: getHomeDir() & 
"s/cb/hash_cb4.c".}
        result = c_memhash(0, ms.data, ms.size)
      else:
        result = 0
        for i in 0 ..< ms.size:
          result = (result + (result shl 5)) xor int(cast[cstring](ms.data)[i])
    
    proc split*(ms: MemSlice, fields: var seq[MemSlice], nA: int,
                sep=' ', size = -1): int =
      proc c_memchr(s: cstring, c: char, n: csize): cstring {.
           importc: "memchr", header: "<string.h>".}
      proc `+!`(p: cstring, i: int): cstring {.inline.} = 
cast[cstring](cast[int](p) +% i)
      proc `-!`(p, q: cstring): int {.inline.} = cast[int](p) -% cast[int](q)
      var b   = cast[cstring](ms.data)
      var eob = cast[cstring](ms.data) +! (if size != -1: size else: ms.size)
      var e   = c_memchr(b, sep, eob -! b)
      while e != nil:
        fields[result].data = cast[pointer](b)
        fields[result].size = e -! b
        result += 1
        if result == nA - 2:  #max ix nA-1; Need one more slot for final field
          break
        b = e +! 1
        e = c_memchr(b, sep, eob -! b)
      fields[result].data = cast[pointer](b)
      fields[result].size = eob -! b
      result += 1
    
    proc main(input: string, sizeGuess: int) =
      const mxCols = 1024     #file format-dependent max
      when defined(useCountTab):
        var count = initCountTable[MemSlice](rightSize(sizeGuess))
      else:
        var count = initTable[MemSlice, int](rightSize(sizeGuess))
      var fields = newSeq[MemSlice](mxCols)
      for line in memSlices(memfiles.open(input)):
        fields.setLen(split(line, fields, mxCols, ' '))
        for word in fields:
          when defined(useCountTab):
            count.inc(word)
          else:
            inc(count.mgetOrPut(word, 0))
      stderr.write count.len, " unique words\n"
      when defined(useCountTab) and defined(useCountSort):
        count.sort()          #uses Shell's sort internally
        for key, cnt in count:
          stdout.write cnt, " "
          discard stdout.writeBuffer(cast[cstring](key.data), key.size)
          stdout.write "\n"
      else:
        var answer = newSeq[tuple[cnt: int, key: MemSlice]](count.len)
        var i = 0
        for key, cnt in count.pairs():
          answer[i].cnt = cnt
          answer[i].key = key
          inc(i)
        proc myCmp(x, y: tuple[cnt: int, key: MemSlice]): int = - cmp(x.cnt, 
y.cnt)
        answer.sort(myCmp)
        for ans in answer:
          stdout.write ans.cnt, " "
          discard stdout.writeBuffer(cast[cstring](ans.key.data), ans.key.size)
          stdout.write "\n"
    
    #Must call with (seekable/mmapable) input file name and a size guess
    main(paramStr(1), parseInt(paramStr(2)))
    
    
    Run

Reply via email to