Here is a minimally optimized module to do interning - basically 30 lines of a specialized hash table, 30 of the main logic, and a little test thing (more useful if you change the XXX 1024 to 2). Only minimal error checking is done and interning `""` may not work and several things should probably be made into compile-time parameters like `lenBits`, but it seems mostly ok.
Time/space measuring it on the above linked SOWPODS file as input suggests it is about 2.5x faster and uses < 1/4 the space according to [cgmemtime](https://github.com/gsauthof/cgmemtime). { As a witty fella named @Araq once observed, "optimization is specialization". ;-) }: import hashes type InTab*[Q, N] = object data: seq[N] # could adix/seqUint to mem optimize count: int proc find[Q, N](t: InTab[Q, N], q: Q|N, h: Hash): int = let mask = t.data.len - 1 var i = h and mask while t.data[i] != N(0): if t.data[i] == q: return i i = (i + 1) and mask -i - 1 # missing, return -(insert point) - 1 proc grow[Q, N](t: var InTab[Q, N]) = var alt = newSeq[N](2 * t.data.len) swap t.data, alt for n in alt: if n != N(0): t.data[-t.find(n, n.hash) - 1] = n proc getOrPut*[Q, N](t: var InTab[Q, N], q: Q, n: N): N = if t.data.len == 0: t.data.setLen 1024 #XXX 2 to test let h = q.hash var i = t.find(q, h) if i < 0: if (t.count + 1) * 4 > t.data.len * 3: t.grow; i = t.find(q, h) i = -i - 1 t.data[i] = n t.count.inc t.data[i] type InStr* = distinct uint32 #spc =~ 11..15*nInStr() const lenBits = 6 # 2^6=63 is v.long const lenMask = (1 shl lenBits) - 1 # 2^26 is much unique var tab: InTab[string, InStr] var dat = newStringOfCap(6000) proc off*(n: InStr): int = n.int shr lenBits proc len*(n: InStr): int = n.int and lenMask proc `$`*(n: InStr): string = dat[n.off ..< n.off + n.len] proc `==`*(a, b: InStr): bool {.borrow.} # the fast one proc space*(): int = dat.len + tab.data.len * InStr.sizeof proc nInStr*(): int = tab.count proc `==`*(a: InStr, b: string): bool = proc mcmp(a, b: pointer; n: csize): cint {. importc: "memcmp", header: "<string.h>" .} a.len == b.len and mcmp(dat[a.off].addr, b[0].unsafeAddr, b.len) == 0 proc `==`*(a: string, b: InStr): bool = b == a # swap args proc hash(n: InStr): Hash = # MUST MATCH hash(string)! let p = cast[ptr UncheckedArray[byte]](dat[n.off].addr) hash(toOpenArray[byte](p, 0, n.len - 1)) proc intern*(s: string): InStr = if s.len > lenMask: raise newException(ValueError, "s too long") if dat.len + s.len >= 1 shl (InStr.sizeof * 8 - lenBits): raise newException(ValueError, "too much unique") let n = InStr((dat.len.InStr.int shl lenBits) or s.len) result = tab.getOrPut(s, n) if result == n: dat.add(s) when isMainModule: import strutils for itr in 1..2: for w in "hello more interned world".split(): let a=intern(w);echo a.int," ",a," ",tab.data," ",dat echo intern("hello")=="hello", intern("there")=="world" echo tab.data," ",dat Run Oh, and for completeness here is my test program for that SOWPODS..pretty trivial but @Serge seems new to Nim: import os, strutils when defined(trivialIntern): import intern0 else: import intern for w in open(paramStr(1)).readAll.split(): discard intern(w) Run where `intern0.nim` is just my above 10-line `Table & seq` variant with `Str` -> `InStr` (for "interned string"). Exact space will, of course, always depend upon average word length in your corpus. It's like 9.11 bytes in SOWPODS but like 7.46 bytes in `(cd /usr/src/linux-5.6.12; cat **/*.txt)` for purely alphabetic words. But I added an `nInStr()` and `space()` calls to assess that if you like. Anyway, in spite of the many authors there are only like 21965 words in the Linux in-kernel doc corpus. So, 128 KiB for the hash table part and 142067 for the data part or total of 266 KiB means almost fully L2 resident (L2 is usually 256 KiB, but it has been growing).