With that version in my file: [enwik8](http://mattmahoney.net/dc/enwik8.zip)
drops to 53s vs the python 64s. But I can't see the results with that version.
And if I save the string slice I will be back to the same problem.
The python version:
from timeit import default_timer as timer
WORD_SIZE = 12
K = 10000
def window(line, size):
for i in range(len(line) - size + 1):
yield line[i : i + size]
def counter(file, size, k):
lines = ""
for line in open(file):
lines += line
counts = {}
for word in window(lines, size):
if word in counts:
counts[word] += 1
elif len(counts) < k:
counts[word] = 1
else:
to_remove = []
for i in counts:
if counts[i] == 1:
to_remove.append(i)
else:
counts[i] -= 1
for r in to_remove:
del counts[r]
return counts
def printTop(table, top):
sorted_keys = sorted(table, key = table.__getitem__, reverse = True)
n = 0
for key in sorted_keys:
n += 1
if n > top: break
escaped_key = key.replace('\n', '\\n')
print("{}: '{}' -> {}".format(n, escaped_key, table[key]))
t0 = timer()
res = counter("enwik8", WORD_SIZE, K)
print("CPU time [s] ", timer() - t0)
printTop(res, 30)