# word segmentation by dynamic programming following Norvig (in Lua)

```You'll have to download count_1w.txt and tell this program where to find
it; see the code for details.```
```

#!/usr/bin/lua
-- segmentation code from Darius Bacon’s JavaScript version of
-- Norvig’s word segmenter

-- function from http://lua-users.org/wiki/SplitJoin
function justWords(str)
local t = {}
local function helper(word) table.insert(t, word) return "" end
if not str:gsub("%w+", helper):find"%S" then return t end
end

wordfreqs = {}
for line in io.lines(filename) do
split = justWords(line)
wordfreqs[split[1]] = split[2]
end

maxWordLength = 0
for word in pairs(wordfreqs) do
if   maxWordLength < string.len(word)
then maxWordLength = string.len(word) end
end

NT = 1024908267229           -- Number of tokens.
end

-- estimate the probability of a word
function Pw(word)
if wordfreqs[word] ~= nil
then return wordfreqs[word] / NT
else return 10 / (NT * 10^string.len(word))
end
end

function memoize1(f)
local memos = {}
return (function(x)
if memos[x] == nil
then memos[x] = {v=f(x)} end
return memos[x].v
end)
end

-- Return a list of words such that table.concat(wordlist) == words,
-- along with its probability. We pick the most probable such list.

segment = memoize1(function(words)
if words == '' then return {words = {}, P = 1} end
local best = {P = 0}
for ii = 1, math.min(string.len(words), maxWordLength) do
local word, result = string.sub(words, 1, ii), segment(string.sub(words,
ii+1))
local P = Pw(word) * result.P
if best.P < P
then best = {words = {word, unpack(result.words)}, P = P} end
end
return best
end)

-- to test:
--
print(unpack(segment('returnalistofwordssuchthattableconcatwordlistequalswords').words))

-- For interactive use, pass arguments on the command line:
-- ./segment.lua expertsexchange.com
if #{...} > 0 then
input = string.lower(table.concat({...})) -- lowercase version of input
input = string.gsub(input, '[^%a]', '')   -- remove non-letters

-- from http://norvig.com/ngrams/
local infile = "/home/kragen/pkgs/ngrams/count_1w.txt"