You'll have to download count_1w.txt and tell this program where to find it; see the code for details.
#!/usr/bin/lua -- segmentation code from Darius Bacon’s JavaScript version of -- Norvig’s word segmenter -- function from http://lua-users.org/wiki/SplitJoin function justWords(str) local t = {} local function helper(word) table.insert(t, word) return "" end if not str:gsub("%w+", helper):find"%S" then return t end end function load_word_list(filename) wordfreqs = {} for line in io.lines(filename) do split = justWords(line) wordfreqs[split[1]] = split[2] end maxWordLength = 0 for word in pairs(wordfreqs) do if maxWordLength < string.len(word) then maxWordLength = string.len(word) end end NT = 1024908267229 -- Number of tokens. end -- estimate the probability of a word function Pw(word) if wordfreqs[word] ~= nil then return wordfreqs[word] / NT else return 10 / (NT * 10^string.len(word)) end end function memoize1(f) local memos = {} return (function(x) if memos[x] == nil then memos[x] = {v=f(x)} end return memos[x].v end) end -- Return a list of words such that table.concat(wordlist) == words, -- along with its probability. We pick the most probable such list. segment = memoize1(function(words) if words == '' then return {words = {}, P = 1} end local best = {P = 0} for ii = 1, math.min(string.len(words), maxWordLength) do local word, result = string.sub(words, 1, ii), segment(string.sub(words, ii+1)) local P = Pw(word) * result.P if best.P < P then best = {words = {word, unpack(result.words)}, P = P} end end return best end) -- to test: -- loadfile("segment.lua")() -- print(unpack(segment('returnalistofwordssuchthattableconcatwordlistequalswords').words)) -- For interactive use, pass arguments on the command line: -- ./segment.lua expertsexchange.com if #{...} > 0 then input = string.lower(table.concat({...})) -- lowercase version of input input = string.gsub(input, '[^%a]', '') -- remove non-letters -- from http://norvig.com/ngrams/ local infile = "/home/kragen/pkgs/ngrams/count_1w.txt" print('Loading word usage statistics from `'..infile..'`...') load_word_list(infile) print('loaded. analyzing: "'..input..'"') print(table.concat(segment(input).words, ' ')) end -- To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks