You'll have to download count_1w.txt and tell this program where to find
it; see the code for details.


#!/usr/bin/lua
-- segmentation code from Darius Bacon’s JavaScript version of
-- Norvig’s word segmenter

-- function from http://lua-users.org/wiki/SplitJoin
function justWords(str)
  local t = {}
  local function helper(word) table.insert(t, word) return "" end
  if not str:gsub("%w+", helper):find"%S" then return t end
end

function load_word_list(filename)
   wordfreqs = {}
   for line in io.lines(filename) do
      split = justWords(line)
      wordfreqs[split[1]] = split[2]
   end

   maxWordLength = 0
   for word in pairs(wordfreqs) do
      if   maxWordLength < string.len(word) 
      then maxWordLength = string.len(word) end
   end

   NT = 1024908267229           -- Number of tokens.
end


-- estimate the probability of a word
function Pw(word)
   if wordfreqs[word] ~= nil 
   then return wordfreqs[word] / NT
   else return 10 / (NT * 10^string.len(word))
   end
end

function memoize1(f)
   local memos = {}
   return (function(x)
             if memos[x] == nil
             then memos[x] = {v=f(x)} end
             return memos[x].v
          end)
end

-- Return a list of words such that table.concat(wordlist) == words,
-- along with its probability. We pick the most probable such list.

segment = memoize1(function(words)
   if words == '' then return {words = {}, P = 1} end
   local best = {P = 0}
   for ii = 1, math.min(string.len(words), maxWordLength) do
      local word, result = string.sub(words, 1, ii), segment(string.sub(words, 
ii+1))
      local P = Pw(word) * result.P
      if best.P < P
      then best = {words = {word, unpack(result.words)}, P = P} end
   end
   return best
end)

-- to test:
-- loadfile("segment.lua")()
-- 
print(unpack(segment('returnalistofwordssuchthattableconcatwordlistequalswords').words))

-- For interactive use, pass arguments on the command line:
-- ./segment.lua expertsexchange.com
if #{...} > 0 then
   input = string.lower(table.concat({...})) -- lowercase version of input
   input = string.gsub(input, '[^%a]', '')   -- remove non-letters

   -- from http://norvig.com/ngrams/
   local infile = "/home/kragen/pkgs/ngrams/count_1w.txt"
   print('Loading word usage statistics from `'..infile..'`...')
   load_word_list(infile)
   print('loaded. analyzing: "'..input..'"')
   print(table.concat(segment(input).words, ' '))
end
-- 
To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks

Reply via email to