Hey all,

I would like to be able to recognise a string with a regular expression.

I couldn't get MatchExe to work for now -- having tried for about a day,
I gave up[1] -- so I've implemented a 'recognise' function in the
Transducer class. 

$ ./lt-regexp bee*r ber
0       1       b       b       
1       2       e       e       
2       2       e       e       
2       3       r       r       
3

bee*r ber true

$ ./lt-regexp bee*r br
0       1       b       b       
1       2       e       e       
2       2       e       e       
2       3       r       r       
3

bee*r br false

I'm attaching a diff, if anyone is interested, and the source for the
lt-regexp program. 

Fran

1. This is surely indication of my deficiency, not the code.
Index: transducer.cc
===================================================================
--- transducer.cc	(revision 34527)
+++ transducer.cc	(working copy)
@@ -656,3 +656,46 @@
     fwprintf(output, L"%d\n", *it3);
   }
 }
+
+bool
+Transducer::recognise(wstring patro, Alphabet &a, FILE *err)
+{
+  bool accepted = false;
+  vector<int> states ;
+  states.push_back(getInitial());
+  
+  // For each of the characters in the input string
+  for(wstring::iterator it = patro.begin(); it != patro.end(); it++)  
+  {
+    vector<int> new_state;
+    int sym = *it;
+    // For each of the current alive states
+    for(vector<int>::iterator it2 = states.begin(); it2 != states.end(); it2++)
+    {
+      multimap<int, int> p = transitions[*it2];
+      // For each of the transitions in the state
+      for(multimap<int, int>::iterator it3 = p.begin(); it3 != p.end(); it3++)
+      { 
+        
+	pair<int, int> t = a.decode(it3->first);
+        wstring l = L"";
+        a.getSymbol(l, t.first);
+        //fwprintf(err, L"step: %S %C (%d), state: %d, trans: %S, targ: %d\n", patro.c_str(), *it, sym, *it2, l.c_str(), it3->second);
+        if(l.find(*it) != wstring::npos || l == L"")
+        {
+          new_state.push_back(it3->second); 
+        }
+      }
+    }
+    states = new_state;
+  }
+  for(vector<int>::iterator it4 = states.begin(); it4 != states.end(); it4++)
+  {
+    if(isFinal(*it4)) 
+    {
+      accepted = true;
+    }
+  }
+
+  return accepted;
+}
Index: transducer.h
===================================================================
--- transducer.h	(revision 34527)
+++ transducer.h	(working copy)
@@ -146,6 +146,13 @@
   bool isFinal(int const state) const;
 
   /**
+   * Test if a pattern is recognised by the FST
+   * @param a widestring of the pattern to be recognised
+   * @return true if the pattern is recognised by the transducer
+   */
+  bool recognise(wstring patro, Alphabet &a, FILE *err = stderr);
+
+  /**
    * Set the state as a final or not, yes by default
    * @param state the state
    * @param value if true, the state is set as final state
/* 
 * g++ -o lt-regexp lt-regexp.cc -I/usr/include/libxml2 -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3 -llibxml2
 */
#include <cwchar>
#include <cstdio>
#include <cerrno>
#include <string>
#include <iostream>
#include <list>
#include <set>

#include <lttoolbox/ltstr.h>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/transducer.h>
#include <lttoolbox/alphabet.h>
#include <lttoolbox/pool.h>
#include <lttoolbox/state.h>
#include <lttoolbox/regexp_compiler.h>
#include <lttoolbox/match_exe.h>
#include <lttoolbox/match_state.h>
#include <lttoolbox/xml_parse_util.h>

wstring ws(char *arg)
{
  wchar_t buf[1024];
  memset(buf, '\0', 1024);
  size_t num_chars = mbstowcs(buf, arg, strlen(arg));
  wstring ws(buf, num_chars);
  return ws;
}

int main (int argc, char** argv)
{
  Alphabet alphabet;
  Transducer t;
  RegexpCompiler re;
  bool matched;

  LtLocale::tryToSetLocale();

  if(argc < 3) 
  {
    wcout << L"Usage: lt-regexp <pattern> <string to match>" << endl;
    exit(-1);
  }

  FILE *output = stdout;
  wstring pattern = ws(argv[1]);
  wstring s = ws(argv[2]);

  re.initialize(&alphabet);
  re.compile(pattern);
  t = re.getTransducer();
  t.minimize();

  t.show(alphabet, output);

  matched = t.recognise(s, alphabet, stderr); 

  wcout << endl << pattern << L" " << s << L" " << (matched ? L"true" : L"false")<< endl;
}
------------------------------------------------------------------------------
All the data continuously generated in your IT infrastructure contains a
definitive record of customers, application performance, security
threats, fraudulent activity and more. Splunk takes this data and makes
sense of it. Business sense. IT sense. Common sense.
http://p.sf.net/sfu/splunk-d2d-oct
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff

Reply via email to