Hey all,
I would like to be able to recognise a string with a regular expression.
I couldn't get MatchExe to work for now -- having tried for about a day,
I gave up[1] -- so I've implemented a 'recognise' function in the
Transducer class.
$ ./lt-regexp bee*r ber
0 1 b b
1 2 e e
2 2 e e
2 3 r r
3
bee*r ber true
$ ./lt-regexp bee*r br
0 1 b b
1 2 e e
2 2 e e
2 3 r r
3
bee*r br false
I'm attaching a diff, if anyone is interested, and the source for the
lt-regexp program.
Fran
1. This is surely indication of my deficiency, not the code.
Index: transducer.cc
===================================================================
--- transducer.cc (revision 34527)
+++ transducer.cc (working copy)
@@ -656,3 +656,46 @@
fwprintf(output, L"%d\n", *it3);
}
}
+
+bool
+Transducer::recognise(wstring patro, Alphabet &a, FILE *err)
+{
+ bool accepted = false;
+ vector<int> states ;
+ states.push_back(getInitial());
+
+ // For each of the characters in the input string
+ for(wstring::iterator it = patro.begin(); it != patro.end(); it++)
+ {
+ vector<int> new_state;
+ int sym = *it;
+ // For each of the current alive states
+ for(vector<int>::iterator it2 = states.begin(); it2 != states.end(); it2++)
+ {
+ multimap<int, int> p = transitions[*it2];
+ // For each of the transitions in the state
+ for(multimap<int, int>::iterator it3 = p.begin(); it3 != p.end(); it3++)
+ {
+
+ pair<int, int> t = a.decode(it3->first);
+ wstring l = L"";
+ a.getSymbol(l, t.first);
+ //fwprintf(err, L"step: %S %C (%d), state: %d, trans: %S, targ: %d\n", patro.c_str(), *it, sym, *it2, l.c_str(), it3->second);
+ if(l.find(*it) != wstring::npos || l == L"")
+ {
+ new_state.push_back(it3->second);
+ }
+ }
+ }
+ states = new_state;
+ }
+ for(vector<int>::iterator it4 = states.begin(); it4 != states.end(); it4++)
+ {
+ if(isFinal(*it4))
+ {
+ accepted = true;
+ }
+ }
+
+ return accepted;
+}
Index: transducer.h
===================================================================
--- transducer.h (revision 34527)
+++ transducer.h (working copy)
@@ -146,6 +146,13 @@
bool isFinal(int const state) const;
/**
+ * Test if a pattern is recognised by the FST
+ * @param a widestring of the pattern to be recognised
+ * @return true if the pattern is recognised by the transducer
+ */
+ bool recognise(wstring patro, Alphabet &a, FILE *err = stderr);
+
+ /**
* Set the state as a final or not, yes by default
* @param state the state
* @param value if true, the state is set as final state
/*
* g++ -o lt-regexp lt-regexp.cc -I/usr/include/libxml2 -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3 -llibxml2
*/
#include <cwchar>
#include <cstdio>
#include <cerrno>
#include <string>
#include <iostream>
#include <list>
#include <set>
#include <lttoolbox/ltstr.h>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/transducer.h>
#include <lttoolbox/alphabet.h>
#include <lttoolbox/pool.h>
#include <lttoolbox/state.h>
#include <lttoolbox/regexp_compiler.h>
#include <lttoolbox/match_exe.h>
#include <lttoolbox/match_state.h>
#include <lttoolbox/xml_parse_util.h>
wstring ws(char *arg)
{
wchar_t buf[1024];
memset(buf, '\0', 1024);
size_t num_chars = mbstowcs(buf, arg, strlen(arg));
wstring ws(buf, num_chars);
return ws;
}
int main (int argc, char** argv)
{
Alphabet alphabet;
Transducer t;
RegexpCompiler re;
bool matched;
LtLocale::tryToSetLocale();
if(argc < 3)
{
wcout << L"Usage: lt-regexp <pattern> <string to match>" << endl;
exit(-1);
}
FILE *output = stdout;
wstring pattern = ws(argv[1]);
wstring s = ws(argv[2]);
re.initialize(&alphabet);
re.compile(pattern);
t = re.getTransducer();
t.minimize();
t.show(alphabet, output);
matched = t.recognise(s, alphabet, stderr);
wcout << endl << pattern << L" " << s << L" " << (matched ? L"true" : L"false")<< endl;
}
------------------------------------------------------------------------------
All the data continuously generated in your IT infrastructure contains a
definitive record of customers, application performance, security
threats, fraudulent activity and more. Splunk takes this data and makes
sense of it. Business sense. IT sense. Common sense.
http://p.sf.net/sfu/splunk-d2d-oct
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff