El dl 01 de 08 de 2011 a les 11:35 +0000, en/na Francis Tyers va
escriure:
> Hello everyone
>
> At the moment, apertium-pretransfer accepts the output of the tagger
> (without surface forms) and splits MLUs (joined with '+') into two.
>
> As I'm working with output from the tagger with surface forms, it would
> be useful to have a mode to pretransfer to do this, but also strip the
> surface form too.
>
> So instead of:
>
> in: ^per<pr>+el<det><def><m><pl>$
> out: ^per<pr>$ ^el<det><def><m><pl>$
>
> it would be
>
> in: ^pels/per<pr>+el<det><def><m><pl>$
> out: ^per<pr>$ ^el<det><def><m><pl>$
>
> I suggest calling the option -n (this is the same as the cg-proc option
> of the same function --no-word-forms)
>
> Any objections ?
Here is the patch. I've also taken the liberty of adding '~' as a
compound word boundary, something that myself and Unhammer have been
thinking of doing for a while. The '~' symbol has not yet been used
anywhere in analysis (only in generation).
Now it will work the same as '+' only no space will be output. Here are
some examples:
$ echo '^de<pr>+el<det><def><m><sg>$' | apertium-pretransfer
^de<pr>$ ^el<det><def><m><sg>$
$ echo '^del/de<pr>+el<det><def><m><sg>$' | apertium-pretransfer -n
^de<pr>$ ^el<det><def><m><sg>$
$ echo '^arbeidsmiljø<n><nt><sg><ind>~lov<n><m><sg><def>$' |
apertium-pretransfer
^arbeidsmiljø<n><nt><sg><ind>$^lov<n><m><sg><def>$
Note, there is an outstanding "bug"(?) with pretransfer where the
multiword queue gets appended to the first part of joined analyses, not
the second:
$ echo '^arbeidsmiljø<n><nt><sg><ind>+lov<n><m><sg><def># plan$' |
apertium-pretransfer
^arbeidsmiljø# plan<n><nt><sg><ind>$ ^lov<n><m><sg><def>$
If there is a joined analysis with multiword queue, should it go on the
first or last part of the join ?
Fran
Index: apertium/apertium_pretransfer.cc
===================================================================
--- apertium/apertium_pretransfer.cc (revision 32331)
+++ apertium/apertium_pretransfer.cc (working copy)
@@ -56,7 +56,7 @@
}
}
-void procWord(FILE *input, FILE *output)
+void procWord(FILE *input, FILE *output, bool surface_forms)
{
int mychar;
wstring buffer = L"";
@@ -64,6 +64,12 @@
bool buffer_mode = false;
bool in_tag = false;
bool queuing = false;
+
+ if(surface_forms)
+ {
+ while((mychar = fgetwc_unlocked(input)) != L'/') ;
+ }
+
while((mychar = fgetwc_unlocked(input)) != L'$')
{
if(feof(input))
@@ -97,14 +103,19 @@
if(buffer_mode)
{
- if(mychar != L'+' || (mychar == L'+' && in_tag == true))
+ if((mychar != L'+' || (mychar == L'+' && in_tag == true)) &&
+ (mychar != L'~' || (mychar == L'~' && in_tag == true)))
{
buffer += static_cast<wchar_t>(mychar);
}
- else if(in_tag == false)
+ else if(in_tag == false && mychar == L'+')
{
buffer.append(L"$ ^");
}
+ else if(in_tag == false && mychar == L'~')
+ {
+ buffer.append(L"$^");
+ }
}
else
{
@@ -123,7 +134,7 @@
fputws_unlocked(buffer.c_str(), output);
}
-void processStream(FILE *input, FILE *output, bool null_flush)
+void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms)
{
while(true)
{
@@ -147,7 +158,7 @@
case L'^':
fputwc_unlocked(mychar, output);
- procWord(input, output);
+ procWord(input, output, surface_forms);
fputwc_unlocked(L'$', output);
break;
@@ -180,6 +191,7 @@
{
LtLocale::tryToSetLocale();
bool null_flush = false;
+ bool surface_forms = false;
#if HAVE_GETOPT_LONG
int option_index=0;
@@ -190,13 +202,14 @@
static struct option long_options[] =
{
{"null-flush", no_argument, 0, 'z'},
+ {"no-surface-forms", no_argument, 0, 'n'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
- int c=getopt_long(argc, argv, "zh", long_options, &option_index);
+ int c=getopt_long(argc, argv, "nzh", long_options, &option_index);
#else
- int c=getopt(argc, argv, "zh");
+ int c=getopt(argc, argv, "nzh");
#endif
if (c==-1)
break;
@@ -206,7 +219,11 @@
case 'z':
null_flush = true;
break;
-
+
+ case 'n':
+ surface_forms = true;
+ break;
+
case 'h':
default:
usage(argv[0]);
@@ -257,5 +274,5 @@
_setmode(_fileno(output), _O_U8TEXT);
#endif
- processStream(input, output, null_flush);
+ processStream(input, output, null_flush, surface_forms);
}
------------------------------------------------------------------------------
Got Input? Slashdot Needs You.
Take our quick survey online. Come on, we don't ask for help often.
Plus, you'll get a chance to win $100 to spend on ThinkGeek.
http://p.sf.net/sfu/slashdot-survey
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff