El dl 01 de 08 de 2011 a les 11:35 +0000, en/na Francis Tyers va
escriure:
> Hello everyone
> 
> At the moment, apertium-pretransfer accepts the output of the tagger
> (without surface forms) and splits MLUs (joined with '+') into two.
> 
> As I'm working with output from the tagger with surface forms, it would
> be useful to have a mode to pretransfer to do this, but also strip the
> surface form too.
> 
> So instead of:
>  
>  in: ^per<pr>+el<det><def><m><pl>$
>  out: ^per<pr>$ ^el<det><def><m><pl>$
> 
> it would be 
> 
>  in: ^pels/per<pr>+el<det><def><m><pl>$
>  out: ^per<pr>$ ^el<det><def><m><pl>$
> 
> I suggest calling the option -n (this is the same as the cg-proc option
> of the same function --no-word-forms)
> 
> Any objections ?

Here is the patch. I've also taken the liberty of adding '~' as a
compound word boundary, something that myself and Unhammer have been
thinking of doing for a while. The '~' symbol has not yet been used
anywhere in analysis (only in generation). 

Now it will work the same as '+' only no space will be output. Here are
some examples:

$ echo '^de<pr>+el<det><def><m><sg>$' | apertium-pretransfer
^de<pr>$ ^el<det><def><m><sg>$

$ echo '^del/de<pr>+el<det><def><m><sg>$' | apertium-pretransfer -n
^de<pr>$ ^el<det><def><m><sg>$

$ echo '^arbeidsmiljø<n><nt><sg><ind>~lov<n><m><sg><def>$' |
apertium-pretransfer 
^arbeidsmiljø<n><nt><sg><ind>$^lov<n><m><sg><def>$

Note, there is an outstanding "bug"(?) with pretransfer where the
multiword queue gets appended to the first part of joined analyses, not
the second:

$ echo '^arbeidsmiljø<n><nt><sg><ind>+lov<n><m><sg><def># plan$' |
apertium-pretransfer 
^arbeidsmiljø# plan<n><nt><sg><ind>$ ^lov<n><m><sg><def>$

If there is a joined analysis with multiword queue, should it go on the
first or last part of the join ? 

Fran
Index: apertium/apertium_pretransfer.cc
===================================================================
--- apertium/apertium_pretransfer.cc	(revision 32331)
+++ apertium/apertium_pretransfer.cc	(working copy)
@@ -56,7 +56,7 @@
   }
 }
 
-void procWord(FILE *input, FILE *output)
+void procWord(FILE *input, FILE *output, bool surface_forms)
 {
   int mychar;
   wstring buffer = L"";
@@ -64,6 +64,12 @@
   bool buffer_mode = false;
   bool in_tag = false;
   bool queuing = false;
+
+  if(surface_forms)
+  {
+    while((mychar = fgetwc_unlocked(input)) != L'/') ;
+  } 
+
   while((mychar = fgetwc_unlocked(input)) != L'$')
   {
     if(feof(input))
@@ -97,14 +103,19 @@
 
     if(buffer_mode)
     { 
-      if(mychar != L'+' || (mychar == L'+' && in_tag == true))
+      if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && 
+         (mychar != L'~' || (mychar == L'~' && in_tag == true)))
       {
         buffer += static_cast<wchar_t>(mychar);
       }
-      else if(in_tag == false)
+      else if(in_tag == false && mychar == L'+')
       {
         buffer.append(L"$ ^");
       }
+      else if(in_tag == false && mychar == L'~')
+      {
+        buffer.append(L"$^");
+      }
     }
     else
     {
@@ -123,7 +134,7 @@
   fputws_unlocked(buffer.c_str(), output);
 }
 
-void processStream(FILE *input, FILE *output, bool null_flush)
+void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms)
 {
   while(true)
   {
@@ -147,7 +158,7 @@
  
       case L'^':
         fputwc_unlocked(mychar, output);
-        procWord(input, output);
+        procWord(input, output, surface_forms);
         fputwc_unlocked(L'$', output);
         break;
       
@@ -180,6 +191,7 @@
 { 
   LtLocale::tryToSetLocale();
   bool null_flush = false;
+  bool surface_forms = false;
   
 #if HAVE_GETOPT_LONG
   int option_index=0;
@@ -190,13 +202,14 @@
     static struct option long_options[] =
     {
       {"null-flush", no_argument, 0, 'z'},
+      {"no-surface-forms", no_argument, 0, 'n'},
       {"help", no_argument, 0, 'h'},
       {0, 0, 0, 0}
     };
 
-    int c=getopt_long(argc, argv, "zh", long_options, &option_index);
+    int c=getopt_long(argc, argv, "nzh", long_options, &option_index);
 #else
-    int c=getopt(argc, argv, "zh");
+    int c=getopt(argc, argv, "nzh");
 #endif
     if (c==-1)
       break;
@@ -206,7 +219,11 @@
       case 'z':
         null_flush = true;
         break;
-      
+
+      case 'n':
+        surface_forms = true;
+        break;
+       
       case 'h':
       default:
         usage(argv[0]);
@@ -257,5 +274,5 @@
     _setmode(_fileno(output), _O_U8TEXT);
 #endif
 
-  processStream(input, output, null_flush);
+  processStream(input, output, null_flush, surface_forms);
 }
------------------------------------------------------------------------------
Got Input?   Slashdot Needs You.
Take our quick survey online.  Come on, we don't ask for help often.
Plus, you'll get a chance to win $100 to spend on ThinkGeek.
http://p.sf.net/sfu/slashdot-survey
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff

Reply via email to