Revision: 9491
http://languagetool.svn.sourceforge.net/languagetool/?rev=9491&view=rev
Author: jaumeortola
Date: 2013-02-25 10:17:36 +0000 (Mon, 25 Feb 2013)
Log Message:
-----------
[ca] Add script used for tagging verbs according to regional variants.
Modified Paths:
--------------
trunk/languagetool/languagetool-core/CHANGES.txt
Added Paths:
-----------
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/tag_verbs.pl
Modified: trunk/languagetool/languagetool-core/CHANGES.txt
===================================================================
--- trunk/languagetool/languagetool-core/CHANGES.txt 2013-02-25 10:04:34 UTC
(rev 9490)
+++ trunk/languagetool/languagetool-core/CHANGES.txt 2013-02-25 10:17:36 UTC
(rev 9491)
@@ -12,6 +12,7 @@
-fixed multiple false alarms.
-improved sentence and word tokenization.
-the tagger dictionary has been fixed and expanded (added 9000 nameplaces).
+ -Verbal forms have been tagged according to regional variants (with script
tag_verbs.pl).
-Hunspell dictionary (Softcatalà) has been replaced with LT tagger
dictionary
for spellchecking.
Added:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/tag_verbs.pl
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/tag_verbs.pl
(rev 0)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/tag_verbs.pl
2013-02-25 10:17:36 UTC (rev 9491)
@@ -0,0 +1,456 @@
+# Etiqueta formes verbals
+# C=català central, V=valencià, B=Balear
+# X=C+V, Y=C+B, Z=V+B
+
+use strict;
+use warnings;
+use autodie;
+
+my $f1 = "freeling_utf8.txt";
+my $out = "freeling_utf8_verbs_reetiquetats.txt";
+my $out2 = "verbs_descartats.txt";
+my $paraula = "";
+my $arrel = "";
+my $postag = "";
+my $printed=0;
+
+open FILE1, "$f1" or die "Could not open file: $! \n";
+open (OUTFILE, ">$out") or die "Cannot open $out for writing \n";
+open (OUTFILE2, ">$out2") or die "Cannot open $out for writing \n";
+while(my $line = <FILE1>)
+{
+ $printed=0;
+ chomp($line);
+ if ($line =~ /^([^ ]+)([aeiï]ra|[aeiï]res|[àéí]rem|[àéí]reu|[aeiï]ren)
([^ ]+) (VMSI...).?$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ elsif ($line =~ /^(.*ten[cgd].*) (.*tenir) (V......).?$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ elsif ($line =~ /^(.*ven[cgd].*) (.*venir) (V......).?$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ elsif ($line =~ /^(vén[cg].*) (venir) (V......).?$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ elsif ($line =~ /^([^ ]+)(e) ([^ ]+) (V.[^N]....).?$/)
+ {
+ $paraula="$1$2";
+ $arrel="$3";
+ $postag="$4";
+ if (($paraula !~ /^(he|ve|vine)$/)
+ && ($arrel !~ /^.*obrir|.*omplir|.*córrer$/))
#atenció: maniobre
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ elsif ($line =~ /^(.*(omple|obre)) ([^ ]+) (VMM02S0).?$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Y\n"; #català - balear omple-li
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+)(a|en?) ([^ ]+) (V.SP...).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~ /^(s|c)àpig(a|uen)$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+)(es) ([^ ]+) (V.SP...).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~ /^(s|c)àpigues|fes$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+)(en?) ([^ ]+) (V.M....).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~
/^(s|c)àpiguen|vine|.*obre|.*omple|pren|ven|fen|.*corre$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+)(a) ([^ ]+) (V.M03S0).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~ /^(s|c)àpiga$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^
]+)(asses|àssem|àsseu|assen|esses|éssem|ésseu|essen|isses|íssem|ísseu|issen)
([^ ]+) (V.SI...).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~ /^$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+(assis|àssim|àssiu|assin)) ([^ ]+) (V.SI...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$3";
+ #if ($arrel!~ /^.+escar$/)
+ {
+ print OUTFILE "$1 $3 $4";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+)(essis|éssim|éssiu|essin|issis|íssim|íssiu|issin)
([^ ]+) (V.SI...).?$/)
+ {
+ $paraula="$1$2";
+ if ($paraula!~ /^$/)
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "Y\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^
]+)(.*[iï](sc|sca|sques|squem|squeu|squen)|.+e(sca|sques|squen)) ([^ ]+)
(V.SI...).?$/)
+ {
+ $paraula="$1$2";
+ $arrel="$5";
+ if (($paraula!~ /^.*(visc|visquem|visqueu)$/) &&
($arrel!~ /^.*[ei]scar$/))
+ {
+ print OUTFILE "$1$2 $5 $6";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]*)(òmplic|òbric|córrec) ([^ ]+) (V......).?$/)
+ {
+ {
+ print OUTFILE "$1$2 $3 $4";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]*)((tra|ja|ve|cre)e(m|nt|u)) ([^ ]+) (V......).?$/)
+ {
+ $paraula="$1$2";
+ $arrel="$5";
+ if ($arrel!~ /^(.*crear|desvear)$/)
+ {
+ print OUTFILE "$1$2 $5 $6";
+ print OUTFILE "V\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+(esqui|esquis|esquin)|.*fé) ([^ ]+) (V......).?$/)
+ {
+ $paraula="$1";
+ $arrel="$3";
+ if ($arrel!~ /^.+escar$/)
+ {
+ print OUTFILE "$1 $3 $4";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^([^ ]+) ([^ ]+) (VMIP1S0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ if (($paraula !~
/^(.+[eo]|.+scric|.*tr[ea]c|.*faig|.*[vt]inc|.*[ïie]sc|.*córrec|acut|tix|tiny|pertanc|planc|complanc|guard|absolc|adic|aparec|aprehenc|aprenc|assec|atenc|bec|benveig|caic|calc|carvenc|cloc|coc|colc|commoc|comparec|complac|componc|comprenc|concloc|condolc|conec|confonc|contenc|contradic|corfonc|corprenc|corresponc|crec|dec|decaic|defenc|depenc|desaparec|desaprenc|desatenc|descloc|descomplac|descomponc|desconec|descrec|desdic|desentenc|despenc|desplac|desponc|desprenc|dic|difonc|dissolc|distenc|dolc|duc|embec|emprenc|encenc|encloc|enduc|enfonc|entenc|entredic|entreploc|entreveig|equivalc|escaic|estenc|estic|excloc|expenc|exsolc|fonc|incloc|infonc|interdic|llec|maldic|malentenc|malprenc|malveig|malvenc|malvull|mamprenc|marfonc|menyscrec|moc|molc|noc|ofenc|olc|parec|plac|ploc|ponc|predic|prenc|pretenc|prevalc|preveig|promoc|puc|rac|reabsolc|reaparec|rebec|recaic|recloc|recoc|recomponc|reconec|redic|refonc|remoc|remolc|reprenc|resolc|responc|retonc|retrovenc|reveig|ric|romanc|salprenc|sec|sobreentenc|sobreprenc|sobresec|sobrevalc|solc|somoc|somric|sorprenc|suspenc|sé|tolc|tonc|transfonc|ullprenc|vaig|valc|veig|venc|vull|ajac|ajec|hac|hec|jac|jec)$/)
+ && ($arrel !~ /^.*sentir$/))
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ elsif ($paraula =~
/^(desatrac|plovisc|nevisc|envisc|entreobr|reompl|tenc|ompl|obr|vénc|abstenc|advenc|avenc|captenc|cartenc|contrafaç|contravenc|convenc|desavenc|desconvenc|desfaç|detenc|entretenc|entrevenc|esdevenc|estrafaç|intervenc|mantenc|menystenc|obtenc|obvenc|perfaç|pervenc|prevenc|provenc|rarefaç|reconvenc|refaç|retenc|revenc|satisfaç|sobrevenc|sostenc|subvenc|viltenc|faç)$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ elsif ($paraula =~ /^(.+esc)$/) #valencià+balear
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Z\n";
+ $printed=1;
+ }
+ elsif ($paraula =~ /^.+[qg]uo$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Y\n"; #català+balear
+ $printed=1;
+ }
+ elsif (($paraula =~ /^.+([^e]i|ï)(xo)$/)
+ && ($arrel !~
/^.+eixir|.+uixir|.+uixar|.+aixar|.+oixar|.+àixer|.*néixer$/))
+ #elimina: fornixo, aclameïxo
+ {
+ print OUTFILE2 "$paraula $arrel $postag";
+ print OUTFILE2 "Y\n"; #català+balear
+ $printed=1;
+ }
+ elsif ($paraula =~ /^.+o$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "C\n"; #català
+ $printed=1;
+ }
+ elsif ($paraula =~
/^(acut|complanc|pertanc|planc|isc|desisc|reïsc|sobreïsc|tix|tiny)$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Z\n"; #valencià+balear
+ $printed=1;
+ }
+ elsif (($paraula =~ /^.*sent$/) && ($arrel =~/^.*sentir$/))
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Z\n"; #valencià+balear
+ $printed=1;
+ }
+ elsif ($paraula =~ /^.+[ïi]sc$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n"; #valencià
+ $printed=1;
+ }
+ elsif ($paraula =~ /^.*[vt]inc$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "X\n"; #valencià+català
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+a[um]) ([^ ]+) (VM[MI].[12]P0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ if (($paraula!~ /^va[um]$/) && ($arrel!~ /^.*caure$/))
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*corrs?) ([^ ]+) (V......).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ #if (($paraula!~ /^va[um]$/) && ($arrel!~ /^.*caure$/))
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+às) ([^ ]+) (V.SI...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ #if ($arrel!~ /^.+escar$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "Z\n"; #valencià+balear
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+eu) ([^ ]+) (VMM02P0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ if ($arrel!~ /^.+(er|re)$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "X\n"; #català+valencià
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+([^e]i|ï)(x|xes|xen)) ([^ ]+) (V.IP...|VMM02S0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$4";
+ $postag="$5";
+ if ($arrel !~
/^.+eixir|.+uixir|.+uixar|.+aixar|.+oixar|.+àixer|.*néixer$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n"; #valencià
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+[iï][ns]?) ([^ ]+) (V.SP...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ if ($paraula !~
/^tixi|tixis|tixin|tinyi|tinyis|tinyin$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Y\n"; #català balear
+ $printed=1;
+ }
+ else
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n"; #balear
+ $printed=1;
+ }
+
+ }
+ elsif ($line =~ /^(.+[iï]n?) ([^ ]+) (V.M03[SP].).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ if ($paraula !~
/^tixi|tixis|tixin|tinyi|tinyis|tinyin$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Y\n"; #català balear
+ $printed=1;
+ }
+ else
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n"; #balear
+ $printed=1;
+ }
+ }
+ # excepcions omplir/obrir. Omple/obre present d'indicatiu val per a tots
els casos.
+ elsif ($line =~ /^(.*(ompli|obri)[ns]?) ([^ ]+) (VMIP...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$3";
+ $postag="$4";
+ #if ($arrel !~ /^.*obrir|.*omplir$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Z\n"; #valencià-balear present d'indicatiu: obri, ompli
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*(ompli|obri)) ([^ ]+) (VMM02S0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$3";
+ $postag="$4";
+ #if ($arrel !~ /^.*obrir|.*omplir$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n"; #valencià present d'indicatiu: obri, ompli
+ $printed=1;
+ }
+ }
+
+ # Més excepcions: tix, tiny, vist, deim i semblants Imperatius: dis, obtín,
obtén, etc.
+
+ elsif ($line =~ /^(tix|tixes|tixen|tiny|tinys|tinyen) ([^ ]+)
(V.IP...|VMM....).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "Z\n"; #valencià balear
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*[fdv]eim|.*[fdv]eis) ([^ ]+) (V.IP...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n"; # balear
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*facem|.*faceu|.*feis) ([^ ]+) (V.M....|V.SP...).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n"; # balear
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.+[ií]s) (.*dir) (VMM02S0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n"; # valencià
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*tín|tin) (.*tenir) (VMM02S0).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "V\n"; # valencià
+ $printed=1;
+ }
+ }
+ elsif ($line =~ /^(.*igue[mu]) (.+ir) (V......).?$/)
+ {
+ $paraula="$1";
+ $arrel="$2";
+ $postag="$3";
+ if ($arrel!~
/^(dir|desdir|maldir|adir|contradir|entredir|interdir|predir|redir)$/)
+ {
+ print OUTFILE "$paraula $arrel $postag";
+ print OUTFILE "B\n";
+ $printed=1;
+ }
+ }
+
+
+
+ if (!$printed)
+ {
+ if ($line =~ /^([^ ]+) ([^ ]+) (V......).?$/)
+ {
+ print OUTFILE "$1 $2 $3";
+ print OUTFILE "0\n";
+ }
+ else
+ {
+ print OUTFILE "$line\n";
+ }
+ }
+
+}
+close(FILE1);
+close(OUTFILE);
+close(OUTFILE2);
+
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_d2d_feb
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits