Darkdadaah has submitted this change and it was merged.
Change subject: Add option to find a pattern in the pages
......................................................................
Add option to find a pattern in the pages
- Pattern matched with option -r "perl regexp pattern"
- Also use \r to rewrite the progress lines
Change-Id: I242071b5a46ce8ba74bcd43c29353bdaaacd0da7
---
M scripts/list_vs_dump.pl
1 file changed, 21 insertions(+), 5 deletions(-)
Approvals:
Darkdadaah: Verified; Looks good to me, approved
diff --git a/scripts/list_vs_dump.pl b/scripts/list_vs_dump.pl
index 6ea095d..2844574 100755
--- a/scripts/list_vs_dump.pl
+++ b/scripts/list_vs_dump.pl
@@ -16,6 +16,7 @@
use wiktio::parser qw( parseArticle printArticle
parseLanguage printLanguage parseType printType is_gentile) ;
use wiktio::pron_tools qw(cherche_prononciation simple_prononciation
section_prononciation) ;
our %opt ;
+my %count = ('pages' => 0, 'mots' => 0);
#################################################
# Message about this program and how to use it
@@ -38,6 +39,8 @@
-p list, : utiliser ces espaces de nommage
-m : Utiliser les pages de l'espace principal (en combinaison
avec -P or -p)
+ -r <str> : Ne garder que les pages ayant ce motif regex dans leur texte
+
EOF
exit ;
}
@@ -46,7 +49,7 @@
# Command line options processing
sub init()
{
- getopts( 'hi:I:o:PHp:m', \%opt ) or usage() ;
+ getopts( 'hi:I:o:PHp:mr:', \%opt ) or usage() ;
usage() if $opt{h} ;
usage( "Chemin du dump (-i)" ) if not $opt{i} ;
@@ -109,9 +112,18 @@
my ($titre, $article, $dico, $sql) = @_ ;
# Précorrection
+ my $regexok = 0;
foreach my $line (@$article) {
+ if (not $regexok and $opt{r} and $line =~ /$opt{r}/) {
+ $regexok=1;
+ }
# Ligne de traduction ou prononciation
$line =~ s/\*\*? ?\{\{[^\}\{]+?\}\} ?:.+$/ /g ;
+ }
+
+ # Pas trouvé le motif indispensable
+ if ($opt{r} and not $regexok) {
+ return 0;
}
# Wikisource qualité
@@ -187,6 +199,7 @@
my $cols = join("\t", @cols) ;
print $sql "$cols\n" ;
$num_mots++ ;
+ $count{mots}++;
}
return $num_mots ;
@@ -226,7 +239,6 @@
# Read dump
open(DUMP, dump_input($opt{i})) or die "Couldn't open '$opt{i}': $!\n" ;
my $title = '' ;
-my $n = 0 ;
my $complete_article = 0 ;
my @article = () ;
@@ -237,6 +249,7 @@
print STDERR "Allowed namespaces: ".join(", ", @namespaces)."\n" ;
open(my $sql, ">$sqlfile") or die("$sqlfile: $!") ;
+$|=1;
while(<DUMP>) {
if ( /<title>(.+?)<\/title>/ ) {
$title = $1 ;
@@ -305,16 +318,19 @@
my $mots_en_plus = article($title, \@article, $dico,
$sql) ;
$num_mots += $mots_en_plus if $mots_en_plus ;
######################################
- $n++ ;
- print "[$n] $title\n" if $n%10000==0 ;
+ $count{pages}++ ;
+ print STDERR "[$count{pages}] [$count{mots}] $title
\r" if $count{pages}%1000==0 ;
}
$complete_article = 0 ;
}
}
+$|=0;
+print STDERR "\n";
close(DUMP) ;
close($sql) ;
-print "Total = $n\n" ;
+print "Total = $count{pages}\n" ;
+print "Total = $count{mots}\n" if $count{mots};
my $num_dico = keys %$dico ;
print "Articles dico: $num_dico\n" ;
--
To view, visit https://gerrit.wikimedia.org/r/71044
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I242071b5a46ce8ba74bcd43c29353bdaaacd0da7
Gerrit-PatchSet: 1
Gerrit-Project: wiktionary/anagrimes
Gerrit-Branch: master
Gerrit-Owner: Darkdadaah <[email protected]>
Gerrit-Reviewer: Darkdadaah <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits