Darkdadaah has submitted this change and it was merged.

Change subject: Add option to find a pattern in the pages
......................................................................


Add option to find a pattern in the pages

- Pattern matched with option -r "perl regexp pattern"
- Also use \r to rewrite the progress lines

Change-Id: I242071b5a46ce8ba74bcd43c29353bdaaacd0da7
---
M scripts/list_vs_dump.pl
1 file changed, 21 insertions(+), 5 deletions(-)

Approvals:
  Darkdadaah: Verified; Looks good to me, approved



diff --git a/scripts/list_vs_dump.pl b/scripts/list_vs_dump.pl
index 6ea095d..2844574 100755
--- a/scripts/list_vs_dump.pl
+++ b/scripts/list_vs_dump.pl
@@ -16,6 +16,7 @@
 use wiktio::parser                     qw( parseArticle printArticle 
parseLanguage printLanguage parseType printType is_gentile) ;
 use wiktio::pron_tools         qw(cherche_prononciation simple_prononciation 
section_prononciation) ;
 our %opt ;
+my %count = ('pages' => 0, 'mots' => 0);
 
 #################################################
 # Message about this program and how to use it
@@ -38,6 +39,8 @@
        -p list,  : utiliser ces espaces de nommage
        -m        : Utiliser les pages de l'espace principal (en combinaison 
avec -P or -p)
        
+       -r <str>  : Ne garder que les pages ayant ce motif regex dans leur texte
+       
 EOF
        exit ;
 }
@@ -46,7 +49,7 @@
 # Command line options processing
 sub init()
 {
-       getopts( 'hi:I:o:PHp:m', \%opt ) or usage() ;
+       getopts( 'hi:I:o:PHp:mr:', \%opt ) or usage() ;
        usage() if $opt{h} ;
        
        usage( "Chemin du dump (-i)" ) if not $opt{i} ;
@@ -109,9 +112,18 @@
        my ($titre, $article, $dico, $sql) = @_ ;
        
        # Précorrection
+       my $regexok = 0;
        foreach my $line (@$article) {
+               if (not $regexok and $opt{r} and $line =~ /$opt{r}/) {
+                       $regexok=1;
+               }
                # Ligne de traduction ou prononciation
                $line =~ s/\*\*? ?\{\{[^\}\{]+?\}\} ?:.+$/ /g ;
+       }
+       
+       # Pas trouvé le motif indispensable
+       if ($opt{r} and not $regexok) {
+               return 0;
        }
        
        # Wikisource qualité
@@ -187,6 +199,7 @@
                my $cols = join("\t", @cols) ;
                print $sql "$cols\n" ;
                $num_mots++ ;
+               $count{mots}++;
        }
        
        return $num_mots ;
@@ -226,7 +239,6 @@
 # Read dump
 open(DUMP, dump_input($opt{i})) or die "Couldn't open '$opt{i}': $!\n" ;
 my $title = '' ;
-my $n = 0 ;
 my $complete_article = 0 ;
 my @article = () ;
 
@@ -237,6 +249,7 @@
 print STDERR "Allowed namespaces: ".join(", ", @namespaces)."\n" ;
 open(my $sql, ">$sqlfile") or die("$sqlfile: $!") ;
 
+$|=1;
 while(<DUMP>) {
        if ( /<title>(.+?)<\/title>/ ) {
                $title = $1 ;
@@ -305,16 +318,19 @@
                        my $mots_en_plus = article($title, \@article, $dico, 
$sql) ;
                        $num_mots += $mots_en_plus if $mots_en_plus ;
                        ######################################
-                       $n++ ;
-                       print "[$n] $title\n" if $n%10000==0 ;
+                       $count{pages}++ ;
+                       print STDERR "[$count{pages}] [$count{mots}] $title     
     \r" if $count{pages}%1000==0 ;
                }
                $complete_article = 0 ;
        }
 }
+$|=0;
+print STDERR "\n";
 close(DUMP) ;
 close($sql) ;
 
-print "Total = $n\n" ;
+print "Total = $count{pages}\n" ;
+print "Total = $count{mots}\n" if $count{mots};
 
 my $num_dico = keys %$dico ;
 print "Articles dico: $num_dico\n" ;

-- 
To view, visit https://gerrit.wikimedia.org/r/71044
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I242071b5a46ce8ba74bcd43c29353bdaaacd0da7
Gerrit-PatchSet: 1
Gerrit-Project: wiktionary/anagrimes
Gerrit-Branch: master
Gerrit-Owner: Darkdadaah <[email protected]>
Gerrit-Reviewer: Darkdadaah <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to