Hello,

I am a beginner  in perl and I have segmentation fault in my code. the
code run perfectly for the until the third iteration and it produce
segmentation fault  in the inner while loop in the 3rd iteration of
the outer while loop.
Would you please help me with hints.
Your help will be appreciated

--
Motaz SAAD




#!/usr/bin/perl -w

# Code : Dake
use strict;
use Parse::MediaWikiDump;
use utf8;
use XML::Parser;
use XML::Writer;
use IO::File;

binmode STDOUT, ":utf8";



#my $file = shift(@ARGV) or die "must specify a Mediawiki dump file";
my $enWiki = 'enwiki-latest-pages-articles.xml';
my $frWiki = 'frwiki-20111123-pages-articles.xml';
my $arWiki = 'arwiki-20111118-pages-articles.xml';

my $enPages = Parse::MediaWikiDump::Pages->new($enWiki);
my $frPages = Parse::MediaWikiDump::Pages->new($frWiki);
my $arPages = Parse::MediaWikiDump::Pages->new($arWiki);


my $output = new IO::File("enfrar.xml", ">:utf8");

my $writer = new XML::Writer(OUTPUT => $output, DATA_MODE =>
1,DATA_INDENT => 2);
$writer->xmlDecl("UTF-8");

my $enPage;
#my $frPage;
#my $arPage;

my $enId;
#my $frId;
#my $arId;

my $enTitle;
#my $arTitle;
#my $frTitle;

my $enText;
#my $frText;
#my $arText;



my $EnCount = 0;
my $EnArFrCount = 0;
my $EnArCount = 0;
my $EnFrCount = 0;
my $testCount = 1;


#my $category;

$writer->startTag("en-fr-ar-wiki");

while(defined($enPage = $enPages->next)) {#for each english article
        #main namespace only
        next unless $enPage->namespace eq '';

        $enId = $enPage->id;
        $enTitle = $enPage->title;
        #$category = $page->category;
        $enText = $enPage->text;

        $EnCount++;


        if($enTitle eq "A") {next;}







        #if (($$text =~ /\[\[en:/i) && ($$text =~ /\[\[ar:/i))

        if (  ($$enText =~ m/\[\[fr:/i)  && ($$enText =~ m/\[\[ar:/i)  ) {#
if the english article contains links for arabic and french articles
                print "\nlinks found for ar & fr in en article entitled:   ",
$enTitle , "\n";

                my $frPage;
                my $arPage;

                my $frId = "id not found";
                my $arId = "id not found";

                my $arTitle;
                my $frTitle;

                my $frText;
                my $arText;
                $frText = "text not found";
                $arText = "text not found";


                $EnArFrCount++;
                $$enText =~ /\[\[fr:(.*?)\]/  ;
                $frTitle = $1;
                $$enText =~ /\[\[ar:(.*?)\]/  ;
                $arTitle = $1;


                #enforce the MediaWiki case rules
                #$frTitle = case_fixer($frTitle);
                print "searching for fr text in fr wiki\n";

                while(defined($frPage = $frPages->next)) {#find the the french
article id and text,,, search by article's title
                #main namespace only
                next unless $frPage->namespace eq '';
                        if ($frPage->title eq $frTitle) {
                                my $frTextRef = $frPage->text;
                                $frText = $$frTextRef;
                                $frId = $frPage->id;
                                print "fr text found\n";
                                last;
                        }
                }#end while for extracting french article id and text

########################################

                #enforce the MediaWiki case rules
                #$arTitle = case_fixer($arTitle);
                print "searching for ar text in ar wiki\n";

                while(defined($arPage = $arPages->next)) {#find the the arabic
article id and text,,, search by article's title
                        #main namespace only
                        next unless $arPage->namespace eq '';
                        if ($arPage->title eq $arTitle) {
                                my $arTextRef = $arPage->text;
                                $arText = $$arTextRef;
                                $arId = $arPage->id;
                                print "ar text found\n";
                                last;
                        }
                }#end while for extracting arabic article id and text





                print $enId;
                print ",";
                print $enTitle;
                print ",";
                print $frTitle;
                print ",";
                print $arTitle;
                print"\n";

                #SQL
                $writer->startTag("page");
########################################
                $writer->startTag("en");

                $writer->startTag("id");
                $writer->characters($enId);
                $writer->endTag();

                $writer->startTag("title");
                $writer->characters($enTitle);
                $writer->endTag();

                $writer->startTag("text");
                $writer->characters($$enText);
                $writer->endTag();

                $writer->endTag();

                print "finish writing english artile's id, title, and text to 
xml
file\n";
########################################
                $writer->startTag("fr");

                $writer->startTag("id");
                $writer->characters($frId);
                $writer->endTag();

                $writer->startTag("title");
                $writer->characters($frTitle);
                $writer->endTag();

                $writer->startTag("text");
                $writer->characters($frText);
                $writer->endTag();

                $writer->endTag();
                print "finish writing french artile's id, title, and text to 
xml file
\n";
########################################
                $writer->startTag("ar");

                $writer->startTag("id");
                $writer->characters($arId);
                $writer->endTag();

                $writer->startTag("title");
                $writer->characters($arTitle);
                $writer->endTag();

                $writer->startTag("text");
                $writer->characters($arText);
                $writer->endTag();

                $writer->endTag();
                print "finish writing arabic artile's id, title, and text to 
xml file
\n";
########################################


                $writer->endTag();
                print "closing page tag for the article num: ", $testCount, " 
\n";
                $testCount++;

#               print $testCount; print "\t";
#               if ($testCount == 10){
#                       print "\n";
#                       last;
#               }


        }#end if for en ar fr

        if (($$enText =~ /\[\[fr:/i)){

                #print $page->title, "\n";
                $EnFrCount ++;
        }

        if (($$enText =~ /\[\[ar:/i)){
                #print $page->title, "\n";
                $EnArCount ++;
        }
        #print "working !!!!\n";

}# end for while loop (for each english article)


$writer->endTag();
$writer->end();
$output->close();


print "\n\n";
print "English : ";
print $EnCount;

print "\n\n";
print "English/Arabic/French : ";
print $EnArFrCount;

print "\n\n";
print "English/Arabic : ";
print $EnArCount;

print "\n\n";
print "English/French : ";
print $EnFrCount;



#removes any case sensativity from the very first letter of the title
#but not from the optional namespace name
sub case_fixer {
  my $title = shift;

  #check for namespace
  if ($title =~ /^(.+?):(.+)/) {
    $title = $1 . ':' . ucfirst($2);
  } else {
    $title = ucfirst($title);
  }

  return $title;
}




-- 
To unsubscribe, e-mail: beginners-unsubscr...@perl.org
For additional commands, e-mail: beginners-h...@perl.org
http://learn.perl.org/


Reply via email to