And for those who want to run test by their self, here attached is my
tests comparing pure Perl parsing and various SAX parser (which need to
be installed):
XML::SAX::PurePerl
XML::LibXML::SAX::Parser
XML::SAX::Expat
XML::SAX::ExpatXS
SAX parsing is done directly, without using MARC::File::XML in order to
have raw figures. Parsing in MARC::File::XML should slow down a little
bit but I can't say of what magnitude.
#!/usr/bin/perl
use warnings;
use strict;
use XML::Simple;
use MARC::File::XML;
use Time::HiRes qw(gettimeofday);
# Number of time the following record is parsed
my $max = 1000;
# Tested SAX parsers
my @xml_parsers = qw(
XML::SAX::PurePerl
XML::LibXML::SAX::Parser
XML::SAX::Expat
XML::SAX::ExpatXS
);
my $raw_xml = <<EOS;
<record>
<leader>00675cam a22002051 4500</leader>
<controlfield tag="001"> 10026159 </controlfield>
<controlfield tag="003">DLC</controlfield>
<controlfield tag="005">20050815184409.0</controlfield>
<controlfield tag="008">830916s1910 gw 000 0 ger
</controlfield>
<datafield tag="010" ind1=" " ind2=" ">
<subfield code="a"> 10026159 </subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(OCoLC)9914473</subfield>
</datafield>
<datafield tag="040" ind1=" " ind2=" ">
<subfield code="a">DLC</subfield>
<subfield code="c">OCU</subfield>
<subfield code="d">OCU</subfield>
<subfield code="d">DLC</subfield>
</datafield>
<datafield tag="042" ind1=" " ind2=" ">
<subfield code="a">premarc</subfield>
</datafield>
<datafield tag="050" ind1="0" ind2="0">
<subfield code="a">PA6792.Z9</subfield>
<subfield code="b">G4</subfield>
</datafield>
<datafield tag="100" ind1="1" ind2=" ">
<subfield code="a">Germann, Peter.</subfield>
</datafield>
<datafield tag="245" ind1="1" ind2="4">
<subfield code="a">Die sogenannten Sententiae Varronis.</subfield>
<subfield code="c">Von Peter Germann.</subfield>
</datafield>
<datafield tag="260" ind1=" " ind2=" ">
<subfield code="a">Paderborn,</subfield>
<subfield code="b">F. SchoÌningh,</subfield>
<subfield code="c">1910.</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">2 p. l., 98 p., 1 l.</subfield>
<subfield code="c">24 cm.</subfield>
</datafield>
<datafield tag="440" ind1=" " ind2="0">
<subfield code="a">Studien zur Geschichte und Kultur des Altertums
...</subfield>
<subfield code="v">3. Bd., 6. Hft</subfield>
</datafield>
<datafield tag="600" ind1="1" ind2="0">
<subfield code="a">Varro, Marcus Terentius.</subfield>
<subfield code="k">Spurious and doubtful works.</subfield>
<subfield code="t">Sententiae Varronis.</subfield>
</datafield>
</record>
EOS
# Pure Perl parser
sub get_record_from_xml {
my $raw = shift;
return unless $raw;
return undef unless $raw =~ /<record/;
my @parts = split />/, $raw;
my $record = MARC::Record->new();
my @fields;
while ( @parts ) {
$_ = shift @parts;
$_ = shift @parts if /<record/;
if ( /<leader/ ) {
$_ = shift @parts;
/(.*)<\/leader/;
$record->leader($_);
next;
}
if ( /<controlfield\s*tag="(.*)"/ ) {
my $tag = $1;
$_ = shift @parts;
s/<\/controlfield//;
push @fields, MARC::Field->new( $tag, $_ );
next;
}
if ( /<datafield\s*tag="(.*?)"\s*ind1="(.*?)"\s*ind2="(.*)"/ ) {
my ($tag, $ind1, $ind2) = ($1, $2, $3);
my @subf;
while ( @parts && $parts[0] =~ /<subfield.*code="(.*)"/ ) {
my $letter = $1;
shift @parts;
$_ = shift @parts;
s/<\/subfield//;
push @subf, $letter => $_;
}
push @fields, MARC::Field->new($tag, $ind1, $ind2, @subf);
shift @parts;
next;
}
last;
}
$record->{_fields} = \...@fields;
return $record;
};
my $xs = XML::Simple->new();
# SAX parser
sub get_record_from_xml_sax {
my ($raw) = @_;
return unless $raw;
my $ref = eval { $xs->XMLin($raw, forcearray => [ 'subfield' ] ) };
return undef if $@;
my $record = MARC::Record->new();
$record->leader( $ref->{leader} );
my @fields_control = map {
MARC::Field->new( $_->{tag}, $_->{content} );
} @{$ref->{controlfield}};
my @fields_std = map {
my @sf = map { ($_->{code}, $_->{content}) } @{$_->{subfield}};
MARC::Field->new(
$_->{tag},
$_->{ind1},
$_->{ind2},
@sf,
);
} @{$ref->{datafield}};
$record->{_fields} = [ @fields_control, @fields_std ];
}
sub parse_with_perl {
my $start = gettimeofday;
for ( my $count = 0; $count < $max; $count++ ) {
my $record = get_record_from_xml($raw_xml);
}
print "Parsed $max MARC::Record objects from XML in pure Perl: ",
gettimeofday - $start, "\n";
}
sub parse_with_sax {
for my $sax_parser ( @xml_parsers ) {
$XML::SAX::ParserPackage = $sax_parser;
my $count = 0;
my $start = gettimeofday;
for ( my $count = 0; $count < $max; $count++ ) {
my $record = get_record_from_xml_sax( $raw_xml );
$count++;
last if $count == $max;
}
print "Parsed $max MARC::Record objects from XML using ",
"$sax_parser : ", gettimeofday - $start, "\n";
}
}
parse_with_perl();
parse_with_sax();
_______________________________________________
Koha-devel mailing list
[email protected]
http://lists.koha-community.org/cgi-bin/mailman/listinfo/koha-devel
website : http://www.koha-community.org/
git : http://git.koha-community.org/
bugs : http://bugs.koha-community.org/