Houghton,Andrew
Mon, 25 Jan 2010 07:08:14 -0800
> From: Nolte, Jennifer [mailto:jennifer.no...@yale.edu] > Sent: Monday, January 25, 2010 09:48 AM > To: perl4lib@perl.org > Subject: Splitting a large file of MARC records into smaller files > > Hello- > > I am working with files of MARC records that are over a million records > each. I'd like to split them down into smaller chunks, preferably using > a command line. MARCedit works, but is slow and made for the desktop. > I've looked around and haven't found anything truly useful- Endeavor's > MARCsplit comes close but doesn't separate files into even numbers, > only by matching criteria, so there could be lots of record duplication > between files. > > Any idea where to begin? I am a (super) novice Perl person.
I use the following handy script I created many-many years ago. Consider
it to be in the public domain.
#!perl
#
# Usage:
# perl MARC21-split.pl [-d#] [-n#] [-pPrefix] [-sSuffix] *.marc
#
# perl MARC21-split.pl -d3 -n10000 -pbib -s.marc *.marc
#
# Creates files with three digits sequence number that have 10,000
# records per file: bib001.marc, bib002.marc, etc.
#
# Options:
# -d number of digits for sequence number
# -n number of records per file
# -p prefix text before sequence number
# -s suffix text after sequence number
#
package main; # The current package name
require 5.003; # The current package requires Perl v5.003 or later.
BEGIN { unshift(@INC,'.') }
use Carp; # Perl package, see documentation
my $PACKAGE = 'main';
###################### VARIABLES #####################
my $crlf = "\n"; # ASCII newline.
my $recd = "\x1D"; # MARC21 record delimiter.
my $fldd = "\x1E"; # MARC21 field delimiter.
my $subd = "\x1F"; # MARC21 field separator.
###################### INLINE CODE #####################
# Change Perls default record delimiter.
$/ = $recd;
# Set defaults for command line options.
my $recs = 1;
my $digits = 2;
my $prefix = '';
my $suffix = '.mrc';
# Initialize total record count to zero.
my $total = 0;
print STDERR join("\r\nARG=",'',@ARGV),"\r\n";
# Process command line.
foreach $FileMARC (@ARGV) {
my $FileOUT = undef;
# Process command line options.
if ($FileMARC =~ m/^[\-][Dd]/) {
$FileMARC =~ s/^[\-][Dd]//;
if (($digits = $FileMARC) !~ m/\d+/ || $recs == 0) {
$digits = 1;
}
next;
} elsif ($FileMARC =~ m/^[\-][Nn]/) {
$FileMARC =~ s/^[\-][Nn]//;
if (($recs = $FileMARC) !~ m/\d+/ || $recs == 0) {
$recs = 1;
}
next;
} elsif ($FileMARC =~ m/^[\-][Pp]/) {
$FileMARC =~ s/^[\-][Pp]//;
$prefix = $FileMARC;
next;
} elsif ($FileMARC =~ m/^[\-][Ss]/) {
$FileMARC =~ s/^[\-][Ss]//;
$suffix = $FileMARC;
next;
}
# Open file from command line.
open(MARC,'<'.$FileMARC) ||
croak("$PACKAGE:: Cannot open input file '$FileMARC': $!");
# Count each record in the file.
my $count = 0;
while (<MARC>) {
# Open new output file when necessary.
if (($total % $recs) == 0) {
my $pattern = sprintf('%%s%%0%uu%%s',int($digits));
$FileOUT = sprintf($pattern,$prefix,($total/$recs)+1,$suffix);
# Open output file.
open(OUT,'>'.$FileOUT) ||
croak("$PACKAGE:: Cannot open output file '$FileOUT': $!");
}
print OUT $_; ++$total;
# Close output file when full.
if (($total % $recs) == 0) {
# Close file from command line.
close(OUT);
}
++$count;
}
# Close file from command line.
close(MARC);
# Output total records in file and file name.
print STDERR join("\t",$count,$FileMARC),$crlf;
}
# Output total record count and file count.
print STDERR join("\t",$total,"Total Records"),$crlf;
print STDERR join("\t",int($total/$recs)+1,"Total Files"),$crlf;