Dave Sherohman writes
Hey, all! Long-time Perl programmer, but new to the world of libraries,
so I'm not all that familiar with all the data formats used in these
parts.
I am attempting to use some code which depends on Net::OAI::Harvester,
but my attempts to install OAI::Harvester are running into problems
with:
I am not so familiar with the oai harvesting tools in Perl, so
forgive me if I am giving you incorrect information. My vague
recollection is that there are several oai harvesters for Perl.
The one I use is different one, I think, http::oai. I suggest
you try with this. FWIW, I attach a script taht I use to download
OAI archives. I used to keep a collection of them, based on an
opendoar listing. I think I will soon stop it.
I hope this is helpful.
Cheers,
Thomas Krichelhttp://openlib.org/home/krichel
http://authorclaim.org/profile/pkr1
skype: thomaskrichel
#!/usr/bin/perl -w
use lib '/home/mamf/usr/share/perl/';
use strict;
use Data::Dumper;
use Data::Random qw(:all);
use List::Util qw(shuffle);
use File::Basename;
use File::Compare;
use File::Copy;
use File::Find;
use File::Path;
use File::Listing qw(parse_dir);
use File::Temp qw/ tempfile tempdir /;
use File::Touch;
use LWP::Simple;
use HTTP::OAI;
use Storable;
use XML::DOM;
use XML::LibXML;
use Time::Piece;
use Time::Seconds;
# use Sys::RunAlone;
## home-grown
use Mamf::Common;
## the size of the files, in terms of OAI_DC records
my $batch_size=100;
## directories
my $home=$ENV{'HOME'};
my $log_dir=$home/public_html/log;
my $amf_file =$home/amf/oa/oa.amf.xml;
## renewal time of 30 days
my $renewal_time=30*24*60*60;
## counters
my $collection_count=0;
my $no_oai_count=0;
## XML and standards constants
my $amf_ns='http://amf.openlib.org';
my $doar_ns='http://opendoar.org';
my $freelib_ns='http://3lib.org';
my $collection_prefix='info:3lib:oa:';
## run parmeter
my $verbose=0;
##
## first argument will be an archive to do
##
my $to_do_archive=$ARGV[0];
##
## parse the amf file to find the already existing
## 3lib ids and the oai interfaces, recorded in doar
##
## gives the oai_url for an id
my $oai_urls;
## gives the id for an oai_url
my $ids;
## gives the rID for an oai_url
my $rIDs;
## gives the metadata_formats for an id
my $metadata_format;
##
## open log file
##
my $date=`date -I`;
chomp $date;
my $log_file=$log_dir/down_oa_$date.log;
open(LOG, $log_file);
binmode(LOG,:utf8:);
## populate these varibles, deletes
## archives not to get
parse_oa_amf();
## create in_dirs variable, that contains input
## directories
my @in_dirs;
## an indicator of the input directory
my $in_dir;
foreach my $archive (keys %{$metadata_format}) {
my $format=$metadata_format-{$archive};
if(not defined($in_dir-{$format})) {
push(@in_dirs,$home/opt/$format/oa/$archive);
}
## double meaning array
$in_dir-{$archive}=$home/opt/$format/oa/$archive;
}
if(not $to_do_archive) {
harvest_all();
}
else {
print doing $to_do_archive\n;
eval {
harvest_to_dir($to_do_archive);
} ;
}
exit;
##
##
## shuffle the oai_url, find what archives to download
##
sub harvest_all {
my @rand_ids=shuffle(keys %{$oai_urls}) ;
my $ineligible=get_ineligble_archives($renewal_time);
foreach my $id (@rand_ids) {
open(LOG, $log_file);
binmode(LOG,:utf8:);
my $date=`date --rfc-3339=seconds`;
chomp $date;
print LOG at: $date ;
if($ineligible-{$id}) {
print LOG not renewing .$id., rID .
$rIDs-{$id}., .$ineligible-{$id}.\n;
next;
}
## try to catch errors if it bombs out
print LOG get: $id, rID $rIDs-{$id} from $oai_urls-{$id}\n;
eval {
harvest_to_dir($id);
} ;
if($@) {
print LOG error at id $id: $@\n;
close LOG;
}
}
close LOG;
}
sub get_ineligble_archives {
## directory where the archives
my $max_ago=shift;
## result, an array reference
my $r;
my $count;
foreach my $in_dir (@in_dirs) {
if(not -d $in_dir) {
print LOG making $in_dir\n;
mkdir $in_dir;
}
#foreach my $dir (`ls $format_dir`) {
# ## remove newline
# chomp $dir;
# ## it hase to have 6-char names
# my $archive_dir=$format_dir/$dir;
# if($verbose) {
#print LOG checking $archive_dir\n;
# }
if(not $in_dir=~m|/([^/]{6})$|) {
next;
}
my $id=$1;
$r-{$id}=is_eligible($in_dir,$max_ago);
}
return $r;
}
## check for archiving time
sub is_eligible {
## list xml files, but report no error if they are
## not there
## code kept as a transition
my $archive_dir=shift;
my $max_ago=shift;
my $now=time();
if(not -d $archive_dir) {
print no such dir: $archive_dir\n;
}
## check if it is locked
my $lock_file=$archive_dir/lock;
if(-f $lock_file) {
## remove lock file