Dave Sherohman writes > Hey, all! Long-time Perl programmer, but new to the world of libraries, > so I'm not all that familiar with all the data formats used in these > parts. > > I am attempting to use some code which depends on Net::OAI::Harvester, > but my attempts to install OAI::Harvester are running into problems > with: I am not so familiar with the oai harvesting tools in Perl, so forgive me if I am giving you incorrect information. My vague recollection is that there are several oai harvesters for Perl. The one I use is different one, I think, http::oai. I suggest you try with this. FWIW, I attach a script taht I use to download OAI archives. I used to keep a collection of them, based on an opendoar listing. I think I will soon stop it.
I hope this is helpful. Cheers, Thomas Krichel http://openlib.org/home/krichel http://authorclaim.org/profile/pkr1 skype: thomaskrichel
#!/usr/bin/perl -w use lib '/home/mamf/usr/share/perl/'; use strict; use Data::Dumper; use Data::Random qw(:all); use List::Util qw(shuffle); use File::Basename; use File::Compare; use File::Copy; use File::Find; use File::Path; use File::Listing qw(parse_dir); use File::Temp qw/ tempfile tempdir /; use File::Touch; use LWP::Simple; use HTTP::OAI; use Storable; use XML::DOM; use XML::LibXML; use Time::Piece; use Time::Seconds; # use Sys::RunAlone; ## home-grown use Mamf::Common; ## the size of the files, in terms of OAI_DC records my $batch_size=100; ## directories my $home=$ENV{'HOME'}; my $log_dir="$home/public_html/log"; my $amf_file ="$home/amf/oa/oa.amf.xml"; ## renewal time of 30 days my $renewal_time=30*24*60*60; ## counters my $collection_count=0; my $no_oai_count=0; ## XML and standards constants my $amf_ns='http://amf.openlib.org'; my $doar_ns='http://opendoar.org'; my $freelib_ns='http://3lib.org'; my $collection_prefix='info:3lib:oa:'; ## run parmeter my $verbose=0; ## ## first argument will be an archive to do ## my $to_do_archive=$ARGV[0]; ## ## parse the amf file to find the already existing ## 3lib ids and the oai interfaces, recorded in doar ## ## gives the oai_url for an id my $oai_urls; ## gives the id for an oai_url my $ids; ## gives the rID for an oai_url my $rIDs; ## gives the metadata_formats for an id my $metadata_format; ## ## open log file ## my $date=`date -I`; chomp $date; my $log_file="$log_dir/down_oa_$date.log"; open(LOG,"> $log_file"); binmode(LOG,":utf8:"); ## populate these varibles, deletes ## archives not to get &parse_oa_amf(); ## create in_dirs variable, that contains input ## directories my @in_dirs; ## an indicator of the input directory my $in_dir; foreach my $archive (keys %{$metadata_format}) { my $format=$metadata_format->{$archive}; if(not defined($in_dir->{$format})) { push(@in_dirs,"$home/opt/$format/oa/$archive"); } ## double meaning array $in_dir->{$archive}="$home/opt/$format/oa/$archive"; } if(not $to_do_archive) { &harvest_all(); } else { print "doing $to_do_archive\n"; eval { &harvest_to_dir($to_do_archive); } ; } exit; ################################################################## ## ## shuffle the oai_url, find what archives to download ## sub harvest_all { my @rand_ids=shuffle(keys %{$oai_urls}) ; my $ineligible=&get_ineligble_archives($renewal_time); foreach my $id (@rand_ids) { open(LOG,">> $log_file"); binmode(LOG,":utf8:"); my $date=`date --rfc-3339=seconds`; chomp $date; print LOG "at: $date "; if($ineligible->{$id}) { print LOG "not renewing ".$id.", rID ". $rIDs->{$id}.", ".$ineligible->{$id}."\n"; next; } ## try to catch errors if it bombs out print LOG "get: $id, rID $rIDs->{$id} from $oai_urls->{$id}\n"; eval { &harvest_to_dir($id); } ; if($@) { print LOG "error at id $id: $@\n"; close LOG; } } close LOG; } sub get_ineligble_archives { ## directory where the archives my $max_ago=shift; ## result, an array reference my $r; my $count; foreach my $in_dir (@in_dirs) { if(not -d $in_dir) { print LOG "making $in_dir\n"; mkdir $in_dir; } #foreach my $dir (`ls $format_dir`) { # ## remove newline # chomp $dir; # ## it hase to have 6-char names # my $archive_dir="$format_dir/$dir"; # if($verbose) { # print LOG "checking $archive_dir\n"; # } if(not $in_dir=~m|/([^/]{6})$|) { next; } my $id=$1; $r->{$id}=&is_eligible($in_dir,$max_ago); } return $r; } ## check for archiving time sub is_eligible { ## list xml files, but report no error if they are ## not there ## code kept as a transition my $archive_dir=shift; my $max_ago=shift; my $now=time(); if(not -d $archive_dir) { print "no such dir: $archive_dir\n"; } ## check if it is locked my $lock_file="$archive_dir/lock"; if(-f $lock_file) { ## remove lock file if old my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($lock_file); ## time between now and modification time my $ago=$now-$mtime; ## print "$ago\n"; if($ago > $max_ago) { unlink $lock_file; return; } return "lock file $lock_file is present"; } ## look for the file "done" ## this should be the main indicator, say in december 2009 my $file="$archive_dir/done"; if(-f $file) { my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($file); ## time between now and modification time my $ago=$now-$mtime; ## print "$ago\n"; if($ago < $max_ago) { my $val = Time::Seconds->new($ago); ## number of days ago my $days = sprintf("%.0f", $val->days); return "done $days days ago, (done)"; } ## if it is there, it is the master indication else { return undef; } } ## the old way foreach my $file (`ls -t $archive_dir/*.xml 2> /dev/null`) { chomp $file; my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($file); ## time between now and modicifaciton time my $ago=$now-$mtime; ## print "$ago\n"; if($ago < $max_ago) { my $val = Time::Seconds->new($ago); ## number of days ago my $days = sprintf("%.0f", $val->days); return "done $days days ago (files)"; last; } } return undef; } ## harvest to a directory sub harvest_to_dir { my $id=shift; my $out; my $metadata_prefix=$metadata_format->{$id}; my $in_dir=$in_dir->{$id}; my $identifier_count=0; my $metadata_count=0; my $count_records=0; my $batch_count=0; print LOG "in_dir is $in_dir\n"; ## create lock file my $lock_file="$in_dir/lock"; touch("$in_dir/lock"); if(not -d $in_dir) { print "making $in_dir"; mkpath($in_dir); } my $baseURL=$oai_urls->{$id}; if(not defined($baseURL)) { print LOG "no URL defined for $id\n"; unlink $lock_file; return; } my $h = new HTTP::OAI::Harvester(baseURL=>$baseURL); my $lr = $h->ListRecords(metadataPrefix=> $metadata_prefix); if(not $lr) { print LOG "no response\n"; unlink $lock_file; return; } if( $lr->is_error ) { print LOG "Error harvesting: " . $lr->message . "\n"; unlink $lock_file; return; } if($lr->responseDate) { print LOG "responseDate => ", $lr->responseDate, "\n", "requestURL => ", $lr->requestURL, "\n"; } else { print LOG "no reponse data in request \n"; } ## start with first documen #old: my $doc = XML::LibXML->createDocument; #old: my $records_element = $doc->createElement("records"); my $rec; my $records_as_string=''; while( $rec = $lr->next ) { my $identifier=$rec->identifier; my $metadata=$rec->metadata; # print LOG "Identifier => ", $identifier, "\n"; if( $rec->is_error ) { print LOG "Error: ", $rec->code, " (", $rec->message, ")\n"; } if($identifier) { $identifier_count++; } if($metadata) { if($verbose) { print LOG "metadata $count_records found $identifier\n"; } my $dom=$metadata->dom; if($verbose) { print LOG "DOM called\n"; } my $metadata_element=$dom->documentElement; if($verbose) { print LOG "DOM document_element called\n"; #print LOG $metadata_element->toString; } $count_records++; ## add metadata to $doc my $record_element=XML::LibXML::Element->new('record'); if($verbose) { print LOG "element created\n"; } $record_element->setAttribute('id', "$identifier"); if($verbose) { print LOG "attribute set\n"; } my $contents_element=&Mamf::Common::first_element_child($metadata_element); $record_element->appendChild($contents_element); my $string=$record_element->toString(0); $string=&Mamf::Common::linify_xml($string); $records_as_string.=$string; if($verbose) { print LOG "appended metadata\n"; } if($verbose) { print LOG "appended record\n"; print LOG "record elemnet done\n"; } } elsif($verbose) { print LOG "no metadata for identifier $identifier\n"; } if($count_records and (not ($count_records % $batch_size))) { my $xml_file="$in_dir/$batch_count.xml"; my $string="<records>\n$records_as_string</records>\n"; if($verbose) { print LOG "records as string: \n$string\n"; } &Mamf::Common::save_diff($string,$xml_file); if($verbose) { print LOG "stored $xml_file in loop\n\n"; } #$doc={}; if($verbose) { print LOG "doc nulled\n"; } $batch_count++; ## reset records $records_as_string=''; # print LOG "new document created\n"; } # print LOG "calling next record\n"; } if($verbose) { print LOG "out of loop\n"; } ## store the last batch my $xml_file="$in_dir/$batch_count.xml"; my $string="<records>\n$records_as_string</records>\n"; if($verbose) { print LOG "records as string: \n$string\n"; } &Mamf::Common::save_diff($string,$xml_file); if($verbose) { print LOG "stored $xml_file out of loop\n"; } #$doc={}; ## set the "done file" my $done_file="$in_dir/done"; touch($done_file); unlink "$in_dir/lock"; print LOG "done; identifiers: $identifier_count metadata: $count_records\n"; } ## start parsing sub parse_oa_amf { my $parser = XML::LibXML->new(); $parser->keep_blanks(0); my $amf_node = $parser->parse_file($amf_file)->documentElement; ## get all AMF collection nodes my @collection_nodes = $amf_node->getElementsByTagNameNS($amf_ns,'collection'); foreach my $collection_node (@collection_nodes) { $collection_count++; my $id=$collection_node->getAttribute('id'); $id=~s|^$collection_prefix|| or die "invalid id $id\n"; ## assume that there is only one oai url my $oai_url_node=$collection_node->getElementsByTagNameNS($doar_ns,'rOaiBaseUrl')->[0]; ## happens quite often ;-( if(not $oai_url_node) { $no_oai_count++; next; } my $oai_url=$oai_url_node->textContent; ##happens quite often ;-( if(not $oai_url) { $no_oai_count++; next; } my $metadata_node=$collection_node->getElementsByTagNameNS($freelib_ns,'metadata_format')->[0]; # <freelib:metadata_format name="oai_hal"/> if($metadata_node and $metadata_node->hasAttribute('name')) { my $name=$metadata_node->getAttribute('name'); $metadata_format->{$id}=$name; } else { $metadata_format->{$id}='oai_dc'; } my $status_node=$collection_node->getElementsByTagNameNS($freelib_ns,'status')->[0]; if($status_node and $status_node->hasAttribute('value')) { my $value=$status_node->getAttribute('value'); if($value eq 'exclude') { ## remove data if there is my $in_dir="$home/opt/$metadata_format/$id"; if(-d "$in_dir") { my $system="rm -rf $in_dir/$id"; system($system); } next; } } ## get the rID for reporting to doar my $rID=$collection_node-> getElementsByTagName('doar:repository')->[0]->getAttribute('rID'); $rIDs->{$id}=$rID; $oai_urls->{$id}=$oai_url; $ids->{$oai_url}=$id; } print LOG "$collection_count collections $no_oai_count without oai\n"; } __END__;