Dave Sherohman writes

> Hey, all!  Long-time Perl programmer, but new to the world of libraries,
> so I'm not all that familiar with all the data formats used in these
> parts.
> 
> I am attempting to use some code which depends on Net::OAI::Harvester,
> but my attempts to install OAI::Harvester are running into problems
> with:
 
  I am not so familiar with the oai harvesting tools in Perl, so 
  forgive me if I am giving you incorrect information. My vague
  recollection is that there are several oai harvesters for Perl.
  The one I use is different one, I think, http::oai. I suggest
  you try with this. FWIW, I attach a script taht I use to download
  OAI archives. I used to keep a collection of them, based on an
  opendoar listing. I think I will soon stop it.

  I hope this is helpful.

  Cheers,

  Thomas Krichel                    http://openlib.org/home/krichel
                                http://authorclaim.org/profile/pkr1
                                               skype: thomaskrichel
#!/usr/bin/perl -w

use lib '/home/mamf/usr/share/perl/';

use strict;
use Data::Dumper;
use Data::Random qw(:all);
use List::Util qw(shuffle);
use File::Basename;
use File::Compare;
use File::Copy;
use File::Find;
use File::Path;
use File::Listing qw(parse_dir);
use File::Temp qw/ tempfile tempdir /;
use File::Touch;
use LWP::Simple;
use HTTP::OAI;
use Storable;
use XML::DOM;
use XML::LibXML;
use Time::Piece;
use Time::Seconds;
# use Sys::RunAlone;

## home-grown
use Mamf::Common;

## the size of the files, in terms of OAI_DC records
my $batch_size=100;


## directories
my $home=$ENV{'HOME'};
my $log_dir="$home/public_html/log";
my $amf_file ="$home/amf/oa/oa.amf.xml";


## renewal time of 30 days
my $renewal_time=30*24*60*60;


## counters
my $collection_count=0;
my $no_oai_count=0;


## XML and standards constants
my $amf_ns='http://amf.openlib.org';
my $doar_ns='http://opendoar.org';
my $freelib_ns='http://3lib.org';
my $collection_prefix='info:3lib:oa:';


## run parmeter
my $verbose=0;



##
## first argument will be an archive to do
##
my $to_do_archive=$ARGV[0];

##
## parse the amf file to find the already existing
## 3lib ids and the oai interfaces, recorded in doar
##


## gives  the oai_url for an id
my $oai_urls;
## gives the id for an oai_url
my $ids;
## gives the rID for an oai_url
my $rIDs;
## gives the metadata_formats for an id
my $metadata_format;


##
## open log file
##
my $date=`date -I`;
chomp $date;
my $log_file="$log_dir/down_oa_$date.log";
open(LOG,"> $log_file");
binmode(LOG,":utf8:");

## populate these varibles, deletes
## archives not to get
&parse_oa_amf();

## create in_dirs variable, that contains input
## directories
my @in_dirs;
## an indicator of the input directory
my $in_dir;

foreach my $archive (keys %{$metadata_format}) {
  my $format=$metadata_format->{$archive};
  if(not defined($in_dir->{$format})) {
    push(@in_dirs,"$home/opt/$format/oa/$archive");
  }
  ## double meaning array
  $in_dir->{$archive}="$home/opt/$format/oa/$archive";
}


if(not $to_do_archive) {
  &harvest_all();
}
else {
  print "doing $to_do_archive\n";
  eval {
     &harvest_to_dir($to_do_archive);
   } ;
}

exit;

##################################################################


##
## shuffle the oai_url, find what archives to download
## 
sub harvest_all {
  my @rand_ids=shuffle(keys %{$oai_urls}) ;
  my $ineligible=&get_ineligble_archives($renewal_time);
  foreach my $id (@rand_ids) {
    open(LOG,">> $log_file");
    binmode(LOG,":utf8:");
    my $date=`date --rfc-3339=seconds`;
    chomp $date;
    print LOG "at: $date ";
    if($ineligible->{$id}) {
      print LOG "not renewing ".$id.", rID ".
        $rIDs->{$id}.", ".$ineligible->{$id}."\n";
      next;
    }    
    ## try to catch errors if it bombs out
    print LOG  "get: $id, rID $rIDs->{$id} from $oai_urls->{$id}\n";
    eval {
      &harvest_to_dir($id);
    } ;
    if($@) {
      print LOG "error at id $id: $@\n";
      close LOG;
    }
  }
  close LOG;
}




sub get_ineligble_archives {
  ## directory where the archives
  my $max_ago=shift;
  ## result, an array reference
  my $r;
  my $count;
  foreach my $in_dir (@in_dirs) {
    if(not -d  $in_dir) {
      print LOG "making $in_dir\n";
      mkdir $in_dir;
    }
    #foreach my $dir (`ls $format_dir`) {
    #  ## remove newline
    #  chomp $dir;
    #  ## it hase to have 6-char names
    #  my $archive_dir="$format_dir/$dir";
    #  if($verbose) {
    #    print LOG "checking $archive_dir\n";
    #  }
    if(not $in_dir=~m|/([^/]{6})$|) {
      next;
    }    
    my $id=$1;
    $r->{$id}=&is_eligible($in_dir,$max_ago);
  }
  return $r;
}


## check for archiving time
sub is_eligible {
  ## list xml files, but report no error if they are 
  ## not there
  ## code kept as a transition
  my $archive_dir=shift;
  my $max_ago=shift;
  my $now=time();
  if(not -d $archive_dir) {
    print "no such dir: $archive_dir\n";
  }
  ## check if it is locked
  my $lock_file="$archive_dir/lock";     
  if(-f $lock_file) {
    ## remove lock file if old
    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
      = stat($lock_file);
    ## time between now and modification time
    my $ago=$now-$mtime;
    ## print "$ago\n";
    if($ago > $max_ago) {
      unlink $lock_file;
      return;        
    }
    return "lock file $lock_file is present";
  }
  ## look for the file "done"
  ## this should be the main indicator, say in december 2009
  my $file="$archive_dir/done";     
  if(-f $file) {
    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
      = stat($file);
    ## time between now and modification time
    my $ago=$now-$mtime;
    ## print "$ago\n";
    if($ago < $max_ago) {
      my $val = Time::Seconds->new($ago);
      ## number of days ago
      my $days = sprintf("%.0f", $val->days);      
      return "done $days days ago, (done)";
    }
    ## if it is there, it is the master indication
    else {
      return undef;
    }
  }
  ## the old way 
  foreach my $file (`ls -t $archive_dir/*.xml 2> /dev/null`) {      
    chomp $file;
    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
      = stat($file);
    ## time between now and modicifaciton time
    my $ago=$now-$mtime;
    ## print "$ago\n";
    if($ago < $max_ago) {
      my $val = Time::Seconds->new($ago);
      ## number of days ago
      my $days = sprintf("%.0f", $val->days);      
      return "done $days days ago (files)";
      last;
    }
  }
  return undef;
}

## harvest to a directory
sub harvest_to_dir {
  my $id=shift;
  my $out;
  my $metadata_prefix=$metadata_format->{$id};
  my $in_dir=$in_dir->{$id};
  my $identifier_count=0;
  my $metadata_count=0;
  my $count_records=0;  
  my $batch_count=0;
  print LOG "in_dir is $in_dir\n";
  ## create lock file
  my $lock_file="$in_dir/lock";
  touch("$in_dir/lock");
  if(not -d $in_dir) {    
    print "making $in_dir";
    mkpath($in_dir);
  }
  my $baseURL=$oai_urls->{$id};
  if(not defined($baseURL)) {
    print LOG "no URL defined for $id\n";
    unlink $lock_file;
    return;
  }
  my $h = new HTTP::OAI::Harvester(baseURL=>$baseURL);
  my $lr = $h->ListRecords(metadataPrefix=> $metadata_prefix);
  if(not $lr) {
    print LOG "no response\n";
    unlink $lock_file;
    return;
  }
  if( $lr->is_error ) {
    print LOG "Error harvesting: " . $lr->message . "\n";
    unlink $lock_file;
    return;
  }
  if($lr->responseDate) {
    print LOG "responseDate => ", $lr->responseDate, "\n",
      "requestURL => ", $lr->requestURL, "\n";
  }
  else {
    print LOG "no reponse data in request \n";
  }
  ## start with first documen
  #old: my $doc = XML::LibXML->createDocument;
  #old: my $records_element = $doc->createElement("records");
  my $rec;
  my $records_as_string='';
  while( $rec = $lr->next ) {
    my $identifier=$rec->identifier;
    my $metadata=$rec->metadata;
    # print LOG "Identifier => ", $identifier, "\n";
    if( $rec->is_error ) {
      print LOG "Error: ", $rec->code, " (", $rec->message, ")\n";
    }
    if($identifier) {
      $identifier_count++;
    }
    if($metadata) {      
      if($verbose) {
        print LOG "metadata $count_records found $identifier\n";
      }
      my $dom=$metadata->dom;
      if($verbose) {
        print LOG "DOM called\n";
      }
      my $metadata_element=$dom->documentElement;
      if($verbose) {
        print LOG "DOM document_element called\n";
        #print LOG $metadata_element->toString;
      }
      $count_records++;
      ## add metadata to $doc
      my $record_element=XML::LibXML::Element->new('record');
      if($verbose) {
        print LOG "element created\n";
      }
      $record_element->setAttribute('id', "$identifier");
      if($verbose) {
        print LOG "attribute set\n";
      }
      my 
$contents_element=&Mamf::Common::first_element_child($metadata_element);
      $record_element->appendChild($contents_element);
      my $string=$record_element->toString(0);
      $string=&Mamf::Common::linify_xml($string);
      $records_as_string.=$string;
      if($verbose) {
        print LOG "appended metadata\n";
      }
      if($verbose) {
        print LOG "appended record\n";
        print LOG "record elemnet done\n";
      }
    }
    elsif($verbose) {
      print LOG "no metadata for identifier $identifier\n";
    }
    if($count_records and (not ($count_records % $batch_size))) {
      my $xml_file="$in_dir/$batch_count.xml";
      my $string="<records>\n$records_as_string</records>\n";
      if($verbose) {
        print LOG "records as string: \n$string\n";
      }
      &Mamf::Common::save_diff($string,$xml_file);
      if($verbose) {
        print LOG "stored $xml_file in loop\n\n";
      }
      #$doc={};
      if($verbose) {
        print LOG "doc nulled\n";
      }
      $batch_count++;
      ## reset records
      $records_as_string='';
      # print LOG "new document created\n";
    }    
    # print LOG "calling next record\n";
  }
  if($verbose) {
    print LOG "out of loop\n";
  }
  ## store the last batch
  my $xml_file="$in_dir/$batch_count.xml";
  my $string="<records>\n$records_as_string</records>\n";
  if($verbose) {
    print LOG "records as string: \n$string\n";
  }
  &Mamf::Common::save_diff($string,$xml_file);
  if($verbose) {
    print LOG "stored $xml_file out of loop\n";
  }
  #$doc={};
  ## set the "done file"
  my $done_file="$in_dir/done";
  touch($done_file);
  unlink "$in_dir/lock";
  print LOG "done; identifiers: $identifier_count metadata: $count_records\n";
}


## start parsing
sub parse_oa_amf {
  my $parser = XML::LibXML->new();
  $parser->keep_blanks(0);
  my $amf_node = $parser->parse_file($amf_file)->documentElement;
  ## get all AMF collection nodes
  my @collection_nodes = 
$amf_node->getElementsByTagNameNS($amf_ns,'collection');
  foreach my $collection_node (@collection_nodes) {
    $collection_count++;
    my $id=$collection_node->getAttribute('id');
    $id=~s|^$collection_prefix|| or die "invalid id $id\n";
    ## assume that there is only one oai url
    my 
$oai_url_node=$collection_node->getElementsByTagNameNS($doar_ns,'rOaiBaseUrl')->[0];
    ## happens quite often ;-(
    if(not $oai_url_node) {
      $no_oai_count++;
      next;
    }  
    my $oai_url=$oai_url_node->textContent;
    ##happens quite often ;-(
    if(not $oai_url) {
      $no_oai_count++;
      next;
    }
    my 
$metadata_node=$collection_node->getElementsByTagNameNS($freelib_ns,'metadata_format')->[0];
    # <freelib:metadata_format name="oai_hal"/>
    if($metadata_node and $metadata_node->hasAttribute('name')) {
      my $name=$metadata_node->getAttribute('name');
      $metadata_format->{$id}=$name;
    }
    else {
      $metadata_format->{$id}='oai_dc';
    }
    my 
$status_node=$collection_node->getElementsByTagNameNS($freelib_ns,'status')->[0];
    if($status_node and $status_node->hasAttribute('value')) {
      my $value=$status_node->getAttribute('value');
      if($value eq 'exclude') {
        ## remove data if there is 
        my $in_dir="$home/opt/$metadata_format/$id";
        if(-d "$in_dir") {
          my $system="rm -rf $in_dir/$id";
          system($system);
        }
        next;
      }
    }
    ## get the rID for reporting to doar
    my $rID=$collection_node->
      getElementsByTagName('doar:repository')->[0]->getAttribute('rID');
    $rIDs->{$id}=$rID;
    $oai_urls->{$id}=$oai_url;
    $ids->{$oai_url}=$id;
  }
  print LOG "$collection_count collections $no_oai_count without oai\n";
}



__END__;

Reply via email to