Re: identify encoding from a file

2016-02-06 Thread Thomas Krichel
  Marios lyberak writes

> i have a file which is generated out of an old Paradox database,
>
> and i try to figure out what is the encoding of these strangely represented
> characters

  I know of no way to automate this, and I don't think anybody else
  does. You just simply need to read the file with various encodings
  set at parsing, and manually inspect whether you get the right
  output.

  Your Paradox manual may be of help to reduce the number of candidate
  character sets.

-- 

  Cheers,

  Thomas Krichel  http://openlib.org/home/krichel
  skype:thomaskrichel


Re: Perl module to transform XSL to JSON

2013-09-23 Thread Thomas Krichel
  dasos ili writes

 could you please suggest me a tool in order to transform an xsl
 file i have manged to get from XML, into JSON?
  
  I think that since XSL is in XML format a JSON -- XML converter
  will do. A Google search reveals

http://search.cpan.org/~ken/XML-XML2JSON-0.06/lib/XML/XML2JSON.pm

  I have not used it.

  However since my answer is so simple I suspect I don't fully
  understand your problem.
-- 

  Cheers,

  Thomas Krichel  http://openlib.org/home/krichel
  skype:thomaskrichel


Re: OAI::Harvester installation help

2011-05-17 Thread Thomas Krichel
  Dave Sherohman writes

 Hey, all!  Long-time Perl programmer, but new to the world of libraries,
 so I'm not all that familiar with all the data formats used in these
 parts.
 
 I am attempting to use some code which depends on Net::OAI::Harvester,
 but my attempts to install OAI::Harvester are running into problems
 with:
 
  I am not so familiar with the oai harvesting tools in Perl, so 
  forgive me if I am giving you incorrect information. My vague
  recollection is that there are several oai harvesters for Perl.
  The one I use is different one, I think, http::oai. I suggest
  you try with this. FWIW, I attach a script taht I use to download
  OAI archives. I used to keep a collection of them, based on an
  opendoar listing. I think I will soon stop it.

  I hope this is helpful.

  Cheers,

  Thomas Krichelhttp://openlib.org/home/krichel
http://authorclaim.org/profile/pkr1
   skype: thomaskrichel
#!/usr/bin/perl -w

use lib '/home/mamf/usr/share/perl/';

use strict;
use Data::Dumper;
use Data::Random qw(:all);
use List::Util qw(shuffle);
use File::Basename;
use File::Compare;
use File::Copy;
use File::Find;
use File::Path;
use File::Listing qw(parse_dir);
use File::Temp qw/ tempfile tempdir /;
use File::Touch;
use LWP::Simple;
use HTTP::OAI;
use Storable;
use XML::DOM;
use XML::LibXML;
use Time::Piece;
use Time::Seconds;
# use Sys::RunAlone;

## home-grown
use Mamf::Common;

## the size of the files, in terms of OAI_DC records
my $batch_size=100;


## directories
my $home=$ENV{'HOME'};
my $log_dir=$home/public_html/log;
my $amf_file =$home/amf/oa/oa.amf.xml;


## renewal time of 30 days
my $renewal_time=30*24*60*60;


## counters
my $collection_count=0;
my $no_oai_count=0;


## XML and standards constants
my $amf_ns='http://amf.openlib.org';
my $doar_ns='http://opendoar.org';
my $freelib_ns='http://3lib.org';
my $collection_prefix='info:3lib:oa:';


## run parmeter
my $verbose=0;



##
## first argument will be an archive to do
##
my $to_do_archive=$ARGV[0];

##
## parse the amf file to find the already existing
## 3lib ids and the oai interfaces, recorded in doar
##


## gives  the oai_url for an id
my $oai_urls;
## gives the id for an oai_url
my $ids;
## gives the rID for an oai_url
my $rIDs;
## gives the metadata_formats for an id
my $metadata_format;


##
## open log file
##
my $date=`date -I`;
chomp $date;
my $log_file=$log_dir/down_oa_$date.log;
open(LOG, $log_file);
binmode(LOG,:utf8:);

## populate these varibles, deletes
## archives not to get
parse_oa_amf();

## create in_dirs variable, that contains input
## directories
my @in_dirs;
## an indicator of the input directory
my $in_dir;

foreach my $archive (keys %{$metadata_format}) {
  my $format=$metadata_format-{$archive};
  if(not defined($in_dir-{$format})) {
push(@in_dirs,$home/opt/$format/oa/$archive);
  }
  ## double meaning array
  $in_dir-{$archive}=$home/opt/$format/oa/$archive;
}


if(not $to_do_archive) {
  harvest_all();
}
else {
  print doing $to_do_archive\n;
  eval {
 harvest_to_dir($to_do_archive);
   } ;
}

exit;

##


##
## shuffle the oai_url, find what archives to download
## 
sub harvest_all {
  my @rand_ids=shuffle(keys %{$oai_urls}) ;
  my $ineligible=get_ineligble_archives($renewal_time);
  foreach my $id (@rand_ids) {
open(LOG, $log_file);
binmode(LOG,:utf8:);
my $date=`date --rfc-3339=seconds`;
chomp $date;
print LOG at: $date ;
if($ineligible-{$id}) {
  print LOG not renewing .$id., rID .
$rIDs-{$id}., .$ineligible-{$id}.\n;
  next;
}
## try to catch errors if it bombs out
print LOG  get: $id, rID $rIDs-{$id} from $oai_urls-{$id}\n;
eval {
  harvest_to_dir($id);
} ;
if($@) {
  print LOG error at id $id: $@\n;
  close LOG;
}
  }
  close LOG;
}




sub get_ineligble_archives {
  ## directory where the archives
  my $max_ago=shift;
  ## result, an array reference
  my $r;
  my $count;
  foreach my $in_dir (@in_dirs) {
if(not -d  $in_dir) {
  print LOG making $in_dir\n;
  mkdir $in_dir;
}
#foreach my $dir (`ls $format_dir`) {
#  ## remove newline
#  chomp $dir;
#  ## it hase to have 6-char names
#  my $archive_dir=$format_dir/$dir;
#  if($verbose) {
#print LOG checking $archive_dir\n;
#  }
if(not $in_dir=~m|/([^/]{6})$|) {
  next;
}
my $id=$1;
$r-{$id}=is_eligible($in_dir,$max_ago);
  }
  return $r;
}


## check for archiving time
sub is_eligible {
  ## list xml files, but report no error if they are 
  ## not there
  ## code kept as a transition
  my $archive_dir=shift;
  my $max_ago=shift;
  my $now=time();
  if(not -d $archive_dir) {
print no such dir: $archive_dir\n;
  }
  ## check if it is locked
  my $lock_file=$archive_dir/lock; 
  if(-f $lock_file) {
## remove lock file