Hi,

I recently put something together. Hope it's still useful. It even has 
some POD doc ... ;-)

Ekkehard

[snip]

#!perl -w
use English;
use Getopt::Std;
use strict;

#==========================================================================
=pod

=item @str strings($file, $maxbytes, $translate, $minchar, $offset)

This method is similar to the GNU version of the strings utility. It
reads the file $file upto the $maxbytes byte and extracts all strings
it encounters. A string is here a sequence of at least $minchar bytes
in the range 0x40 - 0x176 or any umlaut followed by a byte not out of
those character set. If $translate is set to 1 any umlauts and german
'sz' found in IBM encoding are translated to ISO-8859-1 encoding. If
the $offset parameter is not 0 each element in the returned list
contains the offset within the file before the string and - tab
separated - the string found. The offset is given in either decimal
(d), octal (o) or hexadecimal (x) notation.

=cut
#==========================================================================
sub strings
{
    my ($file, $maxbytes, $translate, $minchar, $offset) = @ARG;

    return unless -r $file;
    return unless $translate =~ /^[01]$/;
    return unless $minchar   =~ /^\d+$/;
    return unless $offset    =~ /^[0dox]{1}$/;

    $maxbytes = 3000 unless defined $maxbytes;
    my $data;

    open    IN,"<$file"          or return;
    binmode IN;
    read    IN, $data, $maxbytes or return;
    close   IN                   or return;

    #            AE  OE  UE  ae  oe  ue  sz
    my $IBM  = '\216\231\232\204\224\201\341';
    my $ANSI = '\304\326\334\344\366\374\337';

    # What we consider as 'ascii':
    my $ascii="\040-\176$IBM$ANSI";

    my @strings;
    while ($data =~ /([$ascii]{$minchar,})[^$ascii]/g)
    {
        if ($offset ne "0")
        {
            push @strings, 
sprintf("%$offset",pos($data)-length($1)-1)."\t$1";
        }
        else
        {
            push @strings, $1;
        }
    }

    # Translate IBM to ANSI encoding. We can not use the variables
    # here as the translation table is generated at compile time (see
    # tr documentation) !!!
    if ($translate == 1)
    {
        foreach (@strings)
        { tr [\216\231\232\204\224\201\341]
             [\304\326\334\344\366\374\337]; }
    }

    return @strings;
}


our $opt_b = 0;
our $opt_n = 4;
our $opt_t = 0;
our $opt_T = 0;
our $opt_h;

getopts('b:n:t:Th');

if ($opt_h || @ARGV==0)
{
    print <<HEND;

Usage: pstrings [-n <minchars>] [-b <maxbytes>] [-t radix] [-T] [-h] 
<filename>

     -n at least minchars characters make up the string
     -b at most maxbytes are read from the file if specified.
        Otherwise the entire file is read.
     -t Print the offset within the file before each string. The single
        character argument specifies the radix of the offset: o for octal,
        x for hexadecimal, or d for decimal.
     -T translates IBM encoded umlauts to ISO encoded ones
     -h shows this help

The filename has to be in a NTish form.

HEND

    exit 0;
}

my $file = $ARGV[0];
$file =~ s|\\|/|g;

die "Canot read from $file: $!" unless -r $file;

$opt_b = -s _ if $opt_b == 0;

my @s = strings($file,$opt_b,$opt_T,$opt_n, $opt_t) or die "No strings: 
$!";

binmode STDOUT;
print join("\n",@s);

exit 0;

_______________________________________________
ActivePerl mailing list
[EMAIL PROTECTED]
To unsubscribe: http://listserv.ActiveState.com/mailman/mysubs

Reply via email to