James marks wrote:
The problem is most likely in your algorithm. Show us the code.


(Oops. Replied only to Charles by accident. Reposting to the list:)

Sorry. I was posting a part of the real code only to avoid posting an overly long string of code. The problem, it seemed to me, was more likely some other limitation than Perl since the code ran fine on files up to a certain size. However, I'll post the whole code if you'd like:

There is nothing in your code that I can see that would not work with a large file (if you consider 21.5 MB large.) :-) Perhaps it is a limitation of the operating system?



(The script creates a file that can be loaded into a MySQL database.)


#!/usr/bin/perl

use warnings;
use strict;

my $source_file = "/users/jamesmarks/desktop/published_stories.htm";
my $destination_file = "/users/jamesmarks/desktop/published_stories.tb";
my $line_count = 0;
my $total_line_count = 0;

open FILE_IN, "$source_file" or die "Cannot open source file: $!";
open FILE_OUT, ">$destination_file" or die "Cannot open destination file: $!";


select FILE_OUT;

print <<'DEFINE_TABLE';
USE trib_stories;

DROP TABLE IF EXISTS story;

CREATE TABLE story (
story_id INT AUTO_INCREMENT,
issue_date DATE,
section VARCHAR(10),
byline VARCHAR(25),
staff_writer INT,
headline VARCHAR(50),
subhead VARCHAR(255),
body_copy TEXT,
caption_1 VARCHAR(255),
caption_2 VARCHAR(255),
caption_3 VARCHAR(255),
caption_4 VARCHAR(255),
caption_5 VARCHAR(255),
caption_6 VARCHAR(255),
PRIMARY KEY (story_id),
INDEX index1 (issue_date, section, byline),
FULLTEXT (headline),
FULLTEXT (body_copy)
);

DEFINE_TABLE

while (<FILE_IN>) {
    chomp();
    $line_count++;
    $total_line_count++;

You could use the built-in $. variable (since it is there already) instead of your own $total_line_count and then define $line_count as:


     my $line_count = $. % 15;


    if ($line_count == 1) {
        print "INSERT INTO story\nSET story_id = NULL,\n";
    } elsif ($line_count == 2) {
        m{(\d\d?)/(\d\d?)/(\d\d\d\d)};
        my $issue_date = "$3-$1-$2";

You should only use the numeric variables if the match succeeded otherwise they will contain values from the last successful match!



if ($issue_date eq "<BR>") {

$issue_date will *never* be equal to '<BR>'.


            $issue_date = "";
        }
        print "issue_date = \"$issue_date\",\n";
    } elsif ($line_count == 3) {
        m{<TD>(.*)</TD>};
        my $section = $1;
        if ($section eq "<BR>") {
            $section = "";
        }
        print "section = \"$section\",\n";
    } elsif ($line_count == 4) {
        m{<TD>(.*)</TD>};
        my $byline = $1;
        if ($byline eq "<BR>") {
            $byline = "";
        }
        print "byline = \"$byline\",\n";
    } elsif ($line_count == 5) {
        m{<TD>(.*)</TD>};
        my $staff_writer = $1;
        if ($staff_writer eq "<BR>") {
            $staff_writer = "";
        }
        print "staff_writer = \"$staff_writer\",\n";
    } elsif ($line_count == 6) {
        s/<BR>//g;
          ^^^^^^^^^

        m{<TD>(.*)</TD>};
        my $headline = $1;
        if ($headline eq "<BR>") {

$headline will *never* be equal to '<BR>'.


            $headline = "";
        }
        print "headline = \"$headline\",\n";
    } elsif ($line_count == 7) {
        s/<BR>//g;
          ^^^^^^^^^

        m{<TD>(.*)</TD>};
        my $subhead = $1;
        if ($subhead eq "<BR>") {

$subhead will *never* be equal to '<BR>'.


            $subhead = "";
        }
        print "subhead = \"$subhead\",\n";
    } elsif ($line_count == 8) {
        m{<TD>(.*)</TD>};
        my $body_copy = $1;
        if ($body_copy eq "<BR>") {
            $body_copy = "";
        }
        print "body_copy = \"$body_copy\",\n";
    } elsif ($line_count == 9) {
        m{<TD>(.*)</TD>};
        my $caption_1 = $1;
        if ($caption_1 eq "<BR>") {
            $caption_1 = "";
        }
        print "caption_1 = \"$caption_1\",\n";
    } elsif ($line_count == 10) {
        m{<TD>(.*)</TD>};
        my $caption_2 = $1;
        if ($caption_2 eq "<BR>") {
            $caption_2 = "";
        }
        print "caption_2 = \"$caption_2\",\n";
    } elsif ($line_count == 11) {
        m{<TD>(.*)</TD>};
        my $caption_3 = $1;
        if ($caption_3 eq "<BR>") {
            $caption_3 = "";
        }
        print "caption_3 = \"$caption_3\",\n";
    } elsif ($line_count == 12) {
        m{<TD>(.*)</TD>};
        my $caption_4 = $1;
        if ($caption_4 eq "<BR>") {
            $caption_4 = "";
        }
        print "caption_4 = \"$caption_4\",\n";
    } elsif ($line_count == 13) {
        m{<TD>(.*)</TD>};
        my $caption_5 = $1;
        if ($caption_5 eq "<BR>") {
            $caption_5 = "";
        }
        print "caption_5 = \"$caption_5\",\n";
    } elsif ($line_count == 14) {
        m{<TD>(.*)</TD>};
        my $caption_6 = $1;
        if ($caption_6 eq "<BR>") {
            $caption_6 = "";
        }
        print "caption_6 = \"$caption_6\";\n\n";
    }
    if ($line_count == 15) {
        $line_count = 0;
    }
}

close FILE_IN;
close FILE_OUT;

It looks like you have a lot of duplicated code that could condensed:

# UNTESTED

my %field = (
    2  => 'issue_date',
    3  => 'section',
    4  => 'byline',
    5  => 'staff_writer',
    6  => 'headline',
    7  => 'subhead',
    8  => 'body_copy',
    9  => 'caption_1',
    10 => 'caption_2',
    11 => 'caption_3',
    12 => 'caption_4',
    13 => 'caption_5',
    14 => 'caption_6',
    );

while ( <FILE_IN> ) {
    chomp;
    my $line_count = $. % 15;
    next unless $line_count;  # skip every 15th line

    my $capture;
    if ( $line_count == 1 ) {
        print "INSERT INTO story\nSET story_id = NULL,\n";
        next;
    }
    elsif ( $line_count == 2 ) {
        $capture = join '-', ( m{(\d\d?)/(\d\d?)/(\d\d\d\d)} )[ 3, 1, 2 ];
    }
    else {
        s/<BR>//g;
        ( $capture ) = m{<TD>(.*)</TD>};
    }

    print qq($field{$line_count} = "$capture",\n);
    print "\n" if $line_count == 14;
}

my $total_line_count = $.;




John -- use Perl; program fulfillment

--
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
<http://learn.perl.org/> <http://learn.perl.org/first-response>




Reply via email to