Reputation: 103824
I have a Perl script to analyze many megabytes of data line-by-line.
As an example, I will use the close of the Dow Jones Average by day for several lines.
The data is read and the layout of the data in the file is straightforward. In this example, it is:
Date Open High Low Close Volume Adj-Close
As I read the data, many calculations are performed. Some of the data and the calculations are kept for later use. Let's say that the new data is:
Date Open Adj-Close %change [more data to be added]
Here is some example code:
use warnings; use strict;
my @trades;
while(<DATA>) {
chomp;
my ($year,$mon,$day,$open,$high,$low,$close,$vol,$ad_close);
if(($year,$mon,$day,$open,$high,$low,$close,$vol,$ad_close)=
/^(\d+)-(\d+)-(\d+), #date YYYY-MM-DD
(\d+\.\d+), #open
(\d+\.\d+), #High
(\d+\.\d+), #Low
(\d+\.\d+), #Close
(\d+), #Vol
(\d+\.\d+)/x) #adj cl
{
my $drp=($ad_close-$open)/$open;
# HERE Created:
push @trades, [$year,$mon,$day,$open,$ad_close,$drp];
}
else {
print "$_ does not match...\n";
}
}
# widely separated and in multiple places...
foreach my $trade_ref (@trades) {
#HERE Referenced
my ($year,$mon,$day,$open,$ad_close,$drp)=@$trade_ref;
print "$year-$mon-$day $open,$ad_close,$drp\n";
}
# Dow Jones data by day...
__DATA__
2010-10-08,10948.50,11055.29,10901.12,11006.48,3871420000,11006.48
2010-10-07,10968.41,11032.17,10878.04,10948.58,3910550000,10948.58
2010-10-06,10936.79,11015.86,10880.08,10967.65,4073160000,10967.65
2010-10-05,10752.63,10982.98,10752.63,10944.72,4068840000,10944.72
2010-10-04,10828.85,10875.54,10682.66,10751.27,3604110000,10751.27
2010-10-01,10789.72,10907.41,10759.14,10829.68,4298910000,10829.68
2010-09-30,10835.96,10960.99,10732.27,10788.05,4284160000,10788.05
2010-09-29,10857.98,10901.96,10759.75,10835.28,3990280000,10835.28
2010-09-28,10809.85,10905.44,10714.03,10858.14,4025840000,10858.14
2010-09-27,10860.03,10902.52,10776.44,10812.04,3587860000,10812.04
2010-09-24,10664.39,10897.83,10664.39,10860.26,4123950000,10860.26
2010-09-23,10738.48,10779.65,10610.12,10662.42,3847850000,10662.42
2010-09-22,10761.11,10829.75,10682.40,10739.31,3911070000,10739.31
2010-09-21,10753.39,10844.89,10674.83,10761.03,4175660000,10761.03
2010-09-20,10608.08,10783.51,10594.38,10753.62,3364080000,10753.62
2010-09-17,10595.44,10689.29,10529.67,10607.85,4086140000,10607.85
2010-09-16,10571.75,10624.58,10499.43,10594.83,3364080000,10594.83
2010-09-15,10526.42,10609.21,10453.15,10572.73,3369840000,10572.73
2010-09-14,10544.81,10622.69,10460.34,10526.49,4521050000,10526.49
2010-09-13,10458.60,10605.73,10458.45,10544.13,4521050000,10544.13
2010-09-10,10415.01,10502.80,10376.34,10462.77,3061160000,10462.77
2010-09-09,10388.22,10515.86,10359.23,10415.24,3387770000,10415.24
2010-09-08,10338.57,10460.50,10318.93,10387.01,3224640000,10387.01
2010-09-07,10446.80,10448.99,10304.44,10340.69,3107380000,10340.69
2010-09-03,10321.92,10484.71,10321.92,10447.93,3534500000,10447.93
2010-09-02,10270.08,10350.98,10211.80,10320.10,3704210000,10320.10
2010-09-01,10016.01,10305.87,10016.01,10269.47,4396880000,10269.47
2010-08-31,10006.42,10101.53,9915.73,10014.72,4038770000,10014.72
At the place in the code marked #HERE, notice that I first push an anonymous array onto another named array for later access. Later (far later in the real program) I access the same array by reference.
So far I have just been cutting the text of the "template" [$year,$mon,$day,$open,$ad_close,$drp]
from the first # HERE and manually pasting my ($year,$mon,$day,$open,$ad_close,$drp)=@$trade_ref;
into the the other # HERE's in the program. Gotta be a better way...
Is there a way that I can have a template of the anonymous array that I push so it automatically is referenced in an orderly way in the other cases through the script? In the real script, the analyzed results such as $drp
change and I want a change in the creation of the data to be handled gracefully by later routines even if I don't change my ($year,$mon,$day,$open,$ad_close,$drp)=@$trade_ref;
If this were a C program, the definition could be a macro in a single place...
I am thinking about pushing an anonymous hash instead of an array with the hash having name/value pairs that have the name of the value then the value of that value. Theoretically I know that would work, but strikes me as wasteful and slow. There are approximately 2 GB of data in the real data set, and I can zip through that quickly with the current design.
Is there a better way?
Upvotes: 2
Views: 263
Reputation: 29854
A blessed array isn't going to be that much more load than an un-blessed array.
package Trade;
use strict;
use warnings;
use English qw<@LAST_MATCH_START @LAST_MATCH_END>;
my @slots = qw<year month day open high low close vol ad_close drop>;
my %slot_for
= (( map { $slots[$_] => $_ } 0..$#slots )
, ( map { $_ => -1 } qw<date> )
)
;
foreach my $i ( 0..$#slots ) {
my $name = $slots[$i];
no strict 'refs';
*$name = sub {
my ( $self, $value ) = @_;
my $slotr = \$self->[$i];
return $$slotr unless $#_;
my $old = $$slotr;
$$slotr = $value;
return $$slotr;
};
}
my @trades;
my %format_for;
sub trades { return @{[ @trades ]} };
sub new {
my $class = shift;
my @args = @_;
if ( @args == 0 ) {
$args[0] = $_;
}
if ( @args == 1 ) {
my $line = shift @args;
@args
= $line =~
m/^(\d+)-(\d+)-(\d+), #date YYYY-MM-DD
(\d+\.\d+), #open
(\d+\.\d+), #High
(\d+\.\d+), #Low
(\d+\.\d+), #Close
(\d+), #Vol
(\d+\.\d+)
/x
;
my ( $open, $ad_close ) = @args[3,8];
push @args, ( $ad_close - $open ) / $open;
}
my $self = bless \@args, $class;
push @trades, $self;
return $self;
}
sub format {
my $self = shift;
my $format = shift;
my $format_ref = $format_for{ $format };
unless ( $format_ref ) {
my @format_list;
my $fmt = $format;
while ( $fmt =~ m/\$(\w+)/g ) {
next unless exists $slot_for{ $1 };
push @format_list, \&$1;
substr( $fmt, $LAST_MATCH_START[0], $LAST_MATCH_END[0] - $LAST_MATCH_START[0], '%s' );
pos( $fmt ) = $LAST_MATCH_START[0] + 2;
}
$fmt =~ s/\\n/\n/gm;
$format_ref
= $format_for{ $format }
= { format => $fmt, list => \@format_list }
;
}
return $format unless $format_ref->{list};
my ( $fmt, $format_list ) = @$format_ref{ qw<format list> };
return sprintf( $fmt, map { $_->( $self ) } @$format_list );
}
sub date {
my $str = join( '-', &year, &month, &day );
return $str;
}
package main;
Trade->new while <DATA>;
print $_->format( '$date $open,$ad_close,$drop\n' ) foreach Trade->trades();
__DATA__
2010-10-08,10948.50,11055.29,10901.12,11006.48,3871420000,11006.48
2010-10-07,10968.41,11032.17,10878.04,10948.58,3910550000,10948.58
2010-10-06,10936.79,11015.86,10880.08,10967.65,4073160000,10967.65
2010-10-05,10752.63,10982.98,10752.63,10944.72,4068840000,10944.72
2010-10-04,10828.85,10875.54,10682.66,10751.27,3604110000,10751.27
2010-10-01,10789.72,10907.41,10759.14,10829.68,4298910000,10829.68
2010-09-30,10835.96,10960.99,10732.27,10788.05,4284160000,10788.05
2010-09-29,10857.98,10901.96,10759.75,10835.28,3990280000,10835.28
2010-09-28,10809.85,10905.44,10714.03,10858.14,4025840000,10858.14
2010-09-27,10860.03,10902.52,10776.44,10812.04,3587860000,10812.04
2010-09-24,10664.39,10897.83,10664.39,10860.26,4123950000,10860.26
2010-09-23,10738.48,10779.65,10610.12,10662.42,3847850000,10662.42
2010-09-22,10761.11,10829.75,10682.40,10739.31,3911070000,10739.31
2010-09-21,10753.39,10844.89,10674.83,10761.03,4175660000,10761.03
2010-09-20,10608.08,10783.51,10594.38,10753.62,3364080000,10753.62
2010-09-17,10595.44,10689.29,10529.67,10607.85,4086140000,10607.85
2010-09-16,10571.75,10624.58,10499.43,10594.83,3364080000,10594.83
2010-09-15,10526.42,10609.21,10453.15,10572.73,3369840000,10572.73
2010-09-14,10544.81,10622.69,10460.34,10526.49,4521050000,10526.49
2010-09-13,10458.60,10605.73,10458.45,10544.13,4521050000,10544.13
2010-09-10,10415.01,10502.80,10376.34,10462.77,3061160000,10462.77
2010-09-09,10388.22,10515.86,10359.23,10415.24,3387770000,10415.24
2010-09-08,10338.57,10460.50,10318.93,10387.01,3224640000,10387.01
2010-09-07,10446.80,10448.99,10304.44,10340.69,3107380000,10340.69
2010-09-03,10321.92,10484.71,10321.92,10447.93,3534500000,10447.93
2010-09-02,10270.08,10350.98,10211.80,10320.10,3704210000,10320.10
2010-09-01,10016.01,10305.87,10016.01,10269.47,4396880000,10269.47
2010-08-31,10006.42,10101.53,9915.73,10014.72,4038770000,10014.72
Upvotes: 1
Reputation: 171
It might also be worth investigating whether you could hide away the contents of the array in a blessed array reference. Having a trades class, pushing array objects onto your @trades array and then having a method to return the data and the price will hide away some of the problems you have with code duplication but at the risk of slightly slower run time because of the method calls. A direct call into the array using a named constant will be quicker and I assume that's more important than anything else.
Upvotes: 0
Reputation: 4700
use strict; use warnings;
use constant {
YEAR => 0, MON => 1, DAY => 2,
OPEN => 3, AD_CLOSE => 4, DRP => 5,
};
my @trades;
while(<DATA>) {
chomp;
my $dow = parse_dow( $_ );
push @trades, $dow if @$dow;
}
print "@{$_}[YEAR, MON, DAY, OPEN, AD_CLOSE, DRP]\n"
for @trades;
sub parse_dow {
my ($dow) = @_;
my ($date, $open, $high, $low, $close, $vol, $ad_close)
= split /,/, $dow;
my ($year, $mon, $day) = split /-/, $date;
my $drp = ( $ad_close - $open ) / $open;
return [$year, $mon, $day, $open, $ad_close, $drp];
}
__DATA__
2010-10-08,10948.50,11055.29,10901.12,11006.48,3871420000,11006.48
2010-10-07,10968.41,11032.17,10878.04,10948.58,3910550000,10948.58
2010-10-06,10936.79,11015.86,10880.08,10967.65,4073160000,10967.65
Upvotes: 3
Reputation: 239861
use constant (YEAR => 0, MON => 1, DAY => 2, ...);
$trade_ref->[YEAR]
etc.Upvotes: 4