Tiendu
Tiendu

Reputation: 77

Delete multidimensional hash in a loop

I'm doing a hierarchical clustering and I need to remove the clustered elements before moving on to the next step. I did the code for a single-dimensional hash and it ran fine. Now I have a two-dimensional hash, I'm unable to delete the elements.

use strict;
use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my @array = ('A1', 'A2', 'A3');

my %distances;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors += ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            $distances{$key_1}{$key_2} = $deviation_vectors ** 0.5 unless $distances{$key_2}{$key_1};
        };
    };
};
my @values;
while (my ($key, $element) = each %distances) {
    while (my ($element, $value) = each %{$element}) {
        push @values, $value;
    };
};
my $min = (sort {$a <=> $b} @values)[0];
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors += ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_});
                };
# Problem here
# Delete doesn't completely remove the element, it returns a hash with an empty key element
                delete $hash{$key_1};
                delete $hash{$key_2};
            };
        };
    };
};
print Dumper \%hash;

sub mean {
    my @data = @_;
    my $sum;
    foreach (@data) {
        $sum += $_;
    };
    return ($sum / @data)
};

This is the result I got...

$VAR1 = {
          'S4' => {},
          'S2' => {},
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S2,S4' => {
                       'A2' => 7,
                       'A1' => '6.5',
                       'A3' => '3.5'
                     },
          'S1' => {
                    'A3' => 5,
                    'A1' => 10,
                    'A2' => 11
                  },
          'S5' => {
                    'A3' => 10,
                    'A1' => 3,
                    'A2' => 2
                  }
        };

'S2' and 'S4' need to be completely removed from the hash.

Upvotes: 1

Views: 146

Answers (2)

Tiendu
Tiendu

Reputation: 77

So this is the subroutine I've made from @Polar Bear solution. It has three parameters, the first one will be the input data, the second one will be the array of sub-elements, then the last one will be the threshold where we want to stop the subroutine.

...
sub agglomerative_clustering {
    my %data = %{$_[0]};
    my @array = @{$_[1]};
    my $threshold = $_[2];
    my $size = keys %data;
    my %clusters;
    for (my $i = 1; $i < $size; $i++) {
        my (%distances, $find, @keys);
        @keys = sort keys %data;
        for my $index_1 (0 .. $#keys) {
            for my $index_2 (1 + $index_1 .. $#keys) {
                my ($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
                $distance += ($data{$key_1}{$_} - $data{$key_2}{$_}) ** 2 foreach @array;
                $distance = sqrt($distance);
                $distances{$key_1}{$key_2} = $distance;
                $find->{min} = $distance unless $find->{min};
                $find->{key} = [$key_1, $key_2] unless $find->{key};
                if ($find->{min} > $distance) {
                    $find->{min} = $distance;
                    $find->{key} = [$key_1, $key_2];
                };
            };
        };
        my ($key_1, $key_2) = $find->{key}->@*;
        $data{"$key_1,$key_2"}{$_} = mean($data{$key_1}{$_}, $data{$key_2}{$_}) foreach @array;
        delete @data{($key_1, $key_2)};
        last if $find->{min} >= $threshold;
        %clusters = %data;
    };
    return %clusters;
};

sub mean {
    my @data = @_;
    my $sum;
    $sum += $_ for @data;    
    return $sum / @data;
};
...

Upvotes: 0

Polar Bear
Polar Bear

Reputation: 6818

Please inspect following code which is based on provided code with some modification to remove excessive loop cycles with introduction of two indexes.

Perhaps hash %distances in this algorithm is excessive, it is kept for demonstration purpose only as it can be useful to OP.

NOTE: the code is provided for an demonstration purpose in an attempt to improve code readability

INFO: $distance ** 0.5 is better written as sqrt($distance), documentation sqrt

use strict;
use warnings;
use feature 'say';

use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my(%distances, $deviation, @array, @keys);

@array = qw(A1 A2 A3);
@keys  = sort keys %hash;

for my $index_1 (0..$#keys) {
    for my $index_2 (1+$index_1..$#keys) {
        my($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
        
        $distance += ( $hash{$key_1}{$_} - $hash{$key_2}{$_} ) ** 2 for @array;
        $distance = $distance ** 0.5;

        $distances{$key_1}{$key_2} = $distance;

        $deviation->{min} = $distance unless $deviation->{min};
         
        if( $deviation->{min} > $distance ) {
            $deviation->{min} = $distance;
            $deviation->{keys} = [$key_1, $key_2];
        }
    }
}

my($key_1, $key_2) = $deviation->{keys}->@*;

$hash{"$key_1,$key_2"}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) for @array;

delete @hash{($key_1, $key_2)};

say Dumper(\%hash);

exit 0;

sub mean {
    my @data = @_;
    my $sum;
    
    $sum += $_ for @data;
    
    return $sum / @data;
}

Output sample

$VAR1 = {
          'S2,S4' => {
                       'A1' => '6.5',
                       'A2' => '7',
                       'A3' => '3.5'
                     },
          'S5' => {
                    'A1' => 3,
                    'A3' => 10,
                    'A2' => 2
                  },
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S1' => {
                    'A2' => 11,
                    'A3' => 5,
                    'A1' => 10
                  }
        };

Upvotes: 1

Related Questions