JasperTack
JasperTack

Reputation: 4457

Messages getting lost in erlang

I am having some issues with messages in erlang that get lost.

The code that I am using works 100% correct when I use it manually, only when my code is used in a 'load test' where I test the code in parallel with a lot of requests, some messages are never received at the receiver part. When logging all the steps and values of the parameters, I found out that the address where I send the message to, is the correct one. Also the message itself has no problems.

My question is the following: is there knowledge of this 'losing of messages' in erlang yet, could this be some bug in erlang itself?

I could post some code that I am using if needed, but I don't think it would add a lot of value to this question in particular.

UPDATE: the main parts of my application. This is quite a lot of code to illustrate my problem, but I can not reproduce my problem in a simplified version. The application is an allocation system, i.e. it will reserve collections of cells in a grid in parallel. The important parts are: the globalManager, an actor that will control the entire allocation system. The rowManager will manage one row of the entire grid and will lock that row when a reservation is made. When a region of cells has to be reserved, the function request_specific_cells is called. This function will send a reservation request to all rowmanagers of whom the row has to be modified. When the row manager has reserved the region in its row it will send a confirmation to the globalManager. When all rowmanagers have send the confirmation, a confirmation will be send to the process that initiated the request, when one of the managers failed, the globalmanager will send a failure.

    globalManager(Grid) ->
    receive
        {Pid, request_specific_cells, ReservationId, Coordinates, Ctr, XX} ->
            NewGrid = request_specific_cells(Grid, Pid, ReservationId, Coordinates, Ctr, XX);

        {Pid, confirm_region, ResId, Rid, Sid, Region, Section, Ctr, XX} ->
            NewGrid = confirm_region(Grid, Pid, ResId, Rid, Sid, Region, Section, Ctr, XX);

        {Pid, failed_region, Rid, Region, Ctr, XX} ->
            NewGrid = failed_region(Grid, Pid, Rid, Region, Ctr, XX);

        Else ->
            erlang:display({unexpectedMessage, actor, Else}),
            NewGrid = Grid
    end,
    globalManager(NewGrid).


request_specific_cells(Grid, Pid, ReservationId, Coordinates, Ctr, XX) ->
    {{Width, Height}, GridRows, MaxAllocationSize, FreeCells, {UnspecificRequests, NextId}, PendingRequests, BlockedRows} = Grid,
    {X, Y, W, H} = Coordinates,
    Rows         = lists:seq(Y,Y+H-1),
    % Is one of the blocks that have to be reserved currently blocked?
    BlockedRow   = lists:foldl(fun(B, Acc) -> Acc xor search_list(B,BlockedRows) end, false, Rows),
    Request      = lists:keyfind(ReservationId, 1, UnspecificRequests),
    {ReservationId, _} = Request,
    % Now we need the addresses of the sections in which the regions has to be reserved.
    SubSectionIds = [ SPid || {_,SPid} <- [ lists:keyfind(Row, 1, GridRows) || Row <- Rows]],
    % Storing request enables us to rollback if one of the registrations fails.
    NewPendingRequests = PendingRequests ++ [{length(PendingRequests), 0, lists:map(fun(S) -> {S,null} end, SubSectionIds)}],
    % Send a registration command with the needed section to each corresponding section manager.
    [SPid ! {self(), request, Pid, ReservationId, length(PendingRequests), Coordinates, Ctr, XX} || SPid<- SubSectionIds],
    NewBlockedRows = Rows ++ BlockedRows,
    {{Width, Height}, GridRows, MaxAllocationSize, FreeCells, {UnspecificRequests, NextId}, NewPendingRequests, NewBlockedRows}
    end.


confirm_region(Grid, Pid, URid, Rid, Sid, Region, Section, Cttr, XX) ->
    {Dimensions, GridRows, MaxAllocationSize, FreeCells, {UnspecificRequests, NextId}, PendingRequests, BlockedRows} = Grid,
    {_,RY,_,_} = Region,
    if
        % All blocks have confirmed the reservation so the entire request is successful
        (Ctr+1) == length(Spids) -> 
                NewUnspecificRequests = lists:keydelete(URid, 1, UnspecificRequests),
                NewPendingRequests = lists:keydelete(Rid, 1, PendingRequests),
                NewSpids = lists:keyreplace(Sid, 1, Spids, {Sid, Section}),
                [Spid ! {self(), confirm_region, Sec} || {Spid, Sec} <- NewSpids],
                Pid ! {self(), request_specific_cells, Rid, success};
        true -> 
                NewUnspecificRequests = UnspecificRequests,
                % Safe the region that has to be marked/rolled back in the row
                NewSpids = lists:keyreplace(Sid, 1, Spids, {Sid, Section}),
                % Increase counter of confirmations
                NewPendingRequests = lists:keyreplace(Rid, 1, PendingRequests, {Rid, Ctr+1, NewSpids})
    end,
    NewBlockedRows = delete_list(RY, BlockedRows)
    {Dimensions, GridRows, MaxAllocationSize, FreeCells, {NewUnspecificRequests, NextId}, NewPendingRequests, NewBlockedRows}.



rowManager(Row) ->
    receive
        {Mid, request, Pid, URid, Rid, Region, Ctr, XX} ->
            NewRow = request_region(Row, Mid, Pid, URid, Rid, Region, Ctr, XX);
        Else ->
            erlang:display({unexpectedMessage, rowManager, Else}),
            NewRow = Row
    end,

    rowManager(NewRow).

request_region(Row, Mid, Pid, URid, Rid, Coordinates, Ctr, XX) ->
    {RY, Content, Modified} = Row,
    {X,_,W,_}    = Coordinates,
    if
        Modified == false -> 
                Free = region_is_empty({X,1,W,1}, Content),
                if
                    Free -> NewModified = true,
                            NewContent = mark_region({X,1,W,1}, Content, reserved),
                            Mid ! {Pid, confirm_region, URid, Rid, self(), Coordinates, {X,1,W,1}, Ctr, XX};
                    true -> NewModified = false,
                            NewContent = Content,
                            Mid ! {Pid, failed_region, Rid, Coordinates, Ctr, XX}
                end;
        true -> NewModified = false,
                NewContent = Content,
                Mid ! {Pid, failed_region, Rid, Coordinates, Ctr, XX}
    end,
    {RY, NewContent, NewModified}. 

This code will be used by the reserver:

request_specific_cells(FollowUpPid, ReservationId, {X, Y, Width, Height}, Ctr, XX) ->
   FollowUpPid ! {self(), request_specific_cells, ReservationId, {X, Y, Width, Height}, Ctr, XX},
   receive
      {FollowUpPid, request_specific_cells, ReservationId, SuccessOrFailure} ->
        SuccessOrFailure
end.

I think that this receiver dies before an answer was received because I know that

Pid ! {self(), request_specific_cells, Rid, success}

from the confirm/9 function is always executed with the correct values, but is not always received at the function.

Upvotes: 1

Views: 511

Answers (1)

Dmitry Belyaev
Dmitry Belyaev

Reputation: 2593

Erlang has strong message delivery guarantees inside the same node if the receiver is alive.

It seems that you have some race condition in your code. Try to write smaller example of your application that has the same problem and post it here.

Upvotes: 3

Related Questions