Skip to content

Commit 3f392e6

Browse files
committed
Ensure batch size is non negative when making a pipelined AER
add assertion add some logging and a test add some logging and a test
1 parent 324d9bc commit 3f392e6

File tree

4 files changed

+78
-33
lines changed

4 files changed

+78
-33
lines changed

src/ra_server.erl

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -480,23 +480,26 @@ handle_leader({PeerId, #append_entries_reply{term = Term, success = true,
480480
State1 = put_peer(PeerId, Peer, State0),
481481
Effects00 = maybe_promote_peer(PeerId, State1, []),
482482
{State2, Effects0} = evaluate_quorum(State1, Effects00),
483-
{State, Effects1} = process_pending_consistent_queries(State2,
483+
{State3, Effects1} = process_pending_consistent_queries(State2,
484484
Effects0),
485-
Effects = [{next_event, info, pipeline_rpcs} | Effects1],
486-
case State of
485+
Effects2 = [{next_event, info, pipeline_rpcs} | Effects1],
486+
case State3 of
487487
#{cluster := #{Id := _}} ->
488488
% leader is in the cluster
489-
{leader, State, Effects};
489+
{leader, State3, Effects2};
490490
#{commit_index := CI,
491491
cluster_index_term := {CITIndex, _}}
492492
when CI >= CITIndex ->
493493
% leader is not in the cluster and the new cluster
494494
% config has been committed
495495
% time to say goodbye
496496
?INFO("~ts: leader not in new cluster - goodbye", [LogId]),
497+
%% need to make pipelined rpcs here as cannot use next event
498+
{State, _, Effects} =
499+
make_pipelined_rpc_effects(State3, Effects2),
497500
{stop, State, Effects};
498501
_ ->
499-
{leader, State, Effects}
502+
{leader, State3, Effects2}
500503
end
501504
end;
502505
handle_leader({PeerId, #append_entries_reply{term = Term}},
@@ -2086,11 +2089,15 @@ make_pipelined_rpc_effects(#{cfg := #cfg{id = Id,
20862089
end,
20872090
%% ensure we don't pass a batch size that would allow
20882091
%% the peer to go over the max pipeline count
2089-
BatchSize = min(MaxBatchSize,
2090-
MaxPipelineCount - NumInFlight),
2092+
%% we'd only really get here if Force=true so setting
2093+
%% a single entry batch size should be fine
2094+
BatchSize = max(1,
2095+
min(MaxBatchSize,
2096+
MaxPipelineCount - NumInFlight)),
20912097
{NewNextIdx, Eff, S} =
20922098
make_rpc_effect(PeerId, Peer0, BatchSize, S0,
20932099
EntryCache),
2100+
?assert(NewNextIdx >= NextIdx),
20942101
Peer = Peer0#{next_index => NewNextIdx,
20952102
commit_index_sent => CommitIndex},
20962103
NewNumInFlight = NewNextIdx - MatchIdx - 1,
@@ -2149,12 +2156,17 @@ make_rpc_effect(PeerId, #{next_index := Next}, MaxBatchSize,
21492156
PrevTerm, MaxBatchSize,
21502157
State#{log => Log},
21512158
EntryCache);
2152-
{LastIdx, _} ->
2159+
{SnapIdx, _} ->
2160+
?DEBUG("~ts: sending snapshot to ~w as their next index ~b "
2161+
"is lower than snapshot index ~b", [log_id(State),
2162+
PeerId, Next,
2163+
SnapIdx]),
2164+
?assert(PrevIdx < SnapIdx),
21532165
SnapState = ra_log:snapshot_state(Log),
21542166
%% don't increment the next index here as we will do
21552167
%% that once the snapshot is fully replicated
21562168
%% and we don't pipeline entries until after snapshot
2157-
{LastIdx,
2169+
{SnapIdx,
21582170
{send_snapshot, PeerId, {SnapState, Id, Term}},
21592171
State#{log => Log}}
21602172
end
@@ -2861,16 +2873,16 @@ apply_with({Idx, Term, {'$ra_cluster_change', CmdMeta, NewCluster, ReplyMode}},
28612873
State = case State0 of
28622874
#{cluster_index_term := {CI, CT}}
28632875
when Idx > CI andalso Term >= CT ->
2864-
?DEBUG("~ts: applying ra cluster change to ~w",
2865-
[log_id(State0), maps:keys(NewCluster)]),
2876+
?DEBUG("~ts: applying ra cluster change at index ~b to ~w",
2877+
[log_id(State0), Idx, maps:keys(NewCluster)]),
28662878
%% we are recovering and should apply the cluster change
28672879
State0#{cluster => NewCluster,
28682880
membership => get_membership(NewCluster, State0),
28692881
cluster_change_permitted => true,
28702882
cluster_index_term => {Idx, Term}};
28712883
_ ->
2872-
?DEBUG("~ts: committing ra cluster change to ~w",
2873-
[log_id(State0), maps:keys(NewCluster)]),
2884+
?DEBUG("~ts: committing ra cluster change at index ~b to ~w",
2885+
[log_id(State0), Idx, maps:keys(NewCluster)]),
28742886
%% else just enable further cluster changes again
28752887
State0#{cluster_change_permitted => true}
28762888
end,
@@ -3078,17 +3090,23 @@ pre_append_log_follower({Idx, Term, Cmd} = Entry,
30783090
% cluster
30793091
case Cmd of
30803092
{'$ra_cluster_change', _, Cluster, _} ->
3093+
?DEBUG("~ts: ~ts: follower applying ra cluster change to ~w",
3094+
[log_id(State), ?FUNCTION_NAME, maps:keys(Cluster)]),
30813095
State#{cluster => Cluster,
30823096
cluster_index_term => {Idx, Term}};
30833097
_ ->
30843098
% revert back to previous cluster
30853099
{PrevIdx, PrevTerm, PrevCluster} = maps:get(previous_cluster, State),
3100+
?DEBUG("~ts: ~ts: follower reverting cluster change to ~w",
3101+
[log_id(State), ?FUNCTION_NAME, maps:keys(PrevCluster)]),
30863102
State1 = State#{cluster => PrevCluster,
30873103
cluster_index_term => {PrevIdx, PrevTerm}},
30883104
pre_append_log_follower(Entry, State1)
30893105
end;
30903106
pre_append_log_follower({Idx, Term, {'$ra_cluster_change', _, Cluster, _}},
30913107
State) ->
3108+
?DEBUG("~ts: ~ts: follower applying ra cluster change to ~w",
3109+
[log_id(State), ?FUNCTION_NAME, maps:keys(Cluster)]),
30923110
State#{cluster => Cluster,
30933111
membership => get_membership(Cluster, State),
30943112
cluster_index_term => {Idx, Term}};

src/ra_server_proc.erl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,17 +1142,20 @@ handle_enter(RaftState, OldRaftState,
11421142
{ServerState, Effects} = ra_server:handle_state_enter(RaftState,
11431143
OldRaftState,
11441144
ServerState0),
1145+
LastApplied = do_state_query(last_applied, State),
11451146
case RaftState == leader orelse OldRaftState == leader of
11461147
true ->
11471148
%% ensure transitions from and to leader are logged at a higher
11481149
%% level
1149-
?NOTICE("~ts: ~s -> ~s in term: ~b machine version: ~b",
1150+
?NOTICE("~ts: ~s -> ~s in term: ~b machine version: ~b, last applied ~b",
11501151
[log_id(State), OldRaftState, RaftState,
1151-
current_term(State), machine_version(State)]);
1152+
current_term(State), machine_version(State),
1153+
LastApplied]);
11521154
false ->
1153-
?DEBUG("~ts: ~s -> ~s in term: ~b machine version: ~b",
1155+
?DEBUG("~ts: ~s -> ~s in term: ~b machine version: ~b, last applied ~b",
11541156
[log_id(State), OldRaftState, RaftState,
1155-
current_term(State), machine_version(State)])
1157+
current_term(State), machine_version(State),
1158+
LastApplied])
11561159
end,
11571160
handle_effects(RaftState, Effects, cast,
11581161
State#state{server_state = ServerState}).

test/coordination_SUITE.erl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ all_tests() ->
4343
start_cluster_majority,
4444
start_cluster_minority,
4545
grow_cluster,
46+
shrink_cluster_with_snapshot,
4647
send_local_msg,
4748
local_log_effect,
4849
leaderboard,
@@ -382,6 +383,45 @@ grow_cluster(Config) ->
382383
stop_peers(Peers),
383384
ok.
384385

386+
shrink_cluster_with_snapshot(Config) ->
387+
%% this test removes leaders to ensure the remaining cluster can
388+
%% resume activity ok
389+
PrivDir = ?config(data_dir, Config),
390+
ClusterName = ?config(cluster_name, Config),
391+
Peers = start_peers([s1,s2,s3], PrivDir),
392+
ServerIds = server_ids(ClusterName, Peers),
393+
[A, B, C] = ServerIds,
394+
395+
Machine = {module, ?MODULE, #{}},
396+
{ok, _, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds),
397+
{ok, _, Leader1} = ra:members(ServerIds),
398+
399+
%% run some activity to create a snapshot
400+
[_ = ra:process_command(Leader1, {banana, I})
401+
|| I <- lists:seq(1, 5000)],
402+
403+
Fun = fun F(L0) ->
404+
{ok, _, L} = ra:process_command(L0, banana),
405+
F(L)
406+
end,
407+
Pid = spawn(fun () -> Fun(Leader1) end),
408+
timer:sleep(100),
409+
410+
exit(Pid, kill),
411+
{ok, _, _} = ra:remove_member(Leader1, Leader1),
412+
413+
414+
timer:sleep(500),
415+
416+
{ok, _, Leader2} = ra:members(ServerIds),
417+
418+
ct:pal("old leader ~p, new leader ~p", [Leader1, Leader2]),
419+
{ok, O, _} = ra:member_overview(Leader2),
420+
ct:pal("overview2 ~p", [O]),
421+
stop_peers(Peers),
422+
?assertMatch(#{cluster_change_permitted := true}, O),
423+
ok.
424+
385425
send_local_msg(Config) ->
386426
PrivDir = ?config(data_dir, Config),
387427
ClusterName = ?config(cluster_name, Config),

test/ra_server_SUITE.erl

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,22 +1097,6 @@ append_entries_reply_success_promotes_nonvoter(_Config) ->
10971097
{aux, eval}]} = ra_server:handle_leader({N2, Ack2}, State3),
10981098
ok.
10991099

1100-
% leader_make_rpcs(_Config) ->
1101-
% N1 = ?N1, N2 = ?N2, N3 = ?N3,
1102-
% Cluster = #{N1 => new_peer_with(#{next_index => 1, match_index => 0}),
1103-
% N2 => new_peer_with(#{next_index => 4, match_index => 3,
1104-
% commit_index_sent => 3}),
1105-
% N3 => new_peer_with(#{next_index => 2, match_index => 1})},
1106-
% State0 = (base_state(3, ?FUNCTION_NAME))#{commit_index => 3,
1107-
% last_applied => 3,
1108-
% cluster => Cluster,
1109-
% machine_state => <<"hi3">>},
1110-
% {leader, State1, Effs1} = ra_server:handle_leader(pipeline_rpcs, State0),
1111-
% % Res = ra_server:make_rpcs(State0),
1112-
% ct:pal("reas ~p", [ra_server:make_rpcs(State1)]),
1113-
1114-
% ok.
1115-
11161100
follower_aer_dupe(_Config) ->
11171101
N1 = ?N1,
11181102
N2 = ?N2,

0 commit comments

Comments
 (0)