Skip to content

Commit b4e28c8

Browse files
authored
Add jittered exponential backoff (#54)
1 parent e35e74e commit b4e28c8

File tree

6 files changed

+189
-27
lines changed

6 files changed

+189
-27
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ filtering out all progress messages, as it wasn't working reliably.
2020
road for namespaces. foo.bar.Buz is parsed into [foo, bar, <<"Buz">>] (if foo
2121
and bar are already existing atoms, but 'Buz' is not).
2222
- Upgrade grisp dependency to 2.8.0.
23+
- Add jittered exponential backoff for reconnection.
2324

2425
## Fixed
2526

rebar.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
{<<"gun">>,{pkg,<<"gun">>,<<"2.1.0">>},1},
77
{<<"jarl">>,
88
{git,"https://github.com/grisp/jarl.git",
9-
{ref,"24d53cc7b521b126588be1f36afecd1d4eb59db3"}},
9+
{ref,"8d20c3ca314aa5c448d5e16f52a2b887e11b26f7"}},
1010
0},
1111
{<<"jsx">>,{pkg,<<"jsx">>,<<"3.1.0">>},0},
1212
{<<"mapz">>,{pkg,<<"mapz">>,<<"2.4.0">>},1}]}.

src/grisp_connect_client.erl

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@
5656

5757
-define(GRISP_IO_PROTOCOL, <<"grisp-io-v1">>).
5858
-define(FORMAT(FMT, ARGS), iolist_to_binary(io_lib:format(FMT, ARGS))).
59-
-define(STD_TIMEOUT, 1000).
6059
-define(CONNECT_TIMEOUT, 5000).
6160
-define(ENV(KEY, GUARDS), fun() ->
6261
case application:get_env(grisp_connect, KEY) of
@@ -156,26 +155,38 @@ idle(cast, connect, Data) ->
156155
{next_state, waiting_ip, Data};
157156
?HANDLE_COMMON.
158157

159-
waiting_ip(enter, _OldState, Data) ->
160-
Delay = case Data#data.retry_count > 0 of
161-
true -> ?STD_TIMEOUT;
162-
false -> 0
163-
end,
164-
{keep_state_and_data, [{state_timeout, Delay, retry}]};
165-
waiting_ip(state_timeout, retry, Data = #data{retry_count = RetryCount}) ->
158+
% @doc State waiting_ip is used to check the device has an IP address.
159+
% The first time entering this state, the check will be performed right away.
160+
% If the device do not have an IP address, it will wait a fixed amount of time
161+
% and check again, without incrementing the retry counter.
162+
waiting_ip(enter, _OldState, _Data) ->
163+
% First IP check do not have any delay
164+
{keep_state_and_data, [{state_timeout, 0, check_ip}]};
165+
waiting_ip(state_timeout, check_ip, Data) ->
166166
case check_inet_ipv4() of
167167
{ok, IP} ->
168168
?LOG_INFO(#{event => checked_ip, ip => IP}),
169169
{next_state, connecting, Data};
170170
invalid ->
171171
?LOG_DEBUG(#{event => waiting_ip}),
172-
{repeat_state, Data#data{retry_count = RetryCount + 1,
173-
last_error = no_ip_available}}
172+
{keep_state_and_data, [{state_timeout, 1000, check_ip}]}
174173
end;
175174
?HANDLE_COMMON.
176175

177-
connecting(enter, _OldState, Data) ->
178-
{keep_state, Data, [{state_timeout, 0, connect}]};
176+
% @doc State connecting is used to establish a connection to grisp.io.
177+
connecting(enter, _OldState, #data{retry_count = 0}) ->
178+
{keep_state_and_data, [{state_timeout, 0, connect}]};
179+
connecting(enter, _OldState, #data{retry_count = RetryCount}) ->
180+
%% Calculate the connection delay in milliseconds with exponential backoff.
181+
%% The delay is selected randomly between `1000' and
182+
%% `2 ^ RETRY_COUNT - 1000' with a maximum value of `64000'.
183+
%% Loosely inspired by https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
184+
MinDelay = 1000,
185+
MaxDelay = 64000,
186+
MaxRandomDelay = min(MaxDelay, (1 bsl RetryCount) * 1000) - MinDelay,
187+
Delay = MinDelay + rand:uniform(MaxRandomDelay),
188+
?LOG_DEBUG("Scheduling connection attempt in ~w ms", [Delay]),
189+
{keep_state_and_data, [{state_timeout, Delay, connect}]};
179190
connecting(state_timeout, connect, Data = #data{conn = undefined}) ->
180191
?LOG_INFO(#{description => <<"Connecting to grisp.io">>,
181192
event => connecting}),
@@ -307,17 +318,23 @@ handle_connection_message(Data, Msg) ->
307318
keep_state_and_data
308319
end.
309320

321+
% @doc Setup the state machine to rety connecting to grisp.io if the maximum
322+
% number of allowed atempts has not been reached.
323+
% Otherwise, the state machine will give up and go back to idle.
310324
reconnect(Data = #data{retry_count = RetryCount,
311325
max_retries = MaxRetries,
312326
last_error = LastError}, Reason)
313-
when MaxRetries =/= infinity, RetryCount > MaxRetries ->
327+
when MaxRetries =/= infinity, RetryCount >= MaxRetries ->
314328
Error = case Reason of undefined -> LastError; E -> E end,
315329
?LOG_ERROR(#{description => <<"Max retries reached, giving up connecting to grisp.io">>,
316330
event => max_retries_reached, last_error => LastError}),
317331
{next_state, idle, Data#data{retry_count = 0, last_error = Error}};
318-
reconnect(Data = #data{retry_count = RetryCount,
319-
last_error = LastError}, Reason) ->
332+
reconnect(Data = #data{retry_count = RetryCount, last_error = LastError},
333+
Reason) ->
320334
Error = case Reason of undefined -> LastError; E -> E end,
335+
% When reconnecting we always increment the retry counter, even if we
336+
% where connected and it was reset to 0, the next step will always be
337+
% retry number 1. It should never reconnect right away.
321338
{next_state, waiting_ip,
322339
Data#data{retry_count = RetryCount + 1, last_error = Error}}.
323340

test/grisp_connect_api_SUITE.erl

Lines changed: 147 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ all() ->
3434

3535
init_per_testcase(TestCase, Config)
3636
when TestCase =:= bad_client_version_test;
37-
TestCase =:= bad_server_version_test ->
37+
TestCase =:= bad_server_version_test;
38+
TestCase =:= exponential_backoff_test ->
3839
Config;
3940
init_per_testcase(TestCase, Config) ->
4041
CertDir = cert_dir(),
@@ -51,7 +52,8 @@ init_per_testcase(TestCase, Config) ->
5152

5253
end_per_testcase(TestCase, Config)
5354
when TestCase =:= bad_client_version_test;
54-
TestCase =:= bad_server_version_test ->
55+
TestCase =:= bad_server_version_test;
56+
TestCase =:= exponential_backoff_test ->
5557
Config;
5658
end_per_testcase(_, Config) ->
5759
ok = application:stop(grisp_connect),
@@ -62,6 +64,140 @@ end_per_testcase(_, Config) ->
6264

6365
%--- Tests ---------------------------------------------------------------------
6466

67+
exponential_backoff_test(_) ->
68+
CertDir = cert_dir(),
69+
CallRef = make_ref(),
70+
Self = self(),
71+
72+
% First we test the exponential backoff algorithm when failing to connect
73+
74+
Apps = grisp_connect_test_server:start(#{
75+
cert_dir => CertDir,
76+
init_callback => fun(Req, Opts) ->
77+
Self ! {CallRef, init, os:timestamp()},
78+
{ok, cowboy_req:reply(400, #{}, <<"Canceled">>, Req), Opts}
79+
end}),
80+
81+
{ok, _} = application:ensure_all_started(grisp_emulation),
82+
application:load(grisp_connect),
83+
{ok, OldConectEnv} = application:get_env(grisp_connect, connect),
84+
application:set_env(grisp_connect, connect, false),
85+
{ok, _} = application:ensure_all_started(grisp_connect),
86+
87+
{T1, T2, T3, T4} = try
88+
T1a = os:timestamp(),
89+
grisp_connect:connect(),
90+
T2a = receive
91+
{CallRef, init, V2} -> V2
92+
after 300 ->
93+
erlang:error(timeout2)
94+
end,
95+
T3a = receive
96+
{CallRef, init, V3} -> V3
97+
after 2300 ->
98+
erlang:error(timeout3)
99+
end,
100+
T4a = receive
101+
{CallRef, init, V4} -> V4
102+
after 4300 ->
103+
erlang:error(timeout4)
104+
end,
105+
{T1a, T2a, T3a, T4a}
106+
catch
107+
C1:R1:S1 ->
108+
ok = application:stop(grisp_connect),
109+
application:set_env(grisp_connect, connect, OldConectEnv),
110+
erlang:raise(C1, R1, S1)
111+
after
112+
grisp_connect_test_server:wait_disconnection(),
113+
?assertEqual([], flush()),
114+
grisp_connect_test_server:stop(Apps)
115+
end,
116+
117+
% Then we allow the connection to succeed
118+
119+
grisp_connect_test_server:start(#{
120+
cert_dir => CertDir,
121+
init_callback => fun(Req, Opts) ->
122+
Self ! {CallRef, init, os:timestamp()},
123+
Req2 = cowboy_req:set_resp_header(<<"sec-websocket-protocol">>, <<"grisp-io-v1">>, Req),
124+
{cowboy_websocket, Req2, Opts}
125+
end}),
126+
try
127+
T5 = receive
128+
{CallRef, init, V5} -> V5
129+
after 8300 ->
130+
erlang:error(timeout5)
131+
end,
132+
133+
?assertMatch(ok, wait_connection()),
134+
D1 = timer:now_diff(T2, T1) div 1000,
135+
D2 = timer:now_diff(T3, T2) div 1000,
136+
D3 = timer:now_diff(T4, T3) div 1000,
137+
D4 = timer:now_diff(T5, T4) div 1000,
138+
?assert(D1 < 300, D1),
139+
?assert(D2 > 900, D2), % 100 ms less for reliability
140+
?assert(D2 < 100 + 1000 * 1 bsl 1, D2), % Extra 100 ms for reliability
141+
?assert(D3 > 900, D3), % 100 ms less for reliability
142+
?assert(D3 < 100 + 1000 * 1 bsl 2, D3), % Extra 100 ms for reliability
143+
?assert(D4 > 900, D4), % 100 ms less for reliability
144+
?assert(D4 < 100 + 1000 * 1 bsl 3, D4) % Extra 100 ms for reliability
145+
146+
catch
147+
C2:R2:S2 ->
148+
ok = application:stop(grisp_connect),
149+
application:set_env(grisp_connect, connect, OldConectEnv),
150+
erlang:raise(C2, R2, S2)
151+
after
152+
grisp_connect_test_server:stop(Apps),
153+
?assertEqual([], flush())
154+
end,
155+
156+
% Wait for grisp_connect to not be connected anymore
157+
fun WaitNotConnected() ->
158+
case grisp_connect:is_connected() of
159+
false -> ok;
160+
true ->
161+
timer:sleep(50),
162+
WaitNotConnected()
163+
end
164+
end(),
165+
T6 = os:timestamp(),
166+
167+
% Finally we test the delay is reset when reconnecting
168+
169+
grisp_connect_test_server:start(#{
170+
cert_dir => CertDir,
171+
init_callback => fun(Req, Opts) ->
172+
Self ! {CallRef, init, os:timestamp()},
173+
{ok, cowboy_req:reply(400, #{}, <<"Canceled">>, Req), Opts}
174+
end}),
175+
try
176+
T7 = receive
177+
{CallRef, init, V7} -> V7
178+
after 2300 ->
179+
erlang:error(timeout7)
180+
end,
181+
T8 = receive
182+
{CallRef, init, V8} -> V8
183+
after 4300 ->
184+
erlang:error(timeout8)
185+
end,
186+
D5 = timer:now_diff(T7, T6) div 1000,
187+
D6 = timer:now_diff(T8, T7) div 1000,
188+
?assert(D5 > 900, D5), % 100 ms less for reliability
189+
?assert(D5 < 100 + 1000 * 1 bsl 1, D5), % Extra 100 ms for reliability
190+
?assert(D6 > 900, D6), % 100 ms less for reliability
191+
?assert(D6 < 100 + 1000 * 1 bsl 2, D6) % Extra 100 ms for reliability
192+
after
193+
ok = application:stop(grisp_connect),
194+
application:set_env(grisp_connect, connect, OldConectEnv),
195+
grisp_connect_test_server:wait_disconnection(),
196+
?assertEqual([], flush()),
197+
grisp_connect_test_server:stop(Apps)
198+
end,
199+
ok.
200+
65201
bad_client_version_test(_) ->
66202
CertDir = cert_dir(),
67203
Apps = grisp_connect_test_server:start(#{
@@ -70,12 +206,14 @@ bad_client_version_test(_) ->
70206
try
71207
{ok, _} = application:ensure_all_started(grisp_emulation),
72208
application:load(grisp_connect),
209+
{ok, OldMaxRetryEnv} = application:get_env(grisp_connect, ws_max_retries),
73210
application:set_env(grisp_connect, ws_max_retries, 2),
74211
{ok, _} = application:ensure_all_started(grisp_connect),
75212
try
76213
?assertMatch({error, ws_upgrade_failed}, wait_connection())
77214
after
78-
ok = application:stop(grisp_connect)
215+
ok = application:stop(grisp_connect),
216+
application:set_env(grisp_connect, ws_max_retries, OldMaxRetryEnv)
79217
end
80218
after
81219
grisp_connect_test_server:wait_disconnection(),
@@ -92,13 +230,17 @@ bad_server_version_test(_) ->
92230
try
93231
{ok, _} = application:ensure_all_started(grisp_emulation),
94232
application:load(grisp_connect),
233+
{ok, OldMaxRetryEnv} = application:get_env(grisp_connect, ws_max_retries),
95234
application:set_env(grisp_connect, ws_max_retries, 2),
96235
{ok, _} = application:ensure_all_started(grisp_connect),
97236
try
98237
% There is no way to know the reason why gun closed the connection
99-
?assertMatch({error, {closed, _}}, wait_connection())
238+
% but we don't want jarl to crash because the protocol couldn't be
239+
% negociated.
240+
?assertMatch({error, normal}, wait_connection())
100241
after
101-
ok = application:stop(grisp_connect)
242+
ok = application:stop(grisp_connect),
243+
application:set_env(grisp_connect, ws_max_retries, OldMaxRetryEnv)
102244
end
103245
after
104246
grisp_connect_test_server:wait_disconnection(),

test/grisp_connect_reconnect_SUITE.erl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,18 @@ end_per_testcase(_, Config) ->
5252
%--- Tests ---------------------------------------------------------------------
5353

5454
reconnect_on_gun_crash_test(_) ->
55-
?assertMatch(ok, wait_connection(100)),
55+
?assertMatch(ok, wait_connection()),
5656
GunPid = connection_gun_pid(),
5757
proc_lib:stop(GunPid),
5858
?assertMatch(ok, wait_disconnection()),
59-
?assertMatch(ok, wait_connection()).
59+
?assertMatch(ok, wait_connection(2300)).
6060

6161
reconnect_on_disconnection_test(Config) ->
6262
?assertMatch(ok, wait_connection()),
6363
stop_cowboy(),
6464
?assertMatch(ok, wait_disconnection()),
6565
start_cowboy(#{cert_dir => cert_dir()}),
66-
?assertMatch(ok, wait_connection(1200)),
66+
?assertMatch(ok, wait_connection(2300)),
6767
Config.
6868

6969
reconnect_on_ping_timeout_test(_) ->
@@ -73,16 +73,16 @@ reconnect_on_ping_timeout_test(_) ->
7373
% Now decrease ping timeout so that the WS closes after just 1 second
7474
application:set_env(grisp_connect, ws_ping_timeout, 1500),
7575
?assertMatch(ok, wait_disconnection()),
76-
?assertMatch(ok, wait_connection(1200)),
76+
?assertMatch(ok, wait_connection(2300)),
7777
?assertMatch(ok, wait_disconnection()),
78-
?assertMatch(ok, wait_connection(1200)),
78+
?assertMatch(ok, wait_connection(2300)),
7979
?assertMatch(ok, wait_disconnection()).
8080

8181
reconnect_on_closed_frame_test(_) ->
8282
?assertMatch(ok, wait_connection()),
8383
close_websocket(),
8484
?assertMatch(ok, wait_disconnection()),
85-
?assertMatch(ok, wait_connection(1200)).
85+
?assertMatch(ok, wait_connection(2300)).
8686

8787

8888
%--- Internal Functions --------------------------------------------------------

test/grisp_connect_test_server.erl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ wait_disconnection() ->
199199

200200
%--- Websocket Callbacks -------------------------------------------------------
201201

202+
init(Req, Opts = #{init_callback := Fun}) when is_function(Fun, 2) ->
203+
Fun(Req, Opts);
202204
init(Req, Opts) ->
203205
ExpVer = maps:get(expected_protocol, Opts, <<"grisp-io-v1">>),
204206
SelVer = maps:get(selected_protocol, Opts, <<"grisp-io-v1">>),

0 commit comments

Comments
 (0)