Skip to content

Commit 2ad2b25

Browse files
committed
upstream: fix handling of the busy_flag for keepalive connections
There are situations where a race condition can happen with keepalive connections with a mix of failed states of intermediary connections, an example can be HTTP output + AWS auth, where the AWS auth failed and could not be obtained in the DNS resolution state). This commit ensures the busy_flag is set as soon as the connection is created and only removed when released or an exception happens. When destroying connections, we now not only validate the busy flag but as well check if the connection is not assigned to a priority queue. I also added a new check for the event structure clearing up the states before its used again with the event loop interface. The fixes address the issue described in: #10458 Signed-off-by: Eduardo Silva <[email protected]>
1 parent 3588a8d commit 2ad2b25

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

src/flb_upstream.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ static int prepare_destroy_conn(struct flb_connection *u_conn)
530530
/* Add node to destroy queue */
531531
mk_list_add(&u_conn->_head, &uq->destroy_queue);
532532

533+
533534
flb_upstream_decrement_total_connections_count(u);
534535

535536
/*
@@ -555,8 +556,13 @@ static inline int prepare_destroy_conn_safe(struct flb_connection *u_conn)
555556

556557
static int destroy_conn(struct flb_connection *u_conn)
557558
{
558-
/* Delay the destruction of busy connections */
559-
if (u_conn->busy_flag) {
559+
/*
560+
* Delay the destruction if the connection is still marked busy or
561+
* if there are pending events referencing it (e.g. queued in the
562+
* priority bucket queue).
563+
*/
564+
if (u_conn->busy_flag ||
565+
u_conn->event._priority_head.prev != NULL) {
560566
return 0;
561567
}
562568

@@ -614,8 +620,9 @@ static struct flb_connection *create_conn(struct flb_upstream *u)
614620
flb_debug("[upstream] connection #%i failed to %s:%i",
615621
conn->fd, u->tcp_host, u->tcp_port);
616622

617-
prepare_destroy_conn_safe(conn);
623+
/* allow the connection to be destroyed immediately */
618624
conn->busy_flag = FLB_FALSE;
625+
prepare_destroy_conn_safe(conn);
619626

620627
return NULL;
621628
}
@@ -758,9 +765,14 @@ struct flb_connection *flb_upstream_conn_get(struct flb_upstream *u)
758765

759766
flb_stream_release_lock(&u->base);
760767

768+
/* mark connection as busy so it won't be freed until released */
769+
conn->busy_flag = FLB_TRUE;
770+
761771
err = flb_socket_error(conn->fd);
762772

763773
if (!FLB_EINPROGRESS(err) && err != 0) {
774+
/* allow immediate cleanup on failure */
775+
conn->busy_flag = FLB_FALSE;
764776
flb_debug("[upstream] KA connection #%i is in a failed state "
765777
"to: %s:%i, cleaning up",
766778
conn->fd, u->tcp_host, u->tcp_port);
@@ -799,6 +811,8 @@ struct flb_connection *flb_upstream_conn_get(struct flb_upstream *u)
799811
}
800812

801813
if (conn != NULL) {
814+
/* newly created connections return in an idle state; mark busy */
815+
conn->busy_flag = FLB_TRUE;
802816
flb_connection_reset_io_timeout(conn);
803817
flb_upstream_increment_busy_connections_count(u);
804818
}
@@ -828,6 +842,9 @@ int flb_upstream_conn_release(struct flb_connection *conn)
828842
struct flb_upstream *u = conn->upstream;
829843
struct flb_upstream_queue *uq;
830844

845+
/* allow pending destroy to free this connection once we're done */
846+
conn->busy_flag = FLB_FALSE;
847+
831848
flb_upstream_decrement_busy_connections_count(u);
832849

833850
uq = flb_upstream_queue_get(u);
@@ -855,6 +872,18 @@ int flb_upstream_conn_release(struct flb_connection *conn)
855872
* notified if the 'available keepalive connection' gets disconnected by
856873
* the remote endpoint we need to add it again.
857874
*/
875+
/*
876+
* Ensure the event is not registered before we add it back. In some
877+
* error paths the previous handler might not have removed the event
878+
* from the loop which would cause epoll_ctl(ADD) to fail with
879+
* EEXIST. Remove it here if still registered and then reset the
880+
* structure state.
881+
*/
882+
if (MK_EVENT_IS_REGISTERED((&conn->event))) {
883+
mk_event_del(conn->evl, &conn->event);
884+
}
885+
886+
MK_EVENT_NEW(&conn->event);
858887
conn->event.handler = cb_upstream_conn_ka_dropped;
859888

860889
ret = mk_event_add(conn->evl,

0 commit comments

Comments
 (0)