Skip to content

Commit 2265e79

Browse files
authored
DRIVERS-1541 Retry KMS decrypt requests on transient errors (#783)
1 parent 3f5b2f3 commit 2265e79

23 files changed

+700
-9
lines changed

integrating.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,19 +203,26 @@ The responses from one or more HTTP messages to KMS.
203203
(Note, the driver MAY fan out all HTTP requests at the same time).
204204
2. For each context:
205205

206-
a. Create/reuse a TLS socket connected to the endpoint indicated by
206+
a. Delay the message by the time in microseconds indicated by
207+
`mongocrypt_kms_ctx_usleep` if returned value is greater than 0.
208+
209+
b. Create/reuse a TLS socket connected to the endpoint indicated by
207210
`mongocrypt_kms_ctx_endpoint`. The endpoint string is a host name with
208211
a port number separated by a colon. E.g.
209212
"kms.us-east-1.amazonaws.com:443". A port number will always be
210213
included. Drivers may assume the host name is not an IP address or IP
211214
literal.
212215

213-
b. Write the message from `mongocrypt_kms_ctx_message` to the
216+
c. Write the message from `mongocrypt_kms_ctx_message` to the
214217
> socket.
215218

216-
c. Feed the reply back with `mongocrypt_kms_ctx_feed`. Repeat
219+
d. Feed the reply back with `mongocrypt_kms_ctx_feed`. Repeat
217220
> until `mongocrypt_kms_ctx_bytes_needed` returns 0.
218221

222+
If any step encounters a network error, continue to the next KMS context if
223+
`mongocrypt_kms_ctx_fail` returns true. Otherwise, abort and report an
224+
error.
225+
219226
3. When done feeding all replies, call `mongocrypt_ctx_kms_done`.
220227

221228
**Applies to...**

kms-message/src/kms_message/kms_response_parser.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ kms_response_parser_error (kms_response_parser_t *parser);
5757
KMS_MSG_EXPORT (void)
5858
kms_response_parser_destroy (kms_response_parser_t *parser);
5959

60+
KMS_MSG_EXPORT (void)
61+
kms_response_parser_reset (kms_response_parser_t *parser);
62+
6063
#ifdef __cplusplus
6164
} /* extern "C" */
6265
#endif

kms-message/src/kms_response_parser.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ _parser_init (kms_response_parser_t *parser)
3838
parser->kmip = NULL;
3939
}
4040

41+
void
42+
kms_response_parser_reset (kms_response_parser_t *parser)
43+
{
44+
KMS_ASSERT(!parser->kmip); // KMIP is not-yet supported.
45+
_parser_destroy(parser);
46+
_parser_init(parser);
47+
}
48+
4149
kms_response_parser_t *
4250
kms_response_parser_new (void)
4351
{

src/mongocrypt-ctx-datakey.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ static mongocrypt_kms_ctx_t *_next_kms_ctx(mongocrypt_ctx_t *ctx) {
4040
BSON_ASSERT_PARAM(ctx);
4141

4242
dkctx = (_mongocrypt_ctx_datakey_t *)ctx;
43-
if (dkctx->kms_returned) {
43+
if (!dkctx->kms.should_retry && dkctx->kms_returned) {
4444
return NULL;
4545
}
46+
dkctx->kms.should_retry = false; // Reset retry state.
4647
dkctx->kms_returned = true;
4748
return &dkctx->kms;
4849
}

src/mongocrypt-ctx-private.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ typedef struct __mongocrypt_ctx_opts_t {
6363
_mongocrypt_buffer_t key_material;
6464
mongocrypt_encryption_algorithm_t algorithm;
6565
_mongocrypt_kek_t kek;
66+
bool retry_enabled;
6667

6768
struct {
6869
mongocrypt_index_type_t value;

src/mongocrypt-ctx.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ mongocrypt_ctx_t *mongocrypt_ctx_new(mongocrypt_t *crypt) {
307307
ctx->crypt = crypt;
308308
ctx->status = mongocrypt_status_new();
309309
ctx->opts.algorithm = MONGOCRYPT_ENCRYPTION_ALGORITHM_NONE;
310+
ctx->opts.retry_enabled = crypt->retry_enabled;
310311
ctx->state = MONGOCRYPT_CTX_DONE;
311312
return ctx;
312313
}
@@ -513,8 +514,9 @@ mongocrypt_kms_ctx_t *mongocrypt_ctx_next_kms_ctx(mongocrypt_ctx_t *ctx) {
513514
return NULL;
514515
}
515516

517+
mongocrypt_kms_ctx_t *ret;
516518
switch (ctx->state) {
517-
case MONGOCRYPT_CTX_NEED_KMS: return ctx->vtable.next_kms_ctx(ctx);
519+
case MONGOCRYPT_CTX_NEED_KMS: ret = ctx->vtable.next_kms_ctx(ctx); break;
518520
case MONGOCRYPT_CTX_ERROR: return NULL;
519521
case MONGOCRYPT_CTX_DONE:
520522
case MONGOCRYPT_CTX_NEED_KMS_CREDENTIALS:
@@ -525,6 +527,11 @@ mongocrypt_kms_ctx_t *mongocrypt_ctx_next_kms_ctx(mongocrypt_ctx_t *ctx) {
525527
case MONGOCRYPT_CTX_READY:
526528
default: _mongocrypt_ctx_fail_w_msg(ctx, "wrong state"); return NULL;
527529
}
530+
531+
if (ret) {
532+
ret->retry_enabled = ctx->opts.retry_enabled;
533+
}
534+
return ret;
528535
}
529536

530537
bool mongocrypt_ctx_provide_kms_providers(mongocrypt_ctx_t *ctx, mongocrypt_binary_t *kms_providers_definition) {

src/mongocrypt-key-broker.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,13 @@ mongocrypt_kms_ctx_t *_mongocrypt_key_broker_next_kms(_mongocrypt_key_broker_t *
817817
return NULL;
818818
}
819819

820+
// Check if any requests need retry
821+
for (key_returned_t *ptr = kb->keys_returned; ptr != NULL; ptr = ptr->next) {
822+
if (ptr->kms.should_retry) {
823+
ptr->kms.should_retry = false;
824+
return &ptr->kms;
825+
}
826+
}
820827
while (kb->decryptor_iter) {
821828
if (!kb->decryptor_iter->decrypted) {
822829
key_returned_t *key_returned;

src/mongocrypt-kms-ctx-private.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,14 @@ struct _mongocrypt_kms_ctx_t {
5656
char *endpoint;
5757
_mongocrypt_log_t *log;
5858
char *kmsid;
59+
int64_t sleep_usec;
60+
int attempts;
61+
bool retry_enabled;
62+
bool should_retry;
5963
};
6064

65+
static const int kms_max_attempts = 3;
66+
6167
bool _mongocrypt_kms_ctx_init_aws_decrypt(mongocrypt_kms_ctx_t *kms,
6268
_mongocrypt_opts_kms_providers_t *kms_providers,
6369
_mongocrypt_key_doc_t *key,

src/mongocrypt-kms-ctx.c

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@
2020
#include "mongocrypt-crypto-private.h"
2121
#include "mongocrypt-ctx-private.h"
2222
#include "mongocrypt-endpoint-private.h"
23+
#include "mongocrypt-kek-private.h"
2324
#include "mongocrypt-kms-ctx-private.h"
2425
#include "mongocrypt-log-private.h"
2526
#include "mongocrypt-opts-private.h"
2627
#include "mongocrypt-private.h"
2728
#include "mongocrypt-status-private.h"
2829
#include "mongocrypt-util-private.h"
2930
#include "mongocrypt.h"
31+
#include <bson/bson.h>
3032
#include <kms_message/kms_azure_request.h>
3133
#include <kms_message/kms_b64.h>
3234
#include <kms_message/kms_gcp_request.h>
@@ -142,6 +144,9 @@ _init_common(mongocrypt_kms_ctx_t *kms, _mongocrypt_log_t *log, _kms_request_typ
142144
kms->status = mongocrypt_status_new();
143145
kms->req_type = kms_type;
144146
_mongocrypt_buffer_init(&kms->result);
147+
kms->sleep_usec = 0;
148+
kms->attempts = 0;
149+
kms->should_retry = false;
145150
}
146151

147152
bool _mongocrypt_kms_ctx_init_aws_decrypt(mongocrypt_kms_ctx_t *kms,
@@ -427,11 +432,21 @@ uint32_t mongocrypt_kms_ctx_bytes_needed(mongocrypt_kms_ctx_t *kms) {
427432
if (!mongocrypt_status_ok(kms->status) || !_mongocrypt_buffer_empty(&kms->result)) {
428433
return 0;
429434
}
435+
if (kms->should_retry) {
436+
return 0;
437+
}
430438
want_bytes = kms_response_parser_wants_bytes(kms->parser, DEFAULT_MAX_KMS_BYTE_REQUEST);
431439
BSON_ASSERT(want_bytes >= 0);
432440
return (uint32_t)want_bytes;
433441
}
434442

443+
int64_t mongocrypt_kms_ctx_usleep(mongocrypt_kms_ctx_t *kms) {
444+
if (!kms) {
445+
return 0;
446+
}
447+
return kms->sleep_usec;
448+
}
449+
435450
static void
436451
_handle_non200_http_status(int http_status, const char *body, size_t body_len, mongocrypt_status_t *status) {
437452
BSON_ASSERT_PARAM(body);
@@ -455,6 +470,55 @@ _handle_non200_http_status(int http_status, const char *body, size_t body_len, m
455470
CLIENT_ERR("Error in KMS response. HTTP status=%d. Response body=\n%s", http_status, body);
456471
}
457472

473+
static int64_t backoff_time_usec(int64_t attempts) {
474+
static bool seeded = false;
475+
if (!seeded) {
476+
srand((uint32_t)time(NULL));
477+
seeded = true;
478+
}
479+
480+
/* Exponential backoff with jitter. */
481+
const int64_t base = 200000; /* 0.2 seconds */
482+
const int64_t max = 20000000; /* 20 seconds */
483+
BSON_ASSERT(attempts > 0);
484+
int64_t backoff = base * ((int64_t)1 << (attempts - 1));
485+
if (backoff > max) {
486+
backoff = max;
487+
}
488+
489+
/* Full jitter: between 1 and current max */
490+
return (int64_t)((double)rand() / (double)RAND_MAX * (double)backoff) + 1;
491+
}
492+
493+
static bool should_retry_http(int http_status, _kms_request_type_t t) {
494+
static const int retryable_aws[] = {408, 429, 500, 502, 503, 509};
495+
static const int retryable_azure[] = {408, 429, 500, 502, 503, 504};
496+
if (t == MONGOCRYPT_KMS_AWS_ENCRYPT || t == MONGOCRYPT_KMS_AWS_DECRYPT) {
497+
for (size_t i = 0; i < sizeof(retryable_aws) / sizeof(retryable_aws[0]); i++) {
498+
if (http_status == retryable_aws[i]) {
499+
return true;
500+
}
501+
}
502+
} else if (t == MONGOCRYPT_KMS_AZURE_WRAPKEY || t == MONGOCRYPT_KMS_AZURE_UNWRAPKEY) {
503+
for (size_t i = 0; i < sizeof(retryable_azure) / sizeof(retryable_azure[0]); i++) {
504+
if (http_status == retryable_azure[i]) {
505+
return true;
506+
}
507+
}
508+
} else if (t == MONGOCRYPT_KMS_GCP_ENCRYPT || t == MONGOCRYPT_KMS_GCP_DECRYPT) {
509+
if (http_status == 408 || http_status == 429 || http_status / 500 == 1) {
510+
return true;
511+
}
512+
}
513+
return false;
514+
}
515+
516+
static void set_retry(mongocrypt_kms_ctx_t *kms) {
517+
kms->should_retry = true;
518+
kms->attempts++;
519+
kms->sleep_usec = backoff_time_usec(kms->attempts);
520+
}
521+
458522
/* An AWS KMS context has received full response. Parse out the result or error.
459523
*/
460524
static bool _ctx_done_aws(mongocrypt_kms_ctx_t *kms, const char *json_field) {
@@ -485,6 +549,21 @@ static bool _ctx_done_aws(mongocrypt_kms_ctx_t *kms, const char *json_field) {
485549
}
486550
body = kms_response_get_body(response, &body_len);
487551

552+
if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
553+
if (kms->attempts >= kms_max_attempts) {
554+
// Wrap error to indicate maximum retries occurred.
555+
_handle_non200_http_status(http_status, body, body_len, status);
556+
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
557+
kms_max_attempts,
558+
mongocrypt_status_message(status, NULL));
559+
goto fail;
560+
} else {
561+
ret = true;
562+
set_retry(kms);
563+
goto fail;
564+
}
565+
}
566+
488567
if (http_status != 200) {
489568
_handle_non200_http_status(http_status, body, body_len, status);
490569
goto fail;
@@ -643,6 +722,21 @@ static bool _ctx_done_azure_wrapkey_unwrapkey(mongocrypt_kms_ctx_t *kms) {
643722
}
644723
body = kms_response_get_body(response, &body_len);
645724

725+
if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
726+
if (kms->attempts >= kms_max_attempts) {
727+
// Wrap error to indicate maximum retries occurred.
728+
_handle_non200_http_status(http_status, body, body_len, status);
729+
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
730+
kms_max_attempts,
731+
mongocrypt_status_message(status, NULL));
732+
goto fail;
733+
} else {
734+
ret = true;
735+
set_retry(kms);
736+
goto fail;
737+
}
738+
}
739+
646740
if (body_len == 0) {
647741
CLIENT_ERR("Empty KMS response. HTTP status=%d", http_status);
648742
goto fail;
@@ -737,6 +831,21 @@ static bool _ctx_done_gcp(mongocrypt_kms_ctx_t *kms, const char *json_field) {
737831
}
738832
body = kms_response_get_body(response, &body_len);
739833

834+
if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
835+
if (kms->attempts >= kms_max_attempts) {
836+
// Wrap error to indicate maximum retries occurred.
837+
_handle_non200_http_status(http_status, body, body_len, status);
838+
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
839+
kms_max_attempts,
840+
mongocrypt_status_message(status, NULL));
841+
goto fail;
842+
} else {
843+
ret = true;
844+
set_retry(kms);
845+
goto fail;
846+
}
847+
}
848+
740849
if (http_status != 200) {
741850
_handle_non200_http_status(http_status, body, body_len, status);
742851
goto fail;
@@ -995,6 +1104,53 @@ static bool _ctx_done_kmip_decrypt(mongocrypt_kms_ctx_t *kms_ctx) {
9951104
return ret;
9961105
}
9971106

1107+
bool mongocrypt_kms_ctx_fail(mongocrypt_kms_ctx_t *kms) {
1108+
if (!kms || !kms->retry_enabled) {
1109+
return false;
1110+
}
1111+
1112+
kms->should_retry = false;
1113+
mongocrypt_status_t *status = kms->status;
1114+
1115+
if (!kms->retry_enabled) {
1116+
CLIENT_ERR("KMS request failed due to network error");
1117+
return false;
1118+
}
1119+
1120+
if (kms->attempts >= kms_max_attempts) {
1121+
CLIENT_ERR("KMS request failed after %d retries due to a network error", kms_max_attempts);
1122+
return false;
1123+
}
1124+
1125+
// Check if request type is retryable. Some requests are non-idempotent and cannot be safely retried.
1126+
_kms_request_type_t retryable_types[] = {MONGOCRYPT_KMS_AWS_ENCRYPT,
1127+
MONGOCRYPT_KMS_AWS_DECRYPT,
1128+
MONGOCRYPT_KMS_AZURE_WRAPKEY,
1129+
MONGOCRYPT_KMS_AZURE_UNWRAPKEY,
1130+
MONGOCRYPT_KMS_GCP_ENCRYPT,
1131+
MONGOCRYPT_KMS_GCP_DECRYPT};
1132+
bool is_retryable = false;
1133+
for (size_t i = 0; i < sizeof(retryable_types) / sizeof(retryable_types[0]); i++) {
1134+
if (retryable_types[i] == kms->req_type) {
1135+
is_retryable = true;
1136+
break;
1137+
}
1138+
}
1139+
if (!is_retryable) {
1140+
CLIENT_ERR("KMS request failed due to network error");
1141+
return false;
1142+
}
1143+
1144+
// Mark KMS context as retryable. Return again in `mongocrypt_ctx_next_kms_ctx`.
1145+
set_retry(kms);
1146+
1147+
// Reset intermediate state of parser.
1148+
if (kms->parser) {
1149+
kms_response_parser_reset(kms->parser);
1150+
}
1151+
return true;
1152+
}
1153+
9981154
bool mongocrypt_kms_ctx_feed(mongocrypt_kms_ctx_t *kms, mongocrypt_binary_t *bytes) {
9991155
if (!kms) {
10001156
return false;

src/mongocrypt-private.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ struct _mongocrypt_t {
125125
_mongo_crypt_v1_vtable csfle;
126126
/// Pointer to the global csfle_lib object. Should not be freed directly.
127127
mongo_crypt_v1_lib *csfle_lib;
128+
bool retry_enabled;
128129
};
129130

130131
typedef enum {

0 commit comments

Comments
 (0)