From 1924d5d05b3167322d41f0ca129086e75f96c0f4 Mon Sep 17 00:00:00 2001 From: Sven Krieger <37476281+svkrieger@users.noreply.github.com> Date: Wed, 30 Apr 2025 12:56:10 +0200 Subject: [PATCH 1/2] Change usage of retry-after header With this change the retry-after header will be treated as an absolute value and not used for exponential backoff. Whenever the retry-after interval sent by the service broker is larger than the calculated interval using the base interval and exponential backoff, the retry-after interval will be used. When the CC calculated interval is larger the retry-after interval will be ignored. --- app/jobs/reoccurring_job.rb | 4 +- spec/unit/jobs/reoccurring_job_spec.rb | 146 +++++++++++++++++++++---- 2 files changed, 125 insertions(+), 25 deletions(-) diff --git a/app/jobs/reoccurring_job.rb b/app/jobs/reoccurring_job.rb index 2c6c9ed0fb7..1738cd01068 100644 --- a/app/jobs/reoccurring_job.rb +++ b/app/jobs/reoccurring_job.rb @@ -30,7 +30,7 @@ def maximum_duration_seconds=(duration) end def polling_interval_seconds - [@polling_interval || 0, default_polling_interval_seconds].max + @polling_interval || 0 end def polling_interval_seconds=(interval) @@ -59,7 +59,7 @@ def default_polling_exponential_backoff end def next_execution_in - polling_interval_seconds * (default_polling_exponential_backoff**retry_number) + [polling_interval_seconds, default_polling_interval_seconds * (default_polling_exponential_backoff**retry_number)].max end def next_enqueue_would_exceed_maximum_duration? diff --git a/spec/unit/jobs/reoccurring_job_spec.rb b/spec/unit/jobs/reoccurring_job_spec.rb index 8f7196ab086..f94c74b44b2 100644 --- a/spec/unit/jobs/reoccurring_job_spec.rb +++ b/spec/unit/jobs/reoccurring_job_spec.rb @@ -118,11 +118,13 @@ def perform expect(job.polling_interval_seconds).to eq(24.hours) end - context 'exponential backoff rate' do - context 'updates the polling interval' do - it 'when changing exponential backoff rate only' do + describe 'exponential backoff rate' do + context 'when changing exponential backoff rate only' do + before do TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 2.0 + end + it 'updates the polling interval' do enqueued_time = 0 Timecop.freeze do @@ -141,11 +143,15 @@ def perform end end end + end - it 'when changing exponential backoff rate and default polling interval' do + context 'when changing exponential backoff rate and default polling interval' do + before do TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3 TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 10 + end + it 'updates the polling interval' do enqueued_time = 0 Timecop.freeze do @@ -164,36 +170,130 @@ def perform end end end + end - it 'when changing exponential backoff rate and retry_after from the job' do - TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3 - TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 10 + describe 'changing exponential backoff rate and retry_after from the job' do + context 'when retry-after is larger than calculated backoff' do + let(:fake_job) { FakeJob.new(retry_after: [20, 30]) } - enqueued_time = 0 + before do + TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3 + TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 10 + end - Timecop.freeze do - Jobs::Enqueuer.new(queue: Jobs::Queues.generic).enqueue_pollable(FakeJob.new(retry_after: [20, 30])) - execute_all_jobs(expected_successes: 1, expected_failures: 0) - enqueued_time = Time.now + it 'uses retry-after interval' do + enqueued_time = 0 + + Timecop.freeze do + Jobs::Enqueuer.new(queue: Jobs::Queues.generic).enqueue_pollable(fake_job) + execute_all_jobs(expected_successes: 1, expected_failures: 0) + enqueued_time = Time.now + end + + # the job should run after 20s (20s > 10 * 1.3^0) + Timecop.freeze(19.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(21.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 30s (30s > 10 * 1.3^1) + Timecop.freeze(29.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(31.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end end + end - # the job should run after 20s * 1.3^0 = 20 seconds - Timecop.freeze(19.seconds.after(enqueued_time)) do - execute_all_jobs(expected_successes: 0, expected_failures: 0) + context 'when retry-after is smaller than calculated backoff' do + let(:fake_job) { FakeJob.new(retry_after: [10, 20]) } + + before do + TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3 + TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 30 end - Timecop.freeze(21.seconds.after(enqueued_time)) do - enqueued_time = Time.now - execute_all_jobs(expected_successes: 1, expected_failures: 0) + it 'uses calculated interval' do + enqueued_time = 0 + + Timecop.freeze do + Jobs::Enqueuer.new(queue: Jobs::Queues.generic).enqueue_pollable(fake_job) + execute_all_jobs(expected_successes: 1, expected_failures: 0) + enqueued_time = Time.now + end + + # the job should run after 30s (30s > 10s) + Timecop.freeze(29.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(31.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 30s (30s * 1.3^1 = 39 > 20s) + Timecop.freeze(38.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(40.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end end + end - # the job should run after 30s * 1.3^1 = 39 seconds - Timecop.freeze(38.seconds.after(enqueued_time)) do - execute_all_jobs(expected_successes: 0, expected_failures: 0) + context 'when calculated backoff gets larger than retry-after' do + let(:fake_job) { FakeJob.new(retry_after: [15, 15, 15]) } + + before do + TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 2 + TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 5 end - Timecop.freeze(40.seconds.after(enqueued_time)) do - execute_all_jobs(expected_successes: 1, expected_failures: 0) + it 'uses retry-after until calculated backoff is larger' do + enqueued_time = 0 + + Timecop.freeze do + Jobs::Enqueuer.new(queue: Jobs::Queues.generic).enqueue_pollable(fake_job) + execute_all_jobs(expected_successes: 1, expected_failures: 0) + enqueued_time = Time.now + end + + # the job should run after 15s (15s > 5s (5 * 2^0)) + Timecop.freeze(14.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(16.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 15s (15s > 10s (5 * 2^1)) + Timecop.freeze(14.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(16.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 20s (20s > 15s (5 * 2^2)) + Timecop.freeze(19.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(21.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end end end end From 47ef7e9eb1efa286b1cf7ff7d97e64428e915cf1 Mon Sep 17 00:00:00 2001 From: Sven Krieger <37476281+svkrieger@users.noreply.github.com> Date: Fri, 2 May 2025 13:25:35 +0200 Subject: [PATCH 2/2] Introduce config to cap service related polling interval The maximum polling interval was hardcoded to 24h when set on the job. Though, the exponential backoff calculation did not take this into account. Now the maximum can be configured through `broker_client_maximum_async_poll_interval_seconds` and will be taken into account for all scenarios. Default is still 24h and the config is optional. --- app/jobs/reoccurring_job.rb | 12 +++- config/cloud_controller.yml | 1 + .../config_schemas/base/api_schema.rb | 1 + .../config_schemas/base/worker_schema.rb | 1 + spec/unit/jobs/reoccurring_job_spec.rb | 59 ++++++++++++++++++- 5 files changed, 71 insertions(+), 3 deletions(-) diff --git a/app/jobs/reoccurring_job.rb b/app/jobs/reoccurring_job.rb index 1738cd01068..115996f2fae 100644 --- a/app/jobs/reoccurring_job.rb +++ b/app/jobs/reoccurring_job.rb @@ -35,7 +35,7 @@ def polling_interval_seconds def polling_interval_seconds=(interval) interval = interval.to_i if interval.is_a? String - @polling_interval = interval.clamp(default_polling_interval_seconds, 24.hours) + @polling_interval = interval.clamp(default_polling_interval_seconds, maximum_polling_interval) end private @@ -58,8 +58,16 @@ def default_polling_exponential_backoff Config.config.get(:broker_client_async_poll_exponential_backoff_rate) end + def maximum_polling_interval + Config.config.get(:broker_client_max_async_poll_interval_seconds) + end + def next_execution_in - [polling_interval_seconds, default_polling_interval_seconds * (default_polling_exponential_backoff**retry_number)].max + # use larger polling_interval. Either from job or calculated. + polling_interval = [polling_interval_seconds, default_polling_interval_seconds * (default_polling_exponential_backoff**retry_number)].max + + # cap polling interval at maximum_polling_interval + [polling_interval, maximum_polling_interval].min end def next_enqueue_would_exceed_maximum_duration? diff --git a/config/cloud_controller.yml b/config/cloud_controller.yml index 649a71d6f04..f0cbb85e6c2 100644 --- a/config/cloud_controller.yml +++ b/config/cloud_controller.yml @@ -71,6 +71,7 @@ max_retained_revisions_per_app: 100 broker_client_default_async_poll_interval_seconds: 60 broker_client_max_async_poll_duration_minutes: 10080 broker_client_async_poll_exponential_backoff_rate: 1.0 +broker_client_max_async_poll_interval_seconds: 86400 broker_client_response_parser: log_errors: true diff --git a/lib/cloud_controller/config_schemas/base/api_schema.rb b/lib/cloud_controller/config_schemas/base/api_schema.rb index 0c4ffd8092f..a0c7ba2b434 100644 --- a/lib/cloud_controller/config_schemas/base/api_schema.rb +++ b/lib/cloud_controller/config_schemas/base/api_schema.rb @@ -208,6 +208,7 @@ class ApiSchema < VCAP::Config broker_client_default_async_poll_interval_seconds: Integer, broker_client_max_async_poll_duration_minutes: Integer, broker_client_async_poll_exponential_backoff_rate: Numeric, + broker_client_max_async_poll_interval_seconds: Integer, optional(:broker_client_response_parser) => { log_errors: bool, log_validators: bool, diff --git a/lib/cloud_controller/config_schemas/base/worker_schema.rb b/lib/cloud_controller/config_schemas/base/worker_schema.rb index 84726503417..67ab3655c34 100644 --- a/lib/cloud_controller/config_schemas/base/worker_schema.rb +++ b/lib/cloud_controller/config_schemas/base/worker_schema.rb @@ -115,6 +115,7 @@ class WorkerSchema < VCAP::Config broker_client_default_async_poll_interval_seconds: Integer, broker_client_max_async_poll_duration_minutes: Integer, broker_client_async_poll_exponential_backoff_rate: Numeric, + broker_client_max_async_poll_interval_seconds: Integer, optional(:broker_client_response_parser) => { log_errors: bool, log_validators: bool, diff --git a/spec/unit/jobs/reoccurring_job_spec.rb b/spec/unit/jobs/reoccurring_job_spec.rb index f94c74b44b2..14cd72632d1 100644 --- a/spec/unit/jobs/reoccurring_job_spec.rb +++ b/spec/unit/jobs/reoccurring_job_spec.rb @@ -109,7 +109,7 @@ def perform end end - it 'keeps the polling interval within the bounds' do + it 'keeps the polling interval within the default bounds' do job = FakeJob.new job.polling_interval_seconds = 5 expect(job.polling_interval_seconds).to eq(60) @@ -118,6 +118,18 @@ def perform expect(job.polling_interval_seconds).to eq(24.hours) end + context 'when maximum polling interval is configured' do + before do + TestConfig.config[:broker_client_max_async_poll_interval_seconds] = 1800 + end + + it 'limits the polling interval to the configured maximum' do + job = FakeJob.new + job.polling_interval_seconds = 10.days + expect(job.polling_interval_seconds).to eq(1800) + end + end + describe 'exponential backoff rate' do context 'when changing exponential backoff rate only' do before do @@ -295,6 +307,51 @@ def perform execute_all_jobs(expected_successes: 1, expected_failures: 0) end end + + context 'when maximum polling interval is configured' do + before do + TestConfig.config[:broker_client_max_async_poll_interval_seconds] = 18 + end + + it 'limits the polling interval to the configured maximum' do + enqueued_time = 0 + + Timecop.freeze do + Jobs::Enqueuer.new(queue: Jobs::Queues.generic).enqueue_pollable(fake_job) + execute_all_jobs(expected_successes: 1, expected_failures: 0) + enqueued_time = Time.now + end + + # the job should run after 15s (15s > 5s (5 * 2^0)) + Timecop.freeze(14.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(16.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 15s (15s > 10s (5 * 2^1)) + Timecop.freeze(14.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(16.seconds.after(enqueued_time)) do + enqueued_time = Time.now + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + + # the job should run after 18s (capped at ) + Timecop.freeze(17.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 0, expected_failures: 0) + end + + Timecop.freeze(19.seconds.after(enqueued_time)) do + execute_all_jobs(expected_successes: 1, expected_failures: 0) + end + end + end end end