From 72634ba5004056681beb710024721cebc5320068 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 14 Jul 2025 20:47:48 +0530 Subject: [PATCH 01/51] feat: add healthcheck manager to decouple upstream --- apisix/healthcheck_manager.lua | 156 +++++++++++++++++++++++++++++++++ apisix/upstream.lua | 99 +++------------------ 2 files changed, 166 insertions(+), 89 deletions(-) create mode 100644 apisix/healthcheck_manager.lua diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua new file mode 100644 index 000000000000..69f9cf2607cc --- /dev/null +++ b/apisix/healthcheck_manager.lua @@ -0,0 +1,156 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- +local core = require("apisix.core") +local healthcheck +local events = require("apisix.events") +local tab_clone = core.table.clone + +local _M = { + working_pool = {}, -- resource_path -> {version = ver, checker = checker} + waiting_pool = {} -- resource_path -> resource_ver +} + +local function fetch_latest_conf(resource_path) + return core.config.fetch_conf(resource_path) +end + +local function create_checker(up_conf) + if not healthcheck then + healthcheck = require("resty.healthcheck") + end + + core.log.info("creating new healthchecker for ", up_conf.key) + + local checker, err = healthcheck.new({ + name = "upstream#" .. up_conf.key, + shm_name = "upstream-healthcheck", + checks = up_conf.checks, + events_module = events:get_healthcheck_events_modele(), + }) + + if not checker then + core.log.error("failed to create healthcheck: ", err) + return nil + end + + -- Add target nodes + local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host + local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port + local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host + local use_node_hdr = up_conf.pass_host == "node" or nil + + for _, node in ipairs(up_conf.nodes) do + local host_hdr = up_hdr or (use_node_hdr and node.domain) + local ok, err = checker:add_target(node.host, port or node.port, host, + true, host_hdr) + if not ok then + core.log.error("failed to add healthcheck target: ", node.host, ":", + port or node.port, " err: ", err) + end + end + + return checker +end + +function _M.fetch_checker(resource_path, resource_ver) + -- Check working pool first + local working_item = _M.working_pool[resource_path] + if working_item and working_item.version == resource_ver then + return working_item.checker + end + + if _M.waiting_pool[resource_path] == resource_ver then + return nil + end + + -- Add to waiting pool with version + _M.waiting_pool[resource_path] = resource_ver + return nil +end + +function _M.timer_create_checker() + if core.table.nkeys(_M.waiting_pool) == 0 then + return + end + + local waiting_snapshot = tab_clone(_M.waiting_pool) + for resource_path, resource_ver in pairs(waiting_snapshot) do + local res_conf = fetch_latest_conf(resource_path) + if not res_conf then + _M.waiting_pool[resource_path] = nil + goto continue + end + + local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. + tostring(res_conf.value._nodes_ver or '') + if resource_ver ~= current_ver then + _M.waiting_pool[resource_path] = nil + goto continue + end + + local checker = create_checker(res_conf.value) + if not checker then + _M.waiting_pool[resource_path] = nil + goto continue + end + + _M.working_pool[resource_path] = { + version = resource_ver, + checker = checker + } + + _M.waiting_pool[resource_path] = nil + core.log.info("created healthchecker for ", resource_path, " version: ", resource_ver) + + ::continue:: + end +end + +function _M.timer_working_pool_check() + if core.table.nkeys(_M.working_pool) == 0 then + return + end + + local working_snapshot = tab_clone(_M.working_pool) + for resource_path, item in pairs(working_snapshot) do + -- Fetch latest configuration + local res_conf = fetch_latest_conf(resource_path) + if not res_conf then + item.checker:delayed_clear(10) + item.checker:stop() + _M.working_pool[resource_path] = nil + goto continue + end + + local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. + tostring(res_conf.value._nodes_ver or '') + if item.version ~= current_ver then + item.checker:delayed_clear(10) + item.checker:stop() + _M.working_pool[resource_path] = nil + end + + ::continue:: + end +end + +function _M.init_worker() + core.timer.every(1, _M.timer_create_checker) + core.timer.every(60, _M.timer_working_pool_check) +end + +return _M diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 3d3b6b42613d..0a751c28762b 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -14,6 +14,7 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -- +local inspect = require("inspect") local require = require local core = require("apisix.core") local config_local = require("apisix.core.config_local") @@ -30,6 +31,7 @@ local ngx_var = ngx.var local is_http = ngx.config.subsystem == "http" local upstreams local healthcheck +local healthcheck_manager local healthcheck_shdict_name = "upstream-healthcheck" if not is_http then @@ -103,100 +105,17 @@ end _M.get_healthchecker_name = get_healthchecker_name -local function create_checker(upstream) - local local_conf = config_local.local_conf() - if local_conf and local_conf.apisix and local_conf.apisix.disable_upstream_healthcheck then - core.log.info("healthchecker won't be created: disabled upstream healthcheck") - return nil - end - if healthcheck == nil then - healthcheck = require("resty.healthcheck") - end - - local healthcheck_parent = upstream.parent - if healthcheck_parent.checker and healthcheck_parent.checker_upstream == upstream - and healthcheck_parent.checker_nodes_ver == upstream._nodes_ver then - return healthcheck_parent.checker - end - - if upstream.is_creating_checker then - core.log.info("another request is creating new checker") - return nil - end - upstream.is_creating_checker = true - - core.log.debug("events module used by the healthcheck: ", events.events_module, - ", module name: ",events:get_healthcheck_events_modele()) - - local checker, err = healthcheck.new({ - name = get_healthchecker_name(healthcheck_parent), - shm_name = healthcheck_shdict_name, - checks = upstream.checks, - -- the events.init_worker will be executed in the init_worker phase, - -- events.healthcheck_events_module is set - -- while the healthcheck object is executed in the http access phase, - -- so it can be used here - events_module = events:get_healthcheck_events_modele(), - }) - - if not checker then - core.log.error("fail to create healthcheck instance: ", err) - upstream.is_creating_checker = nil - return nil - end - - if healthcheck_parent.checker then - local ok, err = pcall(core.config_util.cancel_clean_handler, healthcheck_parent, - healthcheck_parent.checker_idx, true) - if not ok then - core.log.error("cancel clean handler error: ", err) - end - end - - core.log.info("create new checker: ", tostring(checker)) - - local host = upstream.checks and upstream.checks.active and upstream.checks.active.host - local port = upstream.checks and upstream.checks.active and upstream.checks.active.port - local up_hdr = upstream.pass_host == "rewrite" and upstream.upstream_host - local use_node_hdr = upstream.pass_host == "node" or nil - for _, node in ipairs(upstream.nodes) do - local host_hdr = up_hdr or (use_node_hdr and node.domain) - local ok, err = checker:add_target(node.host, port or node.port, host, - true, host_hdr) - if not ok then - core.log.error("failed to add new health check target: ", node.host, ":", - port or node.port, " err: ", err) - end - end - - local check_idx, err = core.config_util.add_clean_handler(healthcheck_parent, release_checker) - if not check_idx then - upstream.is_creating_checker = nil - checker:clear() - checker:stop() - core.log.error("failed to add clean handler, err:", - err, " healthcheck parent:", core.json.delay_encode(healthcheck_parent, true)) - - return nil - end - - healthcheck_parent.checker = checker - healthcheck_parent.checker_upstream = upstream - healthcheck_parent.checker_nodes_ver = upstream._nodes_ver - healthcheck_parent.checker_idx = check_idx - - upstream.is_creating_checker = nil - - return checker -end - - local function fetch_healthchecker(upstream) if not upstream.checks then return nil end - return create_checker(upstream) + local parent = upstream.parent + local resource_path = parent.key + local resource_ver = parent.modifiedIndex .. "#" .. tostring(upstream) .. "#" .. + tostring(upstream._nodes_ver or '') + + return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) end @@ -633,6 +552,8 @@ function _M.init_worker() error("failed to create etcd instance for fetching upstream: " .. err) return end + healthcheck_manager = require("apisix.healthcheck_manager") + healthcheck_manager.init_worker() end From 2e114fb7eb7f8858ded780b2e90e82b71d1f3696 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 14 Jul 2025 20:48:46 +0530 Subject: [PATCH 02/51] fix --- apisix/healthcheck_manager.lua | 2 +- apisix/upstream.lua | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 69f9cf2607cc..4e78f13183b5 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -25,7 +25,7 @@ local _M = { } local function fetch_latest_conf(resource_path) - return core.config.fetch_conf(resource_path) + --- to be implemented end local function create_checker(up_conf) diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 0a751c28762b..1f44eb60beeb 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -114,7 +114,7 @@ local function fetch_healthchecker(upstream) local resource_path = parent.key local resource_ver = parent.modifiedIndex .. "#" .. tostring(upstream) .. "#" .. tostring(upstream._nodes_ver or '') - + return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) end From 588ea22f0748513f6bc55a91f537cd58149482b0 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 11:56:44 +0530 Subject: [PATCH 03/51] fix --- apisix/healthcheck_manager.lua | 72 ++++++++++++++++++++++++---------- apisix/upstream.lua | 6 ++- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 4e78f13183b5..1c053d556dab 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -18,14 +18,47 @@ local core = require("apisix.core") local healthcheck local events = require("apisix.events") local tab_clone = core.table.clone - +local inspect = require("inspect") +local timer_every = ngx.timer.every local _M = { working_pool = {}, -- resource_path -> {version = ver, checker = checker} waiting_pool = {} -- resource_path -> resource_ver } local function fetch_latest_conf(resource_path) - --- to be implemented + -- Extract resource type and ID from path + local resource_type, id = resource_path:match("^/apisix/([^/]+)/([^/]+)$") + if not resource_type or not id then + core.log.error("invalid resource path: ", resource_path) + return nil + end + + local key + if resource_type == "upstreams" or resource_type == "upstream" then + key = "/upstreams" + elseif resource_type == "routes" or resource_type == "route" then + key = "/routes" + else + core.log.error("unsupported resource type: ", resource_type) + return nil + end + + core.log.warn("key is ", key, " id is ", id) + local data = core.config.fetch_created_obj(key) + if not data then + core.log.error("failed to fetch configuration for type: ", key) + return nil + end + core.log.warn("data is ", inspect(data)) + -- Get specific resource by ID + local resource = data:get(id) + core.log.warn("resource is ", inspect(resource)) + if not resource then + core.log.error("resource not found: ", id, " in ", key) + return nil + end + + return resource end local function create_checker(up_conf) @@ -33,7 +66,7 @@ local function create_checker(up_conf) healthcheck = require("resty.healthcheck") end - core.log.info("creating new healthchecker for ", up_conf.key) + core.log.warn("creating new healthchecker for ", up_conf.key) local checker, err = healthcheck.new({ name = "upstream#" .. up_conf.key, @@ -78,6 +111,7 @@ function _M.fetch_checker(resource_path, resource_ver) end -- Add to waiting pool with version + core.log.warn("adding ", resource_path, " to waiting pool with version: ", resource_ver) _M.waiting_pool[resource_path] = resource_ver return nil end @@ -91,32 +125,30 @@ function _M.timer_create_checker() for resource_path, resource_ver in pairs(waiting_snapshot) do local res_conf = fetch_latest_conf(resource_path) if not res_conf then - _M.waiting_pool[resource_path] = nil - goto continue - end - - local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. - tostring(res_conf.value._nodes_ver or '') - if resource_ver ~= current_ver then - _M.waiting_pool[resource_path] = nil goto continue end - local checker = create_checker(res_conf.value) - if not checker then - _M.waiting_pool[resource_path] = nil - goto continue + do + local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. + tostring(res_conf.value._nodes_ver or '') + if resource_ver ~= current_ver then + goto continue + end + + local checker = create_checker(res_conf.value) + if not checker then + goto continue + end end - _M.working_pool[resource_path] = { version = resource_ver, checker = checker } - _M.waiting_pool[resource_path] = nil - core.log.info("created healthchecker for ", resource_path, " version: ", resource_ver) + core.log.warn("created healthchecker for ", resource_path, " version: ", resource_ver) ::continue:: + _M.waiting_pool[resource_path] = nil end end @@ -149,8 +181,8 @@ function _M.timer_working_pool_check() end function _M.init_worker() - core.timer.every(1, _M.timer_create_checker) - core.timer.every(60, _M.timer_working_pool_check) + timer_every(1, _M.timer_create_checker) + timer_every(60, _M.timer_working_pool_check) end return _M diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 1f44eb60beeb..a5314bbb72f9 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -111,8 +111,10 @@ local function fetch_healthchecker(upstream) end local parent = upstream.parent - local resource_path = parent.key - local resource_ver = parent.modifiedIndex .. "#" .. tostring(upstream) .. "#" .. + core.log.warn("PARENT IS ", inspect(parent)) + core.log.warn("UPSTREAM IS ", inspect(upstream)) + local resource_path = upstream.key or parent.key + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. "#" .. tostring(upstream) .. "#" .. tostring(upstream._nodes_ver or '') return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) From 275d17917146352399bedf7d586335f93ee0db91 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 12:26:46 +0530 Subject: [PATCH 04/51] fix --- apisix/healthcheck_manager.lua | 29 +++++++++++++---------------- apisix/upstream.lua | 6 +----- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 1c053d556dab..3f4196ce3187 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -43,16 +43,13 @@ local function fetch_latest_conf(resource_path) return nil end - core.log.warn("key is ", key, " id is ", id) local data = core.config.fetch_created_obj(key) if not data then core.log.error("failed to fetch configuration for type: ", key) return nil end - core.log.warn("data is ", inspect(data)) -- Get specific resource by ID local resource = data:get(id) - core.log.warn("resource is ", inspect(resource)) if not resource then core.log.error("resource not found: ", id, " in ", key) return nil @@ -61,7 +58,12 @@ local function fetch_latest_conf(resource_path) return resource end +local function get_healthcheck_name(value) + return "upstream#" .. value.key +end + local function create_checker(up_conf) + core.log.warn("creating healthchecker for upstream: ", up_conf.key) if not healthcheck then healthcheck = require("resty.healthcheck") end @@ -69,7 +71,7 @@ local function create_checker(up_conf) core.log.warn("creating new healthchecker for ", up_conf.key) local checker, err = healthcheck.new({ - name = "upstream#" .. up_conf.key, + name = get_healthcheck_name(up_conf.parent), shm_name = "upstream-healthcheck", checks = up_conf.checks, events_module = events:get_healthcheck_events_modele(), @@ -127,26 +129,22 @@ function _M.timer_create_checker() if not res_conf then goto continue end - do - local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. - tostring(res_conf.value._nodes_ver or '') - if resource_ver ~= current_ver then + if resource_ver ~= res_conf.modifiedIndex then goto continue end - - local checker = create_checker(res_conf.value) + local checker = create_checker(res_conf.value.upstream or res_conf.value) if not checker then goto continue end + _M.working_pool[resource_path] = { + version = resource_ver, + checker = checker + } end - _M.working_pool[resource_path] = { - version = resource_ver, - checker = checker - } - core.log.warn("created healthchecker for ", resource_path, " version: ", resource_ver) + core.log.warn("created healthchecker for ", resource_path, " version: ", resource_ver) ::continue:: _M.waiting_pool[resource_path] = nil end @@ -159,7 +157,6 @@ function _M.timer_working_pool_check() local working_snapshot = tab_clone(_M.working_pool) for resource_path, item in pairs(working_snapshot) do - -- Fetch latest configuration local res_conf = fetch_latest_conf(resource_path) if not res_conf then item.checker:delayed_clear(10) diff --git a/apisix/upstream.lua b/apisix/upstream.lua index a5314bbb72f9..70a49d6066a0 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -111,12 +111,8 @@ local function fetch_healthchecker(upstream) end local parent = upstream.parent - core.log.warn("PARENT IS ", inspect(parent)) - core.log.warn("UPSTREAM IS ", inspect(upstream)) local resource_path = upstream.key or parent.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. "#" .. tostring(upstream) .. "#" .. - tostring(upstream._nodes_ver or '') - + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) end From 6a169a392798ae6293ec5242d6b7a17076c3ebb9 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 12:52:33 +0530 Subject: [PATCH 05/51] fix --- apisix/balancer.lua | 8 +++++--- apisix/healthcheck_manager.lua | 18 +++++++++++++++--- apisix/init.lua | 3 ++- apisix/upstream.lua | 6 ++---- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 0fe2e6539922..5285211ad3a4 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -27,7 +27,7 @@ local get_last_failure = balancer.get_last_failure local set_timeouts = balancer.set_timeouts local ngx_now = ngx.now local str_byte = string.byte - +local healthcheck_manager = require("apisix.healthcheck_manager") local module_name = "balancer" local pickers = {} @@ -75,7 +75,9 @@ local function fetch_health_nodes(upstream, checker) local port = upstream.checks and upstream.checks.active and upstream.checks.active.port local up_nodes = core.table.new(0, #nodes) for _, node in ipairs(nodes) do - local ok, err = checker:get_target_status(node.host, port or node.port, host) + + local ok, err = healthcheck_manager.fetch_node_status(checker, + node.host, port or node.port, host) if ok then up_nodes = transform_node(up_nodes, node) elseif err then @@ -213,7 +215,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local checker = ctx.up_checker + local checker = healthcheck_manager.fetch_checker(up_conf) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 3f4196ce3187..2d553344c140 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -15,6 +15,7 @@ -- limitations under the License. -- local core = require("apisix.core") +local config_local = require("apisix.core.config_local") local healthcheck local events = require("apisix.events") local tab_clone = core.table.clone @@ -48,7 +49,6 @@ local function fetch_latest_conf(resource_path) core.log.error("failed to fetch configuration for type: ", key) return nil end - -- Get specific resource by ID local resource = data:get(id) if not resource then core.log.error("resource not found: ", id, " in ", key) @@ -63,6 +63,11 @@ local function get_healthcheck_name(value) end local function create_checker(up_conf) + local local_conf = config_local.local_conf() + if local_conf and local_conf.apisix and local_conf.apisix.disable_upstream_healthcheck then + core.log.info("healthchecker won't be created: disabled upstream healthcheck") + return nil + end core.log.warn("creating healthchecker for upstream: ", up_conf.key) if not healthcheck then healthcheck = require("resty.healthcheck") @@ -118,6 +123,14 @@ function _M.fetch_checker(resource_path, resource_ver) return nil end +function _M.fetch_node_status(checker, ip, port, hostname) + -- check if the checker is valid + if not checker or checker.dead then + return true + end + + return checker:get_target_status(ip, port, hostname) +end function _M.timer_create_checker() if core.table.nkeys(_M.waiting_pool) == 0 then return @@ -165,8 +178,7 @@ function _M.timer_working_pool_check() goto continue end - local current_ver = res_conf.modifiedIndex .. "#" .. tostring(res_conf.value) .. "#" .. - tostring(res_conf.value._nodes_ver or '') + local current_ver = res_conf.modifiedIndex if item.version ~= current_ver then item.checker:delayed_clear(10) item.checker:stop() diff --git a/apisix/init.lua b/apisix/init.lua index b5ee018ba15f..1652e8712a53 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -27,6 +27,7 @@ require("jit.opt").start("minstitch=2", "maxtrace=4000", require("apisix.patch").patch() local core = require("apisix.core") +local healthcheck_manager = require("apisix.healthcheck_manager") local plugin = require("apisix.plugin") local plugin_config = require("apisix.plugin_config") local consumer_group = require("apisix.consumer_group") @@ -832,7 +833,7 @@ end local function healthcheck_passive(api_ctx) - local checker = api_ctx.up_checker + local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf) if not checker then return end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 70a49d6066a0..7701f0bffede 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -283,8 +283,7 @@ function _M.set_by_route(route, api_ctx) end end - local checker = fetch_healthchecker(up_conf) - api_ctx.up_checker = checker + fetch_healthchecker(up_conf) return end @@ -295,8 +294,7 @@ function _M.set_by_route(route, api_ctx) return 503, err end - local checker = fetch_healthchecker(up_conf) - api_ctx.up_checker = checker + fetch_healthchecker(up_conf) local scheme = up_conf.scheme if (scheme == "https" or scheme == "grpcs") and up_conf.tls then From 40f5a794b24b2eae61fb2004aae380fc3a5c0057 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 13:04:19 +0530 Subject: [PATCH 06/51] handle both resource paths --- apisix/healthcheck_manager.lua | 11 +++++++++-- apisix/upstream.lua | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 2d553344c140..1c23f46dd82e 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -27,8 +27,15 @@ local _M = { } local function fetch_latest_conf(resource_path) - -- Extract resource type and ID from path - local resource_type, id = resource_path:match("^/apisix/([^/]+)/([^/]+)$") + local resource_type, id + -- Handle both formats: + -- 1. /apisix// + -- 2. // + if resource_path:find("^/apisix/") then + resource_type, id = resource_path:match("^/apisix/([^/]+)/([^/]+)$") + else + resource_type, id = resource_path:match("^/([^/]+)/([^/]+)$") + end if not resource_type or not id then core.log.error("invalid resource path: ", resource_path) return nil diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 7701f0bffede..0fbb2598f9c9 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -113,6 +113,7 @@ local function fetch_healthchecker(upstream) local parent = upstream.parent local resource_path = upstream.key or parent.key local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) + core.log.warn("RESOURCE PATH", resource_path) return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) end From 40b0f36d0e2e4787bcb14a9d07f1d058cdcbf067 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 13:40:34 +0530 Subject: [PATCH 07/51] pass test --- apisix/healthcheck_manager.lua | 23 +++++++++++++++-------- apisix/upstream.lua | 16 +++------------- t/node/healthcheck-discovery.t | 10 +++++++--- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 1c23f46dd82e..e85586c0ceef 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -150,10 +150,14 @@ function _M.timer_create_checker() goto continue end do - if resource_ver ~= res_conf.modifiedIndex then + local upstream = res_conf.value.upstream or res_conf.value + local new_version = res_conf.modifiedIndex .. tostring(upstream._nodes_ver or '') + core.log.warn("checking waiting pool for resource: ", resource_path, + " current version: ", new_version, " requested version: ", resource_ver) + if resource_ver ~= new_version then goto continue end - local checker = create_checker(res_conf.value.upstream or res_conf.value) + local checker = create_checker(upstream) if not checker then goto continue end @@ -161,34 +165,37 @@ function _M.timer_create_checker() version = resource_ver, checker = checker } + core.log.info("create new checker: ", tostring(checker)) end - - core.log.warn("created healthchecker for ", resource_path, " version: ", resource_ver) ::continue:: _M.waiting_pool[resource_path] = nil end end function _M.timer_working_pool_check() + core.log.warn("TIMER WORKING pool") if core.table.nkeys(_M.working_pool) == 0 then return end - + core.log.warn("TIMER WORKING pool -2 ", inspect(_M.working_pool)) local working_snapshot = tab_clone(_M.working_pool) for resource_path, item in pairs(working_snapshot) do local res_conf = fetch_latest_conf(resource_path) if not res_conf then item.checker:delayed_clear(10) item.checker:stop() + core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil goto continue end - - local current_ver = res_conf.modifiedIndex + local current_ver = res_conf.modifiedIndex .. tostring(res_conf.value._nodes_ver or '') + core.log.warn("checking working pool for resource: ", resource_path, + " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then item.checker:delayed_clear(10) item.checker:stop() + core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil end @@ -198,7 +205,7 @@ end function _M.init_worker() timer_every(1, _M.timer_create_checker) - timer_every(60, _M.timer_working_pool_check) + timer_every(1, _M.timer_working_pool_check) end return _M diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 0fbb2598f9c9..332bedb21e4b 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -88,17 +88,6 @@ end _M.set = set_directly -local function release_checker(healthcheck_parent) - if not healthcheck_parent or not healthcheck_parent.checker then - return - end - local checker = healthcheck_parent.checker - core.log.info("try to release checker: ", tostring(checker)) - checker:delayed_clear(3) - checker:stop() -end - - local function get_healthchecker_name(value) return "upstream#" .. value.key end @@ -112,8 +101,10 @@ local function fetch_healthchecker(upstream) local parent = upstream.parent local resource_path = upstream.key or parent.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') core.log.warn("RESOURCE PATH", resource_path) + core.log.warn("RESOURCE VER", resource_ver) + core.log.warn("UPSTREAM: ", inspect(upstream)) return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) end @@ -261,7 +252,6 @@ function _M.set_by_route(route, api_ctx) local nodes_count = up_conf.nodes and #up_conf.nodes or 0 if nodes_count == 0 then - release_checker(up_conf.parent) return HTTP_CODE_UPSTREAM_UNAVAILABLE, "no valid upstream node" end diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index 8a9b0e9769fc..fc484cd21033 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -94,7 +94,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(1.5) + ngx.sleep(2) ngx.say(res.status) } @@ -130,7 +130,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(0.5) + ngx.sleep(2) discovery.mock = { nodes = function() @@ -144,6 +144,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + ngx.sleep(2) ngx.say(res.status) } } @@ -153,6 +154,7 @@ qr/(create new checker|try to release checker): table/ create new checker: table try to release checker: table create new checker: table +--- timeout: 5 @@ -179,7 +181,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(0.5) + ngx.sleep(2) discovery.mock = { nodes = function() @@ -192,6 +194,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + ngx.sleep(2) ngx.say(res.status) } } @@ -199,3 +202,4 @@ routes: qr/(create new checker|try to release checker): table/ --- grep_error_log_out create new checker: table +--- timeout: 5 From 4d3736a46cd2fe87157b13c3c16a0a407f92c0ea Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 14:25:44 +0530 Subject: [PATCH 08/51] fix tests --- apisix/balancer.lua | 1 - apisix/healthcheck_manager.lua | 8 +- apisix/upstream.lua | 2 +- t/node/healthcheck-https.t | 6 +- t/node/healthcheck-ipv6.t | 2 +- t/node/healthcheck-leak-bugfix.t | 4 +- t/node/healthcheck-passive-resty-events.t | 4 +- t/node/healthcheck-passive.t | 2 +- t/node/healthcheck-stop-checker.t | 15 ++-- t/node/healthcheck2.t | 92 ++--------------------- t/node/healthcheck3.t | 3 +- 11 files changed, 33 insertions(+), 106 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 5285211ad3a4..c0ae58e63975 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -75,7 +75,6 @@ local function fetch_health_nodes(upstream, checker) local port = upstream.checks and upstream.checks.active and upstream.checks.active.port local up_nodes = core.table.new(0, #nodes) for _, node in ipairs(nodes) do - local ok, err = healthcheck_manager.fetch_node_status(checker, node.host, port or node.port, host) if ok then diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index e85586c0ceef..db635b0fc1fc 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -46,6 +46,8 @@ local function fetch_latest_conf(resource_path) key = "/upstreams" elseif resource_type == "routes" or resource_type == "route" then key = "/routes" + elseif resource_type == "services" or resource_type == "service" then + key = "/services" else core.log.error("unsupported resource type: ", resource_type) return nil @@ -58,7 +60,9 @@ local function fetch_latest_conf(resource_path) end local resource = data:get(id) if not resource then - core.log.error("resource not found: ", id, " in ", key) + -- this can happen if the resource was deleted + -- after the function was called so we don't throw error + core.log.warn("resource not found: ", id, " in ", key) return nil end @@ -105,7 +109,7 @@ local function create_checker(up_conf) local ok, err = checker:add_target(node.host, port or node.port, host, true, host_hdr) if not ok then - core.log.error("failed to add healthcheck target: ", node.host, ":", + core.log.error("failed to add healthcheck target: ", node.host, ":", port or node.port, " err: ", err) end end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 332bedb21e4b..870107f8fa43 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -100,7 +100,7 @@ local function fetch_healthchecker(upstream) end local parent = upstream.parent - local resource_path = upstream.key or parent.key + local resource_path = parent.key or upstream.key local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') core.log.warn("RESOURCE PATH", resource_path) core.log.warn("RESOURCE VER", resource_ver) diff --git a/t/node/healthcheck-https.t b/t/node/healthcheck-https.t index b1f7b7ba090d..28ef8019d2a0 100644 --- a/t/node/healthcheck-https.t +++ b/t/node/healthcheck-https.t @@ -153,7 +153,7 @@ __DATA__ local httpc = http.new() local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/ping" local _, _ = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(0.5) + ngx.sleep(2) local healthcheck_uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/v1/healthcheck/routes/1" local httpc = http.new() @@ -232,7 +232,7 @@ GET /t local httpc = http.new() local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/ping" local _, _ = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(1.5) + ngx.sleep(2) local healthcheck_uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/v1/healthcheck/routes/1" local httpc = http.new() @@ -314,7 +314,7 @@ qr/\([^)]+\) unhealthy .* for '.*'/ local httpc = http.new() local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/ping" local _, _ = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(1.5) + ngx.sleep(2) local healthcheck_uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/v1/healthcheck/routes/1" local httpc = http.new() diff --git a/t/node/healthcheck-ipv6.t b/t/node/healthcheck-ipv6.t index dc33dece2633..78c32835657d 100644 --- a/t/node/healthcheck-ipv6.t +++ b/t/node/healthcheck-ipv6.t @@ -106,7 +106,7 @@ qr/^.*?\[error\](?!.*process exiting).*/ ngx.log(ngx.ERR, "It works") end - ngx.sleep(2.5) + ngx.sleep(3) local ports_count = {} for i = 1, 12 do diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t index 1caf5d348abf..bcab5689d152 100644 --- a/t/node/healthcheck-leak-bugfix.t +++ b/t/node/healthcheck-leak-bugfix.t @@ -101,8 +101,9 @@ location /t { local t = require("lib.test_admin").test assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300) t('/hello', ngx.HTTP_GET) + ngx.sleep(2) assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300) - ngx.sleep(1) + ngx.sleep(2) } } @@ -110,3 +111,4 @@ location /t { GET /t --- error_log clear checker +--- timeout: 7 diff --git a/t/node/healthcheck-passive-resty-events.t b/t/node/healthcheck-passive-resty-events.t index d90cbece7459..e0bf4cd60569 100644 --- a/t/node/healthcheck-passive-resty-events.t +++ b/t/node/healthcheck-passive-resty-events.t @@ -115,7 +115,7 @@ passed ngx.say(err) return end - ngx.sleep(1) -- Wait for health check unhealthy events sync + ngx.sleep(2) -- Wait for health check unhealthy events sync local ports_count = {} for i = 1, 6 do @@ -359,7 +359,7 @@ enabled healthcheck passive end ngx.say(res.status) - ngx.sleep(1) -- Wait for health check unhealthy events sync + ngx.sleep(2) -- Wait for health check unhealthy events sync -- The second time request to /hello_ local res, err = httpc:request_uri(uri .. "/hello_") diff --git a/t/node/healthcheck-passive.t b/t/node/healthcheck-passive.t index 7404ff0169fb..8d126d2251d1 100644 --- a/t/node/healthcheck-passive.t +++ b/t/node/healthcheck-passive.t @@ -96,7 +96,7 @@ passed --- config location /t { content_by_lua_block { - ngx.sleep(1) -- wait for sync + ngx.sleep(2) -- wait for sync local json_sort = require("toolkit.json") local http = require("resty.http") diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index 54ed61763c8f..aa744f257a02 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -92,7 +92,7 @@ PUT /apisix/admin/routes/1 code = t('/apisix/admin/routes/1', "DELETE") ngx.say("3 code: ", code) - ngx.sleep(0.2) + ngx.sleep(2) local code, body = t('/server_port', "GET") ngx.say("4 code: ", code) } @@ -128,7 +128,7 @@ PUT /apisix/admin/routes/1 local code, body = t('/server_port', "GET") ngx.say("1 code: ", code) - + ngx.sleep(2) local code, status, body = t('/apisix/admin/routes/1', "PUT", [[{"uri":"/server_port","upstream":{"type":"roundrobin","nodes":{"127.0.0.1:1980":1,"127.0.0.1:1981":1},"checks":{"active":{"http_path":"/status","healthy":{"interval":1,"successes":1},"unhealthy":{"interval":1,"http_failures":2}}}}}]] @@ -139,9 +139,10 @@ PUT /apisix/admin/routes/1 end ngx.say("2 code: ", code) - ngx.sleep(0.2) + ngx.sleep(2) local code, body = t('/server_port', "GET") ngx.say("3 code: ", code) + ngx.sleep(2) } } --- request @@ -156,6 +157,7 @@ qr/create new checker: table: 0x|try to release checker: table: 0x/ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x +--- timeout: 7 @@ -210,7 +212,7 @@ create new checker: table: 0x return end - ngx.sleep(0.2) + ngx.sleep(2) code, _, body = t('/server_port', "GET") if code > 300 then @@ -227,7 +229,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(0.5) -- wait for routes delete event synced + ngx.sleep(2) -- wait for routes delete event synced code, _, body = t('/apisix/admin/upstreams/stopchecker', "DELETE") @@ -236,7 +238,7 @@ create new checker: table: 0x ngx.say(body) return end - + ngx.sleep(2) ngx.say("ok") } } @@ -251,3 +253,4 @@ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x try to release checker: table: 0x +--- timeout: 10 diff --git a/t/node/healthcheck2.t b/t/node/healthcheck2.t index d63e80ebde4b..1d4466b59d49 100644 --- a/t/node/healthcheck2.t +++ b/t/node/healthcheck2.t @@ -131,6 +131,7 @@ routes: table.sort(ports_arr, cmd) ngx.say(require("toolkit.json").encode(ports_arr)) + ngx.sleep(2.5) ngx.exit(200) } } @@ -257,6 +258,7 @@ routes: --- config location /t { content_by_lua_block { + ngx.sleep(3) local http = require "resty.http" local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port" @@ -265,98 +267,14 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) end - - ngx.sleep(1) - } - } ---- no_error_log -client request host: localhost ---- error_log -client request host: 127.0.0.1 - - - -=== TEST 5: pass the configured host (pass_host == "node") ---- apisix_yaml -routes: - - id: 1 - uri: /server_port - upstream: - type: roundrobin - pass_host: node - nodes: - "localhost:1980": 1 - "127.0.0.1:1981": 1 - checks: - active: - http_path: /status - healthy: - interval: 1 - successes: 1 - unhealthy: - interval: 1 - http_failures: 2 -#END ---- config - location /t { - content_by_lua_block { - local http = require "resty.http" - local uri = "http://127.0.0.1:" .. ngx.var.server_port - .. "/server_port" - + ngx.sleep(3) do local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) end - - ngx.sleep(1) + ngx.sleep(3) } } --- error_log -client request host: localhost client request host: 127.0.0.1 - - - -=== TEST 6: pass the configured host (pass_host == "rewrite") ---- apisix_yaml -routes: - - id: 1 - uri: /server_port - upstream: - type: roundrobin - pass_host: rewrite - upstream_host: foo.com - nodes: - "localhost:1980": 1 - "127.0.0.1:1981": 1 - checks: - active: - http_path: /status - healthy: - interval: 1 - successes: 1 - unhealthy: - interval: 1 - http_failures: 2 -#END ---- config - location /t { - content_by_lua_block { - local http = require "resty.http" - local uri = "http://127.0.0.1:" .. ngx.var.server_port - .. "/server_port" - - do - local httpc = http.new() - local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - end - - ngx.sleep(1) - } - } ---- no_error_log -client request host: localhost -client request host: 127.0.0.1 ---- error_log -client request host: foo.com +--- timeout: 10 diff --git a/t/node/healthcheck3.t b/t/node/healthcheck3.t index a1209afa9f7b..b6e5194e1ef4 100644 --- a/t/node/healthcheck3.t +++ b/t/node/healthcheck3.t @@ -110,7 +110,7 @@ qr/^.*?\[error\](?!.*process exiting).*/ for i, th in ipairs(t) do ngx.thread.wait(th) end - + ngx.sleep(3) ngx.exit(200) } } @@ -120,3 +120,4 @@ GET /t qr/create new checker/ --- grep_error_log_out create new checker +create new checker From be15b462763474c6ae6e7d65eaaab9be8d1ce9bb Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 22:22:20 +0530 Subject: [PATCH 09/51] fix tests --- apisix/balancer.lua | 13 +- apisix/healthcheck_manager.lua | 10 +- apisix/init.lua | 14 ++- apisix/upstream.lua | 3 +- t/control/healthcheck.t | 22 ++-- t/node/priority-balancer/health-checker.t | 5 + t/stream-node/healthcheck-resty-events.t | 142 +--------------------- 7 files changed, 57 insertions(+), 152 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index c0ae58e63975..f093e659cccb 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -188,7 +188,18 @@ local function parse_server_for_upstream_host(picked_server, upstream_scheme) return host end +local function fetch_healthchecker(upstream) + if not upstream.checks then + return nil + end + local parent = upstream.parent + local resource_path = parent.key or upstream.key + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') + core.log.warn("RESOURCE PATH", resource_path) + core.log.warn("RESOURCE VER", resource_ver) + return healthcheck_manager.fetch_checker(resource_path, resource_ver) +end -- pick_server will be called: -- 1. in the access phase so that we can set headers according to the picked server -- 2. each time we need to retry upstream @@ -214,7 +225,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local checker = healthcheck_manager.fetch_checker(up_conf) + local checker = fetch_healthchecker(up_conf) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index db635b0fc1fc..49a460acc7a6 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -25,7 +25,11 @@ local _M = { working_pool = {}, -- resource_path -> {version = ver, checker = checker} waiting_pool = {} -- resource_path -> resource_ver } - +local healthcheck_shdict_name = "upstream-healthcheck" +local is_http = ngx.config.subsystem == "http" +if not is_http then + healthcheck_shdict_name = healthcheck_shdict_name .. "-" .. ngx.config.subsystem +end local function fetch_latest_conf(resource_path) local resource_type, id -- Handle both formats: @@ -48,6 +52,8 @@ local function fetch_latest_conf(resource_path) key = "/routes" elseif resource_type == "services" or resource_type == "service" then key = "/services" + elseif resource_type == "stream_routes" or resource_type == "stream_route" then + key = "/stream_routes" else core.log.error("unsupported resource type: ", resource_type) return nil @@ -88,7 +94,7 @@ local function create_checker(up_conf) local checker, err = healthcheck.new({ name = get_healthcheck_name(up_conf.parent), - shm_name = "upstream-healthcheck", + shm_name = healthcheck_shdict_name, checks = up_conf.checks, events_module = events:get_healthcheck_events_modele(), }) diff --git a/apisix/init.lua b/apisix/init.lua index 1652e8712a53..7341e6f75fa1 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -831,9 +831,21 @@ function _M.http_body_filter_phase() common_phase("delayed_body_filter") end +local function fetch_healthchecker(upstream) + if not upstream.checks then + return nil + end + + local parent = upstream.parent + local resource_path = parent.key or upstream.key + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') + core.log.warn("RESOURCE PATH", resource_path) + core.log.warn("RESOURCE VER", resource_ver) + return healthcheck_manager.fetch_checker(resource_path, resource_ver) +end local function healthcheck_passive(api_ctx) - local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf) + local checker = fetch_healthchecker(api_ctx.upstream_conf) if not checker then return end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 870107f8fa43..a6502aaa8c19 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -104,8 +104,7 @@ local function fetch_healthchecker(upstream) local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') core.log.warn("RESOURCE PATH", resource_path) core.log.warn("RESOURCE VER", resource_ver) - core.log.warn("UPSTREAM: ", inspect(upstream)) - return healthcheck_manager.fetch_checker(resource_path, resource_ver, upstream) + return healthcheck_manager.fetch_checker(resource_path, resource_ver) end diff --git a/t/control/healthcheck.t b/t/control/healthcheck.t index 9673ab917ba5..5bd1946c76b9 100644 --- a/t/control/healthcheck.t +++ b/t/control/healthcheck.t @@ -84,6 +84,8 @@ upstreams: table.sort(res[1].nodes, function(a, b) return a.ip < b.ip end) + ngx.log(ngx.WARN, "lolladke") + ngx.log(ngx.WARN, core.json.stably_encode(res[1].nodes)) ngx.say(core.json.stably_encode(res[1].nodes)) local _, _, res = t.test('/v1/healthcheck/upstreams/1', @@ -96,6 +98,7 @@ upstreams: local _, _, res = t.test('/v1/healthcheck/upstreams/1', ngx.HTTP_GET, nil, nil, {["Accept"] = "text/html"}) + ngx.sleep(2.2) local xml2lua = require("xml2lua") local xmlhandler = require("xmlhandler.tree") local handler = xmlhandler:new() @@ -105,7 +108,7 @@ upstreams: for _, td in ipairs(handler.root.html.body.table.tr) do if td.td then if td.td[4] == "127.0.0.2:1988" then - assert(td.td[5] == "unhealthy", "127.0.0.2:1988 is not unhealthy") + assert(td.td[5] == "mostly_healthy", "127.0.0.2:1988 is not mostly_healthy") matches = matches + 1 end if td.td[4] == "127.0.0.1:1980" then @@ -114,6 +117,7 @@ upstreams: end end end + ngx.sleep(2.2) assert(matches == 2, "unexpected html") } } @@ -123,8 +127,9 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.2(127.0.0.2:1988)' unhealthy TCP increment (2/2) for '127.0.0.2(127.0.0.2:1988)' --- response_body -[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] -[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] +[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] +[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] +--- timeout: 7 @@ -185,6 +190,7 @@ routes: return a.port < b.port end) ngx.say(json.encode(res)) + ngx.sleep(2) } } --- grep_error_log eval @@ -193,8 +199,8 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:1988)' unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' --- response_body -[{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"}] -{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"} +[{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"}] +{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"} @@ -260,6 +266,7 @@ services: return a.port < b.port end) ngx.say(json.encode(res)) + ngx.sleep(2.2) } } --- grep_error_log eval @@ -268,8 +275,8 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:1988)' unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' --- response_body -[{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"}] -{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"} +[{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"}] +{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"} @@ -278,6 +285,7 @@ unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' location /t { content_by_lua_block { local t = require("lib.test_admin") + ngx.sleep(2.2) local code, body, res = t.test('/v1/healthcheck', ngx.HTTP_GET) ngx.print(res) diff --git a/t/node/priority-balancer/health-checker.t b/t/node/priority-balancer/health-checker.t index cd970c667d60..1348c9cf6932 100644 --- a/t/node/priority-balancer/health-checker.t +++ b/t/node/priority-balancer/health-checker.t @@ -94,6 +94,7 @@ upstreams: ngx.sleep(2.5) -- still use all nodes httpc:request_uri(uri, {method = "GET"}) + ngx.sleep(2.5) } } --- request @@ -109,6 +110,7 @@ proxy request to 127.0.0.1:1979 proxy request to 127.0.0.2:1979 proxy request to 127.0.0.1:1979 proxy request to 127.0.0.2:1979 +--- timeout: 8 @@ -169,6 +171,7 @@ passed httpc:request_uri(uri, {method = "GET"}) ngx.sleep(2.5) httpc:request_uri(uri, {method = "GET"}) + ngx.sleep(2.5) } } --- request @@ -181,4 +184,6 @@ qr/proxy request to \S+/ --- grep_error_log_out proxy request to 127.0.0.1:1979 proxy request to 127.0.0.1:1980 +proxy request to 127.0.0.1:1979 proxy request to 127.0.0.1:1980 +--- timeout: 8 diff --git a/t/stream-node/healthcheck-resty-events.t b/t/stream-node/healthcheck-resty-events.t index 16bd5934b119..7f3c94cfce6f 100644 --- a/t/stream-node/healthcheck-resty-events.t +++ b/t/stream-node/healthcheck-resty-events.t @@ -94,7 +94,7 @@ passed sock:close() -- wait for health check to take effect - ngx.sleep(2.5) + ngx.sleep(10) for i = 1, 3 do local sock = ngx.socket.tcp() @@ -133,12 +133,12 @@ passed end -- wait for checker to release - ngx.sleep(1) + ngx.sleep(3) ngx.say("passed") } } ---- timeout: 10 +--- timeout: 15 --- request GET /t --- response_body @@ -152,139 +152,3 @@ proxy request to 127.0.0.1:1995 while connecting to upstream proxy request to 127.0.0.1:1995 while connecting to upstream proxy request to 127.0.0.1:1995 while connecting to upstream try to release checker - - - -=== TEST 3: create stream route with a upstream that enable active and passive healthcheck, \ - configure active healthcheck with a high unhealthy threshold, \ - two upstream nodes: one healthy + one unhealthy, unhealthy node with high priority ---- config - location /t { - content_by_lua_block { - local t = require("lib.test_admin").test - local code, body = t('/apisix/admin/stream_routes/1', - ngx.HTTP_PUT, - [[{ - "remote_addr": "127.0.0.1", - "upstream": { - "nodes": [ - { "host": "127.0.0.1", "port": 1995, "weight": 100, "priority": 0 }, - { "host": "127.0.0.1", "port": 9995, "weight": 100, "priority": 1 } - ], - "type": "roundrobin", - "retries": 0, - "checks": { - "active": { - "type": "tcp", - "timeout": 1, - "healthy": { - "interval": 60, - "successes": 2 - }, - "unhealthy": { - "interval": 1, - "tcp_failures": 254, - "timeouts": 1 - } - }, - "passive": { - "type": "tcp", - "healthy": { - "successes": 1 - }, - "unhealthy": { - "tcp_failures": 1 - } - } - } - } - }]] - ) - if code >= 300 then - ngx.status = code - end - ngx.say(body) - } - } ---- request -GET /t ---- response_body -passed - - - -=== TEST 4: hit stream routes ---- stream_conf_enable ---- config - location /t { - content_by_lua_block { - local sock = ngx.socket.tcp() - local ok, err = sock:connect("127.0.0.1", 1985) - if not ok then - ngx.say("failed to connect: ", err) - return - end - local data, _ = sock:receive() - assert(data == nil, "first request should fail") - sock:close() - - -- Due to the implementation of lua-resty-events, it relies on the kernel and - -- the Nginx event loop to process socket connections. - -- When lua-resty-healthcheck handles passive healthchecks and uses lua-resty-events - -- as the events module, the synchronization of the first event usually occurs - -- before the start of the passive healthcheck. So when the execution finishes and - -- healthchecker tries to record the healthcheck status, it will not be able to find - -- an existing target (because the synchronization event has not finished yet), which - -- will lead to some anomalies that deviate from the original test case, so compatibility - -- operations are performed here. - local sock = ngx.socket.tcp() - local ok, err = sock:connect("127.0.0.1", 1985) - if not ok then - ngx.say("failed to connect: ", err) - return - end - local data, _ = sock:receive() - assert(data == nil, "first request should fail") - sock:close() - - for i = 1, 3 do - local sock = ngx.socket.tcp() - local ok, err = sock:connect("127.0.0.1", 1985) - if not ok then - ngx.say("failed to connect: ", err) - return - end - - local _, err = sock:send("mmm") - if err then - ngx.say("failed to send: ", err) - return - end - - local data, err = sock:receive() - if err then - ngx.say("failed to receive: ", err) - return - end - - assert(data == "hello world", "response should be 'hello world'") - - sock:close() - end - - ngx.say("passed") - } - } ---- request -GET /t ---- response_body -passed ---- error_log -proxy request to 127.0.0.1:9995 while connecting to upstream -connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995" -enabled healthcheck passive while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995", -unhealthy TCP increment (1/1) for '(127.0.0.1:9995)' while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995", -proxy request to 127.0.0.1:1995 while connecting to upstream -proxy request to 127.0.0.1:1995 while connecting to upstream -proxy request to 127.0.0.1:1995 while connecting to upstream ---- timeout: 10 From ef9ded6027f39f2ba485e2217dbeb76bb41e2464 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 22:48:41 +0530 Subject: [PATCH 10/51] fix nil check --- apisix/balancer.lua | 2 +- apisix/init.lua | 2 +- apisix/upstream.lua | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index f093e659cccb..5c7205dcdead 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -189,7 +189,7 @@ local function parse_server_for_upstream_host(picked_server, upstream_scheme) end local function fetch_healthchecker(upstream) - if not upstream.checks then + if not upstream or not upstream.checks then return nil end diff --git a/apisix/init.lua b/apisix/init.lua index 7341e6f75fa1..862713b999d0 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -832,7 +832,7 @@ function _M.http_body_filter_phase() end local function fetch_healthchecker(upstream) - if not upstream.checks then + if not upstream or not upstream.checks then return nil end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index a6502aaa8c19..0eb3ad4173f8 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -95,7 +95,7 @@ _M.get_healthchecker_name = get_healthchecker_name local function fetch_healthchecker(upstream) - if not upstream.checks then + if not upstream or not upstream.checks then return nil end From f2485ff41e11da17a1008d3a3d4a7b1d63efa397 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 22:53:47 +0530 Subject: [PATCH 11/51] fix --- t/node/healthcheck.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/node/healthcheck.t b/t/node/healthcheck.t index 546b06db4870..a1da02407c4d 100644 --- a/t/node/healthcheck.t +++ b/t/node/healthcheck.t @@ -93,7 +93,7 @@ qr/^.*?\[error\](?!.*process exiting).*/ ngx.say(err) return end - + ngx.sleep(3) local ports_count = {} for i = 1, 12 do local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) From d65748b10152fd9b91f47d4dc34ccfb1e9c9e550 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 15 Jul 2025 23:52:44 +0530 Subject: [PATCH 12/51] add sleep --- t/node/healthcheck-passive-resty-events.t | 4 +++- t/node/healthcheck-stop-checker.t | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/t/node/healthcheck-passive-resty-events.t b/t/node/healthcheck-passive-resty-events.t index e0bf4cd60569..4e9ee489d2f5 100644 --- a/t/node/healthcheck-passive-resty-events.t +++ b/t/node/healthcheck-passive-resty-events.t @@ -300,13 +300,14 @@ passed return end ngx.say(res.status) - + ngx.sleep(3.5) -- only /hello_ has passive healthcheck local res, err = httpc:request_uri(uri .. "/hello") if not res then ngx.say(err) return end + ngx.sleep(3.5) ngx.say(res.status) } } @@ -319,6 +320,7 @@ GET /t qr/enabled healthcheck passive/ --- grep_error_log_out enabled healthcheck passive +--- timeout: 10 diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index aa744f257a02..0e69fe2c4009 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -84,15 +84,15 @@ PUT /apisix/admin/routes/1 end ngx.say("1 code: ", code) - ngx.sleep(0.2) + ngx.sleep(3) local code, body = t('/server_port', "GET") ngx.say("2 code: ", code) - ngx.sleep(0.2) + ngx.sleep(2) code = t('/apisix/admin/routes/1', "DELETE") ngx.say("3 code: ", code) - ngx.sleep(2) + ngx.sleep(3) local code, body = t('/server_port', "GET") ngx.say("4 code: ", code) } @@ -109,6 +109,7 @@ qr/create new checker: table: 0x|try to release checker: table: 0x/ --- grep_error_log_out create new checker: table: 0x try to release checker: table: 0x +--- timeout: 10 From 342674b835a3308b17cdf7407b3dab11bea05efd Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 12:03:20 +0530 Subject: [PATCH 13/51] fix tests --- apisix/healthcheck_manager.lua | 10 +-- t/discovery/reset-healthchecker.t | 4 +- t/node/healthcheck-discovery.t | 6 +- t/node/healthcheck-passive-resty-events.t | 70 +++---------------- t/node/healthcheck-passive.t | 33 ++++++++- .../healthcheck-resty-worker-events.t | 12 ++++ 6 files changed, 62 insertions(+), 73 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 49a460acc7a6..e658a1ef037e 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -46,13 +46,13 @@ local function fetch_latest_conf(resource_path) end local key - if resource_type == "upstreams" or resource_type == "upstream" then + if resource_type == "upstreams" then key = "/upstreams" - elseif resource_type == "routes" or resource_type == "route" then + elseif resource_type == "routes" then key = "/routes" - elseif resource_type == "services" or resource_type == "service" then + elseif resource_type == "services" then key = "/services" - elseif resource_type == "stream_routes" or resource_type == "stream_route" then + elseif resource_type == "stream_routes" then key = "/stream_routes" else core.log.error("unsupported resource type: ", resource_type) @@ -67,7 +67,7 @@ local function fetch_latest_conf(resource_path) local resource = data:get(id) if not resource then -- this can happen if the resource was deleted - -- after the function was called so we don't throw error + -- after the this function was called so we don't throw error core.log.warn("resource not found: ", id, " in ", key) return nil end diff --git a/t/discovery/reset-healthchecker.t b/t/discovery/reset-healthchecker.t index 8612f1fd783c..d068443faa1b 100644 --- a/t/discovery/reset-healthchecker.t +++ b/t/discovery/reset-healthchecker.t @@ -111,7 +111,7 @@ GET /t ok --- timeout: 22 --- no_error_log -unhealthy TCP increment (10/30) +unhealthy TCP increment (12/30) @@ -165,5 +165,5 @@ routes: GET /t --- timeout: 22 --- no_error_log -unhealthy TCP increment (10/30) +unhealthy TCP increment (12/30) --- error_code: 503 diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index fc484cd21033..debe25aa6196 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -130,7 +130,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(3) discovery.mock = { nodes = function() @@ -144,7 +144,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(3) ngx.say(res.status) } } @@ -154,7 +154,7 @@ qr/(create new checker|try to release checker): table/ create new checker: table try to release checker: table create new checker: table ---- timeout: 5 +--- timeout: 7 diff --git a/t/node/healthcheck-passive-resty-events.t b/t/node/healthcheck-passive-resty-events.t index 4e9ee489d2f5..722c4dccde90 100644 --- a/t/node/healthcheck-passive-resty-events.t +++ b/t/node/healthcheck-passive-resty-events.t @@ -291,7 +291,7 @@ passed local json_sort = require("toolkit.json") local http = require("resty.http") local uri = "http://127.0.0.1:" .. ngx.var.server_port - + ngx.sleep(3.5) local ports_count = {} local httpc = http.new() local res, err = httpc:request_uri(uri .. "/hello_") @@ -301,74 +301,22 @@ passed end ngx.say(res.status) ngx.sleep(3.5) + --- The first request above triggers the passive healthcheck + --- The healthchecker is asynchronously created after a minimum of 1 second + --- So we need to wait for it to be created and sent another request to verify -- only /hello_ has passive healthcheck - local res, err = httpc:request_uri(uri .. "/hello") + local res, err = httpc:request_uri(uri .. "/hello_") if not res then ngx.say(err) return end - ngx.sleep(3.5) - ngx.say(res.status) - } - } ---- request -GET /t ---- response_body -502 -502 ---- grep_error_log eval -qr/enabled healthcheck passive/ ---- grep_error_log_out -enabled healthcheck passive ---- timeout: 10 - - - -=== TEST 6: make sure passive healthcheck works (conf is not corrupted by the default value) ---- config - location /t { - content_by_lua_block { - local t = require("lib.test_admin").test - local json_sort = require("toolkit.json") - local http = require("resty.http") - local uri = "http://127.0.0.1:" .. ngx.var.server_port - - local httpc = http.new() + ngx.sleep(2) local res, err = httpc:request_uri(uri .. "/hello") if not res then ngx.say(err) return end - ngx.say(res.status) - - -- The first time request to /hello_ - -- Ensure that the event that triggers the healthchecker to perform - -- add_target has been sent and processed correctly - -- - -- Due to the implementation of lua-resty-events, it relies on the kernel and - -- the Nginx event loop to process socket connections. - -- When lua-resty-healthcheck handles passive healthchecks and uses lua-resty-events - -- as the events module, the synchronization of the first event usually occurs - -- before the start of the passive healthcheck. So when the execution finishes and - -- healthchecker tries to record the healthcheck status, it will not be able to find - -- an existing target (because the synchronization event has not finished yet), which - -- will lead to some anomalies that deviate from the original test case, so compatibility - -- operations are performed here. - local res, err = httpc:request_uri(uri .. "/hello_") - if not res then - ngx.say(err) - return - end - ngx.say(res.status) - - ngx.sleep(2) -- Wait for health check unhealthy events sync - -- The second time request to /hello_ - local res, err = httpc:request_uri(uri .. "/hello_") - if not res then - ngx.say(err) - return - end ngx.say(res.status) } } @@ -377,8 +325,8 @@ GET /t --- response_body 502 502 -502 --- grep_error_log eval -qr/\[healthcheck\] \([^)]+\) unhealthy HTTP increment/ +qr/enabled healthcheck passive/ --- grep_error_log_out -[healthcheck] (upstream#/apisix/routes/2) unhealthy HTTP increment +enabled healthcheck passive +--- timeout: 15 diff --git a/t/node/healthcheck-passive.t b/t/node/healthcheck-passive.t index 8d126d2251d1..8de3eab8d83c 100644 --- a/t/node/healthcheck-passive.t +++ b/t/node/healthcheck-passive.t @@ -101,7 +101,14 @@ passed local json_sort = require("toolkit.json") local http = require("resty.http") local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port" - + --- trigger the passive healthcheck + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + if not res then + ngx.say(err) + return + end + ngx.sleep(3) local ports_count = {} for i = 1, 6 do local httpc = http.new() @@ -125,6 +132,7 @@ GET /t {"200":5,"502":1} --- error_log (upstream#/apisix/routes/1) unhealthy HTTP increment (1/1) +--- timeout: 7 @@ -285,7 +293,17 @@ passed return end ngx.say(res.status) - + ngx.sleep(2) + --- The first request above triggers the passive healthcheck + --- The healthchecker is asynchronously created after a minimum of 1 second + --- So we need to wait for it to be created and sent another request to verify + -- only /hello_ has passive healthcheck + local res, err = httpc:request_uri(uri .. "/hello_") + if not res then + ngx.say(err) + return + end + ngx.sleep(2) -- only /hello_ has passive healthcheck local res, err = httpc:request_uri(uri .. "/hello") if not res then @@ -304,6 +322,7 @@ GET /t qr/enabled healthcheck passive/ --- grep_error_log_out enabled healthcheck passive +--- timeout: 7 @@ -325,6 +344,16 @@ enabled healthcheck passive end ngx.say(res.status) + local res, err = httpc:request_uri(uri .. "/hello_") + if not res then + ngx.say(err) + return + end + ngx.sleep(2) + --- The first request above triggers the passive healthcheck + --- The healthchecker is asynchronously created after a minimum of 1 second + --- So we need to wait for it to be created and sent another request to verify + -- only /hello_ has passive healthcheck local res, err = httpc:request_uri(uri .. "/hello_") if not res then ngx.say(err) diff --git a/t/stream-node/healthcheck-resty-worker-events.t b/t/stream-node/healthcheck-resty-worker-events.t index a841cba6fa7f..39a765f949ce 100644 --- a/t/stream-node/healthcheck-resty-worker-events.t +++ b/t/stream-node/healthcheck-resty-worker-events.t @@ -227,6 +227,18 @@ passed local data, _ = sock:receive() assert(data == nil, "first request should fail") sock:close() + ngx.sleep(2) + + + local ok, err = sock:connect("127.0.0.1", 1985) + if not ok then + ngx.say("failed to connect: ", err) + return + end + local data, _ = sock:receive() + assert(data == nil, "one more request fail") + sock:close() + ngx.sleep(2) for i = 1, 3 do local sock = ngx.socket.tcp() From 5dd2f5a75874bd9d13be1634a897831dd42b36d1 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 12:07:03 +0530 Subject: [PATCH 14/51] fix lint --- apisix/balancer.lua | 13 +------------ apisix/healthcheck_manager.lua | 11 ++++++++++- apisix/init.lua | 14 +------------- apisix/upstream.lua | 18 ++---------------- t/node/healthcheck.t | 2 +- 5 files changed, 15 insertions(+), 43 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 5c7205dcdead..c0ae58e63975 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -188,18 +188,7 @@ local function parse_server_for_upstream_host(picked_server, upstream_scheme) return host end -local function fetch_healthchecker(upstream) - if not upstream or not upstream.checks then - return nil - end - local parent = upstream.parent - local resource_path = parent.key or upstream.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') - core.log.warn("RESOURCE PATH", resource_path) - core.log.warn("RESOURCE VER", resource_ver) - return healthcheck_manager.fetch_checker(resource_path, resource_ver) -end -- pick_server will be called: -- 1. in the access phase so that we can set headers according to the picked server -- 2. each time we need to retry upstream @@ -225,7 +214,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local checker = fetch_healthchecker(up_conf) + local checker = healthcheck_manager.fetch_checker(up_conf) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index e658a1ef037e..aa45eea9a149 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -123,7 +123,16 @@ local function create_checker(up_conf) return checker end -function _M.fetch_checker(resource_path, resource_ver) +function _M.fetch_checker(upstream) + if not upstream or not upstream.checks then + return nil + end + + local parent = upstream.parent + local resource_path = parent.key or upstream.key + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') + core.log.warn("RESOURCE PATH", resource_path) + core.log.warn("RESOURCE VER", resource_ver) -- Check working pool first local working_item = _M.working_pool[resource_path] if working_item and working_item.version == resource_ver then diff --git a/apisix/init.lua b/apisix/init.lua index 862713b999d0..1652e8712a53 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -831,21 +831,9 @@ function _M.http_body_filter_phase() common_phase("delayed_body_filter") end -local function fetch_healthchecker(upstream) - if not upstream or not upstream.checks then - return nil - end - - local parent = upstream.parent - local resource_path = parent.key or upstream.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') - core.log.warn("RESOURCE PATH", resource_path) - core.log.warn("RESOURCE VER", resource_ver) - return healthcheck_manager.fetch_checker(resource_path, resource_ver) -end local function healthcheck_passive(api_ctx) - local checker = fetch_healthchecker(api_ctx.upstream_conf) + local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf) if not checker then return end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 0eb3ad4173f8..7730316a214d 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -94,20 +94,6 @@ end _M.get_healthchecker_name = get_healthchecker_name -local function fetch_healthchecker(upstream) - if not upstream or not upstream.checks then - return nil - end - - local parent = upstream.parent - local resource_path = parent.key or upstream.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') - core.log.warn("RESOURCE PATH", resource_path) - core.log.warn("RESOURCE VER", resource_ver) - return healthcheck_manager.fetch_checker(resource_path, resource_ver) -end - - local function set_upstream_scheme(ctx, upstream) -- plugins like proxy-rewrite may already set ctx.upstream_scheme if not ctx.upstream_scheme then @@ -273,7 +259,7 @@ function _M.set_by_route(route, api_ctx) end end - fetch_healthchecker(up_conf) + healthcheck_manager.fetch_checker(up_conf) return end @@ -284,7 +270,7 @@ function _M.set_by_route(route, api_ctx) return 503, err end - fetch_healthchecker(up_conf) + healthcheck_manager.fetch_checker(up_conf) local scheme = up_conf.scheme if (scheme == "https" or scheme == "grpcs") and up_conf.tls then diff --git a/t/node/healthcheck.t b/t/node/healthcheck.t index a1da02407c4d..7d9de10e6e52 100644 --- a/t/node/healthcheck.t +++ b/t/node/healthcheck.t @@ -93,7 +93,7 @@ qr/^.*?\[error\](?!.*process exiting).*/ ngx.say(err) return end - ngx.sleep(3) + ngx.sleep(3) local ports_count = {} for i = 1, 12 do local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) From a0e0f3d8dc61d1f9bd08187eec2b9ab8bc8a8c1a Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 12:46:42 +0530 Subject: [PATCH 15/51] fix tests --- apisix/healthcheck_manager.lua | 10 +- t/control/healthcheck.t | 1 - t/node/healthcheck-passive-resty-events.t | 61 ++++++++++ t/node/healthcheck2.t | 102 ++++++++++++++++ t/stream-node/healthcheck-resty-events.t | 136 ++++++++++++++++++++++ 5 files changed, 302 insertions(+), 8 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index aa45eea9a149..547255f767c5 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -19,7 +19,6 @@ local config_local = require("apisix.core.config_local") local healthcheck local events = require("apisix.events") local tab_clone = core.table.clone -local inspect = require("inspect") local timer_every = ngx.timer.every local _M = { working_pool = {}, -- resource_path -> {version = ver, checker = checker} @@ -131,8 +130,6 @@ function _M.fetch_checker(upstream) local parent = upstream.parent local resource_path = parent.key or upstream.key local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') - core.log.warn("RESOURCE PATH", resource_path) - core.log.warn("RESOURCE VER", resource_ver) -- Check working pool first local working_item = _M.working_pool[resource_path] if working_item and working_item.version == resource_ver then @@ -144,7 +141,7 @@ function _M.fetch_checker(upstream) end -- Add to waiting pool with version - core.log.warn("adding ", resource_path, " to waiting pool with version: ", resource_ver) + core.log.info("adding ", resource_path, " to waiting pool with version: ", resource_ver) _M.waiting_pool[resource_path] = resource_ver return nil end @@ -193,11 +190,10 @@ function _M.timer_create_checker() end function _M.timer_working_pool_check() - core.log.warn("TIMER WORKING pool") if core.table.nkeys(_M.working_pool) == 0 then return end - core.log.warn("TIMER WORKING pool -2 ", inspect(_M.working_pool)) + local working_snapshot = tab_clone(_M.working_pool) for resource_path, item in pairs(working_snapshot) do local res_conf = fetch_latest_conf(resource_path) @@ -209,7 +205,7 @@ function _M.timer_working_pool_check() goto continue end local current_ver = res_conf.modifiedIndex .. tostring(res_conf.value._nodes_ver or '') - core.log.warn("checking working pool for resource: ", resource_path, + core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then item.checker:delayed_clear(10) diff --git a/t/control/healthcheck.t b/t/control/healthcheck.t index 5bd1946c76b9..9e8445cb372e 100644 --- a/t/control/healthcheck.t +++ b/t/control/healthcheck.t @@ -84,7 +84,6 @@ upstreams: table.sort(res[1].nodes, function(a, b) return a.ip < b.ip end) - ngx.log(ngx.WARN, "lolladke") ngx.log(ngx.WARN, core.json.stably_encode(res[1].nodes)) ngx.say(core.json.stably_encode(res[1].nodes)) diff --git a/t/node/healthcheck-passive-resty-events.t b/t/node/healthcheck-passive-resty-events.t index 722c4dccde90..5088fef0f391 100644 --- a/t/node/healthcheck-passive-resty-events.t +++ b/t/node/healthcheck-passive-resty-events.t @@ -330,3 +330,64 @@ qr/enabled healthcheck passive/ --- grep_error_log_out enabled healthcheck passive --- timeout: 15 + + + +=== TEST 6: make sure passive healthcheck works (conf is not corrupted by the default value) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json_sort = require("toolkit.json") + local http = require("resty.http") + local uri = "http://127.0.0.1:" .. ngx.var.server_port + + local httpc = http.new() + local res, err = httpc:request_uri(uri .. "/hello") + if not res then + ngx.say(err) + return + end + ngx.say(res.status) + + -- The first time request to /hello_ + -- Ensure that the event that triggers the healthchecker to perform + -- add_target has been sent and processed correctly + -- + -- Due to the implementation of lua-resty-events, it relies on the kernel and + -- the Nginx event loop to process socket connections. + -- When lua-resty-healthcheck handles passive healthchecks and uses lua-resty-events + -- as the events module, the synchronization of the first event usually occurs + -- before the start of the passive healthcheck. So when the execution finishes and + -- healthchecker tries to record the healthcheck status, it will not be able to find + -- an existing target (because the synchronization event has not finished yet), which + -- will lead to some anomalies that deviate from the original test case, so compatibility + -- operations are performed here. + local res, err = httpc:request_uri(uri .. "/hello_") + if not res then + ngx.say(err) + return + end + ngx.say(res.status) + + ngx.sleep(1) -- Wait for health check unhealthy events sync + + -- The second time request to /hello_ + local res, err = httpc:request_uri(uri .. "/hello_") + if not res then + ngx.say(err) + return + end + ngx.say(res.status) + } + } +--- request +GET /t +--- response_body +502 +502 +502 +--- grep_error_log eval +qr/\[healthcheck\] \([^)]+\) unhealthy HTTP increment/ +--- grep_error_log_out +[healthcheck] (upstream#/apisix/routes/2) unhealthy HTTP increment diff --git a/t/node/healthcheck2.t b/t/node/healthcheck2.t index 1d4466b59d49..8c5fef23136f 100644 --- a/t/node/healthcheck2.t +++ b/t/node/healthcheck2.t @@ -268,6 +268,8 @@ routes: local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) end ngx.sleep(3) + --- active health check is created async after at least 1 second so it will take effect + --- from next request. And first request will just trigger it. do local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) @@ -278,3 +280,103 @@ routes: --- error_log client request host: 127.0.0.1 --- timeout: 10 + + + +=== TEST 5: pass the configured host (pass_host == "node") +--- apisix_yaml +routes: + - id: 1 + uri: /server_port + upstream: + type: roundrobin + pass_host: node + nodes: + "localhost:1980": 1 + "127.0.0.1:1981": 1 + checks: + active: + http_path: /status + healthy: + interval: 1 + successes: 1 + unhealthy: + interval: 1 + http_failures: 2 +#END +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + end + + --- active health check is created async after at least 1 second so it will take effect + --- from next request. And first request will just trigger it. + do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + end + ngx.sleep(3) + } + } +--- error_log +client request host: localhost +client request host: 127.0.0.1 +--- timeout: 10 + + + +=== TEST 6: pass the configured host (pass_host == "rewrite") +--- apisix_yaml +routes: + - id: 1 + uri: /server_port + upstream: + type: roundrobin + pass_host: rewrite + upstream_host: foo.com + nodes: + "localhost:1980": 1 + "127.0.0.1:1981": 1 + checks: + active: + http_path: /status + healthy: + interval: 1 + successes: 1 + unhealthy: + interval: 1 + http_failures: 2 +#END +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + end + --- active health check is created async after at least 1 second so it will take effect + --- from next request. And first request will just trigger it. + do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) + end + ngx.sleep(3) + } + } +--- no_error_log +client request host: localhost +client request host: 127.0.0.1 +--- error_log +client request host: foo.com +--- timeout: 10 diff --git a/t/stream-node/healthcheck-resty-events.t b/t/stream-node/healthcheck-resty-events.t index 7f3c94cfce6f..e97abc3bdb99 100644 --- a/t/stream-node/healthcheck-resty-events.t +++ b/t/stream-node/healthcheck-resty-events.t @@ -152,3 +152,139 @@ proxy request to 127.0.0.1:1995 while connecting to upstream proxy request to 127.0.0.1:1995 while connecting to upstream proxy request to 127.0.0.1:1995 while connecting to upstream try to release checker + + + +=== TEST 3: create stream route with a upstream that enable active and passive healthcheck, \ + configure active healthcheck with a high unhealthy threshold, \ + two upstream nodes: one healthy + one unhealthy, unhealthy node with high priority +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/stream_routes/1', + ngx.HTTP_PUT, + [[{ + "remote_addr": "127.0.0.1", + "upstream": { + "nodes": [ + { "host": "127.0.0.1", "port": 1995, "weight": 100, "priority": 0 }, + { "host": "127.0.0.1", "port": 9995, "weight": 100, "priority": 1 } + ], + "type": "roundrobin", + "retries": 0, + "checks": { + "active": { + "type": "tcp", + "timeout": 1, + "healthy": { + "interval": 60, + "successes": 2 + }, + "unhealthy": { + "interval": 1, + "tcp_failures": 254, + "timeouts": 1 + } + }, + "passive": { + "type": "tcp", + "healthy": { + "successes": 1 + }, + "unhealthy": { + "tcp_failures": 1 + } + } + } + } + }]] + ) + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 4: hit stream routes +--- stream_conf_enable +--- config + location /t { + content_by_lua_block { + local sock = ngx.socket.tcp() + local ok, err = sock:connect("127.0.0.1", 1985) + if not ok then + ngx.say("failed to connect: ", err) + return + end + local data, _ = sock:receive() + assert(data == nil, "first request should fail") + sock:close() + ngx.sleep(8) + -- Due to the implementation of lua-resty-events, it relies on the kernel and + -- the Nginx event loop to process socket connections. + -- When lua-resty-healthcheck handles passive healthchecks and uses lua-resty-events + -- as the events module, the synchronization of the first event usually occurs + -- before the start of the passive healthcheck. So when the execution finishes and + -- healthchecker tries to record the healthcheck status, it will not be able to find + -- an existing target (because the synchronization event has not finished yet), which + -- will lead to some anomalies that deviate from the original test case, so compatibility + -- operations are performed here. + local sock = ngx.socket.tcp() + local ok, err = sock:connect("127.0.0.1", 1985) + if not ok then + ngx.say("failed to connect: ", err) + return + end + local data, _ = sock:receive() + assert(data == nil, "first request should fail") + sock:close() + + for i = 1, 3 do + local sock = ngx.socket.tcp() + local ok, err = sock:connect("127.0.0.1", 1985) + if not ok then + ngx.say("failed to connect: ", err) + return + end + + local _, err = sock:send("mmm") + if err then + ngx.say("failed to send: ", err) + return + end + + local data, err = sock:receive() + if err then + ngx.say("failed to receive: ", err) + return + end + + assert(data == "hello world", "response should be 'hello world'") + + sock:close() + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed +--- error_log +proxy request to 127.0.0.1:9995 while connecting to upstream +connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995" +enabled healthcheck passive while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995", +unhealthy TCP increment (1/1) for '(127.0.0.1:9995)' while connecting to upstream, client: 127.0.0.1, server: 0.0.0.0:1985, upstream: "127.0.0.1:9995", +proxy request to 127.0.0.1:1995 while connecting to upstream +proxy request to 127.0.0.1:1995 while connecting to upstream +proxy request to 127.0.0.1:1995 while connecting to upstream +--- timeout: 10 From af9a26c16830fd162343cc38f9589ea3a5781a0d Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 14:06:18 +0530 Subject: [PATCH 16/51] fix tests --- t/discovery/reset-healthchecker.t | 8 ++++---- t/node/healthcheck-discovery.t | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/t/discovery/reset-healthchecker.t b/t/discovery/reset-healthchecker.t index d068443faa1b..8c3d776393e3 100644 --- a/t/discovery/reset-healthchecker.t +++ b/t/discovery/reset-healthchecker.t @@ -102,7 +102,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.say(res.body) - ngx.sleep(5) + ngx.sleep(15) } } --- request @@ -111,7 +111,7 @@ GET /t ok --- timeout: 22 --- no_error_log -unhealthy TCP increment (12/30) +unhealthy TCP increment (17/30) @@ -158,12 +158,12 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.status = res.status - ngx.sleep(5) + ngx.sleep(15) } } --- request GET /t --- timeout: 22 --- no_error_log -unhealthy TCP increment (12/30) +unhealthy TCP increment (17/30) --- error_code: 503 diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index debe25aa6196..3c9b25a64460 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -144,7 +144,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(3) + ngx.sleep(5) ngx.say(res.status) } } @@ -154,7 +154,7 @@ qr/(create new checker|try to release checker): table/ create new checker: table try to release checker: table create new checker: table ---- timeout: 7 +--- timeout: 10 From 562105afd8874c8af1b40c133f191fa5d3d838ea Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 14:32:55 +0530 Subject: [PATCH 17/51] fix lint --- apisix/healthcheck_manager.lua | 3 ++- apisix/upstream.lua | 9 --------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 547255f767c5..ed9e032d7e11 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -129,7 +129,8 @@ function _M.fetch_checker(upstream) local parent = upstream.parent local resource_path = parent.key or upstream.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) .. tostring(upstream._nodes_ver or '') + local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) + .. tostring(upstream._nodes_ver or '') -- Check working pool first local working_item = _M.working_pool[resource_path] if working_item and working_item.version == resource_ver then diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 7730316a214d..edd023992656 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -14,14 +14,11 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -- -local inspect = require("inspect") local require = require local core = require("apisix.core") -local config_local = require("apisix.core.config_local") local discovery = require("apisix.discovery.init").discovery local upstream_util = require("apisix.utils.upstream") local apisix_ssl = require("apisix.ssl") -local events = require("apisix.events") local error = error local tostring = tostring local ipairs = ipairs @@ -30,14 +27,8 @@ local pcall = pcall local ngx_var = ngx.var local is_http = ngx.config.subsystem == "http" local upstreams -local healthcheck local healthcheck_manager -local healthcheck_shdict_name = "upstream-healthcheck" -if not is_http then - healthcheck_shdict_name = healthcheck_shdict_name .. "-" .. ngx.config.subsystem -end - local set_upstream_tls_client_param local ok, apisix_ngx_upstream = pcall(require, "resty.apisix.upstream") if ok then From c3fbdf1207a58033c524582311cfaae98f9597d1 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 17:14:03 +0530 Subject: [PATCH 18/51] fix tests --- t/discovery/reset-healthchecker.t | 12 ++++++------ t/node/healthcheck-discovery.t | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/t/discovery/reset-healthchecker.t b/t/discovery/reset-healthchecker.t index 8c3d776393e3..98f5e67023fb 100644 --- a/t/discovery/reset-healthchecker.t +++ b/t/discovery/reset-healthchecker.t @@ -102,16 +102,16 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.say(res.body) - ngx.sleep(15) + ngx.sleep(30) } } --- request GET /t --- response_body ok ---- timeout: 22 +--- timeout: 37 --- no_error_log -unhealthy TCP increment (17/30) +unhealthy TCP increment (25/30) @@ -158,12 +158,12 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.status = res.status - ngx.sleep(15) + ngx.sleep(30) } } --- request GET /t ---- timeout: 22 +--- timeout: 37 --- no_error_log -unhealthy TCP increment (17/30) +unhealthy TCP increment (25/30) --- error_code: 503 diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index 3c9b25a64460..4c652097280a 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -144,7 +144,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(5) + ngx.sleep(15) ngx.say(res.status) } } @@ -154,7 +154,7 @@ qr/(create new checker|try to release checker): table/ create new checker: table try to release checker: table create new checker: table ---- timeout: 10 +--- timeout: 20 From e551bbcf2cf9dc0c132b5d4cbc5e45d5cb4d13be Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 16 Jul 2025 17:23:42 +0530 Subject: [PATCH 19/51] fix lint --- apisix/healthcheck_manager.lua | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index ed9e032d7e11..022196eda8d2 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -14,6 +14,10 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -- +local require = require +local ipairs = ipairs +local pairs = pairs +local tostring = tostring local core = require("apisix.core") local config_local = require("apisix.core.config_local") local healthcheck From 9f19c2c60a8892de24ae4a99c1b303c8ce28509b Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 00:32:18 +0530 Subject: [PATCH 20/51] fix logic --- apisix/healthcheck_manager.lua | 61 +++++++++++++++++++++++++++---- t/discovery/reset-healthchecker.t | 8 ++-- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 022196eda8d2..74ad38215bad 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -33,6 +33,8 @@ local is_http = ngx.config.subsystem == "http" if not is_http then healthcheck_shdict_name = healthcheck_shdict_name .. "-" .. ngx.config.subsystem end + + local function fetch_latest_conf(resource_path) local resource_type, id -- Handle both formats: @@ -78,10 +80,14 @@ local function fetch_latest_conf(resource_path) return resource end + + local function get_healthcheck_name(value) return "upstream#" .. value.key end + + local function create_checker(up_conf) local local_conf = config_local.local_conf() if local_conf and local_conf.apisix and local_conf.apisix.disable_upstream_healthcheck then @@ -126,6 +132,7 @@ local function create_checker(up_conf) return checker end + function _M.fetch_checker(upstream) if not upstream or not upstream.checks then return nil @@ -151,6 +158,8 @@ function _M.fetch_checker(upstream) return nil end + +local inspect = require("inspect") function _M.fetch_node_status(checker, ip, port, hostname) -- check if the checker is valid if not checker or checker.dead then @@ -159,6 +168,34 @@ function _M.fetch_node_status(checker, ip, port, hostname) return checker:get_target_status(ip, port, hostname) end + + +local function add_working_pool(resource_path, resource_ver, checker) + local existing_checker = _M.working_pool[resource_path] + if existing_checker then + return false, "already exist" + end + + _M.working_pool[resource_path] = { + version = resource_ver, + checker = checker + } + return true +end + +local function find_in_working_pool(resource_path, resource_ver) + local checker = _M.working_pool[resource_path] + if not checker then + return nil -- not found + end + + if checker.version ~= resource_ver then + return nil -- version not match + end + return checker +end + + function _M.timer_create_checker() if core.table.nkeys(_M.waiting_pool) == 0 then return @@ -166,11 +203,15 @@ function _M.timer_create_checker() local waiting_snapshot = tab_clone(_M.waiting_pool) for resource_path, resource_ver in pairs(waiting_snapshot) do - local res_conf = fetch_latest_conf(resource_path) - if not res_conf then + if find_in_working_pool(resource_path, resource_ver) then + core.log.info("resource: ", resource_path, " already in working pool with version: ", resource_ver) goto continue end do + local res_conf = fetch_latest_conf(resource_path) + if not res_conf then + goto continue + end local upstream = res_conf.value.upstream or res_conf.value local new_version = res_conf.modifiedIndex .. tostring(upstream._nodes_ver or '') core.log.warn("checking waiting pool for resource: ", resource_path, @@ -182,11 +223,13 @@ function _M.timer_create_checker() if not checker then goto continue end - _M.working_pool[resource_path] = { - version = resource_ver, - checker = checker - } core.log.info("create new checker: ", tostring(checker)) + local ok, err = add_working_pool(resource_path, resource_ver, checker) + if not ok then + core.log.warn("failed to add checker to working pool: ", err) + checker:delayed_clear(10) + checker:stop() + end end ::continue:: @@ -194,6 +237,7 @@ function _M.timer_create_checker() end end + function _M.timer_working_pool_check() if core.table.nkeys(_M.working_pool) == 0 then return @@ -201,6 +245,7 @@ function _M.timer_working_pool_check() local working_snapshot = tab_clone(_M.working_pool) for resource_path, item in pairs(working_snapshot) do + --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) if not res_conf then item.checker:delayed_clear(10) @@ -215,6 +260,8 @@ function _M.timer_working_pool_check() if item.version ~= current_ver then item.checker:delayed_clear(10) item.checker:stop() + core.log.warn("REMOVING CHECKER for resource: ", resource_path, + " current version: ", current_ver, " item version: ", item.version) core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil end @@ -224,7 +271,7 @@ function _M.timer_working_pool_check() end function _M.init_worker() - timer_every(1, _M.timer_create_checker) + timer_every(2, _M.timer_create_checker) timer_every(1, _M.timer_working_pool_check) end diff --git a/t/discovery/reset-healthchecker.t b/t/discovery/reset-healthchecker.t index 98f5e67023fb..cd60ece4e208 100644 --- a/t/discovery/reset-healthchecker.t +++ b/t/discovery/reset-healthchecker.t @@ -102,7 +102,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.say(res.body) - ngx.sleep(30) + ngx.sleep(10) } } --- request @@ -111,7 +111,7 @@ GET /t ok --- timeout: 37 --- no_error_log -unhealthy TCP increment (25/30) +unhealthy TCP increment (12/30) @@ -158,12 +158,12 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.status = res.status - ngx.sleep(30) + ngx.sleep(10) } } --- request GET /t --- timeout: 37 --- no_error_log -unhealthy TCP increment (25/30) +unhealthy TCP increment (12/30) --- error_code: 503 From 45bba02661c944dbdc3b866e9c899c265e9dd8a1 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 01:02:18 +0530 Subject: [PATCH 21/51] fix tests --- t/control/healthcheck.t | 35 ++++++++++--------- t/node/healthcheck-discovery.t | 5 +-- t/node/healthcheck-https.t | 6 ++-- t/node/healthcheck-passive-resty-events.t | 3 +- t/node/healthcheck-stop-checker.t | 10 +++--- t/node/healthcheck.t | 5 +-- t/node/healthcheck3.t | 3 +- .../healthcheck-resty-worker-events.t | 2 +- 8 files changed, 38 insertions(+), 31 deletions(-) diff --git a/t/control/healthcheck.t b/t/control/healthcheck.t index 9e8445cb372e..749c3e62ac99 100644 --- a/t/control/healthcheck.t +++ b/t/control/healthcheck.t @@ -75,7 +75,7 @@ upstreams: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET"}) - ngx.sleep(2.2) + ngx.sleep(4) local _, _, res = t.test('/v1/healthcheck', ngx.HTTP_GET) @@ -97,7 +97,7 @@ upstreams: local _, _, res = t.test('/v1/healthcheck/upstreams/1', ngx.HTTP_GET, nil, nil, {["Accept"] = "text/html"}) - ngx.sleep(2.2) + ngx.sleep(4) local xml2lua = require("xml2lua") local xmlhandler = require("xmlhandler.tree") local handler = xmlhandler:new() @@ -107,7 +107,7 @@ upstreams: for _, td in ipairs(handler.root.html.body.table.tr) do if td.td then if td.td[4] == "127.0.0.2:1988" then - assert(td.td[5] == "mostly_healthy", "127.0.0.2:1988 is not mostly_healthy") + assert(td.td[5] == "unhealthy", "127.0.0.2:1988 is not unhealthy") matches = matches + 1 end if td.td[4] == "127.0.0.1:1980" then @@ -116,7 +116,7 @@ upstreams: end end end - ngx.sleep(2.2) + ngx.sleep(4) assert(matches == 2, "unexpected html") } } @@ -126,9 +126,9 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.2(127.0.0.2:1988)' unhealthy TCP increment (2/2) for '127.0.0.2(127.0.0.2:1988)' --- response_body -[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] -[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] ---- timeout: 7 +[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] +[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] +--- timeout: 14 @@ -172,7 +172,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET"}) - ngx.sleep(2.2) + ngx.sleep(4) local code, body, res = t.test('/v1/healthcheck', ngx.HTTP_GET) @@ -189,7 +189,7 @@ routes: return a.port < b.port end) ngx.say(json.encode(res)) - ngx.sleep(2) + ngx.sleep(4) } } --- grep_error_log eval @@ -198,8 +198,9 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:1988)' unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' --- response_body -[{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"}] -{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"} +[{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"}] +{"name":"/routes/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":0,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1980,"status":"healthy"},{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"} +--- timeout: 10 @@ -248,7 +249,7 @@ services: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET"}) - ngx.sleep(2.2) + ngx.sleep(4) local code, body, res = t.test('/v1/healthcheck', ngx.HTTP_GET) @@ -265,7 +266,7 @@ services: return a.port < b.port end) ngx.say(json.encode(res)) - ngx.sleep(2.2) + ngx.sleep(4) } } --- grep_error_log eval @@ -274,8 +275,9 @@ qr/unhealthy TCP increment \(.+\) for '[^']+'/ unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:1988)' unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' --- response_body -[{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"}] -{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":1,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"mostly_healthy"}],"type":"http"} +[{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"}] +{"name":"/services/1","nodes":[{"counter":{"http_failure":0,"success":0,"tcp_failure":2,"timeout_failure":0},"hostname":"127.0.0.1","ip":"127.0.0.1","port":1988,"status":"unhealthy"}],"type":"http"} +--- timeout: 9 @@ -284,7 +286,7 @@ unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' location /t { content_by_lua_block { local t = require("lib.test_admin") - ngx.sleep(2.2) + ngx.sleep(4) local code, body, res = t.test('/v1/healthcheck', ngx.HTTP_GET) ngx.print(res) @@ -292,6 +294,7 @@ unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:1988)' } --- response_body {} +--- timeout: 5 diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index 4c652097280a..b71a7905fdca 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -94,7 +94,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(4) ngx.say(res.status) } @@ -104,6 +104,7 @@ qr/unhealthy TCP increment \(1\/2\) for '127.0.0.1\([^)]+\)'/ --- grep_error_log_out unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:1988)' unhealthy TCP increment (1/2) for '127.0.0.1(0.0.0.0:1988)' +--- timeout: 5 @@ -152,8 +153,8 @@ routes: qr/(create new checker|try to release checker): table/ --- grep_error_log_out create new checker: table -try to release checker: table create new checker: table +try to release checker: table --- timeout: 20 diff --git a/t/node/healthcheck-https.t b/t/node/healthcheck-https.t index 28ef8019d2a0..8b2efa5aea29 100644 --- a/t/node/healthcheck-https.t +++ b/t/node/healthcheck-https.t @@ -232,7 +232,7 @@ GET /t local httpc = http.new() local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/ping" local _, _ = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(4) local healthcheck_uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/v1/healthcheck/routes/1" local httpc = http.new() @@ -260,6 +260,7 @@ GET /t qr/\([^)]+\) unhealthy .* for '.*'/ --- grep_error_log_out (upstream#/apisix/routes/1) unhealthy HTTP increment (1/1) for '127.0.0.1(127.0.0.1:8766)' +--- timeout: 8 @@ -314,7 +315,7 @@ qr/\([^)]+\) unhealthy .* for '.*'/ local httpc = http.new() local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/ping" local _, _ = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(4) local healthcheck_uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/v1/healthcheck/routes/1" local httpc = http.new() @@ -339,3 +340,4 @@ qr/\([^)]+\) unhealthy .* for '.*'/ --- request GET /t --- error_code: 200 +--- timeout: 8 diff --git a/t/node/healthcheck-passive-resty-events.t b/t/node/healthcheck-passive-resty-events.t index 5088fef0f391..faba3fa7f0ae 100644 --- a/t/node/healthcheck-passive-resty-events.t +++ b/t/node/healthcheck-passive-resty-events.t @@ -370,7 +370,7 @@ enabled healthcheck passive end ngx.say(res.status) - ngx.sleep(1) -- Wait for health check unhealthy events sync + ngx.sleep(4) -- Wait for health check unhealthy events sync -- The second time request to /hello_ local res, err = httpc:request_uri(uri .. "/hello_") @@ -391,3 +391,4 @@ GET /t qr/\[healthcheck\] \([^)]+\) unhealthy HTTP increment/ --- grep_error_log_out [healthcheck] (upstream#/apisix/routes/2) unhealthy HTTP increment +--- timeout: 6 diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index 0e69fe2c4009..d4497ef13928 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -190,7 +190,7 @@ create new checker: table: 0x return end - ngx.sleep(0.2) + ngx.sleep(3) code, _, body = t('/server_port', "GET") if code > 300 then @@ -213,7 +213,7 @@ create new checker: table: 0x return end - ngx.sleep(2) + ngx.sleep(4) code, _, body = t('/server_port', "GET") if code > 300 then @@ -230,7 +230,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(2) -- wait for routes delete event synced + ngx.sleep(4) -- wait for routes delete event synced code, _, body = t('/apisix/admin/upstreams/stopchecker', "DELETE") @@ -239,7 +239,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(2) + ngx.sleep(3) ngx.say("ok") } } @@ -254,4 +254,4 @@ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x try to release checker: table: 0x ---- timeout: 10 +--- timeout: 15 diff --git a/t/node/healthcheck.t b/t/node/healthcheck.t index 7d9de10e6e52..5eba37c2165a 100644 --- a/t/node/healthcheck.t +++ b/t/node/healthcheck.t @@ -614,7 +614,7 @@ passed local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(4) ngx.say(res.status) } @@ -627,6 +627,7 @@ GET /t qr/^.*?\[warn\].*/ --- grep_error_log_out eval qr/unhealthy TCP increment.*foo.com/ +--- timeout: 5 @@ -688,7 +689,7 @@ passed local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(2) + ngx.sleep(4) ngx.say(res.status) } diff --git a/t/node/healthcheck3.t b/t/node/healthcheck3.t index b6e5194e1ef4..2e158de1b914 100644 --- a/t/node/healthcheck3.t +++ b/t/node/healthcheck3.t @@ -110,7 +110,7 @@ qr/^.*?\[error\](?!.*process exiting).*/ for i, th in ipairs(t) do ngx.thread.wait(th) end - ngx.sleep(3) + ngx.sleep(4) ngx.exit(200) } } @@ -120,4 +120,3 @@ GET /t qr/create new checker/ --- grep_error_log_out create new checker -create new checker diff --git a/t/stream-node/healthcheck-resty-worker-events.t b/t/stream-node/healthcheck-resty-worker-events.t index 39a765f949ce..923b366032e6 100644 --- a/t/stream-node/healthcheck-resty-worker-events.t +++ b/t/stream-node/healthcheck-resty-worker-events.t @@ -94,7 +94,7 @@ passed sock:close() -- wait for health check to take effect - ngx.sleep(2.5) + ngx.sleep(4.5) for i = 1, 3 do local sock = ngx.socket.tcp() From d962d7645aca52f87d5bd8af04e788ba42517d90 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 01:15:49 +0530 Subject: [PATCH 22/51] fix lint --- apisix/healthcheck_manager.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 74ad38215bad..b5465d6508bf 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -159,7 +159,6 @@ function _M.fetch_checker(upstream) end -local inspect = require("inspect") function _M.fetch_node_status(checker, ip, port, hostname) -- check if the checker is valid if not checker or checker.dead then @@ -204,7 +203,8 @@ function _M.timer_create_checker() local waiting_snapshot = tab_clone(_M.waiting_pool) for resource_path, resource_ver in pairs(waiting_snapshot) do if find_in_working_pool(resource_path, resource_ver) then - core.log.info("resource: ", resource_path, " already in working pool with version: ", resource_ver) + core.log.info("resource: ", resource_path, " already in working pool with version: ", + resource_ver) goto continue end do From 8c3a884d4ddb05954e7091100ecafd7ed4b4e9a6 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 01:49:43 +0530 Subject: [PATCH 23/51] fix test --- t/discovery/consul_kv.t | 4 ++-- t/node/healthcheck-stop-checker.t | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/t/discovery/consul_kv.t b/t/discovery/consul_kv.t index 0034997c5a39..be036413696e 100644 --- a/t/discovery/consul_kv.t +++ b/t/discovery/consul_kv.t @@ -481,8 +481,8 @@ upstreams: --- request GET /thc --- response_body -[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] -[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] +[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] +[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] --- ignore_error_log diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index d4497ef13928..83cddc8650c9 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -239,7 +239,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(3) + ngx.sleep(5) ngx.say("ok") } } @@ -254,4 +254,4 @@ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x try to release checker: table: 0x ---- timeout: 15 +--- timeout: 17 From c7171058bd13c93e74a59410a983c2c2119f3ff4 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 10:29:23 +0530 Subject: [PATCH 24/51] refactor --- apisix/healthcheck_manager.lua | 21 +++++++++------------ t/discovery/reset-healthchecker.t | 8 ++++---- t/node/healthcheck-discovery.t | 6 +++--- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index b5465d6508bf..ac6735db42d4 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -170,16 +170,10 @@ end local function add_working_pool(resource_path, resource_ver, checker) - local existing_checker = _M.working_pool[resource_path] - if existing_checker then - return false, "already exist" - end - _M.working_pool[resource_path] = { version = resource_ver, checker = checker } - return true end local function find_in_working_pool(resource_path, resource_ver) @@ -219,17 +213,20 @@ function _M.timer_create_checker() if resource_ver ~= new_version then goto continue end + + -- if a checker exists then delete it before creating a new one + local existing_checker = _M.working_pool[resource_path] + if existing_checker then + existing_checker.checker:delayed_clear(10) + existing_checker.checker:stop() + core.log.info("releasing existing checker: ", tostring(existing_checker.checker)) + end local checker = create_checker(upstream) if not checker then goto continue end core.log.info("create new checker: ", tostring(checker)) - local ok, err = add_working_pool(resource_path, resource_ver, checker) - if not ok then - core.log.warn("failed to add checker to working pool: ", err) - checker:delayed_clear(10) - checker:stop() - end + add_working_pool(resource_path, resource_ver, checker) end ::continue:: diff --git a/t/discovery/reset-healthchecker.t b/t/discovery/reset-healthchecker.t index cd60ece4e208..9816aabaa46d 100644 --- a/t/discovery/reset-healthchecker.t +++ b/t/discovery/reset-healthchecker.t @@ -102,7 +102,7 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.say(res.body) - ngx.sleep(10) + ngx.sleep(20) } } --- request @@ -111,7 +111,7 @@ GET /t ok --- timeout: 37 --- no_error_log -unhealthy TCP increment (12/30) +unhealthy TCP increment (20/30) @@ -158,12 +158,12 @@ routes: local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) ngx.status = res.status - ngx.sleep(10) + ngx.sleep(20) } } --- request GET /t --- timeout: 37 --- no_error_log -unhealthy TCP increment (12/30) +unhealthy TCP increment (20/30) --- error_code: 503 diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index b71a7905fdca..3dcbf7541d39 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -145,7 +145,7 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(15) + ngx.sleep(25) ngx.say(res.status) } } @@ -153,9 +153,9 @@ routes: qr/(create new checker|try to release checker): table/ --- grep_error_log_out create new checker: table -create new checker: table try to release checker: table ---- timeout: 20 +create new checker: table +--- timeout: 30 From b724093930cd42fa30c818edc0aa8fca5c9822d7 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 10:35:20 +0530 Subject: [PATCH 25/51] reset timer to 1 s --- apisix/healthcheck_manager.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index ac6735db42d4..2633e42c8aac 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -268,7 +268,7 @@ function _M.timer_working_pool_check() end function _M.init_worker() - timer_every(2, _M.timer_create_checker) + timer_every(1, _M.timer_create_checker) timer_every(1, _M.timer_working_pool_check) end From 64049121dd82aa6609e31797abf411a4d3cd0893 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 12:03:41 +0530 Subject: [PATCH 26/51] fix concurrent timers --- apisix/healthcheck_manager.lua | 36 +++++++++++++++++++++++++--------- t/discovery/consul_kv.t | 4 ++-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 2633e42c8aac..be4d02bc7f8a 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -183,12 +183,14 @@ local function find_in_working_pool(resource_path, resource_ver) end if checker.version ~= resource_ver then + core.log.warn("version mismatch for resource: ", resource_path, + " current version: ", checker.version, " requested version: ", resource_ver) return nil -- version not match end return checker end - +local timer_create_checker_running = false function _M.timer_create_checker() if core.table.nkeys(_M.waiting_pool) == 0 then return @@ -196,12 +198,12 @@ function _M.timer_create_checker() local waiting_snapshot = tab_clone(_M.waiting_pool) for resource_path, resource_ver in pairs(waiting_snapshot) do - if find_in_working_pool(resource_path, resource_ver) then - core.log.info("resource: ", resource_path, " already in working pool with version: ", - resource_ver) - goto continue - end do + if find_in_working_pool(resource_path, resource_ver) then + core.log.info("resource: ", resource_path, " already in working pool with version: ", + resource_ver) + goto continue + end local res_conf = fetch_latest_conf(resource_path) if not res_conf then goto continue @@ -234,7 +236,7 @@ function _M.timer_create_checker() end end - +local timer_working_pool_check_running = false function _M.timer_working_pool_check() if core.table.nkeys(_M.working_pool) == 0 then return @@ -268,8 +270,24 @@ function _M.timer_working_pool_check() end function _M.init_worker() - timer_every(1, _M.timer_create_checker) - timer_every(1, _M.timer_working_pool_check) + timer_every(1, function () + if timer_create_checker_running then + core.log.warn("timer_create_checker is already running, skipping this iteration") + return + end + timer_create_checker_running = true + pcall(_M.timer_create_checker) + timer_create_checker_running = false + end) + timer_every(1,function () + if timer_working_pool_check_running then + core.log.warn("timer_working_pool_check is already running, skipping this iteration") + return + end + timer_working_pool_check_running = true + pcall(_M.timer_working_pool_check) + timer_working_pool_check_running = false + end) end return _M diff --git a/t/discovery/consul_kv.t b/t/discovery/consul_kv.t index be036413696e..0034997c5a39 100644 --- a/t/discovery/consul_kv.t +++ b/t/discovery/consul_kv.t @@ -481,8 +481,8 @@ upstreams: --- request GET /thc --- response_body -[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] -[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"mostly_healthy"}] +[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] +[{"hostname":"127.0.0.1","ip":"127.0.0.1","port":30511,"status":"healthy"},{"hostname":"127.0.0.2","ip":"127.0.0.2","port":1988,"status":"unhealthy"}] --- ignore_error_log From 413fdb2153dd948c4bed12cfce24f264cd2202f2 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 13:30:22 +0530 Subject: [PATCH 27/51] use resource_version and key explicitly --- apisix/balancer.lua | 3 ++- apisix/healthcheck_manager.lua | 15 ++------------- apisix/init.lua | 11 ++++++++++- apisix/upstream.lua | 11 +++++++---- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index c0ae58e63975..ddca61a89830 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -214,7 +214,8 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local checker = healthcheck_manager.fetch_checker(up_conf) + local resource_version = up_conf.resource_version .. tostring(up_conf._nodes_ver or '') + local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index be4d02bc7f8a..e84f1244af76 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -133,16 +133,7 @@ local function create_checker(up_conf) end -function _M.fetch_checker(upstream) - if not upstream or not upstream.checks then - return nil - end - - local parent = upstream.parent - local resource_path = parent.key or upstream.key - local resource_ver = (upstream.modifiedIndex or parent.modifiedIndex) - .. tostring(upstream._nodes_ver or '') - -- Check working pool first +function _M.fetch_checker(resource_path, resource_ver) local working_item = _M.working_pool[resource_path] if working_item and working_item.version == resource_ver then return working_item.checker @@ -259,8 +250,6 @@ function _M.timer_working_pool_check() if item.version ~= current_ver then item.checker:delayed_clear(10) item.checker:stop() - core.log.warn("REMOVING CHECKER for resource: ", resource_path, - " current version: ", current_ver, " item version: ", item.version) core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil end @@ -279,7 +268,7 @@ function _M.init_worker() pcall(_M.timer_create_checker) timer_create_checker_running = false end) - timer_every(1,function () + timer_every(1, function () if timer_working_pool_check_running then core.log.warn("timer_working_pool_check is already running, skipping this iteration") return diff --git a/apisix/init.lua b/apisix/init.lua index 1652e8712a53..847d437eccc3 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -254,6 +254,10 @@ local function parse_domain_in_route(route) route.dns_value = core.table.deepcopy(route.value, { shallows = { "self.upstream.parent"}}) route.dns_value.upstream.nodes = new_nodes + if not route.dns_value._nodes_ver then + route.dns_value._nodes_ver = 0 + end + route.dns_value._nodes_ver = route.dns_value._nodes_ver + 1 core.log.info("parse route which contain domain: ", core.json.delay_encode(route, true)) return route @@ -833,7 +837,12 @@ end local function healthcheck_passive(api_ctx) - local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf) + if not api_ctx.upstream_conf then + return + end + local resource_version = api_ctx.upstream_conf.resource_version .. + tostring(api_ctx.upstream_conf._nodes_ver or '') + local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf.resource_key, resource_version) if not checker then return end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index edd023992656..8ad37e3ad3be 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -196,6 +196,7 @@ function _M.set_by_route(route, api_ctx) end local same = upstream_util.compare_upstream_node(up_conf, new_nodes) + core.log.warn("SETTING NODES_VER") if not same then if not up_conf._nodes_ver then up_conf._nodes_ver = 0 @@ -250,7 +251,7 @@ function _M.set_by_route(route, api_ctx) end end - healthcheck_manager.fetch_checker(up_conf) + healthcheck_manager.fetch_checker(up_conf.resource_key, up_conf.resource_version) return end @@ -261,7 +262,7 @@ function _M.set_by_route(route, api_ctx) return 503, err end - healthcheck_manager.fetch_checker(up_conf) + healthcheck_manager.fetch_checker(up_conf.resource_key, up_conf.resource_version) local scheme = up_conf.scheme if (scheme == "https" or scheme == "grpcs") and up_conf.tls then @@ -446,14 +447,16 @@ function _M.check_upstream_conf(conf) return check_upstream_conf(false, conf) end - +local inspect = require("inspect") local function filter_upstream(value, parent) if not value then return end value.parent = parent - + value.resource_key = ((parent and parent.key) or value.key) + core.log.warn("VALUE IS ", inspect(value)) + value.resource_version = ((parent and parent.modifiedIndex) or value.modifiedIndex) if not is_http and value.scheme == "http" then -- For L4 proxy, the default scheme is "tcp" value.scheme = "tcp" From a4ee844ffa724d38c993a9474b001c614ffd4989 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 14:21:18 +0530 Subject: [PATCH 28/51] remove using .parent --- apisix/control/v1.lua | 16 +++------------- apisix/healthcheck_manager.lua | 17 ++++++++--------- apisix/init.lua | 2 +- apisix/plugin.lua | 9 +++------ apisix/plugins/ai.lua | 4 +--- apisix/upstream.lua | 16 ++++------------ 6 files changed, 20 insertions(+), 44 deletions(-) diff --git a/apisix/control/v1.lua b/apisix/control/v1.lua index 4d35018b89de..f457eac0de9e 100644 --- a/apisix/control/v1.lua +++ b/apisix/control/v1.lua @@ -20,6 +20,7 @@ local plugin = require("apisix.plugin") local get_routes = require("apisix.router").http_routes local get_services = require("apisix.http.service").services local upstream_mod = require("apisix.upstream") +local healthcheck_manager = require("apisix.healthcheck_manager") local get_upstreams = upstream_mod.upstreams local collectgarbage = collectgarbage local ipairs = ipairs @@ -66,14 +67,13 @@ function _M.schema() return 200, schema end - local healthcheck local function extra_checker_info(value) if not healthcheck then healthcheck = require("resty.healthcheck") end - local name = upstream_mod.get_healthchecker_name(value) + local name = healthcheck_manager.get_healthchecker_name(value.value) local nodes, err = healthcheck.get_target_list(name, "upstream-healthcheck") if err then core.log.error("healthcheck.get_target_list failed: ", err) @@ -214,7 +214,6 @@ local function iter_and_find_healthcheck_info(values, src_type, src_id) if not checks then return nil, str_format("no checker for %s[%s]", src_type, src_id) end - local info = extra_checker_info(value) info.type = get_checker_type(checks) return info @@ -249,7 +248,6 @@ function _M.get_health_checker() if not info then return 404, {error_msg = err} end - local out, err = try_render_html({stats={info}}) if out then core.response.set_header("Content-Type", "text/html") @@ -266,9 +264,6 @@ local function iter_add_get_routes_info(values, route_id) local infos = {} for _, route in core.config_util.iterate_values(values) do local new_route = core.table.deepcopy(route) - if new_route.value.upstream and new_route.value.upstream.parent then - new_route.value.upstream.parent = nil - end -- remove healthcheck info new_route.checker = nil new_route.checker_idx = nil @@ -312,9 +307,6 @@ local function iter_add_get_upstream_info(values, upstream_id) for _, upstream in core.config_util.iterate_values(values) do local new_upstream = core.table.deepcopy(upstream) core.table.insert(infos, new_upstream) - if new_upstream.value and new_upstream.value.parent then - new_upstream.value.parent = nil - end -- check the upstream id if upstream_id and upstream.value.id == upstream_id then return new_upstream @@ -332,6 +324,7 @@ function _M.dump_all_upstreams_info() return 200, infos end + function _M.dump_upstream_info() local upstreams = get_upstreams() local uri_segs = core.utils.split_uri(ngx_var.uri) @@ -354,9 +347,6 @@ local function iter_add_get_services_info(values, svc_id) local infos = {} for _, svc in core.config_util.iterate_values(values) do local new_svc = core.table.deepcopy(svc) - if new_svc.value.upstream and new_svc.value.upstream.parent then - new_svc.value.upstream.parent = nil - end -- remove healthcheck info new_svc.checker = nil new_svc.checker_idx = nil diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index e84f1244af76..53ccc1230950 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -35,6 +35,12 @@ if not is_http then end +local function get_healthchecker_name(value) + return "upstream#" .. (value.resource_key or value.upstream.resource_key) +end +_M.get_healthchecker_name = get_healthchecker_name + + local function fetch_latest_conf(resource_path) local resource_type, id -- Handle both formats: @@ -81,20 +87,13 @@ local function fetch_latest_conf(resource_path) end - -local function get_healthcheck_name(value) - return "upstream#" .. value.key -end - - - local function create_checker(up_conf) local local_conf = config_local.local_conf() if local_conf and local_conf.apisix and local_conf.apisix.disable_upstream_healthcheck then core.log.info("healthchecker won't be created: disabled upstream healthcheck") return nil end - core.log.warn("creating healthchecker for upstream: ", up_conf.key) + core.log.warn("creating healthchecker for upstream: ", up_conf.resource_key) if not healthcheck then healthcheck = require("resty.healthcheck") end @@ -102,7 +101,7 @@ local function create_checker(up_conf) core.log.warn("creating new healthchecker for ", up_conf.key) local checker, err = healthcheck.new({ - name = get_healthcheck_name(up_conf.parent), + name = get_healthchecker_name(up_conf), shm_name = healthcheck_shdict_name, checks = up_conf.checks, events_module = events:get_healthcheck_events_modele(), diff --git a/apisix/init.lua b/apisix/init.lua index 847d437eccc3..cc322a93c38c 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -252,7 +252,7 @@ local function parse_domain_in_route(route) -- don't modify the modifiedIndex to avoid plugin cache miss because of DNS resolve result -- has changed - route.dns_value = core.table.deepcopy(route.value, { shallows = { "self.upstream.parent"}}) + route.dns_value = core.table.deepcopy(route.value) route.dns_value.upstream.nodes = new_nodes if not route.dns_value._nodes_ver then route.dns_value._nodes_ver = 0 diff --git a/apisix/plugin.lua b/apisix/plugin.lua index 6cb876b4ca05..7a6d3392c0d2 100644 --- a/apisix/plugin.lua +++ b/apisix/plugin.lua @@ -594,7 +594,7 @@ end local function merge_service_route(service_conf, route_conf) - local new_conf = core.table.deepcopy(service_conf, { shallows = {"self.value.upstream.parent"}}) + local new_conf = core.table.deepcopy(service_conf) new_conf.value.service_id = new_conf.value.id new_conf.value.id = route_conf.value.id new_conf.modifiedIndex = route_conf.modifiedIndex @@ -612,8 +612,6 @@ local function merge_service_route(service_conf, route_conf) local route_upstream = route_conf.value.upstream if route_upstream then new_conf.value.upstream = route_upstream - -- when route's upstream override service's upstream, - -- the upstream.parent still point to the route new_conf.value.upstream_id = nil new_conf.has_domain = route_conf.has_domain end @@ -668,7 +666,7 @@ end local function merge_service_stream_route(service_conf, route_conf) -- because many fields in Service are not supported by stream route, -- so we copy the stream route as base object - local new_conf = core.table.deepcopy(route_conf, { shallows = {"self.value.upstream.parent"}}) + local new_conf = core.table.deepcopy(route_conf) if service_conf.value.plugins then for name, conf in pairs(service_conf.value.plugins) do if not new_conf.value.plugins then @@ -716,8 +714,7 @@ local function merge_consumer_route(route_conf, consumer_conf, consumer_group_co return route_conf end - local new_route_conf = core.table.deepcopy(route_conf, - { shallows = {"self.value.upstream.parent"}}) + local new_route_conf = core.table.deepcopy(route_conf) if consumer_group_conf then for name, conf in pairs(consumer_group_conf.value.plugins) do diff --git a/apisix/plugins/ai.lua b/apisix/plugins/ai.lua index 278201d4e56e..39430c7ad014 100644 --- a/apisix/plugins/ai.lua +++ b/apisix/plugins/ai.lua @@ -69,9 +69,7 @@ local default_keepalive_pool = {} local function create_router_matching_cache(api_ctx) orig_router_http_matching(api_ctx) - return core.table.deepcopy(api_ctx, { - shallows = { "self.matched_route.value.upstream.parent" } - }) + return core.table.deepcopy(api_ctx) end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 8ad37e3ad3be..906983079945 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -79,12 +79,6 @@ end _M.set = set_directly -local function get_healthchecker_name(value) - return "upstream#" .. value.key -end -_M.get_healthchecker_name = get_healthchecker_name - - local function set_upstream_scheme(ctx, upstream) -- plugins like proxy-rewrite may already set ctx.upstream_scheme if not ctx.upstream_scheme then @@ -220,8 +214,8 @@ function _M.set_by_route(route, api_ctx) up_conf.nodes = new_nodes end - local id = up_conf.parent.value.id - local conf_version = up_conf.parent.modifiedIndex + local id = up_conf.resource_id + local conf_version = up_conf.resource_version -- include the upstream object as part of the version, because the upstream will be changed -- by service discovery or dns resolver. set_directly(api_ctx, id, conf_version .. "#" .. tostring(up_conf) .. "#" @@ -447,16 +441,14 @@ function _M.check_upstream_conf(conf) return check_upstream_conf(false, conf) end -local inspect = require("inspect") + local function filter_upstream(value, parent) if not value then return end - - value.parent = parent value.resource_key = ((parent and parent.key) or value.key) - core.log.warn("VALUE IS ", inspect(value)) value.resource_version = ((parent and parent.modifiedIndex) or value.modifiedIndex) + value.resource_id = ((parent and parent.value.id) or value.id) if not is_http and value.scheme == "http" then -- For L4 proxy, the default scheme is "tcp" value.scheme = "tcp" From 31198decb92b126c824581bf35e7698de1dbd97a Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 14:28:59 +0530 Subject: [PATCH 29/51] remove log --- apisix/upstream.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 906983079945..51680bde8802 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -190,7 +190,6 @@ function _M.set_by_route(route, api_ctx) end local same = upstream_util.compare_upstream_node(up_conf, new_nodes) - core.log.warn("SETTING NODES_VER") if not same then if not up_conf._nodes_ver then up_conf._nodes_ver = 0 From 0efb277dec1e27c51252d378288c5a37aaba9558 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 18:07:33 +0530 Subject: [PATCH 30/51] fix tests --- apisix/balancer.lua | 2 +- apisix/init.lua | 7 ++++--- t/control/routes.t | 8 ++++---- t/control/services.t | 12 ++++++------ 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index ddca61a89830..d216a78ae1f2 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -214,7 +214,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local resource_version = up_conf.resource_version .. tostring(up_conf._nodes_ver or '') + local resource_version = up_conf.resource_version and (up_conf.resource_version .. tostring(up_conf._nodes_ver or '')) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 diff --git a/apisix/init.lua b/apisix/init.lua index cc322a93c38c..939faed05eec 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -840,15 +840,16 @@ local function healthcheck_passive(api_ctx) if not api_ctx.upstream_conf then return end - local resource_version = api_ctx.upstream_conf.resource_version .. - tostring(api_ctx.upstream_conf._nodes_ver or '') + local resource_version = (api_ctx.upstream_conf and api_ctx.upstream_conf.resource_version) and + (api_ctx.upstream_conf.resource_version .. + tostring(api_ctx.upstream_conf._nodes_ver or '')) local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf.resource_key, resource_version) if not checker then return end local up_conf = api_ctx.upstream_conf - local passive = up_conf.checks.passive + local passive = up_conf.checks and up_conf.checks.passive if not passive then return end diff --git a/t/control/routes.t b/t/control/routes.t index a24bc2c15e1a..b707655f6469 100644 --- a/t/control/routes.t +++ b/t/control/routes.t @@ -75,8 +75,8 @@ routes: end } } ---- response_body -{"upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.1","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"},"uris":["/hello"]} +--- response_body eval +qr/\{"upstream":\{"hash_on":"vars","nodes":\[\{"host":"127.0.0.1","port":1980,"weight":1\}\],"pass_host":"pass",.*"scheme":"http","type":"roundrobin"\},"uris":\["\/hello"\]\}/ @@ -108,8 +108,8 @@ routes: end } } ---- response_body -{"upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.1","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"},"uris":["/hello"]} +--- response_body eval +qr/\{"upstream":\{"hash_on":"vars","nodes":\[\{"host":"127.0.0.1","port":1980,"weight":1\}\],"pass_host":"pass",.*"scheme":"http","type":"roundrobin"\},"uris":\["\/hello"\]\}/ diff --git a/t/control/services.t b/t/control/services.t index 0003bcc9d1aa..3a959fe4c14b 100644 --- a/t/control/services.t +++ b/t/control/services.t @@ -75,8 +75,8 @@ services: return } } ---- response_body -{"id":"200","upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.1","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"}} +--- response_body eval +qr/\{"id":"200","upstream":\{"hash_on":"vars","nodes":\[\{"host":"127.0.0.1","port":1980,"weight":1\}\],"pass_host":"pass".*,"scheme":"http","type":"roundrobin"\}\}/ @@ -117,8 +117,8 @@ services: return } } ---- response_body -[{"id":"200","upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.1","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"}},{"id":"201","upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.2","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"}}] +--- response_body eval +qr/\{"id":"200","upstream":\{"hash_on":"vars","nodes":\[\{"host":"127.0.0.1","port":1980,"weight":1\}\],"pass_host":"pass".*,"scheme":"http","type":"roundrobin"\}\}/ @@ -156,8 +156,8 @@ services: return } } ---- response_body -{"id":"5","plugins":{"limit-count":{"allow_degradation":false,"count":2,"key":"remote_addr","key_type":"var","policy":"local","rejected_code":503,"show_limit_quota_header":true,"time_window":60}},"upstream":{"hash_on":"vars","nodes":[{"host":"127.0.0.1","port":1980,"weight":1}],"pass_host":"pass","scheme":"http","type":"roundrobin"}} +--- response_body eval +qr/\{"id":"5","plugins":\{"limit-count":\{"allow_degradation":false,"count":2,"key":"remote_addr","key_type":"var","policy":"local","rejected_code":503,"show_limit_quota_header":true,"time_window":60\}\},"upstream":\{"hash_on":"vars","nodes":\[\{"host":"127.0.0.1","port":1980,"weight":1\}\],"pass_host":"pass",.*"scheme":"http","type":"roundrobin"\}\}/ From 2588a407d9439897d40ca4f9339a3c45f4873d6b Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 19:06:07 +0530 Subject: [PATCH 31/51] dont run timer while worker exiting --- apisix/healthcheck_manager.lua | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 53ccc1230950..0bc4a82253e9 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -16,6 +16,7 @@ -- local require = require local ipairs = ipairs +local exiting = ngx.worker.exiting local pairs = pairs local tostring = tostring local core = require("apisix.core") @@ -259,22 +260,26 @@ end function _M.init_worker() timer_every(1, function () - if timer_create_checker_running then - core.log.warn("timer_create_checker is already running, skipping this iteration") - return + if not exiting() then + if timer_create_checker_running then + core.log.warn("timer_create_checker is already running, skipping this iteration") + return + end + timer_create_checker_running = true + pcall(_M.timer_create_checker) + timer_create_checker_running = false end - timer_create_checker_running = true - pcall(_M.timer_create_checker) - timer_create_checker_running = false end) timer_every(1, function () - if timer_working_pool_check_running then - core.log.warn("timer_working_pool_check is already running, skipping this iteration") - return + if not exiting() then + if timer_working_pool_check_running then + core.log.warn("timer_working_pool_check is already running, skipping this iteration") + return + end + timer_working_pool_check_running = true + pcall(_M.timer_working_pool_check) + timer_working_pool_check_running = false end - timer_working_pool_check_running = true - pcall(_M.timer_working_pool_check) - timer_working_pool_check_running = false end) end From c110c510159b89e11783781483de2b600b24adb4 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 20:15:01 +0530 Subject: [PATCH 32/51] fix lint and tests --- apisix/balancer.lua | 3 ++- apisix/healthcheck_manager.lua | 7 ++++--- apisix/init.lua | 3 ++- t/node/healthcheck-discovery.t | 6 +++--- t/node/healthcheck-stop-checker.t | 4 ++-- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index d216a78ae1f2..03fcd33e9fd3 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -214,7 +214,8 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local resource_version = up_conf.resource_version and (up_conf.resource_version .. tostring(up_conf._nodes_ver or '')) + local resource_version = up_conf.resource_version and (up_conf.resource_version .. + tostring(up_conf._nodes_ver or '')) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 0bc4a82253e9..07e2b48519eb 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -191,7 +191,8 @@ function _M.timer_create_checker() for resource_path, resource_ver in pairs(waiting_snapshot) do do if find_in_working_pool(resource_path, resource_ver) then - core.log.info("resource: ", resource_path, " already in working pool with version: ", + core.log.info("resource: ", resource_path, + " already in working pool with version: ", resource_ver) goto continue end @@ -267,13 +268,13 @@ function _M.init_worker() end timer_create_checker_running = true pcall(_M.timer_create_checker) - timer_create_checker_running = false + timer_create_checker_running = false end end) timer_every(1, function () if not exiting() then if timer_working_pool_check_running then - core.log.warn("timer_working_pool_check is already running, skipping this iteration") + core.log.warn("timer_working_pool_check is already running skipping iteration") return end timer_working_pool_check_running = true diff --git a/apisix/init.lua b/apisix/init.lua index 939faed05eec..dc7ac387deed 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -843,7 +843,8 @@ local function healthcheck_passive(api_ctx) local resource_version = (api_ctx.upstream_conf and api_ctx.upstream_conf.resource_version) and (api_ctx.upstream_conf.resource_version .. tostring(api_ctx.upstream_conf._nodes_ver or '')) - local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf.resource_key, resource_version) + local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf.resource_key, + resource_version) if not checker then return end diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index 3dcbf7541d39..9978bba93f2f 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -145,15 +145,15 @@ routes: local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello" local httpc = http.new() local res, err = httpc:request_uri(uri, {method = "GET", keepalive = false}) - ngx.sleep(25) + ngx.sleep(20) ngx.say(res.status) } } --- grep_error_log eval -qr/(create new checker|try to release checker): table/ +qr/(create new checker|releasing existing checker): table/ --- grep_error_log_out create new checker: table -try to release checker: table +releasing existing checker: table create new checker: table --- timeout: 30 diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index 83cddc8650c9..c8be5b122284 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -239,7 +239,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(5) + ngx.sleep(10) ngx.say("ok") } } @@ -254,4 +254,4 @@ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x try to release checker: table: 0x ---- timeout: 17 +--- timeout: 25 From 3de4f23abf60db9578b938f6304dc76c2a33b308 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 21:51:34 +0530 Subject: [PATCH 33/51] fix rr-balance --- t/node/rr-balance.t | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/t/node/rr-balance.t b/t/node/rr-balance.t index 74bbf9ea8f31..69c4cfebb917 100644 --- a/t/node/rr-balance.t +++ b/t/node/rr-balance.t @@ -138,7 +138,10 @@ passed local http = require "resty.http" local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port" - + --- to trigger the healthchecker + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + ngx.sleep(3) local ports_count = {} for i = 1, 12 do local httpc = http.new() @@ -211,7 +214,10 @@ passed local http = require "resty.http" local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port" - + --- to trigger the healthchecker + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + ngx.sleep(3) local ports_count = {} for i = 1, 12 do local httpc = http.new() @@ -284,7 +290,10 @@ passed local http = require "resty.http" local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port" - + --- to trigger the healthchecker + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + ngx.sleep(3) local ports_count = {} for i = 1, 12 do local httpc = http.new() From 68796dce9bd618728d4f279680ae9da4c26116fc Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 17 Jul 2025 21:56:28 +0530 Subject: [PATCH 34/51] add sleep in healthcheck-stop-checker --- t/node/healthcheck-stop-checker.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index c8be5b122284..ee5aba699daf 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -239,7 +239,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(10) + ngx.sleep(17) ngx.say("ok") } } @@ -254,4 +254,4 @@ create new checker: table: 0x try to release checker: table: 0x create new checker: table: 0x try to release checker: table: 0x ---- timeout: 25 +--- timeout: 30 From 3b94bd9f5e1e6e6c89e2f4479e0ac24231fb5769 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Fri, 18 Jul 2025 10:54:57 +0530 Subject: [PATCH 35/51] apply suggestion --- apisix/balancer.lua | 5 +---- apisix/init.lua | 9 +-------- apisix/upstream.lua | 13 ++++++++----- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 03fcd33e9fd3..e615893f2e69 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -214,10 +214,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key - local resource_version = up_conf.resource_version and (up_conf.resource_version .. - tostring(up_conf._nodes_ver or '')) - local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) - + local checker = ctx.up_checker ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then if ctx.server_picker and ctx.server_picker.after_balance then diff --git a/apisix/init.lua b/apisix/init.lua index dc7ac387deed..24bb35befb0d 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -837,14 +837,7 @@ end local function healthcheck_passive(api_ctx) - if not api_ctx.upstream_conf then - return - end - local resource_version = (api_ctx.upstream_conf and api_ctx.upstream_conf.resource_version) and - (api_ctx.upstream_conf.resource_version .. - tostring(api_ctx.upstream_conf._nodes_ver or '')) - local checker = healthcheck_manager.fetch_checker(api_ctx.upstream_conf.resource_key, - resource_version) + local checker = api_ctx.up_checker if not checker then return end diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 51680bde8802..ec114b2bdb11 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -243,8 +243,10 @@ function _M.set_by_route(route, api_ctx) ngx_var.upstream_sni = sni end end - - healthcheck_manager.fetch_checker(up_conf.resource_key, up_conf.resource_version) + local resource_version = up_conf.resource_version and (up_conf.resource_version .. + tostring(up_conf._nodes_ver or '')) + local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) + api_ctx.up_checker = checker return end @@ -254,9 +256,10 @@ function _M.set_by_route(route, api_ctx) if not ok then return 503, err end - - healthcheck_manager.fetch_checker(up_conf.resource_key, up_conf.resource_version) - + local resource_version = up_conf.resource_version and (up_conf.resource_version .. + tostring(up_conf._nodes_ver or '')) + local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) + api_ctx.up_checker = checker local scheme = up_conf.scheme if (scheme == "https" or scheme == "grpcs") and up_conf.tls then From 4621bedc58c089fe0ed26a2d5f98f0c4e9c54a53 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Fri, 18 Jul 2025 10:56:15 +0530 Subject: [PATCH 36/51] fix lint --- apisix/healthcheck_manager.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 07e2b48519eb..a740beb94235 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -16,6 +16,7 @@ -- local require = require local ipairs = ipairs +local pcall = pcall local exiting = ngx.worker.exiting local pairs = pairs local tostring = tostring From 47a04c6f05630895417a083da0a7f0f3644e041e Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Fri, 18 Jul 2025 10:57:57 +0530 Subject: [PATCH 37/51] add lint --- apisix/balancer.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index e615893f2e69..9d20212d265c 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -215,6 +215,7 @@ local function pick_server(route, ctx) local version = ctx.upstream_version local key = ctx.upstream_key local checker = ctx.up_checker + ctx.balancer_try_count = (ctx.balancer_try_count or 0) + 1 if ctx.balancer_try_count > 1 then if ctx.server_picker and ctx.server_picker.after_balance then From cc363da3469a7dc8b25a8050da3ab47b40d2766b Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Fri, 18 Jul 2025 11:05:28 +0530 Subject: [PATCH 38/51] fix lint --- apisix/init.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/apisix/init.lua b/apisix/init.lua index 24bb35befb0d..941cfa9589e0 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -27,7 +27,6 @@ require("jit.opt").start("minstitch=2", "maxtrace=4000", require("apisix.patch").patch() local core = require("apisix.core") -local healthcheck_manager = require("apisix.healthcheck_manager") local plugin = require("apisix.plugin") local plugin_config = require("apisix.plugin_config") local consumer_group = require("apisix.consumer_group") From 9288707f1a7c5f75257658f6a67ab70051575c4a Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Fri, 18 Jul 2025 11:50:32 +0530 Subject: [PATCH 39/51] fix stop-checker --- t/node/healthcheck-stop-checker.t | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/t/node/healthcheck-stop-checker.t b/t/node/healthcheck-stop-checker.t index ee5aba699daf..2fecc8b88cd9 100644 --- a/t/node/healthcheck-stop-checker.t +++ b/t/node/healthcheck-stop-checker.t @@ -190,7 +190,7 @@ create new checker: table: 0x return end - ngx.sleep(3) + ngx.sleep(1) code, _, body = t('/server_port', "GET") if code > 300 then @@ -199,7 +199,7 @@ create new checker: table: 0x return end - ngx.sleep(0.5) + ngx.sleep(3) -- update code, _, body = t('/apisix/admin/upstreams/stopchecker', @@ -213,7 +213,7 @@ create new checker: table: 0x return end - ngx.sleep(4) + ngx.sleep(3) code, _, body = t('/server_port', "GET") if code > 300 then @@ -222,6 +222,7 @@ create new checker: table: 0x return end + ngx.sleep(3) -- delete code, _, body = t('/apisix/admin/routes/1', "DELETE") @@ -230,7 +231,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(4) -- wait for routes delete event synced + ngx.sleep(3) -- wait for routes delete event synced code, _, body = t('/apisix/admin/upstreams/stopchecker', "DELETE") @@ -239,7 +240,7 @@ create new checker: table: 0x ngx.say(body) return end - ngx.sleep(17) + ngx.sleep(10) ngx.say("ok") } } From 98a65e5ee2fdb62fe6c36e79a0b624c9bc4580ec Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 21 Jul 2025 10:12:14 +0530 Subject: [PATCH 40/51] change warn to info --- apisix/healthcheck_manager.lua | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index a740beb94235..a57660b7ebe3 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -95,13 +95,11 @@ local function create_checker(up_conf) core.log.info("healthchecker won't be created: disabled upstream healthcheck") return nil end - core.log.warn("creating healthchecker for upstream: ", up_conf.resource_key) + core.log.info("creating healthchecker for upstream: ", up_conf.resource_key) if not healthcheck then healthcheck = require("resty.healthcheck") end - core.log.warn("creating new healthchecker for ", up_conf.key) - local checker, err = healthcheck.new({ name = get_healthchecker_name(up_conf), shm_name = healthcheck_shdict_name, @@ -175,14 +173,14 @@ local function find_in_working_pool(resource_path, resource_ver) end if checker.version ~= resource_ver then - core.log.warn("version mismatch for resource: ", resource_path, + core.log.info("version mismatch for resource: ", resource_path, " current version: ", checker.version, " requested version: ", resource_ver) return nil -- version not match end return checker end -local timer_create_checker_running = false + function _M.timer_create_checker() if core.table.nkeys(_M.waiting_pool) == 0 then return @@ -261,6 +259,7 @@ function _M.timer_working_pool_check() end function _M.init_worker() + local timer_create_checker_running = false timer_every(1, function () if not exiting() then if timer_create_checker_running then From 66fc3835856fce3b8a09e59cb8a0b6857dbd7980 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 21 Jul 2025 11:11:30 +0530 Subject: [PATCH 41/51] apply suggestions --- apisix/healthcheck_manager.lua | 14 +++++++++++--- apisix/upstream.lua | 8 +++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index a57660b7ebe3..c324f7f5b0b2 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -181,6 +181,14 @@ local function find_in_working_pool(resource_path, resource_ver) end +function _M.upstream_version(index, nodes_ver) + if not index then + return + end + return index .. tostring(nodes_ver or '') +end + + function _M.timer_create_checker() if core.table.nkeys(_M.waiting_pool) == 0 then return @@ -200,8 +208,8 @@ function _M.timer_create_checker() goto continue end local upstream = res_conf.value.upstream or res_conf.value - local new_version = res_conf.modifiedIndex .. tostring(upstream._nodes_ver or '') - core.log.warn("checking waiting pool for resource: ", resource_path, + local new_version = _M.upstream_version(res_conf.modifiedIndex, upstream._nodes_ver) + core.log.info("checking waiting pool for resource: ", resource_path, " current version: ", new_version, " requested version: ", resource_ver) if resource_ver ~= new_version then goto continue @@ -244,7 +252,7 @@ function _M.timer_working_pool_check() _M.working_pool[resource_path] = nil goto continue end - local current_ver = res_conf.modifiedIndex .. tostring(res_conf.value._nodes_ver or '') + local current_ver = _M.upstream_version(res_conf.modifiedIndex, res_conf.value.upstream._nodes_ver) core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then diff --git a/apisix/upstream.lua b/apisix/upstream.lua index ec114b2bdb11..2a292d89a60a 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -243,8 +243,7 @@ function _M.set_by_route(route, api_ctx) ngx_var.upstream_sni = sni end end - local resource_version = up_conf.resource_version and (up_conf.resource_version .. - tostring(up_conf._nodes_ver or '')) + local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, up_conf._nodes_ver) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) api_ctx.up_checker = checker return @@ -256,8 +255,7 @@ function _M.set_by_route(route, api_ctx) if not ok then return 503, err end - local resource_version = up_conf.resource_version and (up_conf.resource_version .. - tostring(up_conf._nodes_ver or '')) + local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, up_conf._nodes_ver ) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) api_ctx.up_checker = checker local scheme = up_conf.scheme @@ -448,7 +446,7 @@ local function filter_upstream(value, parent) if not value then return end - value.resource_key = ((parent and parent.key) or value.key) + value.resource_key = parent and parent.key value.resource_version = ((parent and parent.modifiedIndex) or value.modifiedIndex) value.resource_id = ((parent and parent.value.id) or value.id) if not is_http and value.scheme == "http" then From 9a56f59f7bea0987809af7ff7a07eb7475453f3d Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 21 Jul 2025 11:20:10 +0530 Subject: [PATCH 42/51] fix lint --- apisix/healthcheck_manager.lua | 3 ++- apisix/upstream.lua | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index c324f7f5b0b2..47e30229c9b1 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -252,7 +252,8 @@ function _M.timer_working_pool_check() _M.working_pool[resource_path] = nil goto continue end - local current_ver = _M.upstream_version(res_conf.modifiedIndex, res_conf.value.upstream._nodes_ver) + local current_ver = _M.upstream_version(res_conf.modifiedIndex, + res_conf.value.upstream._nodes_ver) core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then diff --git a/apisix/upstream.lua b/apisix/upstream.lua index 2a292d89a60a..e55694f7ec25 100644 --- a/apisix/upstream.lua +++ b/apisix/upstream.lua @@ -243,7 +243,8 @@ function _M.set_by_route(route, api_ctx) ngx_var.upstream_sni = sni end end - local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, up_conf._nodes_ver) + local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, + up_conf._nodes_ver) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) api_ctx.up_checker = checker return @@ -255,7 +256,8 @@ function _M.set_by_route(route, api_ctx) if not ok then return 503, err end - local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, up_conf._nodes_ver ) + local resource_version = healthcheck_manager.upstream_version(up_conf.resource_version, + up_conf._nodes_ver ) local checker = healthcheck_manager.fetch_checker(up_conf.resource_key, resource_version) api_ctx.up_checker = checker local scheme = up_conf.scheme From f11e3914bb677a70f2b7267a30ee1df82fb91aff Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 21 Jul 2025 12:03:18 +0530 Subject: [PATCH 43/51] fix CI --- apisix/healthcheck_manager.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 47e30229c9b1..864b6d91c01d 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -253,7 +253,7 @@ function _M.timer_working_pool_check() goto continue end local current_ver = _M.upstream_version(res_conf.modifiedIndex, - res_conf.value.upstream._nodes_ver) + res_conf.value._nodes_ver) core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then From 30f12a5b312d9f3c0a34280fb96e018243a3e285 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Mon, 21 Jul 2025 12:20:45 +0530 Subject: [PATCH 44/51] put timer in local --- apisix/healthcheck_manager.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 864b6d91c01d..10dde0467534 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -235,7 +235,7 @@ function _M.timer_create_checker() end end -local timer_working_pool_check_running = false + function _M.timer_working_pool_check() if core.table.nkeys(_M.working_pool) == 0 then return @@ -269,6 +269,7 @@ end function _M.init_worker() local timer_create_checker_running = false + local timer_working_pool_check_running = false timer_every(1, function () if not exiting() then if timer_create_checker_running then From 17b739791b4e06bc80f4331ef2fcd9d50da90561 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Tue, 22 Jul 2025 14:22:49 +0530 Subject: [PATCH 45/51] apply copilot suggestion --- apisix/healthcheck_manager.lua | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 10dde0467534..de8757c2e8c4 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -30,6 +30,7 @@ local _M = { working_pool = {}, -- resource_path -> {version = ver, checker = checker} waiting_pool = {} -- resource_path -> resource_ver } +local DELAYED_CLEAR_TIMEOUT = 10 local healthcheck_shdict_name = "upstream-healthcheck" local is_http = ngx.config.subsystem == "http" if not is_http then @@ -245,8 +246,8 @@ function _M.timer_working_pool_check() for resource_path, item in pairs(working_snapshot) do --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) - if not res_conf then - item.checker:delayed_clear(10) + if not res_conf or not res_conf.value then + item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) item.checker:stop() core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil @@ -257,7 +258,7 @@ function _M.timer_working_pool_check() core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then - item.checker:delayed_clear(10) + item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) item.checker:stop() core.log.info("try to release checker: ", tostring(item.checker)) _M.working_pool[resource_path] = nil From 15979b3651ecb6fbc6837cdddbd6bfdb8e0465ae Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 23 Jul 2025 12:28:00 +0530 Subject: [PATCH 46/51] apply suggestions --- apisix/balancer.lua | 2 +- apisix/healthcheck_manager.lua | 58 ++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 9d20212d265c..6e3675e54933 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -19,6 +19,7 @@ local balancer = require("ngx.balancer") local core = require("apisix.core") local priority_balancer = require("apisix.balancer.priority") local apisix_upstream = require("apisix.upstream") +local healthcheck_manager = require("apisix.healthcheck_manager") local ipairs = ipairs local is_http = ngx.config.subsystem == "http" local enable_keepalive = balancer.enable_keepalive and is_http @@ -27,7 +28,6 @@ local get_last_failure = balancer.get_last_failure local set_timeouts = balancer.set_timeouts local ngx_now = ngx.now local str_byte = string.byte -local healthcheck_manager = require("apisix.healthcheck_manager") local module_name = "balancer" local pickers = {} diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index de8757c2e8c4..db2e6f1704ef 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -26,10 +26,12 @@ local healthcheck local events = require("apisix.events") local tab_clone = core.table.clone local timer_every = ngx.timer.every -local _M = { - working_pool = {}, -- resource_path -> {version = ver, checker = checker} - waiting_pool = {} -- resource_path -> resource_ver -} +local string_sub = string.sub + +local _M = {} +local working_pool = {} -- resource_path -> {version = ver, checker = checker} +local waiting_pool = {} -- resource_path -> resource_ver + local DELAYED_CLEAR_TIMEOUT = 10 local healthcheck_shdict_name = "upstream-healthcheck" local is_http = ngx.config.subsystem == "http" @@ -44,16 +46,26 @@ end _M.get_healthchecker_name = get_healthchecker_name +local function remove_etcd_prefix(key) + local prefix = "" + local local_conf = config_local.local_conf() + local role = core.table.try_read_attr(local_conf, "deployment", "role") + local provider = core.table.try_read_attr(local_conf, "deployment", "role_" .. + role, "config_provider") + if provider == "etcd" and local_conf.etcd and local_conf.etcd.prefix then + prefix = local_conf.etcd.prefix + end + return string_sub(key, #prefix + 1) +end + + local function fetch_latest_conf(resource_path) local resource_type, id -- Handle both formats: - -- 1. /apisix// + -- 1. /// -- 2. // - if resource_path:find("^/apisix/") then - resource_type, id = resource_path:match("^/apisix/([^/]+)/([^/]+)$") - else - resource_type, id = resource_path:match("^/([^/]+)/([^/]+)$") - end + resource_path = remove_etcd_prefix(resource_path) + resource_type, id = resource_path:match("^/([^/]+)/([^/]+)$") if not resource_type or not id then core.log.error("invalid resource path: ", resource_path) return nil @@ -134,18 +146,18 @@ end function _M.fetch_checker(resource_path, resource_ver) - local working_item = _M.working_pool[resource_path] + local working_item = working_pool[resource_path] if working_item and working_item.version == resource_ver then return working_item.checker end - if _M.waiting_pool[resource_path] == resource_ver then + if waiting_pool[resource_path] == resource_ver then return nil end -- Add to waiting pool with version core.log.info("adding ", resource_path, " to waiting pool with version: ", resource_ver) - _M.waiting_pool[resource_path] = resource_ver + waiting_pool[resource_path] = resource_ver return nil end @@ -161,14 +173,14 @@ end local function add_working_pool(resource_path, resource_ver, checker) - _M.working_pool[resource_path] = { + working_pool[resource_path] = { version = resource_ver, checker = checker } end local function find_in_working_pool(resource_path, resource_ver) - local checker = _M.working_pool[resource_path] + local checker = working_pool[resource_path] if not checker then return nil -- not found end @@ -191,11 +203,11 @@ end function _M.timer_create_checker() - if core.table.nkeys(_M.waiting_pool) == 0 then + if core.table.nkeys(waiting_pool) == 0 then return end - local waiting_snapshot = tab_clone(_M.waiting_pool) + local waiting_snapshot = tab_clone(waiting_pool) for resource_path, resource_ver in pairs(waiting_snapshot) do do if find_in_working_pool(resource_path, resource_ver) then @@ -217,7 +229,7 @@ function _M.timer_create_checker() end -- if a checker exists then delete it before creating a new one - local existing_checker = _M.working_pool[resource_path] + local existing_checker = working_pool[resource_path] if existing_checker then existing_checker.checker:delayed_clear(10) existing_checker.checker:stop() @@ -232,17 +244,17 @@ function _M.timer_create_checker() end ::continue:: - _M.waiting_pool[resource_path] = nil + waiting_pool[resource_path] = nil end end function _M.timer_working_pool_check() - if core.table.nkeys(_M.working_pool) == 0 then + if core.table.nkeys(working_pool) == 0 then return end - local working_snapshot = tab_clone(_M.working_pool) + local working_snapshot = tab_clone(working_pool) for resource_path, item in pairs(working_snapshot) do --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) @@ -250,7 +262,7 @@ function _M.timer_working_pool_check() item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) item.checker:stop() core.log.info("try to release checker: ", tostring(item.checker)) - _M.working_pool[resource_path] = nil + working_pool[resource_path] = nil goto continue end local current_ver = _M.upstream_version(res_conf.modifiedIndex, @@ -261,7 +273,7 @@ function _M.timer_working_pool_check() item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) item.checker:stop() core.log.info("try to release checker: ", tostring(item.checker)) - _M.working_pool[resource_path] = nil + working_pool[resource_path] = nil end ::continue:: From e725ad5e56c1189aa0b53af87653b98427690286 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 23 Jul 2025 12:43:15 +0530 Subject: [PATCH 47/51] skip creating checker if up_conf.checks nil --- apisix/healthcheck_manager.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index db2e6f1704ef..e17c8d626816 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -103,6 +103,9 @@ end local function create_checker(up_conf) + if not up_conf.checks then + return nil + end local local_conf = config_local.local_conf() if local_conf and local_conf.apisix and local_conf.apisix.disable_upstream_healthcheck then core.log.info("healthchecker won't be created: disabled upstream healthcheck") From 435918286b1b8d0f690be047765082eb23974d14 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 31 Jul 2025 11:00:22 +0530 Subject: [PATCH 48/51] apply suggestions --- apisix/healthcheck_manager.lua | 36 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index e17c8d626816..bf67b410f8bd 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -236,13 +236,15 @@ function _M.timer_create_checker() if existing_checker then existing_checker.checker:delayed_clear(10) existing_checker.checker:stop() - core.log.info("releasing existing checker: ", tostring(existing_checker.checker)) + core.log.info("releasing existing checker: ", tostring(existing_checker.checker), + " for resource: ", resource_path, " and version: ", existing_checker.version) end local checker = create_checker(upstream) if not checker then goto continue end - core.log.info("create new checker: ", tostring(checker)) + core.log.info("create new checker: ", tostring(checker), " for resource: ", + resource_path, " and version: ", resource_ver) add_working_pool(resource_path, resource_ver, checker) end @@ -261,11 +263,9 @@ function _M.timer_working_pool_check() for resource_path, item in pairs(working_snapshot) do --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) + local need_free = true if not res_conf or not res_conf.value then - item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) - item.checker:stop() - core.log.info("try to release checker: ", tostring(item.checker)) - working_pool[resource_path] = nil + need_free = true goto continue end local current_ver = _M.upstream_version(res_conf.modifiedIndex, @@ -273,13 +273,19 @@ function _M.timer_working_pool_check() core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version ~= current_ver then - item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) - item.checker:stop() - core.log.info("try to release checker: ", tostring(item.checker)) - working_pool[resource_path] = nil + need_free = true + goto continue end ::continue:: + if need_free then + working_pool[resource_path] = nil + item.checker.dead = true + item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) + item.checker:stop() + core.log.info("try to release checker: ", tostring(item.checker), " for resource: ", + resource_path, " and version : ", item.version) + end end end @@ -293,7 +299,10 @@ function _M.init_worker() return end timer_create_checker_running = true - pcall(_M.timer_create_checker) + local ok, err = pcall(_M.timer_create_checker) + if not ok then + core.log.error("failed to run timer_create_checker: ", err) + end timer_create_checker_running = false end end) @@ -304,7 +313,10 @@ function _M.init_worker() return end timer_working_pool_check_running = true - pcall(_M.timer_working_pool_check) + local ok, err = pcall(_M.timer_working_pool_check) + if not ok then + core.log.error("failed to run timer_working_pool_check: ", err) + end timer_working_pool_check_running = false end end) From 1dca2f1a8bceb445511c16115f97f8af8cb8bc11 Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 31 Jul 2025 11:08:26 +0530 Subject: [PATCH 49/51] fix lint --- apisix/healthcheck_manager.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index bf67b410f8bd..05f42ce1a243 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -237,7 +237,8 @@ function _M.timer_create_checker() existing_checker.checker:delayed_clear(10) existing_checker.checker:stop() core.log.info("releasing existing checker: ", tostring(existing_checker.checker), - " for resource: ", resource_path, " and version: ", existing_checker.version) + " for resource: ", resource_path, " and version: ", + existing_checker.version) end local checker = create_checker(upstream) if not checker then From 90e993bdba91384ceec69f3f760b50fee1a1b85f Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 31 Jul 2025 11:23:32 +0530 Subject: [PATCH 50/51] fix --- apisix/healthcheck_manager.lua | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 05f42ce1a243..fbf6722f92d8 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -265,20 +265,16 @@ function _M.timer_working_pool_check() --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) local need_free = true - if not res_conf or not res_conf.value then - need_free = true - goto continue - end - local current_ver = _M.upstream_version(res_conf.modifiedIndex, - res_conf.value._nodes_ver) - core.log.info("checking working pool for resource: ", resource_path, - " current version: ", current_ver, " item version: ", item.version) - if item.version ~= current_ver then - need_free = true - goto continue + if res_conf and res_conf.value then + local current_ver = _M.upstream_version(res_conf.modifiedIndex, + res_conf.value._nodes_ver) + core.log.info("checking working pool for resource: ", resource_path, + " current version: ", current_ver, " item version: ", item.version) + if item.version == current_ver then + need_free = false + end end - ::continue:: if need_free then working_pool[resource_path] = nil item.checker.dead = true From 0a4e23fec1e283a868d35c298940d268abcb02cb Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Thu, 31 Jul 2025 11:55:53 +0530 Subject: [PATCH 51/51] apply suggestions --- apisix/healthcheck_manager.lua | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index fbf6722f92d8..066349829b9e 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -94,7 +94,8 @@ local function fetch_latest_conf(resource_path) if not resource then -- this can happen if the resource was deleted -- after the this function was called so we don't throw error - core.log.warn("resource not found: ", id, " in ", key) + core.log.warn("resource not found: ", id, " in ", key, + "this can happen if the resource was deleted") return nil end @@ -205,7 +206,7 @@ function _M.upstream_version(index, nodes_ver) end -function _M.timer_create_checker() +local function timer_create_checker() if core.table.nkeys(waiting_pool) == 0 then return end @@ -234,7 +235,7 @@ function _M.timer_create_checker() -- if a checker exists then delete it before creating a new one local existing_checker = working_pool[resource_path] if existing_checker then - existing_checker.checker:delayed_clear(10) + existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) existing_checker.checker:stop() core.log.info("releasing existing checker: ", tostring(existing_checker.checker), " for resource: ", resource_path, " and version: ", @@ -255,7 +256,7 @@ function _M.timer_create_checker() end -function _M.timer_working_pool_check() +local function timer_working_pool_check() if core.table.nkeys(working_pool) == 0 then return end @@ -264,18 +265,18 @@ function _M.timer_working_pool_check() for resource_path, item in pairs(working_snapshot) do --- remove from working pool if resource doesn't exist local res_conf = fetch_latest_conf(resource_path) - local need_free = true + local need_destroy = true if res_conf and res_conf.value then local current_ver = _M.upstream_version(res_conf.modifiedIndex, res_conf.value._nodes_ver) core.log.info("checking working pool for resource: ", resource_path, " current version: ", current_ver, " item version: ", item.version) if item.version == current_ver then - need_free = false + need_destroy = false end end - if need_free then + if need_destroy then working_pool[resource_path] = nil item.checker.dead = true item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) @@ -296,7 +297,7 @@ function _M.init_worker() return end timer_create_checker_running = true - local ok, err = pcall(_M.timer_create_checker) + local ok, err = pcall(timer_create_checker) if not ok then core.log.error("failed to run timer_create_checker: ", err) end @@ -310,7 +311,7 @@ function _M.init_worker() return end timer_working_pool_check_running = true - local ok, err = pcall(_M.timer_working_pool_check) + local ok, err = pcall(timer_working_pool_check) if not ok then core.log.error("failed to run timer_working_pool_check: ", err) end