@@ -23,62 +23,72 @@ import (
2323func TestFallback (tt * testing.T ) {
2424 t := commontesthelpers .NewE (tt )
2525 cs := getClients (t )
26+ ctx := context .TODO ()
2627
2728 t .Log ("Starting the fallback test" )
28- clusterStateWaitPollTimeout , clusterMustBeReadyFor , waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform (t , cs )
29+ clusterStateWaitPollTimeout , clusterMustBeReadyForBeforeTest , clusterMustBeReadyFor , waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform (t , cs )
2930
3031 // before starting a new test make sure the current state of the cluster is good
31- ensureClusterInGoodState (t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
32+ ensureClusterInGoodState (ctx , t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyForBeforeTest )
3233
3334 // cause a disruption
3435 cfg := getDefaultUnsupportedConfigForCurrentPlatform (t , cs )
3536 cfg ["apiServerArguments" ] = map [string ][]string {"non-existing-flag" : {"true" }}
3637 setUnsupportedConfig (t , cs , cfg )
3738
3839 // validate if the fallback condition is reported and the cluster is stable
39- waitForFallbackDegradedCondition (t , cs , waitForFallbackDegradedConditionTimeout )
40+ waitForFallbackDegradedCondition (ctx , t , cs , waitForFallbackDegradedConditionTimeout )
4041 nodeName , failedRevision := assertFallbackOnNodeStatus (t , cs )
4142 assertKasPodAnnotatedOnNode (t , cs , failedRevision , nodeName )
4243
43- // clean up
44+ // clean up and some extra time is needed to wait for the KAS operator to be ready
4445 setUnsupportedConfig (t , cs , getDefaultUnsupportedConfigForCurrentPlatform (t , cs ))
45- err := waitForClusterInGoodState (t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
46+ err := waitForClusterInGoodState (ctx , t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
4647 require .NoError (t , err )
4748}
4849
4950// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
5051// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
51- func ensureClusterInGoodState (t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) {
52+ func ensureClusterInGoodState (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) {
5253 setUnsupportedConfig (t , cs , getDefaultUnsupportedConfigForCurrentPlatform (t , cs ))
53- err := waitForClusterInGoodState (t , cs , waitPollTimeout , mustBeReadyFor )
54+ err := waitForClusterInGoodState (ctx , t , cs , waitPollTimeout , mustBeReadyFor )
5455 require .NoError (t , err )
5556}
5657
5758// waitForClusterInGoodState checks if the cluster is not progressing
58- func waitForClusterInGoodState (t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) error {
59+ func waitForClusterInGoodState (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) error {
5960 t .Helper ()
6061
6162 startTs := time .Now ()
62- t .Logf ("Waiting %s for the cluster to be in a good condition, interval = 10s , timeout %v" , mustBeReadyFor .String (), waitPollTimeout )
63+ t .Logf ("Waiting %s for the cluster to be in a good condition, interval = 20s , timeout %v" , mustBeReadyFor .String (), waitPollTimeout )
6364
64- return wait .Poll (10 * time .Second , waitPollTimeout , func () (bool , error ) {
65- ckaso , err := cs .Operator .Get (context . TODO () , "cluster" , metav1.GetOptions {})
65+ return wait .Poll (20 * time .Second , waitPollTimeout , func () (bool , error ) {
66+ ckaso , err := cs .Operator .Get (ctx , "cluster" , metav1.GetOptions {})
6667 if err != nil {
6768 t .Log (err )
6869 return false , nil /*retry*/
6970 }
7071
72+ // Check if any node is still progressing
7173 for _ , ns := range ckaso .Status .NodeStatuses {
7274 if ckaso .Status .LatestAvailableRevision != ns .CurrentRevision || ns .TargetRevision > 0 {
73- t .Logf ("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v" , ns .NodeName , ckaso .Status .LatestAvailableRevision , ns .CurrentRevision , ns .TargetRevision )
75+ t .Logf ("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v" ,
76+ ns .NodeName , ckaso .Status .LatestAvailableRevision , ns .CurrentRevision , ns .TargetRevision )
7477 return false , nil /*retry*/
7578 }
7679 }
7780
78- if time .Since (startTs ) > mustBeReadyFor {
81+ // Verify operator conditions
82+ ckasoAvailable := v1helpers .IsOperatorConditionTrue (ckaso .Status .Conditions , "StaticPodsAvailable" )
83+ ckasoNotProgressing := v1helpers .IsOperatorConditionFalse (ckaso .Status .Conditions , "NodeInstallerProgressing" )
84+ ckasoNotDegraded := v1helpers .IsOperatorConditionFalse (ckaso .Status .Conditions , "NodeControllerDegraded" )
85+
86+ // If cluster has been stable for the required time, return success
87+ if time .Since (startTs ) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
7988 t .Logf ("The cluster has been in good condition for %s" , mustBeReadyFor .String ())
8089 return true , nil /*done*/
8190 }
91+
8292 return false , nil /*wait a bit more*/
8393 })
8494}
@@ -108,12 +118,12 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
108118}
109119
110120// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
111- func waitForFallbackDegradedCondition (t testing.TB , cs clientSet , waitPollTimeout time.Duration ) {
121+ func waitForFallbackDegradedCondition (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout time.Duration ) {
112122 t .Helper ()
113123
114124 t .Logf ("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v" , waitPollTimeout )
115125 err := wait .Poll (20 * time .Second , waitPollTimeout , func () (bool , error ) {
116- ckaso , err := cs .Operator .Get (context . TODO () , "cluster" , metav1.GetOptions {})
126+ ckaso , err := cs .Operator .Get (ctx , "cluster" , metav1.GetOptions {})
117127 if err != nil {
118128 t .Logf ("unable to get kube-apiserver-operator resource: %v" , err )
119129 return false , nil /*retry*/
@@ -220,14 +230,17 @@ func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) m
220230// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
221231// TODO: add timeouts for AWS and GCP
222232// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
223- func fallbackTimeoutsForCurrentPlatform (t testing.TB , cs clientSet ) (time.Duration , time.Duration , time.Duration ) {
233+ func fallbackTimeoutsForCurrentPlatform (t testing.TB , cs clientSet ) (time.Duration , time.Duration , time.Duration , time. Duration ) {
224234 /*
225235 default timeouts that apply when the test is run on an SNO cluster
226236
227237 clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
228238 it should match waitForFallbackDegradedConditionTimeout
229239 because we don't know when the previous test finished
230240
241+ clusterMustBeReadyForBeforeTest: the time that make sure the current state of the cluster is good
242+ before starting a new test
243+
231244 clusterMustBeReadyFor: the time the cluster must stay stable
232245
233246 waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
@@ -236,5 +249,8 @@ func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Durati
236249 including the time the server needs to become ready and be noticed by a Load Balancer
237250 longer duration allows as to collect logs and the must-gather
238251 */
239- return 10 * time .Minute /*clusterStateWaitPollInterval*/ , 1 * time .Minute /*clusterMustBeReadyFor*/ , 10 * time .Minute /*waitForFallbackDegradedConditionTimeout*/
252+ return 10 * time .Minute , // clusterStateWaitPollInterval
253+ 1 * time .Minute , // clusterMustBeReadyForBeforeTest
254+ 5 * time .Minute , // clusterMustBeReadyFor
255+ 18 * time .Minute // waitForFallbackDegradedConditionTimeout
240256}
0 commit comments