Skip to content

Remove readiness probe otel-agent sidecars #1791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions e2e/testcases/cli_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"kpt.dev/configsync/e2e/nomostest/ntopts"
"kpt.dev/configsync/e2e/nomostest/policy"
"kpt.dev/configsync/e2e/nomostest/syncsource"

"kpt.dev/configsync/e2e/nomostest/taskgroup"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
Expand Down Expand Up @@ -1301,6 +1302,23 @@ func TestNomosMigrate(t *testing.T) {

nt.T.Cleanup(func() {
// Restore state of Config Sync installation after test
// Legacy ConfigManagement sets readiness on reconciler-manager and resource-group-controller,
// which isn't reliably cleaned up after tests. Delete both to ensure a fresh
// install instead of patching.
rmDeployment := k8sobjects.DeploymentObject(
core.Name(reconcilermanager.ManagerName),
core.Namespace(configsync.ControllerNamespace),
)
rgDeployment := k8sobjects.DeploymentObject(
core.Name(configmanagement.RGControllerName),
core.Namespace(configmanagement.RGControllerNamespace),
)
if err := nt.KubeClient.Delete(rmDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nt.KubeClient.Delete(rgDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nomostest.InstallConfigSync(nt); err != nil {
nt.T.Fatal(err)
}
Expand Down Expand Up @@ -1469,6 +1487,23 @@ func TestNomosMigrateMonoRepo(t *testing.T) {
nt.T.Cleanup(func() {
// Restore state of Config Sync installation after test.
// This also emulates upgrading to the current version after migrating
// Legacy ConfigManagement sets readiness on reconciler-manager and resource-group-controller,
// which isn’t reliably cleaned up after tests. Delete both to ensure a fresh
// install instead of patching.
rmDeployment := k8sobjects.DeploymentObject(
core.Name(reconcilermanager.ManagerName),
core.Namespace(configsync.ControllerNamespace),
)
rgDeployment := k8sobjects.DeploymentObject(
core.Name(configmanagement.RGControllerName),
core.Namespace(configmanagement.RGControllerNamespace),
)
if err := nt.KubeClient.Delete(rmDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nt.KubeClient.Delete(rgDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nomostest.InstallConfigSync(nt); err != nil {
nt.T.Fatal(err)
}
Expand Down Expand Up @@ -1711,6 +1746,23 @@ func TestACMUninstallScript(t *testing.T) {

nt.T.Cleanup(func() {
// Restore state of Config Sync installation after test
// Legacy ConfigManagement sets readiness on reconciler-manager and resource-group-controller,
// which isn’t reliably cleaned up after tests. Delete both to ensure a fresh
// install instead of patching.
rmDeployment := k8sobjects.DeploymentObject(
core.Name(reconcilermanager.ManagerName),
core.Namespace(configsync.ControllerNamespace),
)
rgDeployment := k8sobjects.DeploymentObject(
core.Name(configmanagement.RGControllerName),
core.Namespace(configmanagement.RGControllerNamespace),
)
if err := nt.KubeClient.Delete(rmDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nt.KubeClient.Delete(rgDeployment); err != nil && !apierrors.IsNotFound(err) {
nt.T.Error(err)
}
if err := nomostest.InstallConfigSync(nt); err != nil {
nt.T.Fatal(err)
}
Expand Down
4 changes: 0 additions & 4 deletions manifests/otel-agent-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,7 @@ data:
batch:
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down
4 changes: 0 additions & 4 deletions manifests/otel-agent-reconciler-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,7 @@ data:
# the GCE metadata service, if available.
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down
7 changes: 0 additions & 7 deletions manifests/templates/reconciler-manager-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,9 @@ data:
protocol: TCP
- containerPort: 8888 # Metrics.
protocol: TCP
- containerPort: 13133 # Health check
protocol: TCP
volumeMounts:
- name: otel-agent-config-reconciler-vol
mountPath: /conf
readinessProbe:
httpGet:
path: /
port: 13133 # Health Check extension default port.
scheme: HTTP
imagePullPolicy: IfNotPresent
# These KUBE env vars help populate OTEL_RESOURCE_ATTRIBUTES which
# is used by the otel-agent to populate resource attributes when
Expand Down
5 changes: 0 additions & 5 deletions manifests/templates/reconciler-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ spec:
ports:
- containerPort: 55678 # Default OpenCensus receiver port.
- containerPort: 8888 # Metrics.
- containerPort: 13133 # Health check
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
Expand All @@ -91,10 +90,6 @@ spec:
volumeMounts:
- name: otel-agent-config-vol
mountPath: /conf
readinessProbe:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Odd. I removed the readinessProbe from both the reconciler-manager and resource-group-controller configurations, but when checking the pod description for the failed tests, the readiness probe is still showing up. Also there are two resource-group-controller pods running, with one not being ready due to readiness failure, which seems unusual.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue does not apply to the test root-reconciler, suspecting caching issue.

httpGet:
path: /
port: 13133 # Health Check extension default port.
# These KUBE env vars help populate OTEL_RESOURCE_ATTRIBUTES which
# is used by the otel-agent to populate resource attributes when
# emiting metrics to the otel-collector. This is more efficient than
Expand Down
9 changes: 0 additions & 9 deletions manifests/templates/resourcegroup-manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,7 @@ data:
# the GCE metadata service, if available.
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down Expand Up @@ -274,11 +270,6 @@ spec:
ports:
- containerPort: 55678
- containerPort: 8888
- containerPort: 13133
readinessProbe:
httpGet:
path: /
port: 13133
resources:
requests:
cpu: 10m
Expand Down
29 changes: 0 additions & 29 deletions test/kustomization/expected.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5650,11 +5650,7 @@ data:
batch:
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down Expand Up @@ -5708,11 +5704,7 @@ data:
# the GCE metadata service, if available.
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down Expand Up @@ -5902,16 +5894,9 @@ data:
protocol: TCP
- containerPort: 8888 # Metrics.
protocol: TCP
- containerPort: 13133 # Health check
protocol: TCP
volumeMounts:
- name: otel-agent-config-reconciler-vol
mountPath: /conf
readinessProbe:
httpGet:
path: /
port: 13133 # Health Check extension default port.
scheme: HTTP
imagePullPolicy: IfNotPresent
# These KUBE env vars help populate OTEL_RESOURCE_ATTRIBUTES which
# is used by the otel-agent to populate resource attributes when
Expand Down Expand Up @@ -6034,11 +6019,7 @@ data:
# the GCE metadata service, if available.
resourcedetection:
detectors: [env, gcp]
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [opencensus]
Expand Down Expand Up @@ -6369,11 +6350,6 @@ spec:
ports:
- containerPort: 55678
- containerPort: 8888
- containerPort: 13133
readinessProbe:
httpGet:
path: /
port: 13133
resources:
limits:
cpu: 1
Expand Down Expand Up @@ -6493,11 +6469,6 @@ spec:
ports:
- containerPort: 55678
- containerPort: 8888
- containerPort: 13133
readinessProbe:
httpGet:
path: /
port: 13133
resources:
requests:
cpu: 10m
Expand Down