Skip to content

MongoDB StatefulSet randomly fails to become ready #422

@jk464

Description

@jk464

I've noticed that in our CI runs, the k3s/minikube deployment steps are randomly failing as the 3rd pod in the mongo statefulset is randomly crash looping.

On the k3s E2E test here's 4 examples on the master branch

They're all on the RBAC upgrade

It's also happening on the CircleCI minikube test:

They are also all on the RBAC upgrade.

I manage to replicate it locally, so I could get the logs from Mongo (note this failed on the default deploy, no RBAC):

{"t":{"$date":"2024-05-21T11:40:14.954+00:00"},"s":"F",  "c":"REPL",     "id":21238,   "ctx":"ReplWriterWorker-2","msg":"Writer worker caught exception","attr":{"error":"NamespaceNotFound: Failed to apply operation: { lsid: { id: UUID(\"675a759a-18d7-405e-a37f-83f51fc0de78\"), uid: BinData(0, 7B1F76DE3F42CE46A2DA6396CAB63120A17B97C6ADFD496A1835514D64F14852) }, txnNumber: 15, op: \"i\", ns: \"st2.trace_d_b\", ui: UUID(\"c07a67cd-afb9-427d-832a-7e5ac91acf13\"), o: { _id: ObjectId('664c881ad87a46e2e741cac4'), uid: \"trace:0a5d6a123f66d306f9b73237ec1e1172\", trace_tag: \"trigger_instance-664c881ad87a46e2e741cac3\", trigger_instances: [ { object_id: \"664c881ad87a46e2e741cac3\", ref: \"core.st2.sensor.process_spawn\", updated_at: 1716291610975280, caused_by: {} } ], rules: [], action_executions: [], start_timestamp: 1716291610974987 }, ts: Timestamp(1716291610, 9), t: 14, v: 2, wall: new Date(1716291610991), stmtId: 0, prevOpTime: { ts: Timestamp(0, 0), t: -1 } } :: caused by :: Unable to resolve c07a67cd-afb9-427d-832a-7e5ac91acf13","oplogEntry":{"lsid":{"id":{"$uuid":"675a759a-18d7-405e-a37f-83f51fc0de78"},"uid":{"$binary":{"base64":"ex923j9Czkai2mOWyrYxIKF7l8at/UlqGDVRTWTxSFI=","subType":"0"}}},"txnNumber":15,"op":"i","ns":"st2.trace_d_b","ui":{"$uuid":"c07a67cd-afb9-427d-832a-7e5ac91acf13"},"o":{"_id":{"$oid":"664c881ad87a46e2e741cac4"},"uid":"trace:0a5d6a123f66d306f9b73237ec1e1172","trace_tag":"trigger_instance-664c881ad87a46e2e741cac3","trigger_instances":[{"object_id":"664c881ad87a46e2e741cac3","ref":"core.st2.sensor.process_spawn","updated_at":1716291610975280,"caused_by":{}}],"rules":[],"action_executions":[],"start_timestamp":1716291610974987},"ts":{"$timestamp":{"t":1716291610,"i":9}},"t":14,"v":2,"wall":{"$date":"2024-05-21T11:40:10.991Z"},"stmtId":0,"prevOpTime":{"ts":{"$timestamp":{"t":0,"i":0}},"t":-1}}}}
{"t":{"$date":"2024-05-21T11:40:14.954+00:00"},"s":"F",  "c":"REPL",     "id":21235,   "ctx":"OplogApplier-0","msg":"Failed to apply batch of operations","attr":{"numOperationsInBatch":3,"firstOperation":{"lsid":{"id":{"$uuid":"675a759a-18d7-405e-a37f-83f51fc0de78"},"uid":{"$binary":{"base64":"ex923j9Czkai2mOWyrYxIKF7l8at/UlqGDVRTWTxSFI=","subType":"0"}}},"txnNumber":15,"op":"i","ns":"st2.trace_d_b","ui":{"$uuid":"c07a67cd-afb9-427d-832a-7e5ac91acf13"},"o":{"_id":{"$oid":"664c881ad87a46e2e741cac4"},"uid":"trace:0a5d6a123f66d306f9b73237ec1e1172","trace_tag":"trigger_instance-664c881ad87a46e2e741cac3","trigger_instances":[{"object_id":"664c881ad87a46e2e741cac3","ref":"core.st2.sensor.process_spawn","updated_at":1716291610975280,"caused_by":{}}],"rules":[],"action_executions":[],"start_timestamp":1716291610974987},"ts":{"$timestamp":{"t":1716291610,"i":9}},"t":14,"v":2,"wall":{"$date":"2024-05-21T11:40:10.991Z"},"stmtId":0,"prevOpTime":{"ts":{"$timestamp":{"t":0,"i":0}},"t":-1}},"lastOperation":{"lsid":{"id":{"$uuid":"675a759a-18d7-405e-a37f-83f51fc0de78"},"uid":{"$binary":{"base64":"ex923j9Czkai2mOWyrYxIKF7l8at/UlqGDVRTWTxSFI=","subType":"0"}}},"txnNumber":17,"op":"u","ns":"st2.trigger_instance_d_b","ui":{"$uuid":"5709f9df-3091-4cdc-a92d-330bb1f55de7"},"o":{"$v":1,"$set":{"status":"processed"}},"o2":{"_id":{"$oid":"664c881ad87a46e2e741cac3"}},"ts":{"$timestamp":{"t":1716291611,"i":3}},"t":14,"v":2,"wall":{"$date":"2024-05-21T11:40:11.115Z"},"stmtId":0,"prevOpTime":{"ts":{"$timestamp":{"t":0,"i":0}},"t":-1}},"failedWriterThread":12,"error":"NamespaceNotFound: Failed to apply operation: { lsid: { id: UUID(\"675a759a-18d7-405e-a37f-83f51fc0de78\"), uid: BinData(0, 7B1F76DE3F42CE46A2DA6396CAB63120A17B97C6ADFD496A1835514D64F14852) }, txnNumber: 15, op: \"i\", ns: \"st2.trace_d_b\", ui: UUID(\"c07a67cd-afb9-427d-832a-7e5ac91acf13\"), o: { _id: ObjectId('664c881ad87a46e2e741cac4'), uid: \"trace:0a5d6a123f66d306f9b73237ec1e1172\", trace_tag: \"trigger_instance-664c881ad87a46e2e741cac3\", trigger_instances: [ { object_id: \"664c881ad87a46e2e741cac3\", ref: \"core.st2.sensor.process_spawn\", updated_at: 1716291610975280, caused_by: {} } ], rules: [], action_executions: [], start_timestamp: 1716291610974987 }, ts: Timestamp(1716291610, 9), t: 14, v: 2, wall: new Date(1716291610991), stmtId: 0, prevOpTime: { ts: Timestamp(0, 0), t: -1 } } :: caused by :: Unable to resolve c07a67cd-afb9-427d-832a-7e5ac91acf13"}}
{"t":{"$date":"2024-05-21T11:40:14.954+00:00"},"s":"F",  "c":"-",        "id":23095,   "ctx":"OplogApplier-0","msg":"Fatal assertion","attr":{"msgid":34437,"error":"NamespaceNotFound: Failed to apply operation: { lsid: { id: UUID(\"675a759a-18d7-405e-a37f-83f51fc0de78\"), uid: BinData(0, 7B1F76DE3F42CE46A2DA6396CAB63120A17B97C6ADFD496A1835514D64F14852) }, txnNumber: 15, op: \"i\", ns: \"st2.trace_d_b\", ui: UUID(\"c07a67cd-afb9-427d-832a-7e5ac91acf13\"), o: { _id: ObjectId('664c881ad87a46e2e741cac4'), uid: \"trace:0a5d6a123f66d306f9b73237ec1e1172\", trace_tag: \"trigger_instance-664c881ad87a46e2e741cac3\", trigger_instances: [ { object_id: \"664c881ad87a46e2e741cac3\", ref: \"core.st2.sensor.process_spawn\", updated_at: 1716291610975280, caused_by: {} } ], rules: [], action_executions: [], start_timestamp: 1716291610974987 }, ts: Timestamp(1716291610, 9), t: 14, v: 2, wall: new Date(1716291610991), stmtId: 0, prevOpTime: { ts: Timestamp(0, 0), t: -1 } } :: caused by :: Unable to resolve c07a67cd-afb9-427d-832a-7e5ac91acf13","file":"src/mongo/db/repl/oplog_applier_impl.cpp","line":511}}
{"t":{"$date":"2024-05-21T11:40:14.954+00:00"},"s":"F",  "c":"-",        "id":23096,   "ctx":"OplogApplier-0","msg":"\n\n***aborting after fassert() failure\n\n"}

All I can see is the error is NamespaceNotFound - but I know very little about Mongo, and those locals are pretty obtuse so I've no-idea what is happening here - my only guess is some kind of race condition where the "Namespace" isn't being created in time before st2 is trying to push entries to it...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions