Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ed7ef8f
Decoupling the TCS client's ability to send instance health from the …
alexcmms Nov 7, 2025
56a9be7
creating consts
alexcmms Nov 7, 2025
c15b7a0
renaming to instancehealthcheck status
alexcmms Nov 7, 2025
1ac3e5b
capatilzing C
alexcmms Nov 7, 2025
897be63
fixes
alexcmms Nov 7, 2025
2c503f0
moving health check types out of doctor package
alexcmms Nov 7, 2025
1444a26
moving healthcheck status tests to types test
alexcmms Nov 7, 2025
0917a58
cleaning up comments
alexcmms Nov 7, 2025
09b02cd
updating vendor
alexcmms Nov 7, 2025
e57bff7
fixing failing tests.
alexcmms Nov 7, 2025
2835e7a
clean up tests.txt
alexcmms Nov 7, 2025
49bb4f3
fixing failing integ test.
alexcmms Nov 11, 2025
231c46a
no-op change attempting to trigger tests.
alexcmms Nov 11, 2025
58e2d57
comment to run tests
alexcmms Nov 11, 2025
4214301
various clean up
alexcmms Nov 13, 2025
8e020ba
fix vendor
alexcmms Nov 13, 2025
64ea92f
fix vendr
alexcmms Nov 13, 2025
bdc7ffc
meaningful commit message
alexcmms Nov 13, 2025
28f03e9
Changing const name.
alexcmms Nov 13, 2025
43d3365
Fixing TestSessionReconnectsWithoutBackoffOnEOFError
alexcmms Nov 13, 2025
3228f58
session tests pass locally
alexcmms Nov 13, 2025
6507676
Revert "Fixing TestSessionReconnectsWithoutBackoffOnEOFError"
alexcmms Nov 14, 2025
d76d8c2
Session test fix.
alexcmms Nov 17, 2025
ef1ff77
adding comment
alexcmms Nov 20, 2025
d1017f9
Merge branch 'dev' into container-instance-health
alexcmms Nov 20, 2025
d6622e4
setting to old tcs test.
alexcmms Nov 20, 2025
ee1bb21
fixing merge
alexcmms Nov 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 30 additions & 21 deletions agent/doctor/docker_runtime_healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"time"

"github.com/aws/amazon-ecs-agent/agent/dockerclient/dockerapi"
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/tcs/model/ecstcs"
"github.com/cihub/seelog"
)

Expand All @@ -28,95 +28,104 @@ const systemPingTimeout = time.Second * 2
var timeNow = time.Now

type dockerRuntimeHealthcheck struct {
// HealthcheckType is the reported healthcheck type
// HealthcheckType is the reported healthcheck type.
HealthcheckType string `json:"HealthcheckType,omitempty"`
// Status is the container health status
Status doctor.HealthcheckStatus `json:"HealthcheckStatus,omitempty"`
// Timestamp is the timestamp when container health status changed
// Status is the container health status.
Status ecstcs.InstanceHealthCheckStatus `json:"HealthcheckStatus,omitempty"`
// TimeStamp is the timestamp when container health status changed.
TimeStamp time.Time `json:"TimeStamp,omitempty"`
// StatusChangeTime is the latest time the health status changed
// StatusChangeTime is the latest time the health status changed.
StatusChangeTime time.Time `json:"StatusChangeTime,omitempty"`

// LastStatus is the last container health status
LastStatus doctor.HealthcheckStatus `json:"LastStatus,omitempty"`
// LastTimeStamp is the timestamp of last container health status
// LastStatus is the last container health status.
LastStatus ecstcs.InstanceHealthCheckStatus `json:"LastStatus,omitempty"`
// LastTimeStamp is the timestamp of last container health status.
LastTimeStamp time.Time `json:"LastTimeStamp,omitempty"`

client dockerapi.DockerClient
lock sync.RWMutex
}

// NewDockerRuntimeHealthcheck creates a new Docker runtime health check.
func NewDockerRuntimeHealthcheck(client dockerapi.DockerClient) *dockerRuntimeHealthcheck {
nowTime := timeNow()
return &dockerRuntimeHealthcheck{
HealthcheckType: doctor.HealthcheckTypeContainerRuntime,
Status: doctor.HealthcheckStatusInitializing,
HealthcheckType: ecstcs.InstanceHealthCheckTypeContainerRuntime,
Status: ecstcs.InstanceHealthCheckStatusInitializing,
TimeStamp: nowTime,
StatusChangeTime: nowTime,
LastTimeStamp: nowTime,
client: client,
}
}

func (dhc *dockerRuntimeHealthcheck) RunCheck() doctor.HealthcheckStatus {
// TODO pass in context as an argument
// RunCheck performs a health check by pinging the Docker daemon.
func (dhc *dockerRuntimeHealthcheck) RunCheck() ecstcs.InstanceHealthCheckStatus {
// TODO: Pass in context as an argument.
res := dhc.client.SystemPing(context.TODO(), systemPingTimeout)
resultStatus := doctor.HealthcheckStatusOk
resultStatus := ecstcs.InstanceHealthCheckStatusOk
if res.Error != nil {
seelog.Infof("[DockerRuntimeHealthcheck] Docker Ping failed with error: %v", res.Error)
resultStatus = doctor.HealthcheckStatusImpaired
resultStatus = ecstcs.InstanceHealthCheckStatusImpaired
}
dhc.SetHealthcheckStatus(resultStatus)
return resultStatus
}

func (dhc *dockerRuntimeHealthcheck) SetHealthcheckStatus(healthStatus doctor.HealthcheckStatus) {
// SetHealthcheckStatus updates the health check status and timestamps.
func (dhc *dockerRuntimeHealthcheck) SetHealthcheckStatus(healthStatus ecstcs.InstanceHealthCheckStatus) {
dhc.lock.Lock()
defer dhc.lock.Unlock()
nowTime := time.Now()
// if the status has changed, update status change timestamp
// If the status has changed, update status change timestamp.
if dhc.Status != healthStatus {
dhc.StatusChangeTime = nowTime
}
// track previous status
// Track previous status.
dhc.LastStatus = dhc.Status
dhc.LastTimeStamp = dhc.TimeStamp

// update latest status
// Update latest status.
dhc.Status = healthStatus
dhc.TimeStamp = nowTime
}

// GetHealthcheckType returns the type of this health check.
func (dhc *dockerRuntimeHealthcheck) GetHealthcheckType() string {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
return dhc.HealthcheckType
}

func (dhc *dockerRuntimeHealthcheck) GetHealthcheckStatus() doctor.HealthcheckStatus {
// GetHealthcheckStatus returns the current health check status.
func (dhc *dockerRuntimeHealthcheck) GetHealthcheckStatus() ecstcs.InstanceHealthCheckStatus {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
return dhc.Status
}

// GetHealthcheckTime returns the timestamp of the current health check status.
func (dhc *dockerRuntimeHealthcheck) GetHealthcheckTime() time.Time {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
return dhc.TimeStamp
}

// GetStatusChangeTime returns the timestamp when the status last changed.
func (dhc *dockerRuntimeHealthcheck) GetStatusChangeTime() time.Time {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
return dhc.StatusChangeTime
}

func (dhc *dockerRuntimeHealthcheck) GetLastHealthcheckStatus() doctor.HealthcheckStatus {
// GetLastHealthcheckStatus returns the previous health check status.
func (dhc *dockerRuntimeHealthcheck) GetLastHealthcheckStatus() ecstcs.InstanceHealthCheckStatus {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
return dhc.LastStatus
}

// GetLastHealthcheckTime returns the timestamp of the previous health check status.
func (dhc *dockerRuntimeHealthcheck) GetLastHealthcheckTime() time.Time {
dhc.lock.RLock()
defer dhc.lock.RUnlock()
Expand Down
42 changes: 21 additions & 21 deletions agent/doctor/docker_runtime_healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

"github.com/aws/amazon-ecs-agent/agent/dockerclient/dockerapi"
mock_dockerapi "github.com/aws/amazon-ecs-agent/agent/dockerclient/dockerapi/mocks"
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/tcs/model/ecstcs"
"github.com/docker/docker/api/types"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/assert"
Expand All @@ -27,8 +27,8 @@ func TestNewDockerRuntimeHealthCheck(t *testing.T) {
defer func() { timeNow = originalTimeNow }()

expectedDockerRuntimeHealthcheck := &dockerRuntimeHealthcheck{
HealthcheckType: doctor.HealthcheckTypeContainerRuntime,
Status: doctor.HealthcheckStatusInitializing,
HealthcheckType: ecstcs.InstanceHealthCheckTypeContainerRuntime,
Status: ecstcs.InstanceHealthCheckStatusInitializing,
TimeStamp: mockTime,
StatusChangeTime: mockTime,
LastTimeStamp: mockTime,
Expand All @@ -42,26 +42,26 @@ func TestRunCheck(t *testing.T) {
testcases := []struct {
name string
dockerPingResponse *dockerapi.PingResponse
expectedStatus doctor.HealthcheckStatus
expectedLastStatus doctor.HealthcheckStatus
expectedStatus ecstcs.InstanceHealthCheckStatus
expectedLastStatus ecstcs.InstanceHealthCheckStatus
}{
{
name: "empty checks",
dockerPingResponse: &dockerapi.PingResponse{
Response: &types.Ping{APIVersion: "test_api_version"},
Error: nil,
},
expectedStatus: doctor.HealthcheckStatusOk,
expectedLastStatus: doctor.HealthcheckStatusInitializing,
expectedStatus: ecstcs.InstanceHealthCheckStatusOk,
expectedLastStatus: ecstcs.InstanceHealthCheckStatusInitializing,
},
{
name: "all true checks",
dockerPingResponse: &dockerapi.PingResponse{
Response: nil,
Error: &dockerapi.DockerTimeoutError{},
},
expectedStatus: doctor.HealthcheckStatusImpaired,
expectedLastStatus: doctor.HealthcheckStatusInitializing,
expectedStatus: ecstcs.InstanceHealthCheckStatusImpaired,
expectedLastStatus: ecstcs.InstanceHealthCheckStatusInitializing,
},
}
ctrl := gomock.NewController(t)
Expand All @@ -85,9 +85,9 @@ func TestSetHealthCheckStatus(t *testing.T) {
defer ctrl.Finish()
dockerClient := mock_dockerapi.NewMockDockerClient(ctrl)
dockerRuntimeHealthCheck := NewDockerRuntimeHealthcheck(dockerClient)
healthCheckStatus := doctor.HealthcheckStatusOk
healthCheckStatus := ecstcs.InstanceHealthCheckStatusOk
dockerRuntimeHealthCheck.SetHealthcheckStatus(healthCheckStatus)
assert.Equal(t, doctor.HealthcheckStatusOk, dockerRuntimeHealthCheck.Status)
assert.Equal(t, ecstcs.InstanceHealthCheckStatusOk, dockerRuntimeHealthCheck.Status)
}

func TestSetHealthcheckStatusChange(t *testing.T) {
Expand All @@ -96,23 +96,23 @@ func TestSetHealthcheckStatusChange(t *testing.T) {
dockerClient := mock_dockerapi.NewMockDockerClient(ctrl)
dockerRuntimeHealthcheck := NewDockerRuntimeHealthcheck(dockerClient)

// we should start in initializing status
assert.Equal(t, doctor.HealthcheckStatusInitializing, dockerRuntimeHealthcheck.Status)
// We should start in initializing status.
assert.Equal(t, ecstcs.InstanceHealthCheckStatusInitializing, dockerRuntimeHealthcheck.Status)
initializationChangeTime := dockerRuntimeHealthcheck.GetStatusChangeTime()

// we update to initializing again; our StatusChangeTime remains the same
dockerRuntimeHealthcheck.SetHealthcheckStatus(doctor.HealthcheckStatusInitializing)
// We update to initializing again; our StatusChangeTime remains the same.
dockerRuntimeHealthcheck.SetHealthcheckStatus(ecstcs.InstanceHealthCheckStatusInitializing)
updateChangeTime := dockerRuntimeHealthcheck.GetStatusChangeTime()
assert.Equal(t, doctor.HealthcheckStatusInitializing, dockerRuntimeHealthcheck.Status)
assert.Equal(t, ecstcs.InstanceHealthCheckStatusInitializing, dockerRuntimeHealthcheck.Status)
assert.Equal(t, initializationChangeTime, updateChangeTime)

// add a sleep so we know time has elapsed between the initial status and status change time
// Add a sleep so we know time has elapsed between the initial status and status change time.
time.Sleep(1 * time.Millisecond)

// change status. This should change the update time too
dockerRuntimeHealthcheck.SetHealthcheckStatus(doctor.HealthcheckStatusOk)
assert.Equal(t, doctor.HealthcheckStatusOk, dockerRuntimeHealthcheck.Status)
// Change status. This should change the update time too.
dockerRuntimeHealthcheck.SetHealthcheckStatus(ecstcs.InstanceHealthCheckStatusOk)
assert.Equal(t, ecstcs.InstanceHealthCheckStatusOk, dockerRuntimeHealthcheck.Status)
okChangeTime := dockerRuntimeHealthcheck.GetStatusChangeTime()
// have we updated our change time?
// Have we updated our change time?
assert.True(t, okChangeTime.After(initializationChangeTime))
}
22 changes: 12 additions & 10 deletions agent/doctor/ebs_csi_runtime_healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,25 @@ import (
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/logger"
"github.com/aws/amazon-ecs-agent/ecs-agent/logger/field"
"github.com/aws/amazon-ecs-agent/ecs-agent/tcs/model/ecstcs"
)

const (
// Default request timeout for EBS CSI Daemon health check requests
// DefaultEBSHealthRequestTimeout is the default request timeout for EBS CSI Daemon health check requests.
DefaultEBSHealthRequestTimeout = 2 * time.Second
)

// Health check for EBS CSI Daemon.
// ebsCSIDaemonHealthcheck is a health check for EBS CSI Daemon.
type ebsCSIDaemonHealthcheck struct {
csiClient csiclient.CSIClient
requestTimeout time.Duration
*statustracker.HealthCheckStatusTracker
}

// Constructor for EBS CSI Daemon Health Check
// NewEBSCSIDaemonHealthCheck is the constructor for EBS CSI Daemon Health Check.
func NewEBSCSIDaemonHealthCheck(
csiClient csiclient.CSIClient,
requestTimeout time.Duration, // timeout for health check requests
requestTimeout time.Duration, // Timeout for health check requests.
) doctor.Healthcheck {
return &ebsCSIDaemonHealthcheck{
csiClient: csiClient,
Expand All @@ -47,24 +48,25 @@ func NewEBSCSIDaemonHealthCheck(
}
}

// Performs a health check for EBS CSI Daemon by sending a request to it to get
// node capabilities. If EBS CSI Daemon is not started yet then returns OK trivially.
func (e *ebsCSIDaemonHealthcheck) RunCheck() doctor.HealthcheckStatus {
// RunCheck performs a health check for EBS CSI Daemon by sending a request to it to get node capabilities.
// If EBS CSI Daemon is not started yet then returns OK trivially.
func (e *ebsCSIDaemonHealthcheck) RunCheck() ecstcs.InstanceHealthCheckStatus {
ctx, cancel := context.WithTimeout(context.Background(), e.requestTimeout)
defer cancel()

resp, err := e.csiClient.NodeGetCapabilities(ctx)
if err != nil {
logger.Error("EBS CSI Daemon health check failed", logger.Fields{field.Error: err})
e.SetHealthcheckStatus(doctor.HealthcheckStatusImpaired)
e.SetHealthcheckStatus(ecstcs.InstanceHealthCheckStatusImpaired)
return e.GetHealthcheckStatus()
}

logger.Info("EBS CSI Driver is healthy", logger.Fields{"nodeCapabilities": resp})
e.SetHealthcheckStatus(doctor.HealthcheckStatusOk)
e.SetHealthcheckStatus(ecstcs.InstanceHealthCheckStatusOk)
return e.GetHealthcheckStatus()
}

// GetHealthcheckType returns the type of this health check.
func (e *ebsCSIDaemonHealthcheck) GetHealthcheckType() string {
return doctor.HealthcheckTypeEBSDaemon
return ecstcs.InstanceHealthCheckTypeEBSDaemon
}
18 changes: 9 additions & 9 deletions agent/doctor/ebs_csi_runtime_healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,55 +20,55 @@ import (
"testing"

mock_csiclient "github.com/aws/amazon-ecs-agent/ecs-agent/csiclient/mocks"
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/tcs/model/ecstcs"
"github.com/container-storage-interface/spec/lib/go/csi"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/assert"
)

// Tests that EBS Daemon Health Check is of the right health check type
// Tests that EBS Daemon Health Check is of the right health check type.
func TestEBSGetHealthcheckType(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

csiClient := mock_csiclient.NewMockCSIClient(ctrl)
hc := NewEBSCSIDaemonHealthCheck(csiClient, 0)

assert.Equal(t, doctor.HealthcheckTypeEBSDaemon, hc.GetHealthcheckType())
assert.Equal(t, ecstcs.InstanceHealthCheckTypeEBSDaemon, hc.GetHealthcheckType())
}

// Tests initial health status of EBS Daemon
// Tests initial health status of EBS Daemon.
func TestEBSInitialHealth(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

csiClient := mock_csiclient.NewMockCSIClient(ctrl)
hc := NewEBSCSIDaemonHealthCheck(csiClient, 0)

assert.Equal(t, doctor.HealthcheckStatusInitializing, hc.GetHealthcheckStatus())
assert.Equal(t, ecstcs.InstanceHealthCheckStatusInitializing, hc.GetHealthcheckStatus())
}

// Tests RunCheck method of EBS Daemon Health Check
// Tests RunCheck method of EBS Daemon Health Check.
func TestEBSRunHealthCheck(t *testing.T) {
tcs := []struct {
name string
setCSIClientExpectations func(csiClient *mock_csiclient.MockCSIClient)
expectedStatus doctor.HealthcheckStatus
expectedStatus ecstcs.InstanceHealthCheckStatus
}{
{
name: "OK when healthcheck succeeds",
setCSIClientExpectations: func(csiClient *mock_csiclient.MockCSIClient) {
csiClient.EXPECT().NodeGetCapabilities(gomock.Any()).
Return(&csi.NodeGetCapabilitiesResponse{}, nil)
},
expectedStatus: doctor.HealthcheckStatusOk,
expectedStatus: ecstcs.InstanceHealthCheckStatusOk,
},
{
name: "IMPAIRED when healthcheck fails",
setCSIClientExpectations: func(csiClient *mock_csiclient.MockCSIClient) {
csiClient.EXPECT().NodeGetCapabilities(gomock.Any()).Return(nil, errors.New("err"))
},
expectedStatus: doctor.HealthcheckStatusImpaired,
expectedStatus: ecstcs.InstanceHealthCheckStatusImpaired,
},
}

Expand Down
Loading
Loading