Ver Fonte

More elaborate system status handling (#4571)

Yosef Mihretie há 2 anos atrás
pai
commit
445ff1ee04

+ 17 - 0
api/server/handlers/webhook/prometheus_incoming.go

@@ -4,8 +4,10 @@ import (
 	"context"
 	"fmt"
 	"net/http"
+	"time"
 
 	"connectrpc.com/connect"
+	"google.golang.org/protobuf/types/known/timestamppb"
 
 	porterv1 "github.com/porter-dev/api-contracts/generated/go/porter/v1"
 	"github.com/porter-dev/porter/api/server/authz"
@@ -83,10 +85,25 @@ func (p *PrometheusAlertWebhookHandler) handlePrometheusAlert(ctx context.Contex
 		if alert.Labels["alertname"] == "NoopAlert" {
 			continue
 		}
+		startTime, err := time.Parse(time.RFC3339, alert.StartsAt)
+		if err != nil {
+			return telemetry.Error(ctx, span, err, "error parsing alert start time")
+		}
+		endTime, err := time.Parse(time.RFC3339, alert.EndsAt)
+		if err != nil {
+			return telemetry.Error(ctx, span, err, "error parsing alert end time")
+		}
+		var endTimestamp *timestamppb.Timestamp
+		if endTime.After(startTime) {
+			endTimestamp = timestamppb.New(endTime)
+		}
 		recordPrometheusAlertRequest.Msg.Alerts = append(recordPrometheusAlertRequest.Msg.Alerts, &porterv1.Alert{
 			Name:      alert.Labels["name"],
 			Namespace: alert.Labels["namespace"],
 			Type:      p.getType(alert),
+			Severity:  alert.Labels["severity"],
+			StartTime: timestamppb.New(startTime),
+			EndTime:   endTimestamp,
 		})
 	}
 	telemetry.WithAttributes(span, telemetry.AttributeKV{Key: "porter-app-alert-labels", Value: labelKeyValues})

+ 142 - 103
api/types/system_service_status.go

@@ -21,39 +21,71 @@ const (
 	ServiceDaemonSet InvolvedObjectType = "daemonset"
 )
 
-// Status is the status of a service
-// it has to be one of healthy, partial_failure or failure
-type Status string
+func toInternalInvolvedObjectType(apiType porterv1.InvolvedObjectType) InvolvedObjectType {
+	switch apiType {
+	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DEPLOYMENT:
+		return ServiceDeployment
+	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_STATEFULSET:
+		return ServiceStatefulSet
+	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DAEMONSET:
+		return ServiceDaemonSet
+	default:
+		return ""
+	}
+}
+
+// ServiceStatus is the status of a system service
+type ServiceStatus string
 
 const (
-	// StatusHealthy is when a service is fully healthy
-	StatusHealthy Status = "healthy"
-	// StatusPartialFailure is when a service is partially failing
-	StatusPartialFailure Status = "partial_failure"
-	// StatusFailure is when a service is critically in failure mode
-	StatusFailure Status = "failure"
+	// ServiceStatus_Healthy is the status of a system service when it is fully healthy
+	ServiceStatus_Healthy ServiceStatus = "healthy"
+	// ServiceStatus_PartialFailure is the status of a system service when it is partially failing
+	ServiceStatus_PartialFailure ServiceStatus = "partial_failure"
+	// ServiceStatus_Failure is the status of a system service when it is critically in failure mode
+	ServiceStatus_Failure ServiceStatus = "failure"
+	// ServiceStatus_Undefined is the status of a system service when it is in an undefined state
+	ServiceStatus_Undefined ServiceStatus = "undefined"
 )
 
-// SystemStatusHistory contains the system infrastructure status for a cluster
-type SystemStatusHistory struct {
-	// ClusterStatusHistory is a time series of the cluster's health
-	ClusterStatusHistory []ClusterHealthStatus `json:"cluster_status_history"`
-	// SystemServiceStatusHistories is a list of SystemServiceStatusHistory for each service
-	// there should be only one entry for a service
-	SystemServiceStatusHistories []SystemServiceStatusHistory `json:"system_service_status_histories"`
+func toServiceStatus(apiStatus porterv1.Status) ServiceStatus {
+	switch apiStatus {
+	case porterv1.Status_STATUS_HEALTHY:
+		return ServiceStatus_Healthy
+	case porterv1.Status_STATUS_PARTIAL_FAILURE:
+		return ServiceStatus_PartialFailure
+	case porterv1.Status_STATUS_FAILURE:
+		return ServiceStatus_Failure
+	default:
+		return ServiceStatus_Undefined
+	}
 }
 
-// ClusterHealthStatus is the status of a cluster at a certain timestamp
-type ClusterHealthStatus struct {
-	Timestamp time.Time `json:"timestamp"`
-	// Responsive is set to true if the cluster sent all heartbeats in the time period represented by the Timestamp
-	Responsive bool `json:"responsive"`
-}
+// ClusterHealthType is the type of health check on the cluster that a history is generated from
+type ClusterHealthType string
 
-// SystemServiceStatusHistory contains the status of a system service
-type SystemServiceStatusHistory struct {
-	SystemService SystemService   `json:"system_service"`
-	StatusHistory []ServiceStatus `json:"status_history"`
+const (
+	// ClusterHealthType_Connected is the health history from for checking if the cluster is connected
+	ClusterHealthType_Connected ClusterHealthType = "connected"
+
+	// ClusterHealthType_Pingable is the health history from for checking if the cluster is pingable
+	ClusterHealthType_Pingable ClusterHealthType = "pingable"
+
+	// ClusterHealthType_MetricsHealthy is the health history from for checking if the cluster metrics are healthy
+	ClusterHealthType_MetricsHealthy ClusterHealthType = "metrics_healthy"
+)
+
+func toClusterHealthType(clusterHealthType porterv1.ClusterHealthType) (ClusterHealthType, error) {
+	switch clusterHealthType {
+	case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_CONNECTED:
+		return ClusterHealthType_Connected, nil
+	case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_PINGABLE:
+		return ClusterHealthType_Pingable, nil
+	case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_METRICS_HEALTHY:
+		return ClusterHealthType_MetricsHealthy, nil
+	default:
+		return "", errors.New("unknown cluster health type")
+	}
 }
 
 // SystemService identifies a system service
@@ -63,106 +95,113 @@ type SystemService struct {
 	InvolvedObjectType InvolvedObjectType `json:"involved_object_type"`
 }
 
-// ServiceStatus is the status of a system service at a certain timestamp
-type ServiceStatus struct {
-	Timestamp time.Time `json:"timestamp"`
-	Status    Status    `json:"status"`
+func toSystemService(apiSystemService *porterv1.SystemService) (SystemService, error) {
+	if apiSystemService == nil {
+		return SystemService{}, errors.New("unexpected nil: SystemService")
+	}
+	return SystemService{
+		Name:               apiSystemService.Name,
+		Namespace:          apiSystemService.Namespace,
+		InvolvedObjectType: toInternalInvolvedObjectType(apiSystemService.InvolvedObjectType),
+	}, nil
 }
 
-// ToSystemStatusHistory converts the CCP resposne to the internal SystemStatusHistory
-func ToSystemStatusHistory(apiResp *porterv1.SystemStatusHistoryResponse) (SystemStatusHistory, error) {
-	if apiResp == nil {
-		return SystemStatusHistory{}, fmt.Errorf("nil system service status response")
-	}
-	resp := SystemStatusHistory{
-		ClusterStatusHistory:         []ClusterHealthStatus{},
-		SystemServiceStatusHistories: []SystemServiceStatusHistory{},
+// HealthStatus is the status over a certain period of time
+type HealthStatus struct {
+	StartTime   time.Time     `json:"start_time"`
+	EndTime     *time.Time    `json:"end_time,omitempty"`
+	Status      ServiceStatus `json:"status"`
+	Description string        `json:"description,omitempty"`
+}
+
+// DailyHealthStatus contains the  health status of a system service or cluster over one day
+type DailyHealthStatus struct {
+	StatusPercentages map[ServiceStatus]float32 `json:"status_percentages,omitempty"`
+	HealthStatuses    []*HealthStatus           `json:"health_statuses,omitempty"`
+}
+
+// toDailyHealthStatus converts from the proto  DailyHealthStatus to the local DailyHealthStatus
+func toDailyHealthStatus(protoDailyHealthStatus *porterv1.DailyHealthStatus) DailyHealthStatus {
+	dailyHealthStatus := DailyHealthStatus{
+		StatusPercentages: map[ServiceStatus]float32{},
+		HealthStatuses:    make([]*HealthStatus, 0),
 	}
-	for _, apiClusterStatus := range apiResp.ClusterStatusHistory {
-		clusterHealthStatus, err := toClusterHealthStatus(apiClusterStatus)
-		if err != nil {
-			return resp, err
-		}
-		resp.ClusterStatusHistory = append(resp.ClusterStatusHistory, clusterHealthStatus)
+	for _, statusPercentage := range protoDailyHealthStatus.StatusPercentages {
+		dailyHealthStatus.StatusPercentages[toServiceStatus(statusPercentage.Status)] = statusPercentage.Percentage
 	}
-	for _, apiServiceStatusHistory := range apiResp.SystemServiceStatusHistories {
-		statusHistory, err := toSystemServiceStatusHistory(apiServiceStatusHistory)
-		if err != nil {
-			return resp, err
+	for _, healthStatus := range protoDailyHealthStatus.HealthStatuses {
+		var endTime *time.Time = nil
+		if healthStatus.EndTime != nil {
+			endTimeTemp := healthStatus.EndTime.AsTime()
+			endTime = &endTimeTemp
 		}
-		resp.SystemServiceStatusHistories = append(resp.SystemServiceStatusHistories, statusHistory)
+		dailyHealthStatus.HealthStatuses = append(dailyHealthStatus.HealthStatuses, &HealthStatus{
+			StartTime:   healthStatus.StartTime.AsTime(),
+			EndTime:     endTime,
+			Status:      toServiceStatus(healthStatus.Status),
+			Description: healthStatus.Description,
+		})
 	}
-	return resp, nil
+	return dailyHealthStatus
 }
 
-func toClusterHealthStatus(apiClusterStatus *porterv1.ClusterStatus) (ClusterHealthStatus, error) {
-	if apiClusterStatus == nil {
-		return ClusterHealthStatus{}, errors.New("unexpected nil: ClusterStatus")
-	}
-	return ClusterHealthStatus{
-		Timestamp:  apiClusterStatus.TimestampField.AsTime(),
-		Responsive: apiClusterStatus.Responsive,
-	}, nil
+// SystemServiceStatusHistory contains the daily status history of a system service
+type SystemServiceStatusHistory struct {
+	SystemService      SystemService               `json:"system_service"`
+	DailyHealthHistory map[int32]DailyHealthStatus `json:"daily_health_history,omitempty"`
 }
 
-func toSystemServiceStatusHistory(apiServiceStatusHistory *porterv1.SystemServiceStatusHistory) (SystemServiceStatusHistory, error) {
-	if apiServiceStatusHistory == nil {
+func toSystemServiceStatusHistory(protoSystemServiceStatusHistory *porterv1.SystemServiceStatusHistory) (SystemServiceStatusHistory, error) {
+	if protoSystemServiceStatusHistory == nil {
 		return SystemServiceStatusHistory{}, errors.New("unexpected nil: SystemServiceStatusHistory")
 	}
-	systemService, err := toSystemService(apiServiceStatusHistory.SystemService)
+	systemService, err := toSystemService(protoSystemServiceStatusHistory.SystemService)
 	if err != nil {
 		return SystemServiceStatusHistory{}, err
 	}
 	resp := SystemServiceStatusHistory{
-		SystemService: systemService,
-		StatusHistory: []ServiceStatus{},
+		SystemService:      systemService,
+		DailyHealthHistory: map[int32]DailyHealthStatus{},
 	}
-	for _, apiStatus := range apiServiceStatusHistory.StatusHistory {
-		status, err := toStatus(apiStatus.Status)
-		if err != nil {
-			return resp, err
-		}
-		resp.StatusHistory = append(resp.StatusHistory, ServiceStatus{
-			Timestamp: apiStatus.TimestampField.AsTime(),
-			Status:    status,
-		})
+	for day, protoDailyHealthStatus := range protoSystemServiceStatusHistory.DailyStatusHistory {
+		resp.DailyHealthHistory[day] = toDailyHealthStatus(protoDailyHealthStatus)
 	}
 	return resp, nil
 }
 
-func toSystemService(apiSystemService *porterv1.SystemService) (SystemService, error) {
-	if apiSystemService == nil {
-		return SystemService{}, errors.New("unexpected nil: SystemService")
-	}
-	return SystemService{
-		Name:               apiSystemService.Name,
-		Namespace:          apiSystemService.Namespace,
-		InvolvedObjectType: toInternalInvolvedObjectType(apiSystemService.InvolvedObjectType),
-	}, nil
+// SystemStatusHistory contains the system infrastructure status for a cluster
+type SystemStatusHistory struct {
+	ClusterStatusHistories       map[ClusterHealthType]map[int32]DailyHealthStatus `json:"cluster_status_histories,omitempty"`
+	SystemServiceStatusHistories []SystemServiceStatusHistory                      `json:"system_service_status_histories,omitempty"`
 }
 
-func toInternalInvolvedObjectType(apiType porterv1.InvolvedObjectType) InvolvedObjectType {
-	switch apiType {
-	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DEPLOYMENT:
-		return ServiceDeployment
-	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_STATEFULSET:
-		return ServiceStatefulSet
-	case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DAEMONSET:
-		return ServiceDaemonSet
-	default:
-		return ""
+// ToSystemStatusHistory converts the CCP resposne to the internal SystemStatusHistory
+func ToSystemStatusHistory(apiResp *porterv1.SystemStatusHistoryResponse) (SystemStatusHistory, error) {
+	if apiResp == nil {
+		return SystemStatusHistory{}, fmt.Errorf("nil system service status response")
 	}
-}
-
-func toStatus(apiStatus porterv1.Status) (Status, error) {
-	switch apiStatus {
-	case porterv1.Status_STATUS_HEALTHY:
-		return StatusHealthy, nil
-	case porterv1.Status_STATUS_PARTIAL_FAILURE:
-		return StatusPartialFailure, nil
-	case porterv1.Status_STATUS_FAILURE:
-		return StatusFailure, nil
-	default:
-		return "", errors.New("unknown service status")
+	resp := SystemStatusHistory{
+		ClusterStatusHistories:       map[ClusterHealthType]map[int32]DailyHealthStatus{},
+		SystemServiceStatusHistories: []SystemServiceStatusHistory{},
+	}
+	for _, clusterHealthHistory := range apiResp.ClusterStatusHistories {
+		clusterHealthType, err := toClusterHealthType(clusterHealthHistory.ClusterHealthType)
+		if err != nil {
+			return resp, fmt.Errorf("unknown cluster health type: %s", clusterHealthHistory.ClusterHealthType)
+		}
+		// We don't expect duplicate cluster health types in the output, thus this should be safe
+		resp.ClusterStatusHistories[clusterHealthType] = map[int32]DailyHealthStatus{}
+		for day, protoDailyHealthStatus := range clusterHealthHistory.DailyStatusHistory {
+			dailyHealthStatus := toDailyHealthStatus(protoDailyHealthStatus)
+			resp.ClusterStatusHistories[clusterHealthType][day] = dailyHealthStatus
+		}
+	}
+	for _, apiServiceStatusHistory := range apiResp.SystemServiceStatusHistories {
+		statusHistory, err := toSystemServiceStatusHistory(apiServiceStatusHistory)
+		if err != nil {
+			return resp, err
+		}
+		resp.SystemServiceStatusHistories = append(resp.SystemServiceStatusHistories, statusHistory)
 	}
+	return resp, nil
 }

+ 76 - 60
dashboard/src/main/home/infrastructure-dashboard/tabs/SystemStatus.tsx

@@ -13,11 +13,8 @@ import { useClusterContext } from "../ClusterContextProvider";
 type Props = {};
 
 type StatusData = {
-  cluster_responsive: Array<{
-    timestamp: string;
-    responsive: boolean;
-  }>;
-  services: Record<string, GroupedService[]>;
+  cluster_health_histories: Record<string, Record<number, DailyHealthStatus>>;
+  service_health_histories_grouped: Record<string, GroupedService[]>;
 };
 
 type SystemService = {
@@ -26,43 +23,50 @@ type SystemService = {
   involved_object_type: string;
 };
 
-type SystemStatus = {
-  timestamp: string;
-  status: "failure" | "healthy" | "partial_failure";
+type HealthStatus = {
+  start_time: string;
+  end_time: string;
+  status: "failure" | "healthy" | "partial_failure" | "undefined";
+  description: string;
 };
 
-type Service = {
+type DailyHealthStatus = {
+  status_percentages: Record<string, number>;
+  health_statuses: HealthStatus[];
+}
+
+type ServiceStatusHistory = {
   system_service: SystemService;
-  status_history: SystemStatus[];
+  daily_health_history: Record<number, DailyHealthStatus>;
 };
 
 // If you're also grouping services by namespace and want a type for the grouped structure:
 type GroupedService = {
   system_service: SystemService;
-  status_history: SystemStatus[];
+  daily_health_history: Record<number, DailyHealthStatus>;
 };
 
 type GroupedServices = Record<string, GroupedService[]>;
 
 // Initialize statusData with empty arrays
 const initialState: StatusData = {
-  cluster_responsive: [],
-  services: {},
+  cluster_health_histories: {},
+  service_health_histories_grouped: {},
 };
 
-const groupServicesByNamespace = (services: Service[]): GroupedServices => {
-  return services.reduce<GroupedServices>((acc, service) => {
-    const { namespace } = service.system_service;
+const groupServicesByNamespace = (serviceStatusHistories: ServiceStatusHistory[]): GroupedServices => {
+  return serviceStatusHistories.reduce<GroupedServices>((acc, serviceStatusHistory) => {
+    const { namespace } = serviceStatusHistory.system_service;
     if (!acc[namespace]) {
       acc[namespace] = [];
     }
     acc[namespace].push({
       system_service: {
-        name: service.system_service.name,
-        namespace: service.system_service.namespace,
-        involved_object_type: service.system_service.involved_object_type,
+        name: serviceStatusHistory.system_service.name,
+        namespace: serviceStatusHistory.system_service.namespace,
+        involved_object_type: serviceStatusHistory.system_service.involved_object_type,
       },
-      status_history: service.status_history,
+      daily_health_history: serviceStatusHistory.daily_health_history,
     });
     return acc;
   }, {});
@@ -92,8 +96,8 @@ const SystemStatus: React.FC<Props> = () => {
           data.system_service_status_histories
         );
         setStatusData({
-          cluster_responsive: data.cluster_status_history,
-          services: groupedServices,
+          cluster_health_histories: data.cluster_status_histories,
+          service_health_histories_grouped: groupedServices,
         });
       })
       .catch((err) => {
@@ -103,40 +107,54 @@ const SystemStatus: React.FC<Props> = () => {
 
   return (
     <>
-      <Expandable
-        alt
-        preExpanded
-        header={
-          <Container row>
-            <Text size={16}>Cluster reachable</Text>
-            <Spacer x={1} inline />
-            <Text color="#01a05d">Operational</Text>
-          </Container>
-        }
-      >
-        <StatusBars>
-          {Array.from({ length: 90 }).map((_, i) => {
-            const statusIndex = 89 - i;
-            const responsive =
-              statusData?.cluster_responsive[statusIndex]?.responsive || true; // Provide "true" as the default value
-            return (
-              <Bar
-                key={i}
-                isFirst={i === 0}
-                isLast={i === 89}
-                status={responsive ? "healthy" : "failure"} // Use "responsive" if the value is true, otherwise "unknown"
-              />
-            );
-          })}
-        </StatusBars>
-        <Spacer y={0.5} />
+      <React.Fragment key={"Cluster Health"}>
+        <Spacer y={1} />
+        <Expandable
+          alt
+          preExpanded
+          header={
+            <Container row>
+              <Text size={16}>{"Cluster Health"}</Text>
+              <Spacer x={1} inline />
+              <Text color="#01a05d">Operational</Text>
+            </Container>
+          }
+        >
+          {
+            statusData?.cluster_health_histories &&
+            Object.keys(statusData?.cluster_health_histories).map((key) => {
+              return (
+                <React.Fragment key={key}>
+                  <Text color="helper">{key}</Text>
+                  <Spacer y={0.25} />
+                  <StatusBars>
+                    {Array.from({ length: 90 }).map((_, i) => {
+                      const status =
+                        statusData?.cluster_health_histories[key][89 - i] ? "failure" : "healthy";
+                      return (
+                        <Bar
+                          key={i}
+                          isFirst={i === 0}
+                          isLast={i === 89}
+                          status={status}
+                        />
+                      );
+                    })}
+                  </StatusBars>
+                  <Spacer y={0.25} />
+                </React.Fragment>
+              );
+            })}
+        </Expandable>
+        <Spacer y={0.25} />
         <Container row spaced>
           <Text color="helper">90 days ago</Text>
           <Text color="helper">Today</Text>
         </Container>
-      </Expandable>
-      {statusData?.services &&
-        Object.keys(statusData?.services).map((key) => {
+      </React.Fragment>
+
+      {statusData?.service_health_histories_grouped &&
+        Object.keys(statusData?.service_health_histories_grouped).map((key) => {
           return (
             <React.Fragment key={key}>
               <Spacer y={1} />
@@ -151,23 +169,21 @@ const SystemStatus: React.FC<Props> = () => {
                   </Container>
                 }
               >
-                {statusData.services[key].map((service: Service) => {
+                {statusData.service_health_histories_grouped[key].map((serviceStatusHistory: ServiceStatusHistory) => {
                   return (
-                    <React.Fragment key={service.system_service.name}>
-                      <Text color="helper">{service.system_service.name}</Text>
+                    <React.Fragment key={serviceStatusHistory.system_service.name}>
+                      <Text color="helper">{serviceStatusHistory.system_service.name}</Text>
                       <Spacer y={0.25} />
                       <StatusBars>
                         {Array.from({ length: 90 }).map((_, i) => {
-                          const statusIndex = 89 - i;
+                          const status =
+                            (serviceStatusHistory.daily_health_history?.[89 - i] === undefined) ? "healthy" : "failure";
                           return (
                             <Bar
                               key={i}
                               isFirst={i === 0}
                               isLast={i === 89}
-                              status={
-                                service.status_history[statusIndex]?.status ||
-                                "healthy"
-                              }
+                              status={status}
                             />
                           );
                         })}

+ 1 - 1
go.mod

@@ -85,7 +85,7 @@ require (
 	github.com/matryer/is v1.4.0
 	github.com/nats-io/nats.go v1.24.0
 	github.com/open-policy-agent/opa v0.44.0
-	github.com/porter-dev/api-contracts v0.2.155
+	github.com/porter-dev/api-contracts v0.2.156
 	github.com/riandyrn/otelchi v0.5.1
 	github.com/santhosh-tekuri/jsonschema/v5 v5.0.1
 	github.com/stefanmcshane/helm v0.0.0-20221213002717-88a4a2c6e77d

+ 2 - 2
go.sum

@@ -1552,8 +1552,8 @@ github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/polyfloyd/go-errorlint v0.0.0-20210722154253-910bb7978349/go.mod h1:wi9BfjxjF/bwiZ701TzmfKu6UKC357IOAtNr0Td0Lvw=
-github.com/porter-dev/api-contracts v0.2.155 h1:sRCFsoq1Gvtm/rnvNknSs7CrSvGLZD7Mp7sgy20kB4I=
-github.com/porter-dev/api-contracts v0.2.155/go.mod h1:VV5BzXd02ZdbWIPLVP+PX3GKawJSGQnxorVT2sUZALU=
+github.com/porter-dev/api-contracts v0.2.156 h1:IooB1l6tl+jiGecj2IzYsPoIJxnePaJntDpKSwJBxgc=
+github.com/porter-dev/api-contracts v0.2.156/go.mod h1:VV5BzXd02ZdbWIPLVP+PX3GKawJSGQnxorVT2sUZALU=
 github.com/porter-dev/switchboard v0.0.3 h1:dBuYkiVLa5Ce7059d6qTe9a1C2XEORFEanhbtV92R+M=
 github.com/porter-dev/switchboard v0.0.3/go.mod h1:xSPzqSFMQ6OSbp42fhCi4AbGbQbsm6nRvOkrblFeXU4=
 github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=

+ 35 - 0
internal/models/cluster_health_report.go

@@ -0,0 +1,35 @@
+package models
+
+import (
+	"database/sql"
+
+	"github.com/google/uuid"
+	"github.com/jmoiron/sqlx/types"
+	"gorm.io/gorm"
+)
+
+// ClusterHealthReport represents a cluster health record in the database
+type ClusterHealthReport struct {
+	gorm.Model
+
+	ID        uuid.UUID    `gorm:"type:uuid;primaryKey" json:"id"`
+	CreatedAt sql.NullTime `json:"created_at"`
+	UpdatedAt sql.NullTime `json:"updated_at"`
+
+	// ProjectID is the ID of the project that this cluster belongs to
+	ProjectID uint `json:"project_id"`
+
+	// ClusterID is the ID of the cluster that this health record belongs to
+	ClusterID uint `json:"cluster_id"`
+
+	// Type is the type of health check
+	Type string `json:"type"`
+
+	// Metadata is the metadata associated with the health check
+	Metadata types.JSONText `json:"metadata"`
+}
+
+// TableName overrides the table name
+func (ClusterHealthReport) TableName() string {
+	return "cluster_health_report"
+}

+ 19 - 8
internal/models/system_service_status.go

@@ -1,6 +1,7 @@
 package models
 
 import (
+	"database/sql"
 	"time"
 
 	"github.com/google/uuid"
@@ -19,17 +20,27 @@ type SystemServiceStatus struct {
 	// UpdatedAt is the time (UTC) that the status was last updated.
 	UpdatedAt time.Time `json:"updated_at"`
 
-	ProjectID uint `json:"project_id"`
+	// StartTime is the time at which the status was first observed
+	StartTime sql.NullTime `db:"start_time"`
 
-	ClusterID uint `json:"cluster_id"`
+	// EndTime is the time at which the status was last observed
+	// If null, this means the status might not have been resolved yet
+	EndTime sql.NullTime `db:"end_time"`
 
-	// InvolvedObjectType is the type of k8s object that the service runs
-	// this is currently expected to be "Deployment", "StatefulSet" or "DaemonSet"
-	InvolvedObjectType string `json:"involved_object_type"`
+	// ProjectID is the ID of the project that this app belongs to
+	ProjectID uint `db:"project_id"`
 
-	Name string `json:"name"`
+	// ClusterID is the ID of the cluster that this app belongs to
+	ClusterID uint `db:"cluster_id"`
 
-	Namespace string `json:"namespace"`
+	// the type of kubernetes object this service is
+	InvolvedObjectType string `db:"involved_object_type"`
+
+	Name string `db:"name"`
+
+	Namespace string `db:"namespace"`
+
+	Severity string `db:"severity"`
 
 	// Any other relevant metadata. This field allows us to be flexible in the future.
 	Metadata JSONB `json:"metadata" sql:"type:jsonb" gorm:"type:jsonb"`
@@ -37,5 +48,5 @@ type SystemServiceStatus struct {
 
 // TableName overrides the table name
 func (SystemServiceStatus) TableName() string {
-	return "system_service_status"
+	return "system_service_status_v2"
 }

+ 1 - 0
internal/repository/gorm/migrate.go

@@ -87,5 +87,6 @@ func AutoMigrate(db *gorm.DB, debug bool) error {
 		&ints.SlackIntegration{},
 		&models.Ipam{},
 		&models.AppEventWebhooks{},
+		&models.ClusterHealthReport{},
 	)
 }