|
|
@@ -421,7 +421,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
var ok bool
|
|
|
defaultRegion, ok = util.GetRegion(nodeList[0].Labels)
|
|
|
if !ok {
|
|
|
- log.DedupedWarningf(5, "Failed to locate default region")
|
|
|
+ log.DedupedWarningf(5, "Failed to read default region from labels on node %s", nodeList[0].Name)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -471,7 +471,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
// TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
|
|
|
nodes, err := cmme.Model.GetNodeCost(cmme.CloudProvider)
|
|
|
if err != nil {
|
|
|
- log.Warnf("Metric emission: error getting Node cost: %s", err)
|
|
|
+ log.Warnf("Error getting Node cost: %s", err)
|
|
|
}
|
|
|
for nodeName, node := range nodes {
|
|
|
// Emit costs, guarding against NaN inputs for custom pricing.
|
|
|
@@ -535,24 +535,27 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
// don't record cpuCost, ramCost, or gpuCost in the case of wild outliers
|
|
|
// k8s api sometimes causes cost spikes as described here:
|
|
|
// https://github.com/opencost/opencost/issues/927
|
|
|
- if cpuCost < outlierFactor*avgCosts.CpuCostAverage {
|
|
|
+ cpuOutlierCutoff := outlierFactor * avgCosts.CpuCostAverage
|
|
|
+ if cpuCost < cpuOutlierCutoff {
|
|
|
cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(cpuCost)
|
|
|
avgCosts.CpuCostAverage = (avgCosts.CpuCostAverage*avgCosts.NumCpuDataPoints + cpuCost) / (avgCosts.NumCpuDataPoints + 1)
|
|
|
avgCosts.NumCpuDataPoints += 1
|
|
|
} else {
|
|
|
- log.Warnf("CPU cost outlier detected; skipping data point.")
|
|
|
+ log.Debugf("CPU cost outlier detected; skipping data point: %s had %f as cost, which is above %f.", nodeName, cpuCost, cpuOutlierCutoff)
|
|
|
}
|
|
|
- if ramCost < outlierFactor*avgCosts.RamCostAverage {
|
|
|
+ ramOutlierCutoff := outlierFactor * avgCosts.RamCostAverage
|
|
|
+ if ramCost < ramOutlierCutoff {
|
|
|
cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(ramCost)
|
|
|
avgCosts.RamCostAverage = (avgCosts.RamCostAverage*avgCosts.NumRamDataPoints + ramCost) / (avgCosts.NumRamDataPoints + 1)
|
|
|
avgCosts.NumRamDataPoints += 1
|
|
|
} else {
|
|
|
- log.Warnf("RAM cost outlier detected; skipping data point.")
|
|
|
+ log.Debugf("RAM cost outlier detected; skipping data point: %s had %f as cost, which is above %f.", nodeName, ramCost, ramOutlierCutoff)
|
|
|
}
|
|
|
// skip redording totalCost if any constituent costs were outliers
|
|
|
- if cpuCost < outlierFactor*avgCosts.CpuCostAverage &&
|
|
|
- ramCost < outlierFactor*avgCosts.RamCostAverage {
|
|
|
+ if cpuCost < cpuOutlierCutoff && ramCost < ramOutlierCutoff {
|
|
|
cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(totalCost)
|
|
|
+ } else {
|
|
|
+ log.Debugf("CPU and RAM outlier detected, not recording node %s total cost %f", nodeName, totalCost)
|
|
|
}
|
|
|
|
|
|
nodeCostAverages[labelKey] = avgCosts
|
|
|
@@ -568,7 +571,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
// TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
|
|
|
loadBalancers, err := cmme.Model.GetLBCost(cmme.CloudProvider)
|
|
|
if err != nil {
|
|
|
- log.Warnf("Metric emission: error getting LoadBalancer cost: %s", err)
|
|
|
+ log.Warnf("Error getting LoadBalancer cost: %s", err)
|
|
|
}
|
|
|
for lbKey, lb := range loadBalancers {
|
|
|
// TODO: parse (if necessary) and calculate cost associated with loadBalancer based on dynamic cloud prices fetched into each lb struct on GetLBCost() call
|
|
|
@@ -644,7 +647,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
|
|
|
parameters, ok := storageClassMap[pv.Spec.StorageClassName]
|
|
|
if !ok {
|
|
|
- log.Debugf("Unable to find parameters for storage class \"%s\". Does pv \"%s\" have a storageClassName?", pv.Spec.StorageClassName, pv.Name)
|
|
|
+ log.Debugf("Unable to find parameters for storage class \"%s\". Pv \"%s\" might have an empty or invalid storageClassName.", pv.Spec.StorageClassName, pv.Name)
|
|
|
}
|
|
|
var region string
|
|
|
if r, ok := util.GetRegion(pv.Labels); ok {
|
|
|
@@ -670,44 +673,44 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
// longer exist
|
|
|
for labelString, seen := range nodeSeen {
|
|
|
if !seen {
|
|
|
- log.Debugf("Removing %s from nodes", labelString)
|
|
|
+ log.Debugf("Removing metrics for %s, no data observed recently", labelString)
|
|
|
labels := getLabelStringsFromKey(labelString)
|
|
|
|
|
|
ok := cmme.NodeTotalPriceRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from totalprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from totalprice", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from totalprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from totalprice", labels)
|
|
|
}
|
|
|
ok = cmme.NodeSpotRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from spot records", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from spot records", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from spot records", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from spot records", labels)
|
|
|
}
|
|
|
ok = cmme.CPUPriceRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from cpuprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from cpuprice", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from cpuprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from cpuprice", labels)
|
|
|
}
|
|
|
ok = cmme.GPUPriceRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from gpuprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from gpuprice", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from gpuprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from gpuprice", labels)
|
|
|
}
|
|
|
ok = cmme.GPUCountRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from gpucount", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from gpucount", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from gpucount", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from gpucount", labels)
|
|
|
}
|
|
|
ok = cmme.RAMPriceRecorder.DeleteLabelValues(labels...)
|
|
|
if ok {
|
|
|
- log.Debugf("removed %s from ramprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, removed from ramprice", labels)
|
|
|
} else {
|
|
|
- log.Errorf("FAILURE TO REMOVE %s from ramprice", labelString)
|
|
|
+ log.Debugf("No data observed for node with labels %v, but we could not remove it from ramprice", labels)
|
|
|
}
|
|
|
delete(nodeSeen, labelString)
|
|
|
delete(nodeCostAverages, labelString)
|
|
|
@@ -720,7 +723,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
labels := getLabelStringsFromKey(labelString)
|
|
|
ok := cmme.LBCostRecorder.DeleteLabelValues(labels...)
|
|
|
if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete LoadBalancer with labels: %v", labels)
|
|
|
+ log.Debugf("Failed to delete LoadBalancer with labels: %v", labels)
|
|
|
}
|
|
|
delete(loadBalancerSeen, labelString)
|
|
|
} else {
|
|
|
@@ -730,17 +733,21 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
for labelString, seen := range containerSeen {
|
|
|
if !seen {
|
|
|
labels := getLabelStringsFromKey(labelString)
|
|
|
- ok := cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
- if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete RAMAllocation with labels: %v", labels)
|
|
|
- }
|
|
|
- ok = cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
- if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete CPUAllocation with labels: %v", labels)
|
|
|
- }
|
|
|
- ok = cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
- if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete GPUAllocation with labels: %v", labels)
|
|
|
+ if labels[1] == unmountedPVsContainer { // special "pod" to contain the mounted PVs - does not have RAM/CPU/...
|
|
|
+ ok := cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
+ if !ok {
|
|
|
+ log.Debugf("Failed to delete RAMAllocation for container with labels: %v", labels)
|
|
|
+ }
|
|
|
+ ok = cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
+ if !ok {
|
|
|
+ log.Debugf("Failed to delete CPUAllocation for container with labels: %v", labels)
|
|
|
+ }
|
|
|
+ ok = cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
+ if !ok {
|
|
|
+ log.Debugf("Failed to delete GPUAllocation for container with labels: %v", labels)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ log.Debugf("Did not try to delete RAM/CPU/GPU for fake '%s' container: %v", unmountedPVsContainer, labels)
|
|
|
}
|
|
|
delete(containerSeen, labelString)
|
|
|
} else {
|
|
|
@@ -752,7 +759,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
labels := getLabelStringsFromKey(labelString)
|
|
|
ok := cmme.PersistentVolumePriceRecorder.DeleteLabelValues(labels...)
|
|
|
if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete PVPrice with labels: %v", labels)
|
|
|
+ log.Debugf("Failed to delete PVPrice for PV with labels: %v", labels)
|
|
|
}
|
|
|
delete(pvSeen, labelString)
|
|
|
} else {
|
|
|
@@ -764,7 +771,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
labels := getLabelStringsFromKey(labelString)
|
|
|
ok := cmme.PVAllocationRecorder.DeleteLabelValues(labels...)
|
|
|
if !ok {
|
|
|
- log.Errorf("Metric emission: failed to delete PVAllocation with labels: %v", labels)
|
|
|
+ log.Debugf("Failed to delete PVAllocation for PVC with labels: %v", labels)
|
|
|
}
|
|
|
delete(pvcSeen, labelString)
|
|
|
} else {
|