2
0

datasource.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. package source
  2. import (
  3. "time"
  4. "github.com/julienschmidt/httprouter"
  5. "github.com/opencost/opencost/core/pkg/clusters"
  6. "github.com/opencost/opencost/core/pkg/diagnostics"
  7. )
  8. type MetricsQuerier interface {
  9. // Cluster Disks
  10. // Local Cluster Disks
  11. QueryLocalStorageActiveMinutes(start, end time.Time) *Future[LocalStorageActiveMinutesResult]
  12. QueryLocalStorageUsedAvg(start, end time.Time) *Future[LocalStorageUsedAvgResult]
  13. QueryLocalStorageUsedMax(start, end time.Time) *Future[LocalStorageUsedMaxResult]
  14. QueryLocalStorageBytes(start, end time.Time) *Future[LocalStorageBytesResult]
  15. QueryKMLocalStorageUsedAvg(start, end time.Time) *Future[NodeUIDValueResult]
  16. QueryKMLocalStorageUsedMax(start, end time.Time) *Future[NodeUIDValueResult]
  17. QueryKMLocalStorageBytes(start, end time.Time) *Future[UIDValueResult]
  18. // Nodes
  19. QueryNodeInfo(start, end time.Time) *Future[NodeInfoResult]
  20. QueryNodeUptime(start, end time.Time) *Future[UptimeResult]
  21. QueryNodeActiveMinutes(start, end time.Time) *Future[NodeActiveMinutesResult]
  22. QueryNodeCPUCoresCapacity(start, end time.Time) *Future[NodeCPUCoresCapacityResult]
  23. QueryNodeCPUCoresAllocatable(start, end time.Time) *Future[NodeCPUCoresAllocatableResult]
  24. QueryNodeRAMBytesCapacity(start, end time.Time) *Future[NodeRAMBytesCapacityResult]
  25. QueryNodeRAMBytesAllocatable(start, end time.Time) *Future[NodeRAMBytesAllocatableResult]
  26. QueryNodeGPUCount(start, end time.Time) *Future[NodeGPUCountResult]
  27. QueryNodeCPUModeTotal(start, end time.Time) *Future[NodeCPUModeTotalResult]
  28. QueryNodeIsSpot(start, end time.Time) *Future[NodeIsSpotResult]
  29. QueryNodeRAMSystemPercent(start, end time.Time) *Future[NodeRAMSystemPercentResult]
  30. QueryNodeRAMUserPercent(start, end time.Time) *Future[NodeRAMUserPercentResult]
  31. QueryNodeResourceCapacities(start, end time.Time) *Future[ResourceResult]
  32. QueryNodeResourcesAllocatable(start, end time.Time) *Future[ResourceResult]
  33. // Load Balancers
  34. QueryLBActiveMinutes(start, end time.Time) *Future[LBActiveMinutesResult]
  35. QueryLBPricePerHr(start, end time.Time) *Future[LBPricePerHrResult]
  36. // Cluster Management
  37. QueryClusterInfo(start, end time.Time) *Future[ClusterInfoResult]
  38. QueryClusterUptime(start, end time.Time) *Future[UptimeResult]
  39. QueryClusterManagementDuration(start, end time.Time) *Future[ClusterManagementDurationResult]
  40. QueryClusterManagementPricePerHr(start, end time.Time) *Future[ClusterManagementPricePerHrResult]
  41. // Pods
  42. QueryPods(start, end time.Time) *Future[PodsResult]
  43. QueryPodsUID(start, end time.Time) *Future[PodsResult]
  44. QueryPodInfo(start, end time.Time) *Future[PodInfoResult]
  45. QueryPodUptime(start, end time.Time) *Future[UptimeResult]
  46. QueryPodOwners(start, end time.Time) *Future[OwnerResult]
  47. QueryPodPVCVolumes(start, end time.Time) *Future[PodPVCVolumeResult]
  48. QueryPodNetworkEgressBytes(start, end time.Time) *Future[PodNetworkBytesResult]
  49. QueryPodNetworkIngressBytes(start, end time.Time) *Future[PodNetworkBytesResult]
  50. // Container
  51. QueryContainerUptime(start, end time.Time) *Future[ContainerUptimeResult]
  52. QueryContainerResourceRequests(start, end time.Time) *Future[ContainerResourceResult]
  53. QueryContainerResourceLimits(start, end time.Time) *Future[ContainerResourceResult]
  54. // RAM
  55. QueryRAMBytesAllocated(start, end time.Time) *Future[RAMBytesAllocatedResult]
  56. QueryRAMRequests(start, end time.Time) *Future[RAMRequestsResult]
  57. QueryRAMLimits(start, end time.Time) *Future[RAMLimitsResult]
  58. QueryRAMUsageAvg(start, end time.Time) *Future[RAMUsageAvgResult]
  59. QueryRAMUsageMax(start, end time.Time) *Future[RAMUsageMaxResult]
  60. QueryNodeRAMPricePerGiBHr(start, end time.Time) *Future[NodeRAMPricePerGiBHrResult]
  61. // CPU
  62. QueryCPUCoresAllocated(start, end time.Time) *Future[CPUCoresAllocatedResult]
  63. QueryCPURequests(start, end time.Time) *Future[CPURequestsResult]
  64. QueryCPULimits(start, end time.Time) *Future[CPULimitsResult]
  65. QueryCPUUsageAvg(start, end time.Time) *Future[CPUUsageAvgResult]
  66. QueryCPUUsageMax(start, end time.Time) *Future[CPUUsageMaxResult]
  67. QueryNodeCPUPricePerHr(start, end time.Time) *Future[NodeCPUPricePerHrResult]
  68. // GPU
  69. QueryGPUsAllocated(start, end time.Time) *Future[GPUsAllocatedResult]
  70. QueryGPUsRequested(start, end time.Time) *Future[GPUsRequestedResult]
  71. QueryGPUsUsageAvg(start, end time.Time) *Future[GPUsUsageAvgResult]
  72. QueryGPUsUsageMax(start, end time.Time) *Future[GPUsUsageMaxResult]
  73. QueryNodeGPUPricePerHr(start, end time.Time) *Future[NodeGPUPricePerHrResult]
  74. QueryGPUInfo(start, end time.Time) *Future[GPUInfoResult]
  75. QueryIsGPUShared(start, end time.Time) *Future[IsGPUSharedResult]
  76. // Device
  77. QueryDCGMDeviceInfo(start, end time.Time) *Future[DCGMDeviceInfoResult]
  78. QueryDCGMDeviceUptime(start, end time.Time) *Future[DCGMDeviceUptimeResult]
  79. QueryDCGMContainerUsageAvg(start, end time.Time) *Future[DCGMDeviceContainerUsageResult]
  80. QueryDCGMContainerUsageMax(start, end time.Time) *Future[DCGMDeviceContainerUsageResult]
  81. // GPU saturation (USE method). Sources return no results when the
  82. // underlying DCGM fields are unavailable; absence is never zero.
  83. QueryGPUThrottleViolationRatio(start, end time.Time) *Future[GPUSaturationResult]
  84. QueryGPUThrottleReasonRatio(start, end time.Time) *Future[GPUSaturationResult]
  85. QueryGPUMemoryUsedRatioAvg(start, end time.Time) *Future[GPUSaturationResult]
  86. QueryGPUMemoryUsedRatioMax(start, end time.Time) *Future[GPUSaturationResult]
  87. QueryGPUMemoryPressureRatio(start, end time.Time) *Future[GPUSaturationResult]
  88. QueryGPUXIDErrorCount(start, end time.Time) *Future[GPUSaturationResult]
  89. QueryGPUDRAMActiveAvg(start, end time.Time) *Future[GPUSaturationResult]
  90. QueryGPUDRAMActiveMax(start, end time.Time) *Future[GPUSaturationResult]
  91. QueryGPUSMActiveAvg(start, end time.Time) *Future[GPUSaturationResult]
  92. QueryGPUSMOccupancyAvg(start, end time.Time) *Future[GPUSaturationResult]
  93. QueryGPUPCIeTxBytesAvg(start, end time.Time) *Future[GPUSaturationResult]
  94. QueryGPUPCIeRxBytesAvg(start, end time.Time) *Future[GPUSaturationResult]
  95. QueryGPUNVLinkTxBytesAvg(start, end time.Time) *Future[GPUSaturationResult]
  96. QueryGPUNVLinkRxBytesAvg(start, end time.Time) *Future[GPUSaturationResult]
  97. // Device-level GPU metrics (DeviceInfo / DevicePerformance support):
  98. // grouped by device identity only, all from the default dcgm-exporter
  99. // configuration
  100. QueryGPUDevicePowerAvg(start, end time.Time) *Future[GPUDeviceMetricResult]
  101. QueryGPUDeviceTempAvg(start, end time.Time) *Future[GPUDeviceMetricResult]
  102. QueryGPUDeviceUsageAvg(start, end time.Time) *Future[GPUDeviceMetricResult]
  103. QueryGPUDeviceUsageMax(start, end time.Time) *Future[GPUDeviceMetricResult]
  104. QueryGPUDeviceMemoryUsedAvg(start, end time.Time) *Future[GPUDeviceMetricResult]
  105. QueryGPUDeviceMemoryUsedMax(start, end time.Time) *Future[GPUDeviceMetricResult]
  106. // PVC
  107. QueryPodPVCAllocation(start, end time.Time) *Future[PodPVCAllocationResult]
  108. QueryPVCBytesRequested(start, end time.Time) *Future[PVCBytesRequestedResult]
  109. QueryPVCInfo(start, end time.Time) *Future[PVCInfoResult]
  110. QueryKMPVCInfo(start, end time.Time) *Future[PVCInfoResult]
  111. QueryPVCUptime(start, end time.Time) *Future[UptimeResult]
  112. QueryPVCBytesUsedAverage(start, end time.Time) *Future[PVCUIDValueResult]
  113. QueryPVCBytesUsedMax(start, end time.Time) *Future[PVCUIDValueResult]
  114. // PV
  115. QueryPVBytes(start, end time.Time) *Future[PVBytesResult]
  116. QueryPVPricePerGiBHour(start, end time.Time) *Future[PVPricePerGiBHourResult]
  117. QueryPVInfo(start, end time.Time) *Future[PVInfoResult]
  118. QueryPVActiveMinutes(start, end time.Time) *Future[PVActiveMinutesResult]
  119. QueryPVUsedAverage(start, end time.Time) *Future[PVUsedAvgResult]
  120. QueryPVUsedMax(start, end time.Time) *Future[PVUsedMaxResult]
  121. QueryKMPVInfo(start, end time.Time) *Future[PVInfoResult]
  122. QueryPVUptime(start, end time.Time) *Future[UptimeResult]
  123. // Deployment
  124. QueryDeploymentInfo(start, end time.Time) *Future[DeploymentInfoResult]
  125. QueryDeploymentUptime(start, end time.Time) *Future[UptimeResult]
  126. QueryDeploymentLabels(start, end time.Time) *Future[LabelsResult]
  127. QueryDeploymentAnnotations(start, end time.Time) *Future[AnnotationsResult]
  128. QueryDeploymentMatchLabels(start, end time.Time) *Future[DeploymentLabelsResult]
  129. // StatefulSet
  130. QueryStatefulSetInfo(start, end time.Time) *Future[StatefulSetInfoResult]
  131. QueryStatefulSetUptime(start, end time.Time) *Future[UptimeResult]
  132. QueryStatefulSetLabels(start, end time.Time) *Future[LabelsResult]
  133. QueryStatefulSetAnnotations(start, end time.Time) *Future[AnnotationsResult]
  134. QueryStatefulSetMatchLabels(start, end time.Time) *Future[StatefulSetLabelsResult]
  135. // DaemonSet
  136. QueryDaemonSetInfo(start, end time.Time) *Future[DaemonSetInfoResult]
  137. QueryDaemonSetUptime(start, end time.Time) *Future[UptimeResult]
  138. QueryDaemonSetLabels(start, end time.Time) *Future[LabelsResult]
  139. QueryDaemonSetAnnotations(start, end time.Time) *Future[AnnotationsResult]
  140. // Job
  141. QueryJobInfo(start, end time.Time) *Future[JobInfoResult]
  142. QueryJobUptime(start, end time.Time) *Future[UptimeResult]
  143. QueryJobLabels(start, end time.Time) *Future[LabelsResult]
  144. QueryJobAnnotations(start, end time.Time) *Future[AnnotationsResult]
  145. // CronJob
  146. QueryCronJobInfo(start, end time.Time) *Future[CronJobInfoResult]
  147. QueryCronJobUptime(start, end time.Time) *Future[UptimeResult]
  148. QueryCronJobLabels(start, end time.Time) *Future[LabelsResult]
  149. QueryCronJobAnnotations(start, end time.Time) *Future[AnnotationsResult]
  150. // ReplicaSet
  151. QueryReplicaSetInfo(start, end time.Time) *Future[ReplicaSetInfoResult]
  152. QueryReplicaSetUptime(start, end time.Time) *Future[UptimeResult]
  153. QueryReplicaSetLabels(start, end time.Time) *Future[LabelsResult]
  154. QueryReplicaSetAnnotations(start, end time.Time) *Future[AnnotationsResult]
  155. QueryReplicaSetOwners(start, end time.Time) *Future[OwnerResult]
  156. // Namespace
  157. QueryNamespaceInfo(start, end time.Time) *Future[NamespaceInfoResult]
  158. QueryNamespaceUptime(start, end time.Time) *Future[UptimeResult]
  159. // Service
  160. QueryServiceInfo(start, end time.Time) *Future[ServiceInfoResult]
  161. QueryServiceUptime(start, end time.Time) *Future[UptimeResult]
  162. QueryServiceSelectorLabels(start, end time.Time) *Future[ServiceLabelsResult]
  163. // Network Egress
  164. QueryNetZoneGiB(start, end time.Time) *Future[NetZoneGiBResult]
  165. QueryNetZonePricePerGiB(start, end time.Time) *Future[NetZonePricePerGiBResult]
  166. QueryNetRegionGiB(start, end time.Time) *Future[NetRegionGiBResult]
  167. QueryNetRegionPricePerGiB(start, end time.Time) *Future[NetRegionPricePerGiBResult]
  168. QueryNetInternetGiB(start, end time.Time) *Future[NetInternetGiBResult]
  169. QueryNetInternetPricePerGiB(start, end time.Time) *Future[NetInternetPricePerGiBResult]
  170. QueryNetInternetServiceGiB(start, end time.Time) *Future[NetInternetServiceGiBResult]
  171. QueryNetNatGatewayPricePerGiB(start, end time.Time) *Future[NetNatGatewayPricePerGiBResult]
  172. QueryNetNatGatewayGiB(start, end time.Time) *Future[NetNatGatewayGiBResult]
  173. QueryNetTransferBytes(start, end time.Time) *Future[NetTransferBytesResult]
  174. // Network Ingress
  175. QueryNetZoneIngressGiB(start, end time.Time) *Future[NetZoneIngressGiBResult]
  176. QueryNetRegionIngressGiB(start, end time.Time) *Future[NetRegionIngressGiBResult]
  177. QueryNetInternetIngressGiB(start, end time.Time) *Future[NetInternetIngressGiBResult]
  178. QueryNetInternetServiceIngressGiB(start, end time.Time) *Future[NetInternetServiceIngressGiBResult]
  179. QueryNetNatGatewayIngressPricePerGiB(start, end time.Time) *Future[NetNatGatewayPricePerGiBResult]
  180. QueryNetNatGatewayIngressGiB(start, end time.Time) *Future[NetNatGatewayIngressGiBResult]
  181. QueryNetReceiveBytes(start, end time.Time) *Future[NetReceiveBytesResult]
  182. // Annotations
  183. QueryNamespaceAnnotations(start, end time.Time) *Future[NamespaceAnnotationsResult]
  184. QueryPodAnnotations(start, end time.Time) *Future[PodAnnotationsResult]
  185. // Labels
  186. QueryNodeLabels(start, end time.Time) *Future[NodeLabelsResult]
  187. QueryNamespaceLabels(start, end time.Time) *Future[NamespaceLabelsResult]
  188. QueryPodLabels(start, end time.Time) *Future[PodLabelsResult]
  189. QueryPodsWithDaemonSetOwner(start, end time.Time) *Future[PodsWithDaemonSetOwnerResult]
  190. QueryPodsWithJobOwner(start, end time.Time) *Future[PodsWithJobOwnerResult]
  191. // ReplicaSet -> Controller mapping
  192. QueryPodsWithReplicaSetOwner(start, end time.Time) *Future[PodsWithReplicaSetOwnerResult]
  193. QueryReplicaSetsWithoutOwners(start, end time.Time) *Future[ReplicaSetsWithoutOwnersResult]
  194. QueryReplicaSetsWithRollout(start, end time.Time) *Future[ReplicaSetsWithRolloutResult]
  195. // ResourceQuotas
  196. QueryResourceQuotaInfo(start, end time.Time) *Future[ResourceQuotaInfoResult]
  197. QueryResourceQuotaUptime(start, end time.Time) *Future[UptimeResult]
  198. QueryResourceQuotaSpecCPURequestAverage(start, end time.Time) *Future[ResourceResult]
  199. QueryResourceQuotaSpecCPURequestMax(start, end time.Time) *Future[ResourceResult]
  200. QueryResourceQuotaSpecRAMRequestAverage(start, end time.Time) *Future[ResourceResult]
  201. QueryResourceQuotaSpecRAMRequestMax(start, end time.Time) *Future[ResourceResult]
  202. QueryResourceQuotaSpecCPULimitAverage(start, end time.Time) *Future[ResourceResult]
  203. QueryResourceQuotaSpecCPULimitMax(start, end time.Time) *Future[ResourceResult]
  204. QueryResourceQuotaSpecRAMLimitAverage(start, end time.Time) *Future[ResourceResult]
  205. QueryResourceQuotaSpecRAMLimitMax(start, end time.Time) *Future[ResourceResult]
  206. QueryResourceQuotaStatusUsedCPURequestAverage(start, end time.Time) *Future[ResourceResult]
  207. QueryResourceQuotaStatusUsedCPURequestMax(start, end time.Time) *Future[ResourceResult]
  208. QueryResourceQuotaStatusUsedRAMRequestAverage(start, end time.Time) *Future[ResourceResult]
  209. QueryResourceQuotaStatusUsedRAMRequestMax(start, end time.Time) *Future[ResourceResult]
  210. QueryResourceQuotaStatusUsedCPULimitAverage(start, end time.Time) *Future[ResourceResult]
  211. QueryResourceQuotaStatusUsedCPULimitMax(start, end time.Time) *Future[ResourceResult]
  212. QueryResourceQuotaStatusUsedRAMLimitAverage(start, end time.Time) *Future[ResourceResult]
  213. QueryResourceQuotaStatusUsedRAMLimitMax(start, end time.Time) *Future[ResourceResult]
  214. // Data Coverage Query
  215. QueryDataCoverage(limitDays int) (time.Time, time.Time, error)
  216. }
  217. type OpenCostDataSource interface {
  218. // RegisterEndPoints registers any custom endpoints that can be used for diagnostics or debug purposes.
  219. RegisterEndPoints(router *httprouter.Router)
  220. // RegisterDiagnostics registers any custom data source diagnostics with the `DiagnosticService` that can
  221. // be used to report externally.
  222. RegisterDiagnostics(diagService diagnostics.DiagnosticService)
  223. // Metrics returns a MetricsQuerier that can be used to query historical metrics data from the data source.
  224. Metrics() MetricsQuerier
  225. // ClusterMap returns a mapping of cluster identifier to ClusterInfo for all known clusters (local only for
  226. // single cluster deployments).
  227. ClusterMap() clusters.ClusterMap
  228. // ClusterInfo returns the ClusterInfoProvider for the local cluster.
  229. ClusterInfo() clusters.ClusterInfoProvider
  230. BatchDuration() time.Duration
  231. Resolution() time.Duration
  232. }