Просмотр исходного кода

Merge branch 'develop' into claude/opencost-security-alert-hmaC8

Alex Meijer 3 недель назад
Родитель
Сommit
e73bea17b8
46 измененных файлов с 5839 добавлено и 716 удалено
  1. 193 0
      .github/workflows/update-go-version.yaml
  2. 1 1
      CLAUDE.md
  3. 25 0
      CONTRIBUTING.md
  4. 0 61
      Dockerfile
  5. 0 30
      Dockerfile.debug
  6. 11 11
      README.md
  7. 20 2
      core/pkg/exporter/decoder.go
  8. 4 4
      core/pkg/opencost/bingen.go
  9. 551 153
      core/pkg/opencost/opencost_codecs.go
  10. 219 41
      core/pkg/opencost/opencost_codecs_test.go
  11. 215 196
      core/pkg/util/buffer.go
  12. 90 8
      core/pkg/util/buffer_test.go
  13. 493 0
      core/pkg/util/bufferhelper.go
  14. 68 0
      core/pkg/util/bufferpool.go
  15. 311 0
      core/pkg/util/bufferpool_test.go
  16. 310 0
      core/pkg/util/monitor/memory/helpers.go
  17. 418 0
      core/pkg/util/monitor/memory/helpers_test.go
  18. 226 0
      core/pkg/util/monitor/memory/memorylimitstats.go
  19. 141 0
      core/pkg/util/monitor/memory/memorylimitstats_test.go
  20. 123 0
      core/pkg/util/monitor/memorylimiter.go
  21. 160 0
      core/pkg/util/stringutil/lrubank.go
  22. 408 0
      core/pkg/util/stringutil/lrubank_test.go
  23. 48 0
      core/pkg/util/stringutil/mapbank.go
  24. 17 0
      core/pkg/util/stringutil/noopbank.go
  25. 27 44
      core/pkg/util/stringutil/stringutil.go
  26. 105 4
      core/pkg/util/stringutil/stringutil_test.go
  27. 37 0
      docs/ROADMAP.md
  28. 1 1
      go.mod
  29. 2 2
      go.sum
  30. 140 91
      pkg/carbon/carbonassets.go
  31. 254 0
      pkg/carbon/carbonassets_test.go
  32. 45 8
      pkg/cloud/alibaba/provider.go
  33. 69 3
      pkg/cloud/alibaba/provider_test.go
  34. 124 24
      pkg/cloud/aws/provider.go
  35. 377 7
      pkg/cloud/aws/provider_test.go
  36. 200 0
      pkg/cloud/aws/spotpricehistory.go
  37. 239 0
      pkg/cloud/aws/spotpricehistory_test.go
  38. 0 1
      pkg/cloud/provider/providerconfig.go
  39. 4 0
      pkg/cmd/costmodel/costmodel.go
  40. 9 0
      pkg/costmodel/csv_export.go
  41. 8 7
      pkg/costmodel/csv_export_test.go
  42. 21 3
      pkg/customcost/ingestor.go
  43. 95 5
      pkg/customcost/ingestor_test.go
  44. 7 8
      pkg/customcost/pipelineservice_test.go
  45. 1 1
      pkg/env/costmodel.go
  46. 22 0
      pkg/env/costmodel_test.go

+ 193 - 0
.github/workflows/update-go-version.yaml

@@ -0,0 +1,193 @@
+name: Update Go Version in "go.mod"
+
+on:
+  schedule:
+    - cron: '0 14 * * 1'  # Run every Monday at 2 PM UTC
+  workflow_dispatch:  # Allow manual triggering
+
+
+concurrency:
+  group: golang-ver-bump
+  cancel-in-progress: true
+
+jobs:
+  check-and-update-go:
+    name: Check and Update Go Version
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        repository: 'opencost/opencost'
+        ref: 'develop'
+        fetch-depth: 0
+        path: ./opencost 
+
+    - name: Setup Go
+      uses: actions/setup-go@v6
+      with:
+         go-version: 'stable'
+
+    - name: Check for existing open PRs
+      id: check-existing-prs
+      run: |
+        # Check if there are any open PRs with the go-version-update label
+        EXISTING_PRS=$(gh pr list --label "go-version-update" --state open --json number,title,url)
+        if [ "$(echo "$EXISTING_PRS" | jq 'length')" -gt 0 ]; then
+          echo "Found existing open PR(s) with go-version-update label:"
+          echo "$EXISTING_PRS" | jq -r '.[] | "  - #\(.number): \(.title) (\(.url))"'
+          echo "skip_update=true" >> $GITHUB_OUTPUT
+        else
+          echo "No existing open PRs found, proceeding with update check"
+          echo "skip_update=false" >> $GITHUB_OUTPUT
+        fi
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN}}
+        GH_REPO: ${{ github.repository }}
+
+    - name: Get current Go version from go.mod
+      id: current-version
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        CURRENT_VERSION=$(grep '^go ' go.mod | awk '{print $2}')
+        echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
+        echo "Current Go version: $CURRENT_VERSION"
+
+    - name: Get latest Go version
+      id: latest-version
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        # Get the latest Go version from golang.org/dl
+        LATEST_VERSION=$(curl -sS https://go.dev/VERSION?m=text | grep -o 'go[0-9]\+\.[0-9]\+\.[0-9]\+' | head -1)
+        echo "latest_version=$LATEST_VERSION" >> $GITHUB_OUTPUT
+        echo "Latest Go version: $LATEST_VERSION"
+
+    - name: Compare versions
+      id: version-check
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        CURRENT_VERSION="${{ steps.current-version.outputs.current_version }}"
+        LATEST_VERSION="${{ steps.latest-version.outputs.latest_version }}"
+
+        # Remove 'go' prefix for comparison
+        CURRENT_NUM=$(echo "$CURRENT_VERSION" | sed 's/^go//')
+        LATEST_NUM=$(echo "$LATEST_VERSION" | sed 's/^go//')
+
+        echo "Raw current version: $CURRENT_VERSION"
+        echo "Raw latest version: $LATEST_VERSION"
+        echo "Current version number: $CURRENT_NUM"
+        echo "Latest version number: $LATEST_NUM"
+
+        # Validate that both versions are non-empty and match an expected pattern (e.g., 1.22.1)
+        VERSION_REGEX='^[0-9]+\.[0-9]+(\.[0-9]+)?$'
+
+        if ! echo "$CURRENT_NUM" | grep -Eq "$VERSION_REGEX"; then
+          echo "Error: CURRENT_VERSION '$CURRENT_VERSION' (normalized: '$CURRENT_NUM') is invalid or empty."
+          exit 1
+        fi
+
+        if ! echo "$LATEST_NUM" | grep -Eq "$VERSION_REGEX"; then
+          echo "Error: LATEST_VERSION '$LATEST_VERSION' (normalized: '$LATEST_NUM') is invalid or empty."
+          exit 1
+        fi
+
+        if [ "$CURRENT_NUM" = "$LATEST_NUM" ]; then
+          echo "Go version is up to date"
+          echo "update_needed=false" >> $GITHUB_OUTPUT
+        else
+          # Use version-aware sorting to determine which version is greater
+          HIGHEST_VERSION=$(printf '%s\n' "$CURRENT_NUM" "$LATEST_NUM" | sort -V | tail -n1)
+
+          if [ "$HIGHEST_VERSION" = "$LATEST_NUM" ]; then
+            echo "Newer Go version available: $LATEST_VERSION (current: $CURRENT_VERSION)"
+            echo "update_needed=true" >> $GITHUB_OUTPUT
+            echo "new_version=$LATEST_VERSION" >> $GITHUB_OUTPUT
+          else
+            echo "Current Go version ($CURRENT_VERSION) is newer than or equal to latest reported version ($LATEST_VERSION); no update needed."
+            echo "update_needed=false" >> $GITHUB_OUTPUT
+          fi
+        fi
+
+    - name: Update go.mod with new Go version
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        NEW_VERSION="$(echo '${{ steps.version-check.outputs.new_version }}' | tr -d 'go')"
+        echo "Updating all go.mod files under $(pwd) to use Go version: $NEW_VERSION"
+
+        # Recursively update every go.mod we find under the working directory.
+        # (Skip vendor directories to avoid accidental edits of vendored content.)
+        while IFS= read -r go_mod_file; do
+          echo "Updating: $go_mod_file"
+          sed -i -E "s/^go[[:space:]].*/go ${NEW_VERSION}/" "$go_mod_file"
+        done < <(find . -name go.mod -type f -not -path "*/vendor/*")
+        
+
+    - name: Run go mod tidy
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        echo "Running go mod tidy for every module found under $(pwd)"
+
+        # Tidy once per module directory (dirname of each go.mod we find).
+        # Skip vendor directories to avoid editing vendored content.
+        mapfile -t tidy_dirs < <(find . -name go.mod -type f -not -path "*/vendor/*" -exec dirname {} \; | sort -u)
+        for d in "${tidy_dirs[@]}"; do
+          echo "Tidying: $d"
+          (cd "$d" && go mod tidy)
+        done
+
+    - name: Check for changes
+      id: changes
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        if [ -z "$(git status --porcelain)" ]; then
+          echo "No changes detected, skipping PR creation"
+          echo "skip_pr=true" >> $GITHUB_OUTPUT
+        else
+          echo "Changes detected, will create PR" 
+          echo "skip_pr=false" >> $GITHUB_OUTPUT
+          # Generate a unique branch name with timestamp
+          branch_name="update-go-version-$(date +%Y%m%d-%H%M%S)"
+          echo "branch_name=$branch_name" >> $GITHUB_OUTPUT
+        fi
+
+    - name: Create Pull Request
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true' && steps.changes.outputs.skip_pr == 'false'
+      id: create-pr
+      uses: peter-evans/create-pull-request@v8
+      with:
+        path: ./opencost
+        token: ${{ secrets.GITHUB_TOKEN }}
+        branch: ${{ steps.changes.outputs.branch_name }}
+        title: 'chore: update Go version to ${{ steps.version-check.outputs.new_version }}'
+        commit-message: 'chore: update Go version to ${{ steps.version-check.outputs.new_version }}'
+        delete-branch: true
+        add-paths: |
+          go.mod
+          go.sum
+          **/go.mod
+          **/go.sum
+        body: |
+          Automated PR to update Go version from ${{ steps.current-version.outputs.current_version }} to ${{ steps.version-check.outputs.new_version }}.
+          
+          This PR was automatically generated after detecting a newer Go version is available.
+        base: develop
+        labels: automated,go-version-update
+    
+
+    - name: Skip message - no update needed
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'false'
+      run: |
+        echo "No Go version update needed. Current version is up to date."
+
+    - name: Skip message - existing PR
+      if: steps.check-existing-prs.outputs.skip_update == 'true'
+      run: |
+        echo "Skipping Go version update because an existing PR with go-version-update label is already open." 

+ 1 - 1
CLAUDE.md

@@ -144,7 +144,7 @@ just validate-protobuf
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `MCP_SERVER_ENABLED` | `true` | Enable MCP server |
+| `MCP_SERVER_ENABLED` | `false` | Enable MCP server |
 | `MCP_HTTP_PORT` | `8081` | MCP server HTTP port |
 
 ### Cloud Providers

+ 25 - 0
CONTRIBUTING.md

@@ -126,6 +126,31 @@ To run these tests:
 - Navigate to cost-model/test
 - Run `go test -timeout 700s` from the testing directory. The tests right now take about 10 minutes (600s) to run because they bring up and down pods and wait for Prometheus to scrape data about them.
 
+## Code Review Standards
+
+All pull requests must be reviewed before merging. The review process ensures:
+
+### What reviewers check:
+- **Correctness:** Does the code do what it claims?
+- **Tests:** Are new features and bug fixes covered by tests?
+- **Style:** Does the code follow Go conventions (`gofmt`, `go vet`)?
+- **Security:** Are inputs validated? Are credentials handled safely?
+- **Performance:** Are there obvious performance issues (unbounded allocations, N+1 queries)?
+
+### Review requirements:
+- At least one approval from a Committer or Maintainer is required
+- The reviewer must be a different person than the PR author
+- For security-sensitive changes, review by a Maintainer is required
+- Emergency fixes may bypass review with post-merge review required within 48 hours (per [GOVERNANCE.md](GOVERNANCE.md))
+
+## Regression Tests
+
+When fixing a bug, contributors SHOULD add a test that reproduces the bug before applying the fix. This ensures the bug does not recur. As a project-wide goal, at least 50% of bugs fixed in any six-month window should have corresponding regression tests. This is tracked by maintainers using issues labeled `bug` and measured during release reviews; it is an aspirational target for the project as a whole, not a requirement applied to individual contributors.
+
+## Finding Issues to Work On
+
+Look for issues labeled [`good first issue`](https://github.com/opencost/opencost/labels/good%20first%20issue) or [`help wanted`](https://github.com/opencost/opencost/labels/help%20wanted) for a curated list of tasks suitable for new contributors.
+
 ## Certificate of Origin
 
 By contributing to this project, you certify that your contribution was created in whole or in part by you and that you have the right to submit it under the open source license indicated in the project. In other words, please confirm that you, as a contributor, have the legal right to make the contribution. This is enforced on Pull Requests and requires `Signed-off-by` with the email address for the author in the commit message.

+ 0 - 61
Dockerfile

@@ -1,61 +0,0 @@
-FROM --platform=$BUILDPLATFORM golang:1.25.5-alpine3.21 AS build-env
-
-WORKDIR /app
-
-# This ensures that CGO is disabled for go test running AND for the build
-# step. This prevents a build failure when building an ARM64 image with
-# docker buildx. I believe this is because the ARM64 version of the
-# golang:latest image does not contain GCC, while the AMD64 version does.
-ARG CGO_ENABLED=0
-
-# Get dependencies - will also be cached if we won't change mod/sum
-COPY go.* .
-COPY core/go.* core/
-COPY modules/collector-source/go.* modules/collector-source/
-COPY modules/prometheus-source/go.* modules/prometheus-source/
-RUN go mod download
-
-ARG version=dev
-ARG	commit=HEAD
-
-ARG TARGETOS
-ARG TARGETARCH
-ENV GOOS=$TARGETOS
-ENV GOARCH=$TARGETARCH
-
-# COPY the source code as the last step
-COPY . .
-
-# Build the binary
-RUN set -e ;\
-    cd cmd/costmodel;\
-    go build -a -installsuffix cgo \
-    -ldflags \
-    "-X github.com/opencost/opencost/core/pkg/version.Version=${version} \
-    -X github.com/opencost/opencost/core/pkg/version.GitCommit=${commit}" \
-    -o /go/bin/app
-
-FROM alpine:latest
-
-LABEL org.opencontainers.image.description="Cross-cloud cost allocation models for Kubernetes workloads"
-LABEL org.opencontainers.image.documentation=https://opencost.io/docs/
-LABEL org.opencontainers.image.licenses=Apache-2.0
-LABEL org.opencontainers.image.source=https://github.com/opencost/opencost
-LABEL org.opencontainers.image.title=kubecost-cost-model
-LABEL org.opencontainers.image.url=https://opencost.io
-
-RUN apk add --update --no-cache ca-certificates
-
-COPY --from=build-env /go/bin/app /go/bin/app
-ADD --chmod=400 ./THIRD_PARTY_LICENSES.txt /THIRD_PARTY_LICENSES.txt
-ADD --chmod=500 ./configs/default.json /models/default.json
-ADD --chmod=500 ./configs/azure.json /models/azure.json
-ADD --chmod=500 ./configs/aws.json /models/aws.json
-ADD --chmod=500 ./configs/gcp.json /models/gcp.json
-ADD --chmod=500 ./configs/alibaba.json /models/alibaba.json
-ADD --chmod=500 ./configs/oracle.json /models/oracle.json
-ADD --chmod=500 ./configs/otc.json /models/otc.json
-RUN chown -R 1001:1001 /models
-
-USER 1001
-ENTRYPOINT ["/go/bin/app"]

+ 0 - 30
Dockerfile.debug

@@ -1,30 +0,0 @@
-# This dockerfile is for development purposes only; do not use this for production deployments
-FROM golang:alpine
-# The prebuilt binary path. This Dockerfile assumes the binary will be built
-# outside of Docker.
-ARG binary_path
-
-LABEL org.opencontainers.image.description="Cross-cloud cost allocation models for Kubernetes workloads"
-LABEL org.opencontainers.image.documentation=https://opencost.io/docs/
-LABEL org.opencontainers.image.licenses=Apache-2.0
-LABEL org.opencontainers.image.source=https://github.com/opencost/opencost
-LABEL org.opencontainers.image.title=kubecost-cost-model
-LABEL org.opencontainers.image.url=https://opencost.io
-
-WORKDIR /app
-RUN apk add --update --no-cache ca-certificates
-RUN go install github.com/go-delve/delve/cmd/dlv@latest
-
-ADD --chmod=644 ./THIRD_PARTY_LICENSES.txt /THIRD_PARTY_LICENSES.txt
-ADD --chmod=644 ./configs/default.json /models/default.json
-ADD --chmod=644 ./configs/azure.json /models/azure.json
-ADD --chmod=644 ./configs/aws.json /models/aws.json
-ADD --chmod=644 ./configs/gcp.json /models/gcp.json
-ADD --chmod=644 ./configs/alibaba.json /models/alibaba.json
-ADD --chmod=644 ./configs/oracle.json /models/oracle.json
-ADD --chmod=644 ./configs/otc.json /models/otc.json
-
-COPY ${binary_path} main
-
-ENTRYPOINT ["/go/bin/dlv exec --listen=:40000 --api-version=2 --headless=true --accept-multiclient --log --continue /app/main"]
-EXPOSE 9003 40000

+ 11 - 11
README.md

@@ -54,17 +54,17 @@ Note: The standalone Kubernetes manifest files have been removed. Please use Hel
 
 ## MCP Server
 
-The OpenCost MCP (Model Context Protocol) server provides AI agents with access to cost allocation and asset data through a standardized interface. The MCP server is **enabled by default** in all OpenCost deployments, runs on port 8081, and is **built into the Helm chart** for easy production deployment. Users have full control to disable it or configure custom ports and settings.
+The OpenCost MCP (Model Context Protocol) server provides AI agents with access to cost allocation and asset data through a standardized interface. The MCP server is **disabled by default** (opt-in) in all OpenCost deployments, runs on port 8081, and is **built into the Helm chart** for easy production deployment. Users have full control to enable it or configure custom ports and settings.
 
 ### Features
 
-- **Enabled by Default**: MCP server starts automatically with OpenCost
-- **Full User Control**: Easy to disable or configure port and settings
+- **Opt-in by Default**: MCP server is disabled by default to minimize the attack surface and must be explicitly enabled
+- **Full User Control**: Easy to enable or configure port and settings
 - **Allocation Queries**: Retrieve cost allocation data with filtering and aggregation
 - **Asset Queries**: Access detailed asset information including nodes, disks, load balancers, and more
 - **Cloud Cost Queries**: Query cloud cost data with provider, service, and region filtering
 - **HTTP Transport**: Uses HTTP for reliable communication with MCP clients
-- **Zero Configuration**: Works out of the box with default OpenCost deployment
+- **Simple Configuration**: Easy to enable using standard environment variables
 - **Helm Integration**: Built into the official Helm chart for production deployments
 
 ### Quick Start
@@ -105,19 +105,19 @@ opencost:
 helm repo add opencost https://opencost.github.io/opencost-helm-chart
 helm repo update
 
-# Deploy OpenCost with MCP server (enabled by default)
-helm install opencost opencost/opencost
+# Deploy OpenCost with MCP server enabled (opt-in)
+helm install opencost opencost/opencost --set opencost.mcp.enabled=true
 
 # Access MCP server via port forwarding (example)
 kubectl port-forward svc/opencost 8081:8081
 ```
 
-The MCP server is **enabled by default** in the Helm chart. For custom configuration:
+The MCP server is **disabled by default** in the Helm chart. For custom configuration:
 
 ```bash
-# Deploy with MCP server disabled
+# Deploy with MCP server enabled
 helm install opencost opencost/opencost \
-  --set opencost.mcp.enabled=false
+  --set opencost.mcp.enabled=true
 
 # Deploy with custom MCP port
 helm install opencost opencost/opencost \
@@ -132,8 +132,8 @@ helm install opencost opencost/opencost \
 
 | Configuration | Command | Description |
 |---------------|---------|-------------|
-| **Default** | `helm install opencost opencost/opencost` | MCP enabled on port 8081 |
-| **Disable** | `--set opencost.mcp.enabled=false` | Completely disable MCP server |
+| **Default** | `helm install opencost opencost/opencost` | MCP disabled by default |
+| **Enable** | `--set opencost.mcp.enabled=true` | Enable MCP server on port 8081 |
 | **Custom Port** | `--set opencost.mcp.port=9091` | Use different port |
 | **Debug Mode** | `--set opencost.mcp.extraEnv.MCP_LOG_LEVEL=debug` | Enable debug logging |
 

+ 20 - 2
core/pkg/exporter/decoder.go

@@ -12,14 +12,22 @@ import (
 )
 
 type Decoder[T any] func([]byte) (*T, error)
+type StreamingDecoder[T any] func(io.Reader) (*T, error)
 
-// BinaryMarshalerPtr[T] is a generic constraint to ensure types passed to the encoder implement
-// encoding.BinaryMarshaler and are pointers to T.
+// BinaryMarshalerPtr[T] is a generic constraint to ensure types passed to the decoder implement
+// encoding.BinaryUnmarshaler and are pointers to T.
 type BinaryUnmarshalerPtr[T any] interface {
 	encoding.BinaryUnmarshaler
 	*T
 }
 
+// BinaryUnmarshalWithReaderPtr[T] is a generic constraint to ensure types passed to the decoder
+// implement an unmarshal from io.Reader method.
+type BinaryUnmarshalWithReaderPtr[T any] interface {
+	UnmarshalBinaryFromReader(reader io.Reader) error
+	*T
+}
+
 func BingenDecoder[T any, U BinaryUnmarshalerPtr[T]](data []byte) (*T, error) {
 	var set U = new(T)
 
@@ -31,6 +39,16 @@ func BingenDecoder[T any, U BinaryUnmarshalerPtr[T]](data []byte) (*T, error) {
 	return set, nil
 }
 
+func StreamingBingenDecoder[T any, U BinaryUnmarshalWithReaderPtr[T]](reader io.Reader) (*T, error) {
+	var set U = new(T)
+
+	err := set.UnmarshalBinaryFromReader(reader)
+	if err != nil {
+		return nil, fmt.Errorf("failed to decode bingen: %w", err)
+	}
+	return set, nil
+}
+
 func JSONDecoder[T any](data []byte) (*T, error) {
 	var instance = new(T)
 	err := json.Unmarshal(data, instance)

+ 4 - 4
core/pkg/opencost/bingen.go

@@ -30,7 +30,7 @@ package opencost
 // @bingen:generate:AssetLabels
 // @bingen:generate:AssetProperties
 // @bingen:generate:AssetProperty
-// @bingen:generate[stringtable,preprocess,postprocess]:AssetSet
+// @bingen:generate[streamable,stringtable,preprocess,postprocess]:AssetSet
 // @bingen:generate:AssetSetRange
 // @bingen:generate:Breakdown
 // @bingen:generate:Cloud
@@ -46,7 +46,7 @@ package opencost
 // Allocation Version Set: Includes Allocation pipeline specific resources
 // @bingen:set[name=Allocation,version=25]
 // @bingen:generate[migrate]:Allocation
-// @bingen:generate[stringtable]:AllocationSet
+// @bingen:generate[streamable,stringtable]:AllocationSet
 // @bingen:generate:AllocationSetRange
 // @bingen:generate:AllocationProperties
 // @bingen:generate:AllocationProperty
@@ -64,14 +64,14 @@ package opencost
 // @bingen:set[name=CloudCost,version=3]
 // @bingen:generate:CloudCost
 // @bingen:generate:CostMetric
-// @bingen:generate[stringtable]:CloudCostSet
+// @bingen:generate[streamable,stringtable]:CloudCostSet
 // @bingen:generate:CloudCostSetRange
 // @bingen:generate:CloudCostProperties
 // @bingen:generate:CloudCostLabels
 // @bingen:end
 
 // @bingen:set[name=NetworkInsight,version=1]
-// @bingen:generate:NetworkInsightSet
+// @bingen:generate[streamable,stringtable]:NetworkInsightSet
 // @bingen:generate:NetworkInsight
 // @bingen:generate:NetworkTrafficDirection
 // @bingen:generate:NetworkTrafficType

Разница между файлами не показана из-за своего большого размера
+ 551 - 153
core/pkg/opencost/opencost_codecs.go


+ 219 - 41
core/pkg/opencost/opencost_codecs_test.go

@@ -1,16 +1,52 @@
 package opencost
 
 import (
+	"bytes"
+	"io"
 	"testing"
 	"time"
 )
 
-func TestAllocation_BinaryEncoding(t *testing.T) {
+type UnmarshalFunc func(BingenUnmarshalable, []byte) error
+
+func RunAllocation_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAllocationSet_BinaryEncoding(t *testing.T) {
-	// TODO niko
+func RunAllocationSet_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
+	end := time.Now().UTC().Truncate(day)
+	start := end.Add(-day * 10)
+
+	for start.Before(end) {
+		set0 := GenerateMockAllocationSetClusterIdle(start)
+
+		bytes, err := set0.MarshalBinary()
+		if err != nil {
+			t.Fatalf("Failed to AllocationSet.MarshalBinary: %s", err)
+			return
+		}
+
+		set1 := new(AllocationSet)
+		err = unmarshal(set1, bytes)
+		if err != nil {
+			t.Fatalf("Failed to AllocationSet.UnmarshalBinary: %s", err)
+			return
+		}
+
+		for key, alloc := range set1.Allocations {
+			other, ok := set0.Allocations[key]
+			if !ok {
+				t.Fatalf("Failed to match Allocation for key: %s", key)
+				return
+			}
+
+			if !alloc.Equal(other) {
+				t.Fatalf("allocations for key: %s did not match", key)
+			}
+		}
+
+		start = start.Add(day)
+	}
 }
 
 func BenchmarkAllocationSetRange_BinaryEncoding(b *testing.B) {
@@ -78,7 +114,7 @@ func BenchmarkAllocationSetRange_BinaryEncoding(b *testing.B) {
 	}
 }
 
-func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
+func RunAllocationSetRange_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	endYesterday := time.Now().UTC().Truncate(day)
 	startYesterday := endYesterday.Add(-day)
 	startD2 := startYesterday
@@ -86,7 +122,6 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 	startD0 := startD1.Add(-day)
 
 	var asr0, asr1 *AllocationSetRange
-	var bs []byte
 	var err error
 
 	asr0 = NewAllocationSetRange(
@@ -94,20 +129,29 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 		GenerateMockAllocationSetClusterIdle(startD1),
 		GenerateMockAllocationSetClusterIdle(startD2),
 	)
-
-	bs, err = asr0.MarshalBinary()
-	if err != nil {
-		t.Fatalf("AllocationSetRange.Binary: unexpected error: %s", err)
-		return
+	asrSets0 := [][]byte{}
+	for _, as := range asr0.Allocations {
+		bytes, err := as.MarshalBinary()
+		if err != nil {
+			t.Fatalf("Failed to marshal allocation set into []byte: %s", err)
+			return
+		}
+		asrSets0 = append(asrSets0, bytes)
 	}
 
-	asr1 = &AllocationSetRange{}
-	err = asr1.UnmarshalBinary(bs)
-	if err != nil {
-		t.Fatalf("AllocationSetRange.Binary: unexpected error: %s", err)
-		return
+	asrSets1 := []*AllocationSet{}
+	for _, bytes := range asrSets0 {
+		allocSet := new(AllocationSet)
+		err = unmarshal(allocSet, bytes)
+		if err != nil {
+			t.Fatalf("AllocationSet.Binary: unexpected error: %s", err)
+			return
+		}
+		asrSets1 = append(asrSets1, allocSet)
 	}
 
+	asr1 = NewAllocationSetRange(asrSets1...)
+
 	if asr0.Length() != asr1.Length() {
 		t.Fatalf("AllocationSetRange.Binary: expected %d; found %d", asr0.Length(), asr1.Length())
 	}
@@ -143,7 +187,7 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestAny_BinaryEncoding(t *testing.T) {
+func RunAny_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	start := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	end := start.Add(24 * time.Hour)
 	window := NewWindow(&start, &end)
@@ -167,7 +211,7 @@ func TestAny_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Any{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Any.Binary: unexpected error: %s", err)
 	}
@@ -192,15 +236,15 @@ func TestAny_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestAsset_BinaryEncoding(t *testing.T) {
+func RunAsset_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAssetSet_BinaryEncoding(t *testing.T) {
+func RunAssetSet_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAssetSetRange_BinaryEncoding(t *testing.T) {
+func RunAssetSetRange_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	endYesterday := time.Now().UTC().Truncate(day)
 	startYesterday := endYesterday.Add(-day)
 	startD2 := startYesterday
@@ -224,7 +268,7 @@ func TestAssetSetRange_BinaryEncoding(t *testing.T) {
 	}
 
 	asr1 = &AssetSetRange{}
-	err = asr1.UnmarshalBinary(bs)
+	err = unmarshal(asr1, bs)
 	if err != nil {
 		t.Fatalf("AssetSetRange.Binary: unexpected error: %s", err)
 		return
@@ -263,7 +307,7 @@ func TestAssetSetRange_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestBreakdown_BinaryEncoding(t *testing.T) {
+func RunBreakdown_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var b0, b1 *Breakdown
 	var bs []byte
 	var err error
@@ -281,7 +325,7 @@ func TestBreakdown_BinaryEncoding(t *testing.T) {
 	}
 
 	b1 = &Breakdown{}
-	err = b1.UnmarshalBinary(bs)
+	err = unmarshal(b1, bs)
 	if err != nil {
 		t.Fatalf("Breakdown.Binary: unexpected error: %s", err)
 	}
@@ -300,7 +344,7 @@ func TestBreakdown_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestCloudAny_BinaryEncoding(t *testing.T) {
+func RunCloudAny_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -319,7 +363,7 @@ func TestCloudAny_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Cloud{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("CloudAny.Binary: unexpected error: %s", err)
 	}
@@ -329,7 +373,7 @@ func TestCloudAny_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestClusterManagement_BinaryEncoding(t *testing.T) {
+func RunClusterManagement_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -358,7 +402,7 @@ func TestClusterManagement_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestDisk_BinaryEncoding(t *testing.T) {
+func RunDisk_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -389,7 +433,7 @@ func TestDisk_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Disk{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Disk.Binary: unexpected error: %s", err)
 	}
@@ -399,7 +443,7 @@ func TestDisk_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestNode_BinaryEncoding(t *testing.T) {
+func RunNode_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -441,7 +485,7 @@ func TestNode_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Node{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Node.Binary: unexpected error: %s", err)
 	}
@@ -451,7 +495,7 @@ func TestNode_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestProperties_BinaryEncoding(t *testing.T) {
+func RunProperties_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var p0, p1 *AllocationProperties
 	var bs []byte
 	var err error
@@ -464,7 +508,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -501,7 +545,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -527,7 +571,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -537,7 +581,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestShared_BinaryEncoding(t *testing.T) {
+func RunShared_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -556,7 +600,7 @@ func TestShared_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &SharedAsset{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("SharedAsset.Binary: unexpected error: %s", err)
 	}
@@ -566,7 +610,7 @@ func TestShared_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestWindow_BinaryEncoding(t *testing.T) {
+func RunWindow_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var w0, w1 Window
 	var bs []byte
 	var err error
@@ -578,7 +622,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -598,7 +642,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -618,7 +662,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -638,7 +682,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -650,3 +694,137 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: expected %v; found %v", w0.End(), w1.End())
 	}
 }
+
+type BingenUnmarshalable interface {
+	UnmarshalBinary([]byte) error
+	UnmarshalBinaryFromReader(io.Reader) error
+}
+
+func UnmarshalBingenBytes(value BingenUnmarshalable, b []byte) error {
+	return value.UnmarshalBinary(b)
+}
+
+func UnmarshalBingenReader(value BingenUnmarshalable, b []byte) error {
+	// convert bytes to reader in order to leverage io.Reader string table
+	reader := bytes.NewReader(b)
+	return value.UnmarshalBinaryFromReader(reader)
+}
+
+func RunAllOpencostBingenCodecTests(t *testing.T, unmarshal UnmarshalFunc) {
+	tests := []struct {
+		name string
+		f    func(*testing.T, UnmarshalFunc)
+	}{
+		{
+			name: "RunAllocation_BinaryEncodingTest",
+			f:    RunAllocation_BinaryEncodingTest,
+		},
+		{
+			name: "RunAllocationSet_BinaryEncodingTest",
+			f:    RunAllocationSet_BinaryEncodingTest,
+		},
+		{
+			name: "RunAllocationSetRange_BinaryEncodingTest",
+			f:    RunAllocationSetRange_BinaryEncodingTest,
+		},
+		{
+			name: "RunAny_BinaryEncodingTest",
+			f:    RunAny_BinaryEncodingTest,
+		},
+		{
+			name: "RunAsset_BinaryEncodingTest",
+			f:    RunAsset_BinaryEncodingTest,
+		},
+		{
+			name: "RunAssetSet_BinaryEncodingTest",
+			f:    RunAssetSet_BinaryEncodingTest,
+		},
+		{
+			name: "RunAssetSetRange_BinaryEncodingTest",
+			f:    RunAssetSetRange_BinaryEncodingTest,
+		},
+		{
+			name: "RunBreakdown_BinaryEncodingTest",
+			f:    RunBreakdown_BinaryEncodingTest,
+		},
+		{
+			name: "RunCloudAny_BinaryEncodingTest",
+			f:    RunCloudAny_BinaryEncodingTest,
+		},
+		{
+			name: "RunClusterManagement_BinaryEncodingTest",
+			f:    RunClusterManagement_BinaryEncodingTest,
+		},
+		{
+			name: "RunDisk_BinaryEncodingTest",
+			f:    RunDisk_BinaryEncodingTest,
+		},
+		{
+			name: "RunNode_BinaryEncodingTest",
+			f:    RunNode_BinaryEncodingTest,
+		},
+		{
+			name: "RunProperties_BinaryEncodingTest",
+			f:    RunProperties_BinaryEncodingTest,
+		},
+		{
+			name: "RunShared_BinaryEncodingTest",
+			f:    RunShared_BinaryEncodingTest,
+		},
+		{
+			name: "RunWindow_BinaryEncodingTest",
+			f:    RunWindow_BinaryEncodingTest,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(tt *testing.T) {
+			test.f(tt, unmarshal)
+		})
+	}
+}
+
+func TestOpencostBingenDefaultsWithBytes(t *testing.T) {
+	config := DefaultBingenConfiguration()
+	ConfigureBingen(config)
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenBytes)
+}
+
+func TestOpencostBingenFileStringTableEnabledWithBytes(t *testing.T) {
+	// This test _should_ still run the slice based string table because raw []byte
+	// data always uses the string slice table
+	config := DefaultBingenConfiguration()
+	config.FileBackedStringTableEnabled = true
+	config.FileBackedStringTableDir = t.TempDir()
+	ConfigureBingen(config)
+
+	// reset configuration to default on completion
+	defer ConfigureBingen(DefaultBingenConfiguration())
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenBytes)
+}
+
+func TestOpencostBingenDefaultsWithReader(t *testing.T) {
+	// This test _should_ still run the slice based string table because we haven't configured
+	// bingen to use the file string table
+	config := DefaultBingenConfiguration()
+	ConfigureBingen(config)
+
+	// we use the reader to unmarshal instead of []bytes
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenReader)
+}
+
+func TestOpencostBingenFileStringTableEnabledWithReader(t *testing.T) {
+	// This test _should_ use the file backed string table because we have enabled it AND
+	// we're using a reader
+	config := DefaultBingenConfiguration()
+	config.FileBackedStringTableEnabled = true
+	config.FileBackedStringTableDir = t.TempDir()
+	ConfigureBingen(config)
+
+	// reset configuration to default on completion
+	defer ConfigureBingen(DefaultBingenConfiguration())
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenReader)
+}

+ 215 - 196
core/pkg/util/buffer.go

@@ -1,30 +1,36 @@
 package util
 
 import (
+	"bufio"
 	"bytes"
-	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 	"math"
+	"os"
 	"unsafe"
 
 	"github.com/opencost/opencost/core/pkg/util/stringutil"
 )
 
+var bytePool *bufferPool = newBufferPool()
+
 // NonPrimitiveTypeError represents an error where the user provided a non-primitive data type for reading/writing
 var NonPrimitiveTypeError error = errors.New("Type provided to read/write does not fit inside 8 bytes.")
 
 // Buffer is a utility type which implements a very basic binary protocol for
 // writing core go types.
 type Buffer struct {
-	b *bytes.Buffer
+	b  *bufio.Reader
+	bw *bytes.Buffer
 }
 
 // NewBuffer creates a new Buffer instance using LittleEndian ByteOrder.
 func NewBuffer() *Buffer {
 	var b bytes.Buffer
 	return &Buffer{
-		b: &b,
+		b:  nil,
+		bw: &b,
 	}
 }
 
@@ -32,7 +38,7 @@ func NewBuffer() *Buffer {
 // The new buffer assumes ownership of the byte slice.
 func NewBufferFromBytes(b []byte) *Buffer {
 	return &Buffer{
-		b: bytes.NewBuffer(b),
+		bw: bytes.NewBuffer(b),
 	}
 }
 
@@ -41,180 +47,298 @@ func NewBufferFromBytes(b []byte) *Buffer {
 func NewBufferFrom(b *Buffer) *Buffer {
 	bb := b.Bytes()
 	return &Buffer{
-		b: bytes.NewBuffer(bb),
+		bw: bytes.NewBuffer(bb),
+	}
+}
+
+// NewBufferFromReader creates a new Buffer instance using the provided io.Reader. This
+// buffer is set to read-only.
+func NewBufferFromReader(reader io.Reader) *Buffer {
+	return &Buffer{
+		b:  bufio.NewReader(reader),
+		bw: nil,
 	}
 }
 
-// WriteBool writes a bool value to the buffer.
-func (b *Buffer) WriteBool(t bool) {
-	write(b.b, t)
+// WriteBool writes a bool value to the buffer
+func (b *Buffer) WriteBool(i bool) {
+	b.checkRO()
+	writeBool(b.bw, i)
 }
 
 // WriteInt writes an int value to the buffer.
 func (b *Buffer) WriteInt(i int) {
-	write(b.b, int32(i))
+	b.checkRO()
+	writeInt(b.bw, i)
 }
 
 // WriteInt8 writes an int8 value to the buffer.
 func (b *Buffer) WriteInt8(i int8) {
-	write(b.b, i)
+	b.checkRO()
+	writeInt8(b.bw, i)
 }
 
 // WriteInt16 writes an int16 value to the buffer.
 func (b *Buffer) WriteInt16(i int16) {
-	write(b.b, i)
+	b.checkRO()
+	writeInt16(b.bw, i)
 }
 
 // WriteInt32 writes an int32 value to the buffer.
 func (b *Buffer) WriteInt32(i int32) {
-	write(b.b, i)
+	b.checkRO()
+	writeInt32(b.bw, i)
 }
 
 // WriteInt64 writes an int64 value to the buffer.
 func (b *Buffer) WriteInt64(i int64) {
-	write(b.b, i)
+	b.checkRO()
+	writeInt64(b.bw, i)
 }
 
 // WriteUInt writes a uint value to the buffer.
 func (b *Buffer) WriteUInt(i uint) {
-	write(b.b, i)
+	b.checkRO()
+	writeUint(b.bw, i)
 }
 
 // WriteUInt8 writes a uint8 value to the buffer.
 func (b *Buffer) WriteUInt8(i uint8) {
-	write(b.b, i)
+	b.checkRO()
+	writeUint8(b.bw, i)
 }
 
 // WriteUInt16 writes a uint16 value to the buffer.
 func (b *Buffer) WriteUInt16(i uint16) {
-	write(b.b, i)
+	b.checkRO()
+	writeUint16(b.bw, i)
 }
 
 // WriteUInt32 writes a uint32 value to the buffer.
 func (b *Buffer) WriteUInt32(i uint32) {
-	write(b.b, i)
+	b.checkRO()
+	writeUint32(b.bw, i)
 }
 
 // WriteUInt64 writes a uint64 value to the buffer.
 func (b *Buffer) WriteUInt64(i uint64) {
-	write(b.b, i)
+	b.checkRO()
+	writeUint64(b.bw, i)
 }
 
 // WriteFloat32 writes a float32 value to the buffer.
 func (b *Buffer) WriteFloat32(i float32) {
-	write(b.b, i)
+	b.checkRO()
+	writeFloat32(b.bw, i)
 }
 
 // WriteFloat64 writes a float64 value to the buffer.
 func (b *Buffer) WriteFloat64(i float64) {
-	write(b.b, i)
+	b.checkRO()
+	writeFloat64(b.bw, i)
 }
 
 // WriteString writes the string's length as a uint16 followed by the string contents.
 func (b *Buffer) WriteString(i string) {
+	b.checkRO()
 	s := stringToBytes(i)
 
 	// string lengths are limited to uint16 - See ReadString()
 	if len(s) > math.MaxUint16 {
 		s = s[:math.MaxUint16]
 	}
-	write(b.b, uint16(len(s)))
-	b.b.Write(s)
+	writeUint16(b.bw, uint16(len(s)))
+	b.bw.Write(s)
 }
 
 // WriteBytes writes the contents of the byte slice to the buffer.
 func (b *Buffer) WriteBytes(bytes []byte) {
-	b.b.Write(bytes)
+	b.checkRO()
+	b.bw.Write(bytes)
+}
+
+// Bytes returns the unread portion of the underlying buffer storage. If the buffer was
+// created with an `io.Reader`, then the remaining unread bytes are drained into a byte
+// slice and returned.
+func (b *Buffer) Bytes() []byte {
+	if b.bw != nil {
+		return b.bw.Bytes()
+	}
+
+	bytes, err := io.ReadAll(b.b)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "failed to read remaining bytes from Buffer: %s\n", err)
+	}
+	return bytes
+}
+
+func (b *Buffer) Peek(length int) ([]byte, error) {
+	if b.bw != nil {
+		return nil, fmt.Errorf("unsupported Peek() operation on read/write buffer.")
+	}
+	return b.b.Peek(length)
+}
+
+// this should be inlined
+func (b *Buffer) checkRO() {
+	if b.bw == nil {
+		panic("Buffer is set to read-only")
+	}
 }
 
 // ReadBool reads a bool value from the buffer.
 func (b *Buffer) ReadBool() bool {
 	var i bool
-	read(b.b, &i)
+	if b.bw != nil {
+		readBool(b.bw, &i)
+		return i
+	}
+
+	readBuffBool(b.b, &i)
 	return i
 }
 
 // ReadInt reads an int value from the buffer.
 func (b *Buffer) ReadInt() int {
-	var i int32
-	read(b.b, &i)
-	return int(i)
+	var i int
+	if b.bw != nil {
+		readInt(b.bw, &i)
+		return i
+	}
+
+	readBuffInt(b.b, &i)
+	return i
 }
 
 // ReadInt8 reads an int8 value from the buffer.
 func (b *Buffer) ReadInt8() int8 {
 	var i int8
-	read(b.b, &i)
+	if b.bw != nil {
+		readInt8(b.bw, &i)
+		return i
+	}
+
+	readBuffInt8(b.b, &i)
 	return i
 }
 
 // ReadInt16 reads an int16 value from the buffer.
 func (b *Buffer) ReadInt16() int16 {
 	var i int16
-	read(b.b, &i)
+	if b.bw != nil {
+		readInt16(b.bw, &i)
+		return i
+	}
+
+	readBuffInt16(b.b, &i)
 	return i
 }
 
 // ReadInt32 reads an int32 value from the buffer.
 func (b *Buffer) ReadInt32() int32 {
 	var i int32
-	read(b.b, &i)
+	if b.bw != nil {
+		readInt32(b.bw, &i)
+		return i
+	}
+
+	readBuffInt32(b.b, &i)
 	return i
 }
 
 // ReadInt64 reads an int64 value from the buffer.
 func (b *Buffer) ReadInt64() int64 {
 	var i int64
-	read(b.b, &i)
+	if b.bw != nil {
+		readInt64(b.bw, &i)
+		return i
+	}
+
+	readBuffInt64(b.b, &i)
 	return i
 }
 
 // ReadUInt reads a uint value from the buffer.
 func (b *Buffer) ReadUInt() uint {
 	var i uint
-	read(b.b, &i)
+	if b.bw != nil {
+		readUint(b.bw, &i)
+		return i
+	}
+
+	readBuffUint(b.b, &i)
 	return i
 }
 
 // ReadUInt8 reads a uint8 value from the buffer.
 func (b *Buffer) ReadUInt8() uint8 {
 	var i uint8
-	read(b.b, &i)
+	if b.bw != nil {
+		readUint8(b.bw, &i)
+		return i
+	}
+
+	readBuffUint8(b.b, &i)
 	return i
 }
 
 // ReadUInt16 reads a uint16 value from the buffer.
 func (b *Buffer) ReadUInt16() uint16 {
 	var i uint16
-	read(b.b, &i)
+	if b.bw != nil {
+		readUint16(b.bw, &i)
+		return i
+	}
+
+	readBuffUint16(b.b, &i)
 	return i
 }
 
 // ReadUInt32 reads a uint32 value from the buffer.
 func (b *Buffer) ReadUInt32() uint32 {
 	var i uint32
-	read(b.b, &i)
+	if b.bw != nil {
+		readUint32(b.bw, &i)
+		return i
+	}
+
+	readBuffUint32(b.b, &i)
 	return i
 }
 
 // ReadUInt64 reads a uint64 value from the buffer.
 func (b *Buffer) ReadUInt64() uint64 {
 	var i uint64
-	read(b.b, &i)
+	if b.bw != nil {
+		readUint64(b.bw, &i)
+		return i
+	}
+
+	readBuffUint64(b.b, &i)
 	return i
 }
 
 // ReadFloat32 reads a float32 value from the buffer.
 func (b *Buffer) ReadFloat32() float32 {
 	var i float32
-	read(b.b, &i)
+	if b.bw != nil {
+		readFloat32(b.bw, &i)
+		return i
+	}
+
+	readBuffFloat32(b.b, &i)
 	return i
 }
 
 // ReadFloat64 reads a float64 value from the buffer.
 func (b *Buffer) ReadFloat64() float64 {
 	var i float64
-	read(b.b, &i)
+	if b.bw != nil {
+		readFloat64(b.bw, &i)
+		return i
+	}
+
+	readBuffFloat64(b.b, &i)
 	return i
 }
 
@@ -222,172 +346,67 @@ func (b *Buffer) ReadFloat64() float64 {
 // then uses the length to extract the exact length []byte representing the string.
 func (b *Buffer) ReadString() string {
 	var l uint16
-	read(b.b, &l)
-	return bytesToString(b.b.Next(int(l)))
-}
+	if b.bw != nil {
+		readUint16(b.bw, &l)
+		return bytesToString(b.bw.Next(int(l)))
+	}
 
-// ReadBytes reads the specified length from the buffer and returns the byte slice.
-func (b *Buffer) ReadBytes(length int) []byte {
-	return b.b.Next(length)
-}
+	readBuffUint16(b.b, &l)
 
-// Bytes returns the unread portion of the underlying buffer storage.
-func (b *Buffer) Bytes() []byte {
-	return b.b.Bytes()
-}
-
-// Read reads structured binary data from r into data.
-func read(r *bytes.Buffer, data interface{}) error {
-	order := binary.LittleEndian
-
-	var b [8]byte
-	if n := intDataSize(data); n != 0 {
-		bs := b[:n]
-
-		if _, err := readFull(r, bs); err != nil {
-			return err
-		}
-
-		switch data := data.(type) {
-		case *bool:
-			*data = bs[0] != 0
-		case *int8:
-			*data = int8(bs[0])
-		case *uint8:
-			*data = bs[0]
-		case *int16:
-			*data = int16(order.Uint16(bs))
-		case *uint16:
-			*data = order.Uint16(bs)
-		case *int32:
-			*data = int32(order.Uint32(bs))
-		case *uint32:
-			*data = order.Uint32(bs)
-		case *int64:
-			*data = int64(order.Uint64(bs))
-		case *uint64:
-			*data = order.Uint64(bs)
-		case *float32:
-			*data = math.Float32frombits(order.Uint32(bs))
-		case *float64:
-			*data = math.Float64frombits(order.Uint64(bs))
-		default:
-			n = 0 // fast path doesn't apply
-		}
-
-		if n != 0 {
-			return nil
-		}
+	bytes := bytePool.Get(int(l))
+	defer bytePool.Put(bytes)
+
+	_, err := readBuffFull(b.b, bytes)
+	if err != nil {
+		return ""
 	}
 
-	return NonPrimitiveTypeError
+	return bytesToString(bytes)
 }
 
-// read full is a bytes.Buffer specific implementation of ioutil.ReadFull() which
-// avoids escaping our stack allocated scratch bytes
-func readFull(r *bytes.Buffer, buf []byte) (n int, err error) {
-	min := len(buf)
-	for n < min && err == nil {
-		var nn int
-		nn, err = r.Read(buf[n:])
-		n += nn
-	}
-	if n >= min {
-		err = nil
-	} else if n > 0 && err == io.EOF {
-		err = io.ErrUnexpectedEOF
-	}
-	return
-}
-
-// Write writes the binary representation of data into w.
-func write(w *bytes.Buffer, data interface{}) error {
-	order := binary.LittleEndian
-
-	var b [8]byte
-	if n := intDataSize(data); n != 0 {
-		bs := b[:n]
-
-		switch v := data.(type) {
-		case *bool:
-			if *v {
-				bs[0] = 1
-			} else {
-				bs[0] = 0
-			}
-		case bool:
-			if v {
-				bs[0] = 1
-			} else {
-				bs[0] = 0
-			}
-		case *int8:
-			bs[0] = byte(*v)
-		case int8:
-			bs[0] = byte(v)
-		case *uint8:
-			bs[0] = *v
-		case uint8:
-			bs[0] = v
-		case *int16:
-			order.PutUint16(bs, uint16(*v))
-		case int16:
-			order.PutUint16(bs, uint16(v))
-		case *uint16:
-			order.PutUint16(bs, *v)
-		case uint16:
-			order.PutUint16(bs, v)
-		case *int32:
-			order.PutUint32(bs, uint32(*v))
-		case int32:
-			order.PutUint32(bs, uint32(v))
-		case *uint32:
-			order.PutUint32(bs, *v)
-		case uint32:
-			order.PutUint32(bs, v)
-		case *int64:
-			order.PutUint64(bs, uint64(*v))
-		case int64:
-			order.PutUint64(bs, uint64(v))
-		case *uint64:
-			order.PutUint64(bs, *v)
-		case uint64:
-			order.PutUint64(bs, v)
-		case *float32:
-			order.PutUint32(bs, math.Float32bits(*v))
-		case float32:
-			order.PutUint32(bs, math.Float32bits(v))
-		case *float64:
-			order.PutUint64(bs, math.Float64bits(*v))
-		case float64:
-			order.PutUint64(bs, math.Float64bits(v))
-		}
-
-		_, err := w.Write(bs)
-		return err
+// ReadBytes reads the specified length from the buffer and returns the byte slice.
+func (b *Buffer) ReadBytes(length int) []byte {
+	if b.bw != nil {
+		return b.bw.Next(length)
 	}
 
-	return NonPrimitiveTypeError
-}
-
-// intDataSize returns the size of the data required to represent the data when encoded.
-// It returns zero if the type cannot be implemented by the fast path in Read or Write.
-func intDataSize(data interface{}) int {
-	switch data.(type) {
-	case bool, int8, uint8, *bool, *int8, *uint8:
-		return 1
-	case int16, uint16, *int16, *uint16:
-		return 2
-	case int32, uint32, *int32, *uint32:
-		return 4
-	case int64, uint64, *int64, *uint64:
-		return 8
-	case float32, *float32:
-		return 4
-	case float64, *float64:
-		return 8
+	bytes := make([]byte, length)
+	_, err := readBuffFull(b.b, bytes)
+	if err != nil {
+		return bytes
 	}
-	return 0
+
+	return bytes
+}
+
+// bytesAsString converts a []byte into a string in place. Note that you should use this helper
+// when the []byte slice contains _only_ the string data and isn't part of a larger underlying array.
+// For example, a case where you should *not* use this helper:
+//
+//	func parseString(buffer *bytes.Buffer, length int) string {
+//	  bytes := buffer.Next(length)   // this extracts a sub-slice of the underlying byte array from pos->pos+length
+//
+//	  return bytesAsString(bytes)
+//	}
+//
+// Now both the []byte AND the value string are linked and neither can be GC'd until the other one is GC'd.
+// This is especially problematic if you drop the references to the byte array, as you're effectively requiring
+// 1024 bytes for an 11-byte string.
+//
+// An example where it _is_ ok, and recommended to drop the underlying []byte reference is the following:
+//
+//	func parseString(reader io.Reader, length int) string {
+//	  bytes := make([]byte, length)
+//	  io.ReadFull(reader, bytes)
+//
+//	  return bytesAsString(bytes)
+//	}
+//
+// In this case, we've create a byte array just big enough for the string, we extract the string data from the reader
+// and then cast the byte array in place to the string, and finally drop the byte array reference. This omits an additional
+// allocation if you were to use string(bytes)
+func bytesAsString(b []byte) string {
+	return unsafe.String(unsafe.SliceData(b), len(b))
 }
 
 // Conversion from byte slice to string
@@ -401,7 +420,7 @@ func bytesToString(b []byte) string {
 	// cached string. If it does _not_ exist, then we use the passed func() string to allocate a new
 	// string and cache it. This will prevent us from allocating throw-away strings just to
 	// check our cache.
-	pinned := unsafe.String(unsafe.SliceData(b), len(b))
+	pinned := bytesAsString(b)
 
 	return stringutil.BankFunc(pinned, func() string {
 		return string(b)

+ 90 - 8
core/pkg/util/buffer_test.go

@@ -2,8 +2,9 @@ package util
 
 import (
 	"bytes"
+	"io"
 	"math"
-	"math/rand"
+	"math/rand/v2"
 	"runtime"
 	"strings"
 	"testing"
@@ -182,8 +183,8 @@ func TestBufferWriteReadInt64(t *testing.T) {
 func TestBufferBytes(t *testing.T) {
 	buf := NewBuffer()
 
-	buf.WriteInt(42)
-	buf.WriteFloat64(3.14)
+	buf.WriteInt(-42)
+	buf.WriteFloat64(-3.14)
 
 	unreadBytes := buf.Bytes()
 
@@ -192,11 +193,11 @@ func TestBufferBytes(t *testing.T) {
 	intVal := newBuf.ReadInt()
 	floatVal := newBuf.ReadFloat64()
 
-	if intVal != 42 {
-		t.Errorf("Expected int value to be 42, got %v", intVal)
+	if intVal != -42 {
+		t.Errorf("Expected int value to be -42, got %v", intVal)
 	}
-	if floatVal != 3.14 {
-		t.Errorf("Expected float value to be 3.14, got %v", floatVal)
+	if floatVal != -3.14 {
+		t.Errorf("Expected float value to be -3.14, got %v", floatVal)
 	}
 }
 
@@ -224,7 +225,7 @@ const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 func generateRandomString(ln int) string {
 	b := make([]byte, ln)
 	for i := range b {
-		b[i] = letters[rand.Intn(len(letters))]
+		b[i] = letters[rand.IntN(len(letters))]
 	}
 	return string(b)
 }
@@ -283,6 +284,87 @@ func TestStringBytes(t *testing.T) {
 	}
 }
 
+type randomByteReader struct {
+	bytes []byte
+	pos   int
+}
+
+func newRandomByteReader(bytes []byte) *randomByteReader {
+	return &randomByteReader{
+		bytes: bytes,
+		pos:   0,
+	}
+}
+
+// reads a random number of bytes from 1-4 each time Read is called.
+// simulates partial buffered reads
+func (sbr *randomByteReader) Read(b []byte) (int, error) {
+	if sbr.pos >= len(sbr.bytes) {
+		return 0, io.EOF
+	}
+
+	toCopy := rand.IntN(4) + 1
+	if toCopy > len(b) {
+		toCopy = len(b)
+	}
+
+	var err error
+	remaining := len(sbr.bytes) - sbr.pos
+	if toCopy > remaining {
+		err = io.EOF
+		toCopy = remaining
+	}
+
+	bytesCopied := copy(b, sbr.bytes[sbr.pos:sbr.pos+toCopy])
+	sbr.pos += bytesCopied
+
+	return bytesCopied, err
+}
+
+func TestBufferReaderSupport(t *testing.T) {
+	buf := NewBuffer()
+	buf.WriteBool(true)
+	buf.WriteInt(42)
+	buf.WriteFloat64(3.14)
+	buf.WriteString("Testing, 1, 2, 3!")
+	buf.WriteUInt64(uint64(123456))
+	buf.WriteInt16(44)
+	buf.WriteFloat32(float32(5.0))
+
+	reader := newRandomByteReader(buf.Bytes())
+	readerBuff := NewBufferFromReader(reader)
+
+	b := readerBuff.ReadBool()
+	i := readerBuff.ReadInt()
+	f := readerBuff.ReadFloat64()
+	s := readerBuff.ReadString()
+	ui64 := readerBuff.ReadUInt64()
+	i16 := readerBuff.ReadInt16()
+	f32 := readerBuff.ReadFloat32()
+
+	if !b {
+		t.Errorf("expected true, got: false")
+	}
+	if i != 42 {
+		t.Errorf("expected 42, got: %d", i)
+	}
+	if f != 3.14 {
+		t.Errorf("expected 3.14, got: %f", f)
+	}
+	if s != "Testing, 1, 2, 3!" {
+		t.Errorf("expected 'Testing, 1, 2, 3!', got: '%s'", s)
+	}
+	if ui64 != uint64(123456) {
+		t.Errorf("expected 123456, got: %d", ui64)
+	}
+	if i16 != int16(44) {
+		t.Errorf("expected 44, got: %d", i16)
+	}
+	if f32 != float32(5.0) {
+		t.Errorf("expected 5.0, got: %f", f32)
+	}
+}
+
 func TestTooLargeStringTruncate(t *testing.T) {
 	normalStr := generateRandomString(100)
 	bigStr := generateRandomString(math.MaxUint16 + (math.MaxUint16 / 2))

+ 493 - 0
core/pkg/util/bufferhelper.go

@@ -0,0 +1,493 @@
+package util
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"io"
+	"math"
+)
+
+func readBool(r *bytes.Buffer, data *bool) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = b != 0
+	return nil
+}
+
+func readInt8(r *bytes.Buffer, data *int8) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = int8(b)
+	return nil
+}
+
+func readUint8(r *bytes.Buffer, data *uint8) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = uint8(b)
+	return nil
+}
+
+func readInt16(r *bytes.Buffer, data *int16) error {
+	order := binary.LittleEndian
+	var b [2]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int16(order.Uint16(bs))
+	return nil
+}
+
+func readUint16(r *bytes.Buffer, data *uint16) error {
+	order := binary.LittleEndian
+	var b [2]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint16(bs)
+	return nil
+}
+
+func readInt(r *bytes.Buffer, data *int) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int(int32(order.Uint32(bs)))
+	return nil
+}
+
+func readInt32(r *bytes.Buffer, data *int32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int32(order.Uint32(bs))
+	return nil
+}
+
+func readUint(r *bytes.Buffer, data *uint) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = uint(order.Uint32(bs))
+	return nil
+}
+
+func readUint32(r *bytes.Buffer, data *uint32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint32(bs)
+	return nil
+}
+
+func readInt64(r *bytes.Buffer, data *int64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int64(order.Uint64(bs))
+	return nil
+}
+
+func readUint64(r *bytes.Buffer, data *uint64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint64(bs)
+	return nil
+}
+
+func readFloat32(r *bytes.Buffer, data *float32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = math.Float32frombits(order.Uint32(bs))
+	return nil
+}
+
+func readFloat64(r *bytes.Buffer, data *float64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = math.Float64frombits(order.Uint64(bs))
+	return nil
+}
+
+func readBuffBool(r *bufio.Reader, data *bool) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = b != 0
+	return nil
+}
+
+func readBuffInt8(r *bufio.Reader, data *int8) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = int8(b)
+	return nil
+}
+
+func readBuffUint8(r *bufio.Reader, data *uint8) error {
+	b, err := r.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	*data = uint8(b)
+	return nil
+}
+
+func readBuffInt16(r *bufio.Reader, data *int16) error {
+	order := binary.LittleEndian
+	var b [2]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int16(order.Uint16(bs))
+	return nil
+}
+
+func readBuffUint16(r *bufio.Reader, data *uint16) error {
+	order := binary.LittleEndian
+	var b [2]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint16(bs)
+	return nil
+}
+
+func readBuffInt(r *bufio.Reader, data *int) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int(int32(order.Uint32(bs)))
+	return nil
+}
+
+func readBuffInt32(r *bufio.Reader, data *int32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int32(order.Uint32(bs))
+	return nil
+}
+
+func readBuffUint(r *bufio.Reader, data *uint) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = uint(order.Uint32(bs))
+	return nil
+}
+
+func readBuffUint32(r *bufio.Reader, data *uint32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint32(bs)
+	return nil
+}
+
+func readBuffInt64(r *bufio.Reader, data *int64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = int64(order.Uint64(bs))
+	return nil
+}
+
+func readBuffUint64(r *bufio.Reader, data *uint64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = order.Uint64(bs)
+	return nil
+}
+
+func readBuffFloat32(r *bufio.Reader, data *float32) error {
+	order := binary.LittleEndian
+	var b [4]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = math.Float32frombits(order.Uint32(bs))
+	return nil
+}
+
+func readBuffFloat64(r *bufio.Reader, data *float64) error {
+	order := binary.LittleEndian
+	var b [8]byte
+
+	bs := b[:]
+	_, err := readBuffFull(r, bs)
+	if err != nil {
+		return err
+	}
+
+	*data = math.Float64frombits(order.Uint64(bs))
+	return nil
+}
+
+// read full is a bufio.Reader specific implementation of io.ReadFull() which
+// avoids escaping our stack allocated scratch bytes
+func readBuffFull(r *bufio.Reader, buf []byte) (n int, err error) {
+	min := len(buf)
+	for n < min && err == nil {
+		var nn int
+		nn, err = r.Read(buf[n:])
+		n += nn
+	}
+	if n >= min {
+		err = nil
+	} else if n > 0 && err == io.EOF {
+		err = io.ErrUnexpectedEOF
+	}
+	return
+}
+
+// read full is a bytes.Buffer specific implementation of io.ReadFull() which
+// avoids escaping our stack allocated scratch bytes
+func readFull(r *bytes.Buffer, buf []byte) (n int, err error) {
+	min := len(buf)
+	for n < min && err == nil {
+		var nn int
+		nn, err = r.Read(buf[n:])
+		n += nn
+	}
+	if n >= min {
+		err = nil
+	} else if n > 0 && err == io.EOF {
+		err = io.ErrUnexpectedEOF
+	}
+	return
+}
+
+func writeBool(w *bytes.Buffer, data bool) error {
+	if data {
+		w.WriteByte(1)
+		return nil
+	}
+
+	w.WriteByte(0)
+	return nil
+}
+
+func writeInt8(w *bytes.Buffer, data int8) error {
+	w.WriteByte(byte(data))
+	return nil
+}
+
+func writeUint8(w *bytes.Buffer, data uint8) error {
+	w.WriteByte(byte(data))
+	return nil
+}
+
+func writeInt16(w *bytes.Buffer, data int16) error {
+	var b [2]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint16(bs, uint16(data))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeUint16(w *bytes.Buffer, data uint16) error {
+	var b [2]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint16(bs, data)
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeInt32(w *bytes.Buffer, data int32) error {
+	var b [4]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint32(bs, uint32(data))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeUint32(w *bytes.Buffer, data uint32) error {
+	var b [4]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint32(bs, data)
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeInt(w *bytes.Buffer, data int) error {
+	var b [4]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint32(bs, uint32(int32(data)))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeUint(w *bytes.Buffer, data uint) error {
+	var b [4]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint32(bs, uint32(data))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeInt64(w *bytes.Buffer, data int64) error {
+	var b [8]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint64(bs, uint64(data))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeUint64(w *bytes.Buffer, data uint64) error {
+	var b [8]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint64(bs, data)
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeFloat32(w *bytes.Buffer, data float32) error {
+	var b [4]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint32(bs, math.Float32bits(data))
+	_, err := w.Write(bs)
+	return err
+}
+
+func writeFloat64(w *bytes.Buffer, data float64) error {
+	var b [8]byte
+	bs := b[:]
+
+	binary.LittleEndian.PutUint64(bs, math.Float64bits(data))
+	_, err := w.Write(bs)
+	return err
+}

+ 68 - 0
core/pkg/util/bufferpool.go

@@ -0,0 +1,68 @@
+package util
+
+import (
+	"math"
+	"math/bits"
+	"sync"
+)
+
+// bufferPool holds "tiered" []byte `sync.Pool` instances by capacity up to math.MaxUint16
+type bufferPool struct {
+	pools [17]sync.Pool
+}
+
+func newBufferPool() *bufferPool {
+	bp := new(bufferPool)
+
+	for i := 0; i < 17; i++ {
+		length := 1 << i
+		bp.pools[i].New = func() any {
+			return make([]byte, length)
+		}
+	}
+	return bp
+}
+
+// poolIndex returns the pool index for a buffer of the given size.
+func poolIndex(length int) int {
+	return bits.Len32(uint32(length - 1))
+}
+
+// putIndex returns the pool index for returning a buffer with the given capacity.
+// It is the inverse of poolIndex: given a capacity that was originally handed out
+// by Get, it finds the pool that owns it.
+//
+// Because Get always returns buffers with capacity 1<<i, the capacity here will
+// always be a power of two. bits.Len32(1<<i) = i+1, so we subtract 1 to recover i.
+func putIndex(capacity int) int {
+	return bits.Len32(uint32(capacity)) - 1
+}
+
+func isPowerOfTwo(capacity int) bool {
+	return capacity&(capacity-1) == 0
+}
+
+func (bp *bufferPool) Get(length int) []byte {
+	if length <= 0 {
+		return nil
+	}
+
+	// Beyond our pool range: allocate directly
+	if length > math.MaxUint16 {
+		return make([]byte, length)
+	}
+
+	i := poolIndex(length)
+	buf := bp.pools[i].Get().([]byte)
+	return buf[:length]
+}
+
+func (bp *bufferPool) Put(buf []byte) {
+	capacity := cap(buf)
+	if capacity == 0 || capacity > math.MaxUint16 || !isPowerOfTwo(capacity) {
+		return
+	}
+
+	i := putIndex(capacity)
+	bp.pools[i].Put(buf[:cap(buf)])
+}

+ 311 - 0
core/pkg/util/bufferpool_test.go

@@ -0,0 +1,311 @@
+package util
+
+import (
+	"math"
+	"math/bits"
+	"sync"
+	"testing"
+)
+
+// --- poolIndex / putIndex unit tests ---
+
+func TestPoolIndex(t *testing.T) {
+	cases := []struct {
+		length int
+		want   int
+	}{
+		{1, 0},
+		{2, 1},
+		{3, 2},
+		{4, 2},
+		{5, 3},
+		{7, 3},
+		{8, 3},
+		{255, 8},
+		{256, 8},
+		{1023, 10},
+		{1024, 10},
+		{math.MaxUint16 - 50, 16},
+	}
+	for _, c := range cases {
+		got := poolIndex(c.length)
+		if got != c.want {
+			t.Errorf("poolIndex(%d) = %d, want %d", c.length, got, c.want)
+		}
+	}
+}
+
+func TestAllocMinusOne(t *testing.T) {
+	bp := newBufferPool()
+	for i := 1; i <= 16; i++ {
+		capacity := 1 << i
+		length := capacity - 1
+		if length <= 0 {
+			continue
+		}
+
+		b := bp.Get(length)
+		c := cap(b)
+
+		pIndex := poolIndex(length)
+		rIndex := putIndex(c)
+
+		if pIndex != rIndex {
+			t.Errorf("pIndex: %d != rIndex: %d\n", pIndex, rIndex)
+		}
+
+	}
+}
+
+func TestPutIndex(t *testing.T) {
+	// putIndex must be the inverse of poolIndex for all power-of-two capacities
+	// that Get hands out.
+	for i := 1; i <= 16; i++ {
+		cap := 1 << i
+		got := putIndex(cap)
+		if got != i {
+			t.Errorf("putIndex(1<<%d = %d) = %d, want %d", i, cap, got, i)
+		}
+	}
+}
+
+func TestPoolIndexPutIndexRoundTrip(t *testing.T) {
+	// For any requested length, the buffer Get returns has capacity 1<<poolIndex(length).
+	// Confirm that putIndex maps that capacity back to the same pool slot.
+	for length := 1; length <= math.MaxUint16; length++ {
+		i := poolIndex(length)
+		capacity := 1 << i
+		j := putIndex(capacity)
+		if i != j {
+			t.Errorf("length=%d: poolIndex=%d, capacity=1<<%d=%d, putIndex=%d — round-trip broken",
+				length, i, i, capacity, j)
+		}
+	}
+}
+
+// --- Get ---
+
+func TestGetNilOnZeroOrNegative(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{0, -1, -100} {
+		if got := bp.Get(n); got != nil {
+			t.Errorf("Get(%d) = %v, want nil", n, got)
+		}
+	}
+}
+
+func TestGetLengthIsExact(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{1, 2, 3, 7, 8, 100, 1000, 65535, 65536} {
+		buf := bp.Get(n)
+		if len(buf) != n {
+			t.Errorf("Get(%d): len = %d, want %d", n, len(buf), n)
+		}
+	}
+}
+
+func TestGetCapacityIsPowerOfTwo(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{1, 2, 3, 4, 5, 100, 1000, 550, math.MaxUint16 - 100, math.MaxUint16} {
+		buf := bp.Get(n)
+		c := cap(buf)
+		if c == 0 || !isPowerOfTwo(c) {
+			t.Errorf("Get(%d): cap = %d, not a power of two", n, c)
+		}
+	}
+}
+
+func TestGetCapacityIsSmallestFittingPowerOfTwo(t *testing.T) {
+	bp := newBufferPool()
+	cases := []struct {
+		n       int
+		wantCap int
+	}{
+		{1, 1},
+		{2, 2},
+		{3, 4},
+		{4, 4},
+		{5, 8},
+		{8, 8},
+		{9, 16},
+		{255, 256},
+		{256, 256},
+		{1024, 1024},
+	}
+	for _, c := range cases {
+		buf := bp.Get(c.n)
+		if cap(buf) != c.wantCap {
+			t.Errorf("Get(%d): cap = %d, want %d", c.n, cap(buf), c.wantCap)
+		}
+	}
+}
+
+func TestGetOversizeFallback(t *testing.T) {
+	bp := newBufferPool()
+	n := math.MaxUint16 + 1
+	buf := bp.Get(n)
+	if len(buf) != n {
+		t.Errorf("Get(MaxUint16+1): len = %d, want %d", len(buf), n)
+	}
+}
+
+// --- Put ---
+
+func TestPutDropsZeroCapBuffer(t *testing.T) {
+	// Put on a nil or zero-cap slice must not panic.
+	bp := newBufferPool()
+	bp.Put(nil)
+	bp.Put([]byte{})
+}
+
+// --- Get / Put round-trip ---
+
+func TestGetPutSamePool(t *testing.T) {
+	// A buffer returned via Put must land in the same pool that Get draws from,
+	// so the very next Get (with the same length) should reuse it.
+	bp := newBufferPool()
+
+	buf := bp.Get(100)
+	ptr := &buf[0]
+	bp.Put(buf)
+
+	buf2 := bp.Get(100)
+	if &buf2[0] != ptr {
+		// sync.Pool may have GC'd the entry; this is not a hard failure but
+		// we at minimum require length and capacity to be correct.
+		if len(buf2) != 100 {
+			t.Errorf("Get(100) after Put: len = %d, want 100", len(buf2))
+		}
+	}
+}
+
+func TestPutRestoresFullCapacity(t *testing.T) {
+	// After Put, the pooled slice should have full capacity, not the resliced length.
+	// We verify this by inspecting what comes out of the pool on the next Get.
+	bp := newBufferPool()
+
+	buf := bp.Get(10)  // len=10, cap=16
+	bp.Put(buf)        // must put back with cap=16
+	buf2 := bp.Get(15) // asks for 15 — still fits in cap=16 pool
+	if cap(buf2) < 15 {
+		t.Errorf("After Put(cap=16), Get(15): cap = %d, too small", cap(buf2))
+	}
+}
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for i := 0; i < 16; i++ {
+		cap := 1 << i
+
+		if !isPowerOfTwo(cap) {
+			t.Fatalf("Failed at: i=%d, cap=%d\n", i, cap)
+		}
+	}
+
+	for _, v := range []int{5, 17, 19, 31, 55} {
+		if isPowerOfTwo(v) {
+			t.Fatalf("Unexpected isPowerOfTwo: %d", v)
+		}
+	}
+}
+
+func TestPutNonPowerOfTwoCapIsDiscarded(t *testing.T) {
+	// Buffers with non-power-of-two capacities (e.g. from outside the pool)
+	// get silently dropped. Confirm no panic and pool still works after.
+	bp := newBufferPool()
+	value := make([]byte, 0, 17)
+	bp.Put(value)
+
+	buf := bp.Get(24)
+	if len(buf) != 24 {
+		t.Errorf("Get(24) after spurious Put: len = %d, want 24", len(buf))
+	}
+	if cap(buf) != 32 {
+		t.Errorf("Get(24) after spurious Put: cap = %d, want 32", cap(buf))
+	}
+}
+
+// --- Concurrency ---
+
+func TestConcurrentGetPut(t *testing.T) {
+	bp := newBufferPool()
+	var wg sync.WaitGroup
+	const goroutines = 64
+	const iters = 1000
+
+	for g := 0; g < goroutines; g++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			for i := 0; i < iters; i++ {
+				n := (id*iters + i) % 4096
+				if n == 0 {
+					n = 1
+				}
+				buf := bp.Get(n)
+				if len(buf) != n {
+					t.Errorf("concurrent Get(%d): len = %d", n, len(buf))
+				}
+				// Write to every byte to catch races under -race.
+				for j := range buf {
+					buf[j] = byte(j)
+				}
+				bp.Put(buf)
+			}
+		}(g)
+	}
+	wg.Wait()
+}
+
+// --- Edge cases at pool boundaries ---
+
+func TestGetExactPowerOfTwo(t *testing.T) {
+	// Exact powers of two are the boundary between two pools; confirm correct
+	// bucket selection and full round-trip for each.
+	bp := newBufferPool()
+	for i := 0; i < 17; i++ {
+		n := 1 << i
+		buf := bp.Get(n)
+		if len(buf) != n {
+			t.Errorf("Get(1<<%d=%d): len = %d", i, n, len(buf))
+		}
+		expectedCap := 1 << (bits.Len16(uint16(n - 1)))
+		if cap(buf) != expectedCap {
+			t.Errorf("Get(1<<%d=%d): cap = %d, want %d", i, n, cap(buf), expectedCap)
+		}
+		bp.Put(buf)
+	}
+}
+
+func TestGetMaxInt16(t *testing.T) {
+	i := poolIndex(math.MaxUint16)
+	if i >= 17 {
+		t.Errorf("poolIndex(MaxUint16) = %d, overflows pool array", i)
+	}
+}
+
+// --- Benchmarks ---
+
+func BenchmarkGetPut(b *testing.B) {
+	sizes := []int{64, 512, 4096, 65535}
+	for _, size := range sizes {
+		bp := newBufferPool()
+		b.Run("", func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				buf := bp.Get(size)
+				bp.Put(buf)
+			}
+		})
+	}
+}
+
+func BenchmarkGetPutParallel(b *testing.B) {
+	bp := newBufferPool()
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			buf := bp.Get(4096)
+			bp.Put(buf)
+		}
+	})
+}

+ 310 - 0
core/pkg/util/monitor/memory/helpers.go

@@ -0,0 +1,310 @@
+package memory
+
+import (
+	"fmt"
+	"math"
+	"slices"
+)
+
+//--------------------------------------------------------------------------
+//  Helper Types
+//--------------------------------------------------------------------------
+
+// trackedValue maintains the state of an uninitialized value, a set value, and
+// the previous value. The previous value is always used when the current value
+// is unset. All values that are set become previous values when the current value
+// changes, or Reset() is called.
+type trackedValue struct {
+	current  *float64
+	previous float64
+}
+
+// newTrackedValue returns a new trackedValue instance for tracking unset, set, and previous
+// values for a float64.
+func newTrackedValue() *trackedValue {
+	return new(trackedValue)
+}
+
+// Value returns the current value if it has not been reset. Otherwise, it returns
+// the previous value
+func (tv *trackedValue) Value() float64 {
+	if tv.current == nil {
+		return tv.previous
+	}
+
+	return *tv.current
+}
+
+// IsSet returns `true` if the current value has been set.
+func (tv *trackedValue) IsSet() bool {
+	return tv.current != nil
+}
+
+// Set updates the current value if it is different. If the value is updated, `true` is returned.
+// Otherwise, `false` is returned.
+func (tv *trackedValue) Set(value float64) bool {
+	if tv.current == nil {
+		tv.current = &value
+		return true
+	}
+
+	curr := *tv.current
+	if value != curr {
+		tv.current = &value
+		tv.previous = curr
+		return true
+	}
+
+	return false
+}
+
+// Reset resets the current value to unset, moving it to the previous value if set.
+func (tv *trackedValue) Reset() {
+	if tv.current == nil {
+		return
+	}
+
+	tv.previous = *tv.current
+	tv.current = nil
+}
+
+// Clear resets the value and sets the previous to 0.
+func (tv *trackedValue) Clear() {
+	tv.current = nil
+	tv.previous = 0.0
+}
+
+// The Cumulative Sum (CUSUM) is a statistical process control tool that plots
+// the cumulative sums of deviations from a target mean to detect small, persistent
+// shifts (0.5 to 2 sigma) in process performance quickly.
+type cumulativeSum struct {
+	slack float64
+	sum   float64
+	base  float64
+}
+
+// newCumulativeSum creates a new cumulativeSum instance with the provided slack
+func newCumulativeSum(slack float64) *cumulativeSum {
+	return &cumulativeSum{
+		slack: slack,
+		sum:   0.0,
+		base:  0.0,
+	}
+}
+
+// Calibrate initializes the baseline for the CUSUM. This is generally the mean
+// of the samples once there are enough to consider the sample set as "stable."
+func (cs *cumulativeSum) Calibrate(mean float64) {
+	if cs.base != 0.0 {
+		return
+	}
+
+	cs.base = mean
+}
+
+// Update supplies a new sample to update the internal sum.
+func (cs *cumulativeSum) Update(value float64) {
+	if cs.base == 0.0 {
+		return
+	}
+
+	slack := cs.base * cs.slack
+	cs.sum = max(0, cs.sum+(value-cs.base)-slack)
+}
+
+// Sum returns the current CUSUM value.
+func (cs *cumulativeSum) Sum() float64 {
+	return cs.sum
+}
+
+// IsRecalibrationRequired tests the current CUSUM against the base * thresholdMagnitude.
+// If it has surpassed the magnitude provided, true is returned signalling a recalibration
+// should be performed.
+func (cs *cumulativeSum) IsRecalibrationRequired(thresholdMagnitude float64) bool {
+	if cs.base == 0.0 {
+		return false
+	}
+
+	threshold := cs.base * thresholdMagnitude
+	//fmt.Printf("Testing: %f > %f = %t\n", cs.sum, threshold, cs.sum > threshold)
+	return cs.sum > threshold
+}
+
+func (cs *cumulativeSum) Reset() {
+	cs.base = 0.0
+	cs.sum = 0.0
+}
+
+// exponentialMovingAverage is a helper type that tracks the current moving average
+// value using a providing smoothing factor.
+type exponentialMovingAverage struct {
+	smoothing float64
+	value     float64
+	set       bool
+}
+
+// creates a new exponential moving average instance using the provided smoothing factor
+func newExponentialMovingAverage(smoothing float64) *exponentialMovingAverage {
+	return &exponentialMovingAverage{
+		smoothing: smoothing,
+		set:       false,
+	}
+}
+
+// updates the moving average for the provided sample, and returns the updated
+// value
+func (ema *exponentialMovingAverage) Update(sample float64) float64 {
+	if !ema.set {
+		ema.set = true
+		ema.value = sample
+	} else {
+		ema.value = ema.smoothing*sample + (1.0-ema.smoothing)*ema.value
+	}
+	return ema.value
+}
+
+// The current moving average value
+func (ema *exponentialMovingAverage) Current() float64 {
+	return ema.value
+}
+
+// Resets the moving average calculation
+func (ema *exponentialMovingAverage) Reset() {
+	ema.set = false
+	ema.value = 0.0
+}
+
+// rollingWindow is a ring buffer helper type for tracking a set capacity number of
+// the most recent values. It also provides helper methods for calculating mean,
+// stddev, and percentiles of the contained data.
+type rollingWindow struct {
+	capacity int
+	length   int
+	window   []float64
+	index    int
+}
+
+// creates a new rolling window instance with the provided static capacity.
+func newRollingWindow(capacity int) *rollingWindow {
+	if capacity <= 0 || capacity > (math.MaxInt/2) {
+		panic(fmt.Sprintf("RollingWindow capacity limited to range 1-%d", math.MaxInt/2))
+	}
+
+	return &rollingWindow{
+		capacity: capacity,
+		window:   make([]float64, capacity),
+		index:    0,
+	}
+}
+
+// Pushes a new value into the rolling window, dropping the oldest value if
+// the total length surpasses the capacity.
+func (rw *rollingWindow) Push(value float64) {
+	// advance index, handle overflow
+	index := rw.index % rw.capacity
+	rw.window[index] = value
+
+	rw.index = (rw.index + 1) % rw.capacity
+	rw.length = min(rw.length+1, rw.capacity)
+}
+
+// Clears the rolling window values
+func (rw *rollingWindow) Clear() {
+	rw.window = make([]float64, rw.capacity)
+	rw.index = 0
+	rw.length = 0
+}
+
+// The length of the rolling window. Will never be greater that the `Cap()`.
+func (rw *rollingWindow) Len() int {
+	return rw.length
+}
+
+// Cap returns the maximum capacity of the rolling window.
+func (rw *rollingWindow) Cap() int {
+	return rw.capacity
+}
+
+// Each iterates all values within the rolling window and calls `f` passing each value.
+// NOTE: Ordering is _not_ guaranteed!
+func (rw *rollingWindow) Each(f func(float64)) {
+	total := rw.Len()
+	for i := range total {
+		f(rw.window[i])
+	}
+}
+
+// Mean returns the average of the values in the window
+func (rw *rollingWindow) Mean() float64 {
+	length := rw.Len()
+	if length == 0 {
+		return 0.0
+	}
+
+	sum := 0.0
+	for i := range length {
+		sum += rw.window[i]
+	}
+	return sum / float64(length)
+}
+
+// MeanStdDev computes the mean and standard deviation of the window values.
+func (rw *rollingWindow) MeanStdDev() (mean float64, stddev float64) {
+	mean = rw.Mean()
+
+	length := rw.Len()
+	if length < 2 {
+		return mean, 0
+	}
+
+	variance := 0.0
+	for i := range length {
+		d := rw.window[i] - mean
+		variance += d * d
+	}
+
+	// sample variance (Bessel's correction)
+	variance /= float64(length - 1)
+	stddev = math.Sqrt(variance)
+	return
+}
+
+// Percentile computes the p-th percentile of the values currently stored in the
+// rolling window.
+func (rw *rollingWindow) Percentile(p float64) float64 {
+	length := rw.Len()
+	if length == 0 {
+		return 0
+	}
+
+	sorted := make([]float64, length)
+	for i := range length {
+		sorted[i] = rw.window[i]
+	}
+	slices.Sort(sorted)
+
+	if p <= 0 {
+		return sorted[0]
+	}
+	if p >= 100 {
+		return sorted[len(sorted)-1]
+	}
+
+	rank := (p / 100.0) * float64(len(sorted)-1)
+	lo := int(math.Floor(rank))
+	hi := int(math.Ceil(rank))
+	frac := rank - float64(lo)
+
+	return sorted[lo]*(1-frac) + sorted[hi]*frac
+}
+
+// IsConfidenceSatisfied checks the relative margin of error is within the provided
+// `marginPercent` threshold using the provided z-score.
+func (rw *rollingWindow) IsConfidenceSatisfied(z float64, marginPercent float64) bool {
+	mean, stddev := rw.MeanStdDev()
+	length := float64(rw.Len())
+	marginOfError := z * (stddev / math.Sqrt(length))
+	relative := marginOfError / mean
+
+	return relative <= marginPercent
+}

+ 418 - 0
core/pkg/util/monitor/memory/helpers_test.go

@@ -0,0 +1,418 @@
+package memory
+
+import (
+	"math"
+	"testing"
+)
+
+const epsilon = 1e-9
+
+type set[T comparable] struct {
+	m map[T]struct{}
+}
+
+func newSet[T comparable](values ...T) *set[T] {
+	m := make(map[T]struct{})
+	for _, v := range values {
+		m[v] = struct{}{}
+	}
+	return &set[T]{
+		m: m,
+	}
+}
+
+func (s *set[T]) add(value T) {
+	s.m[value] = struct{}{}
+}
+
+func (s *set[T]) has(value T) bool {
+	_, hasValue := s.m[value]
+	return hasValue
+}
+
+func (s *set[T]) remove(value T) {
+	delete(s.m, value)
+}
+
+// -------------------------------------------------------------------------
+//  exponentialMovingAverage tests
+// -------------------------------------------------------------------------
+
+func TestEMA_InitialState(t *testing.T) {
+	ema := newExponentialMovingAverage(0.5)
+	if ema.set {
+		t.Error("expected ema.set to be false on creation")
+	}
+	if ema.Current() != 0.0 {
+		t.Errorf("expected initial Current() = 0.0, got %f", ema.Current())
+	}
+}
+
+func TestEMA_FirstUpdateSetsValue(t *testing.T) {
+	ema := newExponentialMovingAverage(0.5)
+	got := ema.Update(42.0)
+	if got != 42.0 {
+		t.Errorf("expected first Update() = 42.0, got %f", got)
+	}
+	if !ema.set {
+		t.Error("expected ema.set to be true after first update")
+	}
+}
+
+func TestEMA_SubsequentUpdates(t *testing.T) {
+	// With smoothing=0.5: EMA(n) = 0.5*sample + 0.5*EMA(n-1)
+	ema := newExponentialMovingAverage(0.5)
+	ema.Update(10.0)        // value = 10
+	got := ema.Update(20.0) // value = 0.5*20 + 0.5*10 = 15
+	want := 15.0
+	if math.Abs(got-want) > epsilon {
+		t.Errorf("expected %f, got %f", want, got)
+	}
+}
+
+func TestEMA_SmoothingZero(t *testing.T) {
+	// smoothing=0 means the value never changes after the first sample
+	ema := newExponentialMovingAverage(0.0)
+	ema.Update(5.0)
+	ema.Update(100.0)
+	ema.Update(999.0)
+	if ema.Current() != 5.0 {
+		t.Errorf("expected Current() = 5.0, got %f", ema.Current())
+	}
+}
+
+func TestEMA_SmoothingOne(t *testing.T) {
+	// smoothing=1 means the value is always the latest sample
+	ema := newExponentialMovingAverage(1.0)
+	ema.Update(5.0)
+	ema.Update(99.0)
+	if ema.Current() != 99.0 {
+		t.Errorf("expected Current() = 99.0, got %f", ema.Current())
+	}
+}
+
+func TestEMA_Reset(t *testing.T) {
+	ema := newExponentialMovingAverage(0.5)
+	ema.Update(10.0)
+	ema.Reset()
+	if ema.set {
+		t.Error("expected ema.set to be false after Reset()")
+	}
+	if ema.Current() != 0.0 {
+		t.Errorf("expected Current() = 0.0 after Reset(), got %f", ema.Current())
+	}
+	// First update after reset should treat as a fresh start
+	got := ema.Update(7.0)
+	if got != 7.0 {
+		t.Errorf("expected first Update() after Reset() = 7.0, got %f", got)
+	}
+}
+
+func TestEMA_MultipleUpdates(t *testing.T) {
+	smoothing := 0.3
+	ema := newExponentialMovingAverage(smoothing)
+
+	samples := []float64{10, 20, 30, 40, 50}
+	want := samples[0]
+	for i, s := range samples {
+		got := ema.Update(s)
+		if i == 0 {
+			want = s
+		} else {
+			want = smoothing*s + (1-smoothing)*want
+		}
+		if math.Abs(got-want) > epsilon {
+			t.Errorf("step %d: expected %f, got %f", i, want, got)
+		}
+	}
+}
+
+// -------------------------------------------------------------------------
+//  rollingWindow tests
+// -------------------------------------------------------------------------
+
+func TestRollingWindow_NewPanicsOnBadCapacity(t *testing.T) {
+	cases := []int{0, -1, math.MaxInt}
+	for _, cap := range cases {
+		func() {
+			defer func() {
+				if r := recover(); r == nil {
+					t.Errorf("expected panic for capacity %d", cap)
+				}
+			}()
+			newRollingWindow(cap)
+		}()
+	}
+}
+
+func TestRollingWindow_InitialLen(t *testing.T) {
+	rw := newRollingWindow(5)
+	if rw.Len() != 0 {
+		t.Errorf("expected Len() = 0, got %d", rw.Len())
+	}
+	if rw.Cap() != 5 {
+		t.Errorf("expected Cap() = 5, got %d", rw.Cap())
+	}
+}
+
+func TestRollingWindow_LenGrowsUpToCapacity(t *testing.T) {
+	rw := newRollingWindow(3)
+	rw.Push(1)
+	if rw.Len() != 1 {
+		t.Errorf("expected Len()=1, got %d", rw.Len())
+	}
+	rw.Push(2)
+	if rw.Len() != 2 {
+		t.Errorf("expected Len()=2, got %d", rw.Len())
+	}
+	rw.Push(3)
+	if rw.Len() != 3 {
+		t.Errorf("expected Len()=3, got %d", rw.Len())
+	}
+	// Pushing beyond capacity should not grow Len() past Cap()
+	rw.Push(4)
+	if rw.Len() != 3 {
+		t.Errorf("expected Len()=3 after overflow push, got %d", rw.Len())
+	}
+}
+
+func TestRollingWindow_Clear(t *testing.T) {
+	rw := newRollingWindow(4)
+	rw.Push(1)
+	rw.Push(2)
+	rw.Clear()
+	if rw.Len() != 0 {
+		t.Errorf("expected Len()=0 after Clear(), got %d", rw.Len())
+	}
+}
+
+func TestRollingWindow_Mean_Empty(t *testing.T) {
+	rw := newRollingWindow(4)
+	if rw.Mean() != 0.0 {
+		t.Errorf("expected Mean()=0 for empty window, got %f", rw.Mean())
+	}
+}
+
+func TestRollingWindow_Mean_SingleValue(t *testing.T) {
+	rw := newRollingWindow(4)
+	rw.Push(7.0)
+	if rw.Mean() != 7.0 {
+		t.Errorf("expected Mean()=7.0, got %f", rw.Mean())
+	}
+}
+
+func TestRollingWindow_Mean_MultipleValues(t *testing.T) {
+	rw := newRollingWindow(5)
+	for _, v := range []float64{1, 2, 3, 4, 5} {
+		rw.Push(v)
+	}
+	want := 3.0
+	if math.Abs(rw.Mean()-want) > epsilon {
+		t.Errorf("expected Mean()=%f, got %f", want, rw.Mean())
+	}
+}
+
+func TestRollingWindow_MeanStdDev_SingleValue(t *testing.T) {
+	rw := newRollingWindow(4)
+	rw.Push(10.0)
+	mean, stddev := rw.MeanStdDev()
+	if mean != 10.0 {
+		t.Errorf("expected mean=10.0, got %f", mean)
+	}
+	if stddev != 0.0 {
+		t.Errorf("expected stddev=0.0 for single value, got %f", stddev)
+	}
+}
+
+func TestRollingWindow_MeanStdDev_KnownValues(t *testing.T) {
+	rw := newRollingWindow(5)
+	for _, v := range []float64{2, 4, 4, 4, 5, 5, 7, 9} {
+		rw.Push(v)
+	}
+	// Window holds only the last 5: [5, 5, 7, 9, 9] — wait, cap=5.
+	// Pushes: index 0=2,1=4,2=4,3=4,4=5 -> wraps: index 0=5,1=7,2=9
+	// Use a simpler known case instead.
+	rw2 := newRollingWindow(4)
+	for _, v := range []float64{10, 20, 30, 40} {
+		rw2.Push(v)
+	}
+	mean, stddev := rw2.MeanStdDev()
+	wantMean := 25.0
+	// sample stddev of {10,20,30,40} = sqrt(((−15)²+(−5)²+(5)²+(15)²)/3) = sqrt(500/3)
+	wantStddev := math.Sqrt(500.0 / 3.0)
+	if math.Abs(mean-wantMean) > epsilon {
+		t.Errorf("expected mean=%f, got %f", wantMean, mean)
+	}
+	if math.Abs(stddev-wantStddev) > epsilon {
+		t.Errorf("expected stddev=%f, got %f", wantStddev, stddev)
+	}
+}
+
+func TestRollingWindow_Percentile_Empty(t *testing.T) {
+	rw := newRollingWindow(4)
+	if rw.Percentile(50) != 0.0 {
+		t.Errorf("expected 0.0 for empty window percentile, got %f", rw.Percentile(50))
+	}
+}
+
+func TestRollingWindow_Percentile_BoundaryValues(t *testing.T) {
+	rw := newRollingWindow(5)
+	for _, v := range []float64{3, 1, 4, 1, 5} {
+		rw.Push(v)
+	}
+	if rw.Percentile(0) != 1.0 {
+		t.Errorf("expected p0=1.0, got %f", rw.Percentile(0))
+	}
+	if rw.Percentile(100) != 5.0 {
+		t.Errorf("expected p100=5.0, got %f", rw.Percentile(100))
+	}
+}
+
+func TestRollingWindow_Percentile_Median(t *testing.T) {
+	rw := newRollingWindow(5)
+	for _, v := range []float64{1, 2, 3, 4, 5} {
+		rw.Push(v)
+	}
+	got := rw.Percentile(50)
+	want := 3.0
+	if math.Abs(got-want) > epsilon {
+		t.Errorf("expected p50=%f, got %f", want, got)
+	}
+}
+
+func TestRollingWindow_Percentile_Interpolation(t *testing.T) {
+	rw := newRollingWindow(4)
+	for _, v := range []float64{0, 10, 20, 30} {
+		rw.Push(v)
+	}
+	// rank = 0.25 * 3 = 0.75, lo=0(val=0), hi=1(val=10), frac=0.75 => 0*0.25 + 10*0.75 = 7.5
+	got := rw.Percentile(25)
+	want := 7.5
+	if math.Abs(got-want) > epsilon {
+		t.Errorf("expected p25=%f, got %f", want, got)
+	}
+}
+
+func TestRollingWindow_IsConfidenceSatisfied(t *testing.T) {
+	rw := newRollingWindow(100)
+	// All the same value — stddev=0, margin=0, should always be satisfied
+	for range 100 {
+		rw.Push(50.0)
+	}
+	if !rw.IsConfidenceSatisfied(1.96, 0.05) {
+		t.Error("expected confidence satisfied for zero-variance data")
+	}
+}
+
+func TestRollingWindow_IsConfidenceSatisfied_HighVariance(t *testing.T) {
+	rw := newRollingWindow(10)
+	// High variance: alternating 1 and 1000
+	for i := range 10 {
+		if i%2 == 0 {
+			rw.Push(1.0)
+		} else {
+			rw.Push(1000.0)
+		}
+	}
+	// With high variance and small n, a tight margin should not be satisfied
+	if rw.IsConfidenceSatisfied(1.96, 0.001) {
+		t.Error("expected confidence NOT satisfied for high-variance data with tight margin")
+	}
+}
+
+func TestRollingWindow_Each(t *testing.T) {
+	rw := newRollingWindow(4)
+	for _, v := range []float64{1, 2, 3, 4} {
+		rw.Push(v)
+	}
+	sum := 0.0
+	rw.Each(func(v float64) { sum += v })
+	if math.Abs(sum-10.0) > epsilon {
+		t.Errorf("expected Each() sum=10.0, got %f", sum)
+	}
+}
+
+func TestRollingWindow_OverwritesOldestOnOverflow(t *testing.T) {
+	rw := newRollingWindow(3)
+	rw.Push(1)
+	rw.Push(2)
+	rw.Push(3)
+	rw.Push(100) // Should evict 1, window = [2, 3, 100]
+	mean := rw.Mean()
+	want := (2.0 + 3.0 + 100.0) / 3.0
+	if math.Abs(mean-want) > epsilon {
+		t.Errorf("expected mean=%f after overflow, got %f", want, mean)
+	}
+}
+
+func assertIndexLengthCap(t *testing.T, rw *rollingWindow, index int, length int, cap int) {
+	t.Helper()
+
+	if rw.index != index {
+		t.Errorf("RollingWindow Index: %d. Expected %d", rw.index, index)
+	}
+	if rw.Len() != length {
+		t.Errorf("RollingWindow Length: %d. Expected %d", rw.Len(), length)
+	}
+	if rw.Cap() != cap {
+		t.Errorf("RollingWindow Capacity: %d. Expected %d", rw.Cap(), cap)
+	}
+}
+
+func TestRollingWindow_BasicIndexingLengthCap(t *testing.T) {
+	capacity := 3
+	rw := newRollingWindow(capacity)
+
+	assertIndexLengthCap(t, rw, 0, 0, 3)
+	rw.Push(1)
+	assertIndexLengthCap(t, rw, 1, 1, 3)
+	rw.Push(2)
+	assertIndexLengthCap(t, rw, 2, 2, 3)
+	rw.Push(3)
+	assertIndexLengthCap(t, rw, 0, 3, 3)
+
+	rw.Push(1)
+	assertIndexLengthCap(t, rw, 1, 3, 3)
+	rw.Push(2)
+	assertIndexLengthCap(t, rw, 2, 3, 3)
+	rw.Push(3)
+	assertIndexLengthCap(t, rw, 0, 3, 3)
+
+	set := newSet(1, 2, 3)
+
+	rw.Each(func(value float64) {
+		v := int(value)
+		if !set.has(v) {
+			t.Errorf("Failed to find value: %d in set.\n", v)
+		}
+
+		set.remove(v)
+	})
+
+	// rewrite
+	rw.Push(4)
+	rw.Push(5)
+	rw.Push(6)
+
+	set = newSet(4, 5, 6)
+	rw.Each(func(value float64) {
+		v := int(value)
+		if !set.has(v) {
+			t.Errorf("Failed to find value: %d in set.\n", v)
+		}
+
+		set.remove(v)
+	})
+}
+
+func TestRollingWindow_PartialCapacityMean(t *testing.T) {
+	rw := newRollingWindow(10)
+	for range 5 {
+		rw.Push(5.0)
+	}
+
+	mean := rw.Mean()
+	if mean != 5.0 {
+		t.Errorf("Expected mean = 5.0. Got %f\n", mean)
+	}
+}

+ 226 - 0
core/pkg/util/monitor/memory/memorylimitstats.go

@@ -0,0 +1,226 @@
+package memory
+
+import "sync"
+
+// MemoryLimitConfig contains configuration values used to calculate the soft
+// memory limit based on heap usage over time.
+type MemoryLimitConfig struct {
+	// LimitRatio is the ratio applied to memory limit values calculated. This
+	// is generally set to 90% of the proposed limit. ie: 0.9
+	LimitRatio float64
+
+	// MinSamples is the required number of samples that must be collected before
+	// calculating a memory limit.
+	MinSamples int
+
+	// WindowSize is the total number of smoothed samples to maintain when calculating
+	// the proposed limit. This is generally set based on the timescale in which
+	// samples are added.
+	WindowSize int
+
+	// SmoothingFactor is a value between 0 and 1 which defines how weight importance
+	// should be distributed between a previous average value and the current observation.
+	SmoothingFactor float64
+
+	// BreachWindowSize is the total number of recent raw samples are maintained/used for
+	// breach detection.
+	BreachWindowSize int
+
+	// BreachThreshold is a limit of raw samples, within the `BreachWindowSize`, allowed to
+	// exceed the memory limit. If this threshold is reached, the samples are recalibrated.
+	BreachThreshold int
+
+	// CumulativeSumSlack is also known as the K-Factor (drift tolerance) in cumulative sum control
+	// charts uses the allowable slack range in deviations. If the deviations exceed the allowable
+	// slack, then they're used to calculate the sum. This is generally set from 0.5 to 1.0 standard
+	// deviations to filter out process noise.
+	CumulativeSumSlack float64
+
+	// CumulativeSumThreshold is a scaler applied to the "baseline" mean (set once there are enough
+	// samples to be considered "stable"). If the cumulative sum ever surpasses this baseline * threshold,
+	// the samples will be recalibrated.
+	CumulativeSumThreshold float64
+}
+
+// DefaultMemoryLimitConfig creates the recommended values to use for detecting soft memory limit updates
+func DefaultMemoryLimitConfig() *MemoryLimitConfig {
+	return &MemoryLimitConfig{
+		LimitRatio:             0.90,
+		MinSamples:             30,
+		WindowSize:             60,
+		SmoothingFactor:        0.30,
+		BreachWindowSize:       10,
+		BreachThreshold:        3,
+		CumulativeSumSlack:     0.05,
+		CumulativeSumThreshold: 5.0,
+	}
+}
+
+// MemoryLimitStats is a run-time memory statistics collector that maintains a soft memory limit
+// value based on configurable input parameters. It is designed to adjust the soft limit based on
+// meaningful changes to overall heap allocation, leveraging exponential moving average windows,
+// confidence interval gates, breach detection, and cumulative sum control chart to detect meaningful
+// deviations from the mean.
+type MemoryLimitStats struct {
+	lock   sync.Mutex
+	config *MemoryLimitConfig
+
+	// expontential moving average calculation
+	ema *exponentialMovingAverage
+
+	// ring buffers for tracking exponential moving averages and raw samples
+	window *rollingWindow
+	raw    *rollingWindow
+	breach *rollingWindow
+
+	// cusum calculation for detecting positive shifts in memory usage
+	cusum *cumulativeSum
+
+	// tracked value storage for the soft memory limit proposal which
+	// stores the previous limit as well as the current limit
+	softLimit *trackedValue
+}
+
+// NewMemoryLimitStats creates a new `MemoryLimitStats` instance with the provided
+// `MemoryLimitConfig`. If the provided config is `nil`, then the default configuration
+// values are used.
+func NewMemoryLimitStats(config *MemoryLimitConfig) *MemoryLimitStats {
+	if config == nil {
+		config = DefaultMemoryLimitConfig()
+	}
+
+	return &MemoryLimitStats{
+		config:    config,
+		ema:       newExponentialMovingAverage(config.SmoothingFactor),
+		window:    newRollingWindow(config.WindowSize),
+		raw:       newRollingWindow(config.MinSamples),
+		breach:    newRollingWindow(config.BreachWindowSize),
+		cusum:     newCumulativeSum(config.CumulativeSumSlack),
+		softLimit: newTrackedValue(),
+	}
+}
+
+// Record ingests the total heap memory usage (in bytes), and returns
+// (newSoftLimit, true) when the soft limit has been updated, or
+// (currentSoftLimit, false) when no change occurred.
+//
+// A return value of (0, false) means the monitor is still collecting samples
+// and no limit has been committed yet.
+func (mls *MemoryLimitStats) Record(heapBytes uint64) (softLimit uint64, updated bool) {
+	mls.lock.Lock()
+	defer mls.lock.Unlock()
+
+	sample := float64(heapBytes)
+	smoothed := mls.ema.Update(sample)
+
+	mls.window.Push(smoothed)
+	mls.raw.Push(sample)
+	mls.breach.Push(sample)
+
+	// Check that the minimum number of samples exist in the window before
+	// calculating the memory limit
+	totalSamples := mls.window.Len()
+	if totalSamples < mls.config.MinSamples {
+		return uint64(mls.softLimit.Value()), false
+	}
+
+	// NOTE: We could calculate the mean and stddev here, and determine if the data
+	// NOTE: matches a confidence interval, but this might be too strict. See the
+	// NOTE: method: mls.window.IsConfidenceSatisfied(...) method.
+
+	// Pull the P99 sample from the smoothed sample window
+	p99 := mls.window.Percentile(99)
+	candidate := p99 * mls.config.LimitRatio
+
+	// Ensure we've already set a soft limit before running breach
+	// detection or CUSUM deviation tests.
+	if mls.softLimit.IsSet() {
+		if mls.isBreachDetected() {
+			mls.recalibrate()
+			return uint64(mls.softLimit.Value()), false
+		}
+
+		// update cumulative sum and check for recalibration
+		mls.cusum.Update(sample)
+		if mls.cusum.IsRecalibrationRequired(mls.config.CumulativeSumThreshold) {
+			mls.recalibrate()
+			return uint64(mls.softLimit.Value()), false
+		}
+
+		// this will only end up running once after the min samples threshold
+		// is passed, and sets the baseline mean for the cusum calculations
+		mean := mls.raw.Mean()
+		mls.cusum.Calibrate(mean)
+	}
+
+	// update the soft limit to the candidate sample
+	updated = mls.softLimit.Set(candidate)
+	softLimit = uint64(mls.softLimit.Value())
+	return
+}
+
+// SoftLimit returns the current soft limit without recording a sample.
+// Returns 0 if the monitor is still collecting data samples.
+func (mls *MemoryLimitStats) SoftMemoryLimit() uint64 {
+	mls.lock.Lock()
+	defer mls.lock.Unlock()
+
+	return uint64(mls.softLimit.Value())
+}
+
+// TotalSamples returns the total number of samples _currently_ being used to
+// calculate the memory limit. The samples will reset if a deviation threshold
+// was reached in order to re-establish stability in the data set.
+func (mls *MemoryLimitStats) TotalSamples() int {
+	mls.lock.Lock()
+	defer mls.lock.Unlock()
+
+	return mls.window.Len()
+}
+
+// Reset clears all state and samples collected.
+func (mls *MemoryLimitStats) Reset() {
+	mls.lock.Lock()
+	defer mls.lock.Unlock()
+
+	mls.window.Clear()
+	mls.raw.Clear()
+	mls.breach.Clear()
+	mls.softLimit.Clear()
+	mls.ema.Reset()
+	mls.cusum.Reset()
+}
+
+// isBreachDetected iterates through the breach sample window and tallies the
+// total number of samples that exceed the p99 smoothed memory usage sample.
+func (mls *MemoryLimitStats) isBreachDetected() bool {
+	if !mls.softLimit.IsSet() {
+		return false
+	}
+
+	// due to the nature of breach detection, we want to compare
+	// against the smoothed p99 sample, so unroll the ratio
+	p99 := mls.softLimit.Value() / mls.config.LimitRatio
+
+	// Tally the total number of recent raw samples that are
+	// greater than the p99 smoothed sample
+	count := 0
+	mls.breach.Each(func(value float64) {
+		if value > p99 {
+			count++
+		}
+	})
+
+	return count >= mls.config.BreachThreshold
+}
+
+// recalibrate dumps the existing samples and calculations, but will preserve
+// the previous softLimit value until a new soft limit is set.
+func (mls *MemoryLimitStats) recalibrate() {
+	mls.window.Clear()
+	mls.raw.Clear()
+	mls.breach.Clear()
+	mls.ema.Reset()
+	mls.cusum.Reset()
+	mls.softLimit.Reset()
+}

+ 141 - 0
core/pkg/util/monitor/memory/memorylimitstats_test.go

@@ -0,0 +1,141 @@
+package memory_test
+
+import (
+	"math/rand"
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/util/monitor/memory"
+)
+
+func TestObservationMode(t *testing.T) {
+	config := memory.DefaultMemoryLimitConfig()
+	m := memory.NewMemoryLimitStats(config)
+
+	// Feed fewer than MinSamples — should never return updated=true.
+	for i := 0; i < config.MinSamples-1; i++ {
+		_, updated := m.Record(100 * 1024 * 1024) // 100 MiB
+		if updated {
+			t.Fatalf("sample %d: got updated=true before MinSamples reached", i)
+		}
+	}
+	if got := m.SoftMemoryLimit(); got != 0 {
+		t.Fatalf("expected SoftLimit 0 during observation, got %d", got)
+	}
+}
+
+func TestLimitCommittedAfterMinSamples(t *testing.T) {
+	config := memory.DefaultMemoryLimitConfig()
+	m := memory.NewMemoryLimitStats(config)
+
+	const alloc = 200 * 1024 * 1024 // 200 MiB, perfectly stable
+	var lastLimit uint64
+	var sawUpdate bool
+
+	for i := 0; i < config.MinSamples+10; i++ {
+		limit, updated := m.Record(alloc)
+		if updated {
+			sawUpdate = true
+			lastLimit = limit
+		}
+	}
+
+	if !sawUpdate {
+		t.Fatal("expected at least one limit update after MinSamples")
+	}
+
+	// Soft limit should be ~90% of the stable allocation.
+	expected := uint64(float64(alloc) * 0.90)
+	delta := int64(lastLimit) - int64(expected)
+	if delta < 0 {
+		delta = -delta
+	}
+	// Allow 1% tolerance.
+	if delta > int64(expected)/100 {
+		t.Fatalf("soft limit %d too far from expected %d (delta %d)", lastLimit, expected, delta)
+	}
+}
+
+func TestElasticRecalibrationOnGrowth(t *testing.T) {
+	config := memory.DefaultMemoryLimitConfig()
+	config.BreachWindowSize = 5
+	config.BreachThreshold = 3
+	m := memory.NewMemoryLimitStats(config)
+
+	// Phase 1: stable at 100 MiB — establish a limit.
+	for i := 0; i < config.MinSamples+20; i++ {
+		m.Record(100 * 1024 * 1024)
+	}
+	limitBefore := m.SoftMemoryLimit()
+	if limitBefore == 0 {
+		t.Fatal("expected a non-zero limit after phase 1")
+	}
+
+	// Phase 2: spike to 300 MiB repeatedly — should trigger recalibration.
+	for i := 0; i < config.BreachThreshold+1; i++ {
+		m.Record(500 * 1024 * 1024)
+	}
+
+	// Phase 3: feed enough samples at new level to re-commit.
+	var recalibrated bool
+	for i := 0; i < config.MinSamples+20; i++ {
+		limit, updated := m.Record(800 * 1024 * 1024)
+		if updated && limit > limitBefore {
+			recalibrated = true
+			break
+		}
+	}
+	if !recalibrated {
+		t.Fatal("expected the soft limit to grow after sustained high usage")
+	}
+}
+
+func TestReset(t *testing.T) {
+	config := memory.DefaultMemoryLimitConfig()
+	m := memory.NewMemoryLimitStats(config)
+
+	for i := 0; i < config.MinSamples+5; i++ {
+		m.Record(128 * 1024 * 1024)
+	}
+	if m.SoftMemoryLimit() == 0 {
+		t.Fatal("expected non-zero soft limit before reset")
+	}
+
+	m.Reset()
+
+	if m.SoftMemoryLimit() != 0 {
+		t.Fatal("expected zero soft limit after reset")
+	}
+	if m.TotalSamples() != 0 {
+		t.Fatal("expected zero sample count after reset")
+	}
+}
+
+func TestNoisyInputStability(t *testing.T) {
+	config := memory.DefaultMemoryLimitConfig()
+	m := memory.NewMemoryLimitStats(config)
+
+	rng := rand.New(rand.NewSource(42))
+	base := float64(256 * 1024 * 1024) // 256 MiB
+
+	var limits []uint64
+	for i := 0; i < 200; i++ {
+		// ±10% noise around base
+		noise := (rng.Float64()*0.2 - 0.1) * base
+		_, _ = m.Record(uint64(base + noise))
+		if l := m.SoftMemoryLimit(); l > 0 {
+			limits = append(limits, l)
+		}
+	}
+
+	if len(limits) == 0 {
+		t.Fatal("expected at least one committed limit")
+	}
+
+	// The final limit should be in a sensible range: 75–95% of base.
+	last := limits[len(limits)-1]
+	lo := uint64(base * 0.75)
+	hi := uint64(base * 0.95)
+	if last < lo || last > hi {
+		t.Fatalf("final limit %d outside expected range [%d, %d]", last, lo, hi)
+	}
+}

+ 123 - 0
core/pkg/util/monitor/memorylimiter.go

@@ -0,0 +1,123 @@
+package monitor
+
+import (
+	"fmt"
+	"math"
+	"runtime"
+	"runtime/debug"
+	"sync"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/core/pkg/util/atomic"
+	"github.com/opencost/opencost/core/pkg/util/monitor/memory"
+)
+
+var (
+	once          sync.Once
+	memoryLimiter *MemoryLimiter
+)
+
+// MemoryLimiter is a heap usage monitor for the go runtime which will attempt to
+// dynamically set a GOMEMLIMIT value to best fit the heap usage. It will only
+// adjust the GOMEMLIMIT if the usage analysis results in an increase, and won't
+// try to "best fit" the current usage. It takes into account the initial GOMEMLIMIT
+// value as the baseline.
+type MemoryLimiter struct {
+	runState atomic.AtomicRunState
+	monitor  *memory.MemoryLimitStats
+}
+
+// Start begins collecting heap allocation samples for automatically adjusting the go soft memory limit
+// for heap usage.
+func (ml *MemoryLimiter) Start(interval time.Duration) error {
+	ml.runState.WaitForReset()
+
+	if !ml.runState.Start() {
+		return fmt.Errorf("memory limiter was already started")
+	}
+
+	// main limiter driver
+	go func() {
+		var memStats runtime.MemStats
+		var prevLimit uint64
+
+		// determine if mem limit was set prior by passing a negative
+		// value to SetMemoryLimit, which will return the current value
+		// without making any changes -- the default is MaxInt64
+		goMemLimit := debug.SetMemoryLimit(-1)
+		if goMemLimit == math.MaxInt64 {
+			prevLimit = 0
+		} else {
+			prevLimit = uint64(goMemLimit)
+		}
+
+		// take initial heap measurement
+		runtime.ReadMemStats(&memStats)
+		ml.monitor.Record(memStats.HeapAlloc)
+
+		for {
+			select {
+			case <-ml.runState.OnStop():
+				ml.runState.Reset()
+				return
+
+			case <-time.After(interval):
+			}
+
+			// in the event that someone updates the limit outside of this monitor
+			// we want to make sure that we synchronize the correct value
+			goMemLimit = debug.SetMemoryLimit(-1)
+			if goMemLimit != math.MaxInt64 && goMemLimit != int64(prevLimit) {
+				prevLimit = uint64(goMemLimit)
+			}
+
+			// record and determine if we should update the memory limit
+			runtime.ReadMemStats(&memStats)
+			if softLimit, updated := ml.monitor.Record(memStats.HeapAlloc); updated {
+				// we only allow the limit to increase for now, as this best reflects a
+				// max stable set of samples. Worth observation and potentially updating
+				// in the future
+				if softLimit != 0 && softLimit > prevLimit {
+					prevLimit = softLimit
+					log.Debugf("Updating Go Memory Limit: %dmb", int64(softLimit/1024.0/1024.0))
+					debug.SetMemoryLimit(int64(softLimit))
+				}
+			}
+		}
+	}()
+
+	return nil
+}
+
+// Stops automatically adjusting the memory limiter
+func (ml *MemoryLimiter) Stop() error {
+	if !ml.runState.Stop() {
+		return fmt.Errorf("could not stop memory limiter - in the state of stopping or already stopped")
+	}
+	return nil
+}
+
+// returns the singleton instance of the memory limiter
+func getMemoryLimiter() *MemoryLimiter {
+	once.Do(func() {
+		config := memory.DefaultMemoryLimitConfig()
+		memoryLimiter = &MemoryLimiter{
+			monitor: memory.NewMemoryLimitStats(config),
+		}
+	})
+
+	return memoryLimiter
+}
+
+// DefaultMemoryLimiterSampleInterval is the sample interval in which the auto limiter
+// gathers heap usage.
+const DefaultMemoryLimiterSampleInterval = time.Second
+
+func StartMemoryLimiter() error {
+	return getMemoryLimiter().Start(DefaultMemoryLimiterSampleInterval)
+}
+
+func StopMemoryLimiter() error {
+	return getMemoryLimiter().Stop()
+}

+ 160 - 0
core/pkg/util/stringutil/lrubank.go

@@ -0,0 +1,160 @@
+package stringutil
+
+import (
+	"container/heap"
+	"sync"
+	"time"
+)
+
+type lruEntry struct {
+	value string
+	used  int64
+}
+type maxHeap []*lruEntry
+
+func (h maxHeap) Len() int           { return len(h) }
+func (h maxHeap) Less(i, j int) bool { return h[i].used > h[j].used } // newer = "larger"
+func (h maxHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+
+func (h *maxHeap) Push(x any) {
+	*h = append(*h, x.(*lruEntry))
+}
+
+func (h *maxHeap) Pop() any {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[:n-1]
+	return x
+}
+
+func nOldest(arr []*lruEntry, n int) []*lruEntry {
+	if n <= 0 {
+		return []*lruEntry{}
+	}
+
+	if n >= len(arr) {
+		return arr
+	}
+
+	h := maxHeap(arr[:n])
+	heap.Init(&h)
+
+	for _, entry := range arr[n:] {
+		// swap in oldest, re-heapify
+		if entry.used < h[0].used {
+			h[0] = entry
+			heap.Fix(&h, 0)
+		}
+	}
+
+	return []*lruEntry(h)
+}
+
+type lruStringBank struct {
+	lock     sync.Mutex
+	stop     chan struct{}
+	m        map[string]*lruEntry
+	capacity int
+}
+
+func NewLruStringBank(capacity int, evictionInterval time.Duration) StringBank {
+	stop := make(chan struct{})
+	bank := &lruStringBank{
+		stop:     stop,
+		m:        make(map[string]*lruEntry),
+		capacity: capacity,
+	}
+
+	go func() {
+		for {
+			select {
+			case <-stop:
+				return
+			case <-time.After(evictionInterval):
+			}
+
+			// need to take the lock during eviction
+			bank.lock.Lock()
+			evict(bank, capacity)
+			bank.lock.Unlock()
+		}
+	}()
+
+	return bank
+}
+
+func evict(bank *lruStringBank, capacity int) {
+	if len(bank.m) <= capacity {
+		return
+	}
+
+	// we collect a list of all lru entries so we can max heap the first n elements
+	arr := make([]*lruEntry, 0, len(bank.m))
+	for _, v := range bank.m {
+		arr = append(arr, v)
+	}
+
+	oldest := nOldest(arr, len(bank.m)-capacity)
+	for _, old := range oldest {
+		delete(bank.m, old.value)
+	}
+}
+
+func (sb *lruStringBank) Stop() {
+	sb.lock.Lock()
+	defer sb.lock.Unlock()
+
+	if sb.stop != nil {
+		close(sb.stop)
+		sb.stop = nil
+	}
+}
+
+func (sb *lruStringBank) LoadOrStore(key, value string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		v.used = time.Now().UnixMilli()
+		sb.lock.Unlock()
+		return v.value, ok
+	}
+
+	sb.m[key] = &lruEntry{
+		value: value,
+		used:  time.Now().UnixMilli(),
+	}
+	if len(sb.m) > (sb.capacity + (sb.capacity / 2)) {
+		evict(sb, sb.capacity)
+	}
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *lruStringBank) LoadOrStoreFunc(key string, f func() string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		v.used = time.Now().UnixMilli()
+		sb.lock.Unlock()
+		return v.value, ok
+	}
+
+	// create the key and value using the func (the key could be deallocated later)
+	value := f()
+	sb.m[value] = &lruEntry{
+		value: value,
+		used:  time.Now().UnixMilli(),
+	}
+	if len(sb.m) > (sb.capacity + (sb.capacity / 2)) {
+		evict(sb, sb.capacity)
+	}
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *lruStringBank) Clear() {
+	sb.lock.Lock()
+	sb.m = make(map[string]*lruEntry)
+	sb.lock.Unlock()
+}

+ 408 - 0
core/pkg/util/stringutil/lrubank_test.go

@@ -0,0 +1,408 @@
+package stringutil
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestBasicLruEvict(t *testing.T) {
+	lruBank := NewLruStringBank(3, 2*time.Second).(*lruStringBank)
+	defer lruBank.Stop()
+
+	lruBank.LoadOrStore("foo", "foo")
+	time.Sleep(500 * time.Millisecond)
+	lruBank.LoadOrStore("bar", "bar")
+	time.Sleep(500 * time.Millisecond)
+	lruBank.LoadOrStore("whaz", "whaz")
+	time.Sleep(500 * time.Millisecond)
+	// access foo, updating recency
+	lruBank.LoadOrStore("foo", "foo")
+	// should push bar out after eviction runs
+	lruBank.LoadOrStore("test", "test")
+	time.Sleep(time.Second)
+
+	lruBank.lock.Lock()
+	for _, v := range lruBank.m {
+		t.Logf("Value: %s\n", v.value)
+		if v.value == "bar" {
+			t.Errorf("The 'bar' entry should've been replaced by 'test'")
+		}
+	}
+	lruBank.lock.Unlock()
+}
+
+// ---------------------------------------------------------------------------
+// LoadOrStore
+// ---------------------------------------------------------------------------
+
+// A stored value must be retrievable and LoadOrStore must signal the hit/miss
+// correctly via the boolean return.
+func TestLoadOrStore_MissAndHit(t *testing.T) {
+	bank := NewLruStringBank(10, time.Minute).(*lruStringBank)
+	defer bank.Stop()
+
+	v, loaded := bank.LoadOrStore("hello", "hello")
+	if loaded {
+		t.Errorf("first LoadOrStore: expected loaded=false, got true")
+	}
+	if v != "hello" {
+		t.Errorf("first LoadOrStore: expected value %q, got %q", "hello", v)
+	}
+
+	v, loaded = bank.LoadOrStore("hello", "world")
+	if !loaded {
+		t.Errorf("second LoadOrStore: expected loaded=true, got false")
+	}
+	// The original value must be returned on a hit, not the new candidate.
+	if v != "hello" {
+		t.Errorf("second LoadOrStore: expected cached value %q, got %q", "hello", v)
+	}
+}
+
+// Hitting an existing entry must update its recency so it is not evicted ahead
+// of entries that were never touched again.
+func TestLoadOrStore_HitUpdateRecency(t *testing.T) {
+	bank := NewLruStringBank(2, 500*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	bank.LoadOrStore("old", "old")
+	time.Sleep(100 * time.Millisecond)
+	bank.LoadOrStore("keep", "keep")
+	time.Sleep(100 * time.Millisecond)
+
+	// Re-touch "old" so it becomes the most-recently-used.
+	bank.LoadOrStore("old", "old")
+	time.Sleep(100 * time.Millisecond)
+
+	// Adding a third entry exceeds capacity; "keep" should be the oldest now.
+	bank.LoadOrStore("new", "new")
+
+	// Wait for the eviction goroutine.
+	time.Sleep(600 * time.Millisecond)
+
+	bank.lock.Lock()
+	defer bank.lock.Unlock()
+
+	if _, ok := bank.m["keep"]; ok {
+		t.Error("expected 'keep' to be evicted but it is still present")
+	}
+	if _, ok := bank.m["old"]; !ok {
+		t.Error("expected 'old' to survive eviction after its recency was refreshed")
+	}
+}
+
+// ---------------------------------------------------------------------------
+// LoadOrStoreFunc
+// ---------------------------------------------------------------------------
+
+// The factory function must only be called on a cache miss, not on a hit.
+func TestLoadOrStoreFunc_FactoryCalledOnMissOnly(t *testing.T) {
+	bank := NewLruStringBank(10, time.Minute).(*lruStringBank)
+	defer bank.Stop()
+	calls := 0
+
+	factory := func() string {
+		calls++
+		return "k"
+	}
+
+	bank.LoadOrStoreFunc("k", factory)
+	bank.LoadOrStoreFunc("k", factory)
+
+	if calls != 1 {
+		t.Errorf("factory should be called exactly once, got %d calls", calls)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Capacity / eviction
+// ---------------------------------------------------------------------------
+
+// If the bank never exceeds capacity, nothing should be evicted.
+func TestEviction_BelowCapacityNoEviction(t *testing.T) {
+	const capacity = 5
+	bank := NewLruStringBank(capacity, 200*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	for i := 0; i < capacity; i++ {
+		bank.LoadOrStore(fmt.Sprintf("v%d", i), fmt.Sprintf("v%d", i))
+	}
+
+	// Wait several eviction cycles.
+	time.Sleep(600 * time.Millisecond)
+
+	bank.lock.Lock()
+	defer bank.lock.Unlock()
+
+	if got := len(bank.m); got != capacity {
+		t.Errorf("expected %d entries, got %d", capacity, got)
+	}
+}
+
+// After eviction the map must be trimmed down to exactly capacity.
+func TestEviction_ExceedCapacityTrimsToCapacity(t *testing.T) {
+	const capacity = 3
+	bank := NewLruStringBank(capacity, 350*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	for i := 0; i < capacity+3; i++ {
+		bank.LoadOrStore(fmt.Sprintf("v%d", i), fmt.Sprintf("v%d", i))
+		time.Sleep(20 * time.Millisecond) // ensure distinct timestamps
+	}
+
+	// Wait for eviction.
+	time.Sleep(500 * time.Millisecond)
+
+	bank.lock.Lock()
+	defer bank.lock.Unlock()
+
+	if got := len(bank.m); got > capacity {
+		t.Errorf("expected at most %d entries after eviction, got %d", capacity, got)
+	}
+}
+
+// The most-recently-used entries must survive eviction.
+func TestEviction_MRUSurvives(t *testing.T) {
+	const capacity = 2
+	bank := NewLruStringBank(capacity, 300*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	bank.LoadOrStore("evict1", "evict1")
+	time.Sleep(50 * time.Millisecond)
+	bank.LoadOrStore("evict2", "evict2")
+	time.Sleep(50 * time.Millisecond)
+
+	// These two are the most recent; they must survive.
+	bank.LoadOrStore("keep1", "keep1")
+	time.Sleep(50 * time.Millisecond)
+	bank.LoadOrStore("keep2", "keep2")
+
+	time.Sleep(500 * time.Millisecond)
+
+	bank.lock.Lock()
+	defer bank.lock.Unlock()
+
+	for _, must := range []string{"keep1", "keep2"} {
+		if _, ok := bank.m[must]; !ok {
+			t.Errorf("expected %q to survive eviction", must)
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Clear
+// ---------------------------------------------------------------------------
+
+func TestClear_EmptiesMap(t *testing.T) {
+	bank := NewLruStringBank(10, time.Minute).(*lruStringBank)
+	defer bank.Stop()
+
+	for i := 0; i < 5; i++ {
+		bank.LoadOrStore(fmt.Sprintf("v%d", i), fmt.Sprintf("v%d", i))
+	}
+
+	bank.Clear()
+
+	bank.lock.Lock()
+	defer bank.lock.Unlock()
+
+	if len(bank.m) != 0 {
+		t.Errorf("expected empty map after Clear, got %d entries", len(bank.m))
+	}
+}
+
+// After a Clear, previously stored keys must not be found.
+func TestClear_PreviousKeysGone(t *testing.T) {
+	bank := NewLruStringBank(10, time.Minute).(*lruStringBank)
+	defer bank.Stop()
+
+	bank.LoadOrStore("hello", "world")
+	bank.Clear()
+
+	_, loaded := bank.LoadOrStore("hello", "new")
+	if loaded {
+		t.Error("expected key to be absent after Clear, but it was found")
+	}
+}
+
+// ---------------------------------------------------------------------------
+// nOldest helper
+// ---------------------------------------------------------------------------
+
+func TestNOldest_ReturnsCorrectCount(t *testing.T) {
+	now := time.Now()
+	entries := []*lruEntry{
+		{value: "a", used: now.Add(-4 * time.Second).UnixMilli()},
+		{value: "b", used: now.Add(-3 * time.Second).UnixMilli()},
+		{value: "c", used: now.Add(-2 * time.Second).UnixMilli()},
+		{value: "d", used: now.Add(-1 * time.Second).UnixMilli()},
+		{value: "e", used: now.UnixMilli()},
+	}
+
+	oldest := nOldest(entries, 2)
+	if len(oldest) != 2 {
+		t.Fatalf("expected 2 oldest entries, got %d", len(oldest))
+	}
+
+	values := map[string]bool{}
+	for _, e := range oldest {
+		values[e.value] = true
+	}
+	for _, must := range []string{"a", "b"} {
+		if !values[must] {
+			t.Errorf("expected %q in oldest set, got %v", must, values)
+		}
+	}
+}
+
+func TestNOldest_NGreaterThanLen(t *testing.T) {
+	now := time.Now()
+	entries := []*lruEntry{
+		{value: "x", used: now.UnixMilli()},
+		{value: "y", used: now.Add(-time.Second).UnixMilli()},
+	}
+
+	result := nOldest(entries, 10)
+	if len(result) != 2 {
+		t.Errorf("expected all %d entries when n >= len, got %d", 2, len(result))
+	}
+}
+
+func TestNOldest_NEqualsLen(t *testing.T) {
+	now := time.Now()
+	entries := []*lruEntry{
+		{value: "x", used: now.UnixMilli()},
+		{value: "y", used: now.Add(-time.Second).UnixMilli()},
+	}
+
+	result := nOldest(entries, 2)
+	if len(result) != 2 {
+		t.Errorf("expected 2 entries when n == len, got %d", len(result))
+	}
+}
+
+func TestNOldest_NIsZero(t *testing.T) {
+	now := time.Now()
+	entries := []*lruEntry{
+		{value: "x", used: now.UnixMilli()},
+	}
+
+	result := nOldest(entries, 0)
+	if len(result) != 0 {
+		t.Errorf("expected 0 entries when n=0, got %d", len(result))
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Concurrency
+// ---------------------------------------------------------------------------
+
+// Concurrent LoadOrStore calls must not race or panic.
+func TestConcurrentLoadOrStore(t *testing.T) {
+	bank := NewLruStringBank(50, 100*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	const goroutines = 20
+	const opsEach = 100
+
+	var wg sync.WaitGroup
+	for i := 0; i < goroutines; i++ {
+		g := i
+		wg.Go(func() {
+			for i := 0; i < opsEach; i++ {
+				key := fmt.Sprintf("k%d", (g*opsEach+i)%30)
+				bank.LoadOrStore(key, key)
+			}
+		})
+	}
+
+	waiter := func() chan struct{} {
+		st := make(chan struct{})
+
+		go func() {
+			wg.Wait()
+			close(st)
+		}()
+
+		return st
+	}
+
+	select {
+	case <-waiter():
+		t.Logf("Completed Successfully\n")
+	case <-time.After(10 * time.Second):
+		t.Logf("Timed out\n")
+	}
+}
+
+// Concurrent calls interleaved with eviction cycles must not deadlock or race.
+func TestConcurrentLoadOrStoreWithEviction(t *testing.T) {
+	bank := NewLruStringBank(5, 50*time.Millisecond).(*lruStringBank)
+	defer bank.Stop()
+
+	const goroutines = 10
+	const duration = 300 * time.Millisecond
+
+	var wg sync.WaitGroup
+
+	for i := 0; i < goroutines; i++ {
+		g := i
+		stop := time.After(duration)
+
+		wg.Go(func() {
+			for {
+				select {
+				case <-stop:
+					return
+				default:
+					key := fmt.Sprintf("g%d", g)
+					bank.LoadOrStore(key, key)
+				}
+			}
+		})
+	}
+
+	waiter := func() chan struct{} {
+		st := make(chan struct{})
+
+		go func() {
+			wg.Wait()
+			close(st)
+		}()
+
+		return st
+	}
+
+	select {
+	case <-waiter():
+		t.Logf("Completed Successfully\n")
+	case <-time.After(10 * time.Second):
+		t.Logf("Timed out\n")
+	}
+}
+
+// Concurrent Clear calls alongside reads/writes must not panic.
+func TestConcurrentClear(t *testing.T) {
+	bank := NewLruStringBank(10, time.Minute).(*lruStringBank)
+	defer bank.Stop()
+
+	var wg sync.WaitGroup
+	for i := 0; i < 5; i++ {
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+			bank.LoadOrStore(fmt.Sprintf("k%d", i), "v")
+		}(i)
+	}
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			bank.Clear()
+		}()
+	}
+
+	wg.Wait()
+}

+ 48 - 0
core/pkg/util/stringutil/mapbank.go

@@ -0,0 +1,48 @@
+package stringutil
+
+import "sync"
+
+type stringBank struct {
+	lock sync.Mutex
+	m    map[string]string
+}
+
+func NewStringBank() StringBank {
+	return &stringBank{
+		m: make(map[string]string),
+	}
+}
+
+func (sb *stringBank) LoadOrStore(key, value string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		sb.lock.Unlock()
+		return v, ok
+	}
+
+	sb.m[value] = value
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *stringBank) LoadOrStoreFunc(key string, f func() string) (string, bool) {
+	sb.lock.Lock()
+
+	if v, ok := sb.m[key]; ok {
+		sb.lock.Unlock()
+		return v, ok
+	}
+
+	// create the key and value using the func (the key could be deallocated later)
+	value := f()
+	sb.m[value] = value
+	sb.lock.Unlock()
+	return value, false
+}
+
+func (sb *stringBank) Clear() {
+	sb.lock.Lock()
+	sb.m = make(map[string]string)
+	sb.lock.Unlock()
+}

+ 17 - 0
core/pkg/util/stringutil/noopbank.go

@@ -0,0 +1,17 @@
+package stringutil
+
+type noOpStringBank struct{}
+
+func NewNoOpStringBank() StringBank {
+	return new(noOpStringBank)
+}
+
+func (nsb *noOpStringBank) LoadOrStore(key, value string) (string, bool) {
+	return value, true
+}
+
+func (nsb *noOpStringBank) LoadOrStoreFunc(key string, f func() string) (string, bool) {
+	return f(), true
+}
+
+func (nsb *noOpStringBank) Clear() {}

+ 27 - 44
core/pkg/util/stringutil/stringutil.go

@@ -23,75 +23,58 @@ const (
 var alpha = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 var alphanumeric = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
 
-type stringBank struct {
-	lock sync.Mutex
-	m    map[string]string
+type StringBank interface {
+	LoadOrStore(key, value string) (string, bool)
+	LoadOrStoreFunc(key string, f func() string) (string, bool)
+	Clear()
 }
 
-func newStringBank() *stringBank {
-	return &stringBank{
-		m: make(map[string]string),
-	}
-}
+var (
+	lock sync.RWMutex
 
-func (sb *stringBank) LoadOrStore(key, value string) (string, bool) {
-	sb.lock.Lock()
-
-	if v, ok := sb.m[key]; ok {
-		sb.lock.Unlock()
-		return v, ok
-	}
+	// stringBank is an unbounded string cache that is thread-safe. It is especially useful if
+	// storing a large frequency of dynamically allocated duplicate strings.
+	strings StringBank = NewStringBank()
+)
 
-	sb.m[key] = value
-	sb.lock.Unlock()
-	return value, false
+func init() {
+	rand.Seed(time.Now().UnixNano())
 }
 
-func (sb *stringBank) LoadOrStoreFunc(key string, f func() string) (string, bool) {
-	sb.lock.Lock()
+func UpdateStringBank(sb StringBank) {
+	lock.Lock()
+	defer lock.Unlock()
 
-	if v, ok := sb.m[key]; ok {
-		sb.lock.Unlock()
-		return v, ok
-	}
-
-	// create the key and value using the func (the key could be deallocated later)
-	value := f()
-	sb.m[value] = value
-	sb.lock.Unlock()
-	return value, false
-}
-
-func (sb *stringBank) Clear() {
-	sb.lock.Lock()
-	sb.m = make(map[string]string)
-	sb.lock.Unlock()
+	strings.Clear()
+	strings = sb
 }
 
-// stringBank is an unbounded string cache that is thread-safe. It is especially useful if
-// storing a large frequency of dynamically allocated duplicate strings.
-var strings = newStringBank() // sync.Map
+// GetStringBank returns the _current_ StringBank implementation. Note that the read-lock is
+// not held for the duration of usage, so the returned string bank could be swapped out
+// after being retrieved.
+func GetStringBank() StringBank {
+	lock.RLock()
+	defer lock.RUnlock()
 
-func init() {
-	rand.Seed(time.Now().UnixNano())
+	return strings
 }
 
 // Bank will return a non-copy of a string if it has been used before. Otherwise, it will store
 // the string as the unique instance.
 func Bank(s string) string {
-	ss, _ := strings.LoadOrStore(s, s)
+	ss, _ := GetStringBank().LoadOrStore(s, s)
 	return ss
 }
 
 // BankFunc will use the provided s string to check for an existing allocation of the string. However,
 // if no allocation exists, the f parameter will be used to create the string and store in the bank.
 func BankFunc(s string, f func() string) string {
-	ss, _ := strings.LoadOrStoreFunc(s, f)
+	ss, _ := GetStringBank().LoadOrStoreFunc(s, f)
 	return ss
 }
 
 func ClearBank() {
-	strings.Clear()
+	GetStringBank().Clear()
 }
 
 // RandSeq generates a pseudo-random alphabetic string of the given length

+ 105 - 4
core/pkg/util/stringutil/stringutil_test.go

@@ -6,6 +6,8 @@ import (
 	"strings"
 	"sync"
 	"testing"
+	"time"
+	"unsafe"
 
 	"github.com/opencost/opencost/core/pkg/util/stringutil"
 )
@@ -46,7 +48,7 @@ func copyString(s string) string {
 	return string([]byte(s))
 }
 
-func generateBenchData(totalStrings, totalUnique int) []string {
+func generateBenchData(totalStrings, totalUnique int) [][]byte {
 	randStrings := make([]string, 0, totalStrings)
 	r := rand.New(rand.NewSource(27644437))
 
@@ -69,7 +71,11 @@ func generateBenchData(totalStrings, totalUnique int) []string {
 	// shuffle the list of strings
 	r.Shuffle(totalStrings, func(i, j int) { randStrings[i], randStrings[j] = randStrings[j], randStrings[i] })
 
-	return randStrings
+	stringBytes := make([][]byte, 0, totalStrings)
+	for _, str := range randStrings {
+		stringBytes = append(stringBytes, []byte(str))
+	}
+	return stringBytes
 }
 
 func benchmarkStringBank(b *testing.B, bt bankTest, totalStrings, totalUnique int, useBankFunc bool) {
@@ -80,10 +86,16 @@ func benchmarkStringBank(b *testing.B, bt bankTest, totalStrings, totalUnique in
 		for i := 0; i < b.N; i++ {
 			b.StartTimer()
 			for bb := 0; bb < totalStrings; bb++ {
+				bytes := randStrings[bb]
+
 				if useBankFunc {
-					bt.BankFunc(randStrings[bb], func() string { return randStrings[bb] })
+					str := unsafe.String(unsafe.SliceData(bytes), len(bytes))
+
+					bt.BankFunc(str, func() string {
+						return string(bytes)
+					})
 				} else {
-					bt.Bank(randStrings[bb])
+					bt.Bank(string(bytes))
 				}
 			}
 			b.StopTimer()
@@ -153,3 +165,92 @@ func BenchmarkStringBankFunc25PercentDuplicate(b *testing.B) {
 func BenchmarkStringBankFuncNoDuplicate(b *testing.B) {
 	benchmarkStringBank(b, standardBankTest, 1_000_000, 1_000_000, true)
 }
+
+const LruCapacity = 500_000
+const LruEvictInterval = 5 * time.Second
+
+func BenchmarkLruStringBankFunc90PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
+	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
+	defer func() {
+		if lruBank, ok := sb.(interface{ Stop() }); ok {
+			lruBank.Stop()
+		}
+
+	}()
+
+	stringutil.UpdateStringBank(sb)
+	benchmarkStringBank(b, standardBankTest, 1_000_000, 100_000, true)
+}
+
+func BenchmarkLruStringBankFunc75PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
+	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
+	defer func() {
+		if lruBank, ok := sb.(interface{ Stop() }); ok {
+			lruBank.Stop()
+		}
+	}()
+
+	stringutil.UpdateStringBank(sb)
+	benchmarkStringBank(b, standardBankTest, 1_000_000, 250_000, true)
+}
+
+func BenchmarkLruStringBankFunc50PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
+	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
+	defer func() {
+		if lruBank, ok := sb.(interface{ Stop() }); ok {
+			lruBank.Stop()
+		}
+	}()
+
+	stringutil.UpdateStringBank(sb)
+	benchmarkStringBank(b, standardBankTest, 1_000_000, 100_000, true)
+}
+
+func BenchmarkLruStringBankFunc25PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
+	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
+	defer func() {
+		if lruBank, ok := sb.(interface{ Stop() }); ok {
+			lruBank.Stop()
+		}
+	}()
+
+	stringutil.UpdateStringBank(sb)
+	benchmarkStringBank(b, standardBankTest, 1_000_000, 750_000, true)
+}
+
+func BenchmarkLruStringBankFuncNoDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
+	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
+	defer func() {
+		if lruBank, ok := sb.(interface{ Stop() }); ok {
+			lruBank.Stop()
+		}
+	}()
+
+	stringutil.UpdateStringBank(sb)
+	benchmarkStringBank(b, standardBankTest, 1_000_000, 1_000_000, true)
+}

+ 37 - 0
docs/ROADMAP.md

@@ -0,0 +1,37 @@
+# OpenCost Roadmap
+
+This roadmap reflects the current priorities for the OpenCost project. It is reviewed quarterly and discussed in the biweekly [Working Group meetings](https://zoom-lfx.platform.linuxfoundation.org/meetings/opencost?view=list).
+
+## Current Focus Areas
+
+- **Cloud cost integration:** Connecting cloud billing data to the demo environment, cloud cost bug fixes, and multi-account support
+- **UI revamp:** Major frontend overhaul via LFX mentorship — new UI released, stabilizing before next core release
+- **OpenCost AI:** New sub-project for airgapped private cost models (CI/CD, testing models at scale, finding smallest viable model)
+- **First-class LLM cost support:** Design proposal for native LLM cost tracking in OpenCost core
+- **Integration test expansion:** Pod restart tests, network cost tests, resolving Prometheus-less (promless) vs Prometheus-backed test discrepancies
+- **Plugin ecosystem:** Snowflake, GitHub, and currency conversion plugins proposed; MongoDB reference implementation for currency support
+- **Helm chart signing:** Cryptographic signing of Helm charts (research in progress)
+- **Data persistence and export:** Mounting persistence for promless mode, potential S3 export for cost data
+- **Supply chain security:** Achieving [OpenSSF Best Practices](https://www.bestpractices.dev/projects/6219) Silver and Gold badges, cryptographically signed releases via Sigstore/cosign, SLSA build provenance, and SPDX license compliance across all source files
+- **Community growth:** EMEA/APAC meeting cadence, YouTube channel for meeting recordings, DigitalOcean cloud sponsorship for testing
+
+## Recent Milestones
+
+- New OpenCost UI released (v1.0 via LFX mentorship)
+- OpenCost AI sub-project introduced (first PR merged)
+- MCP server released in v1.118 with right-sizing recommendations
+- KubeModel 1.0 shipped (Fall 2025 LFX mentorship)
+- SBOM generation integrated across core and UI repos (SPDX + CycloneDX)
+- OpenSSF Scorecard integration
+- Community Maintainer role introduced
+- Gateway API deployed for infrastructure
+- Spot node testing enabled in integration test cluster
+- Copilot AI review bot enabled across repositories (provided by CNCF)
+- OpenCost Specification v0.1 published
+- Collector data source shipped (alternative to Prometheus)
+
+## How to Influence the Roadmap
+
+- Join the [OpenCost Working Group](https://zoom-lfx.platform.linuxfoundation.org/meetings/opencost?view=week) (biweekly, alternating between EMEA/APAC at 15:00 UTC and NA at 21:00 UTC)
+- Propose changes via [GitHub Issues](https://github.com/opencost/opencost/issues)
+- Discuss ideas in the [#opencost](https://cloud-native.slack.com/archives/C03D56FPD4G) channel on [CNCF Slack](https://slack.cncf.io/)

+ 1 - 1
go.mod

@@ -26,7 +26,7 @@ require (
 	github.com/aws/aws-sdk-go-v2/credentials v1.19.12
 	github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.22.6
 	github.com/aws/aws-sdk-go-v2/service/athena v1.57.1
-	github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0
+	github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0
 	github.com/aws/aws-sdk-go-v2/service/s3 v1.97.1
 	github.com/aws/aws-sdk-go-v2/service/sts v1.41.9
 	github.com/aws/smithy-go v1.24.2

+ 2 - 2
go.sum

@@ -114,8 +114,8 @@ github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.21 h1:SwGMTMLIlvDNyhMteQ6r8IJSBPl
 github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.21/go.mod h1:UUxgWxofmOdAMuqEsSppbDtGKLfR04HGsD0HXzvhI1k=
 github.com/aws/aws-sdk-go-v2/service/athena v1.57.1 h1:I+YmJvkeAN+r4mUlH2CODoYyAkoP0cQWGItPPkK60hk=
 github.com/aws/aws-sdk-go-v2/service/athena v1.57.1/go.mod h1:yZ507NVXolOco9hA2+mKH3ELnLEOZ/4mGqkrp2phNYs=
-github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0 h1:dgdIaG/GCiXMo16HAdFwpjt9Vn34bD2WVH5SiZdwzUc=
-github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0/go.mod h1:2dMnUs1QzlGzsm46i9oBHAxVHQp7b6qF7PljWcgVEVE=
+github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0 h1:98Miqj16un1WLNyM1RjVDhXYumhqZrQfAeG8i4jPG6o=
+github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0/go.mod h1:T6ndRfdhnXLIY5oKBHjYZDVj706los2zGdpThppquvA=
 github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 h1:5EniKhLZe4xzL7a+fU3C2tfUN4nWIqlLesfrjkuPFTY=
 github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI=
 github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.12 h1:qtJZ70afD3ISKWnoX3xB0J2otEqu3LqicRcDBqsj0hQ=

+ 140 - 91
pkg/carbon/carbonassets.go

@@ -3,7 +3,6 @@ package carbon
 import (
 	"embed"
 	"encoding/csv"
-	"fmt"
 	"strconv"
 	"strings"
 
@@ -15,7 +14,11 @@ import (
 //go:embed carbonlookupdata.csv
 var f embed.FS
 
-type carbonLookupKeyDisk struct {
+// averageRegionKey is the fallback region label used in the lookup CSV when a
+// specific (provider, region, instanceType) tuple cannot be matched.
+const averageRegionKey = "average-region"
+
+type carbonLookupKeyRegion struct {
 	provider string
 	region   string
 }
@@ -26,15 +29,13 @@ type carbonLookupKeyNode struct {
 	instanceType string
 }
 
-var carbonLookupNode map[carbonLookupKeyNode]float64
-var carbonLookupDisk map[carbonLookupKeyDisk]float64
-
-// Opencost does not build network types
-var carbonValidInstanceTypes map[string]string
-var carbonValidRegions map[string]string
+var (
+	carbonLookupNode    map[carbonLookupKeyNode]float64
+	carbonLookupDisk    map[carbonLookupKeyRegion]float64
+	carbonLookupNetwork map[carbonLookupKeyRegion]float64
+)
 
 func init() {
-
 	carbonData, err := f.ReadFile("carbonlookupdata.csv")
 	if err != nil {
 		log.Errorf("Error getting content of carbon lookup file: %s", err)
@@ -43,129 +44,177 @@ func init() {
 
 	reader := csv.NewReader(strings.NewReader(string(carbonData)))
 
-	// skip header
-	_, err = reader.Read()
-	if err != nil {
-		log.Errorf("Error reading carbon lookup data: %s", err)
+	if _, err := reader.Read(); err != nil {
+		log.Errorf("Error reading carbon lookup header: %s", err)
 		return
 	}
 
-	dat, err := reader.ReadAll()
+	rows, err := reader.ReadAll()
 	if err != nil {
 		log.Errorf("Error reading carbon lookup data: %s", err)
 		return
 	}
 
 	carbonLookupNode = make(map[carbonLookupKeyNode]float64)
-	carbonLookupDisk = make(map[carbonLookupKeyDisk]float64)
-
-	carbonValidInstanceTypes = make(map[string]string)
-	carbonValidRegions = make(map[string]string)
-
-	for _, carbonItem := range dat {
+	carbonLookupDisk = make(map[carbonLookupKeyRegion]float64)
+	carbonLookupNetwork = make(map[carbonLookupKeyRegion]float64)
 
-		if coeff, err := strconv.ParseFloat(carbonItem[5], 64); err != nil {
-
-			panic(fmt.Errorf("error setting up carbon lookup table: malformed carbon cost '%s'", carbonItem[5]))
-
-		} else {
-
-			provider := carbonItem[0]
-			region := carbonItem[1]
-			instanceType := carbonItem[2]
-			assetType := carbonItem[3]
+	for _, row := range rows {
+		// Skip blank records (e.g. a trailing newline in the CSV).
+		if len(row) == 0 || (len(row) == 1 && strings.TrimSpace(row[0]) == "") {
+			continue
+		}
+		if len(row) < 6 {
+			log.Warnf("carbon: skipping malformed lookup row %v", row)
+			continue
+		}
 
-			switch assetType {
-			case "Node":
-				carbonLookupNode[carbonLookupKeyNode{
-					provider:     provider,
-					region:       region,
-					instanceType: instanceType,
-				}] = coeff
-			case "Disk":
-				carbonLookupDisk[carbonLookupKeyDisk{
-					provider: provider,
-					region:   region,
-				}] = coeff
-			}
+		coeff, err := strconv.ParseFloat(row[5], 64)
+		if err != nil {
+			log.Warnf("carbon: skipping row with malformed carbon coefficient %q", row[5])
+			continue
+		}
 
-			carbonValidInstanceTypes[instanceType] = provider
-			carbonValidRegions[region] = provider
+		provider := row[0]
+		region := row[1]
+		instanceType := row[2]
+		assetType := row[3]
 
+		switch assetType {
+		case "Node":
+			carbonLookupNode[carbonLookupKeyNode{
+				provider:     provider,
+				region:       region,
+				instanceType: instanceType,
+			}] = coeff
+		case "Disk":
+			carbonLookupDisk[carbonLookupKeyRegion{
+				provider: provider,
+				region:   region,
+			}] = coeff
+		case "Network":
+			carbonLookupNetwork[carbonLookupKeyRegion{
+				provider: provider,
+				region:   region,
+			}] = coeff
 		}
-
 	}
-
 }
 
 type CarbonRow struct {
 	Co2e float64 `json:"co2e"`
 }
 
+// RelateCarbonAssets returns an estimated CO2e value for each asset in the set.
+// The returned value is in metric tonnes of CO2e, consistent with the units of
+// the embedded lookup table (tonnes CO2e per hour of asset runtime).
 func RelateCarbonAssets(as *opencost.AssetSet) (map[string]CarbonRow, error) {
-
-	res := make(map[string]CarbonRow)
+	res := make(map[string]CarbonRow, len(as.Assets))
 
 	for key, asset := range as.Assets {
-
-		// If no valid region, default to per-provider calculated average
-		region, _ := util.GetRegion(asset.GetLabels())
-		if _, ok := carbonValidRegions[region]; !ok {
-			region = "average-region"
-		}
-
-		// If no valid instance type, also default to per-provider calculated average
-		instanceType, _ := util.GetInstanceType(asset.GetLabels())
-		if _, ok := carbonValidInstanceTypes[instanceType]; !ok {
-			region = "average-region"
+		coeff := lookupCarbonCoeff(asset)
+		res[key] = CarbonRow{
+			Co2e: coeff * asset.Minutes() / 60,
 		}
+	}
 
-		provider := getProviderFromProviderID(asset.GetProperties().ProviderID)
+	return res, nil
+}
 
-		// If we're not able to parse the provider id, try to fetch the provider from the carbon data
-		if provider == "" && region != "average-region" {
-			provider = carbonValidRegions[region]
-		} else {
-			if asset.Type() == opencost.NodeAssetType || asset.Type() == opencost.DiskAssetType {
-				log.DedupedErrorf(10, "Cannot infer region information for asset '%s'", asset.GetProperties().ProviderID)
+// lookupCarbonCoeff resolves the carbon coefficient (tonnes CO2e per hour) for
+// the given asset, falling back to the provider-wide average-region value when
+// a specific region or instance type is not present in the lookup table.
+func lookupCarbonCoeff(asset opencost.Asset) float64 {
+	props := asset.GetProperties()
+	provider := resolveProvider(asset)
+	if provider == "" {
+		if isCarbonTrackedAsset(asset.Type()) {
+			providerID := ""
+			if props != nil {
+				providerID = props.ProviderID
 			}
+			log.DedupedWarningf(10, "carbon: cannot infer provider for asset %q", providerID)
 		}
+		return 0
+	}
 
-		var carbonCoeff float64
-		switch asset.Type() {
-		case opencost.NodeAssetType:
-			carbonCoeff = carbonLookupNode[carbonLookupKeyNode{
-				provider:     provider,
-				region:       region,
-				instanceType: instanceType,
-			}]
-		case opencost.DiskAssetType:
-			carbonCoeff = carbonLookupDisk[carbonLookupKeyDisk{
-				provider: provider,
-				region:   region,
-			}]
-		}
+	region, _ := util.GetRegion(asset.GetLabels())
+	instanceType, _ := util.GetInstanceType(asset.GetLabels())
 
-		res[key] = CarbonRow{
-			Co2e: carbonCoeff * asset.Minutes() / 60,
+	switch asset.Type() {
+	case opencost.NodeAssetType:
+		if coeff, ok := carbonLookupNode[carbonLookupKeyNode{provider, region, instanceType}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupNode[carbonLookupKeyNode{provider, averageRegionKey, ""}]; ok {
+			log.DedupedWarningf(10, "carbon: falling back to average-region for node (provider=%s region=%q instanceType=%q)", provider, region, instanceType)
+			return coeff
+		}
+	case opencost.DiskAssetType:
+		if coeff, ok := carbonLookupDisk[carbonLookupKeyRegion{provider, region}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupDisk[carbonLookupKeyRegion{provider, averageRegionKey}]; ok {
+			log.DedupedWarningf(10, "carbon: falling back to average-region for disk (provider=%s region=%q)", provider, region)
+			return coeff
 		}
+	case opencost.NetworkAssetType:
+		if coeff, ok := carbonLookupNetwork[carbonLookupKeyRegion{provider, region}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupNetwork[carbonLookupKeyRegion{provider, averageRegionKey}]; ok {
+			return coeff
+		}
+	}
+	return 0
+}
 
+func isCarbonTrackedAsset(t opencost.AssetType) bool {
+	switch t {
+	case opencost.NodeAssetType, opencost.DiskAssetType, opencost.NetworkAssetType:
+		return true
 	}
+	return false
+}
 
-	return res, nil
+// resolveProvider returns the canonical provider name for an asset. It prefers
+// the canonical Provider property populated by the cost model, falling back to
+// parsing the cloud provider ID when the property is missing.
+func resolveProvider(asset opencost.Asset) string {
+	props := asset.GetProperties()
+	if props == nil {
+		return ""
+	}
 
+	switch props.Provider {
+	case opencost.AWSProvider, opencost.GCPProvider, opencost.AzureProvider:
+		return props.Provider
+	}
+
+	return inferProviderFromProviderID(props.ProviderID)
 }
 
-func getProviderFromProviderID(providerid string) string {
+// inferProviderFromProviderID is a best-effort fallback that matches the
+// conventional shapes of Kubernetes Node `spec.providerID` values for the
+// cloud providers present in the embedded lookup data (AWS, GCP, Azure).
+//
+// Real-world formats:
+//   - AWS:   aws:///<availability-zone>/<instance-id>  (or raw "i-…")
+//   - GCP:   gce://<project>/<zone>/<instance-name>
+//   - Azure: azure:///subscriptions/<sub>/resourceGroups/<rg>/…
+func inferProviderFromProviderID(providerID string) string {
+	id := strings.ToLower(strings.TrimSpace(providerID))
+	if id == "" {
+		return ""
+	}
 
-	if strings.HasPrefix(providerid, "gke") {
-		return opencost.GCPProvider
-	} else if strings.HasPrefix(providerid, "i-") {
+	switch {
+	case strings.HasPrefix(id, "aws:"), strings.HasPrefix(id, "i-"):
 		return opencost.AWSProvider
-	} else if strings.HasPrefix(providerid, "azure") {
+	case strings.HasPrefix(id, "gce:"), strings.HasPrefix(id, "gke"):
+		return opencost.GCPProvider
+	case strings.HasPrefix(id, "azure:"):
 		return opencost.AzureProvider
 	}
-
 	return ""
-
 }

+ 254 - 0
pkg/carbon/carbonassets_test.go

@@ -0,0 +1,254 @@
+package carbon
+
+import (
+	"math"
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	v1 "k8s.io/api/core/v1"
+)
+
+const (
+	// Known-good row from carbonlookupdata.csv:
+	//   AWS,us-east-1,t4g.nano,Node,0.012788433076234564,4.84769853777516e-06
+	awsT4gNanoUSEast1Coeff = 4.84769853777516e-06
+
+	// AWS,average-region,,Node,0.186739186034359,7.278989705005508e-05
+	awsAvgRegionNodeCoeff = 7.278989705005508e-05
+
+	// AWS,us-east-1,,Network,0.001135,4.30243315e-7
+	awsUSEast1NetworkCoeff = 4.30243315e-7
+)
+
+// floatEqual compares floats at a tolerance appropriate for the lookup table
+// values, which are stored with full float64 precision in the CSV.
+func floatEqual(a, b float64) bool {
+	if a == b {
+		return true
+	}
+	return math.Abs(a-b) <= 1e-18+1e-12*math.Max(math.Abs(a), math.Abs(b))
+}
+
+func nodeWithLabels(provider, providerID, region, instanceType string, minutes float64) *opencost.Node {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	n := opencost.NewNode("node", "cluster", providerID, start, end, window)
+	n.Properties.Provider = provider
+	labels := opencost.AssetLabels{}
+	if region != "" {
+		labels[v1.LabelTopologyRegion] = region
+	}
+	if instanceType != "" {
+		labels[v1.LabelInstanceTypeStable] = instanceType
+	}
+	n.Labels = labels
+	return n
+}
+
+func diskWithLabels(provider, providerID, region string, minutes float64) *opencost.Disk {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	d := opencost.NewDisk("disk", "cluster", providerID, start, end, window)
+	d.Properties.Provider = provider
+	if region != "" {
+		d.Labels = opencost.AssetLabels{v1.LabelTopologyRegion: region}
+	}
+	return d
+}
+
+func networkWithLabels(provider, providerID, region string, minutes float64) *opencost.Network {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	nw := opencost.NewNetwork("network", "cluster", providerID, start, end, window)
+	nw.Properties.Provider = provider
+	if region != "" {
+		nw.Labels = opencost.AssetLabels{v1.LabelTopologyRegion: region}
+	}
+	return nw
+}
+
+func TestInferProviderFromProviderID(t *testing.T) {
+	cases := []struct {
+		name string
+		id   string
+		want string
+	}{
+		{"empty", "", ""},
+		{"aws standard", "aws:///us-east-1a/i-0abc123", opencost.AWSProvider},
+		{"aws raw instance", "i-0abc123", opencost.AWSProvider},
+		{"gce standard", "gce://my-project/us-central1-a/gke-node-1", opencost.GCPProvider},
+		{"legacy gke prefix", "gke-node-1", opencost.GCPProvider},
+		{"azure standard", "azure:///subscriptions/x/resourceGroups/y/providers/Microsoft.Compute/virtualMachines/z", opencost.AzureProvider},
+		{"unknown prefix", "something-else", ""},
+		{"whitespace and case", "  AWS:///eu-west-1a/i-xyz  ", opencost.AWSProvider},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := inferProviderFromProviderID(tc.id); got != tc.want {
+				t.Fatalf("inferProviderFromProviderID(%q) = %q, want %q", tc.id, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestResolveProvider_PrefersCanonicalProperty(t *testing.T) {
+	// ProviderID is a GCP-shaped string but the canonical property says AWS.
+	// Canonical property wins.
+	n := nodeWithLabels(opencost.AWSProvider, "gce://foo/bar/baz", "us-east-1", "t4g.nano", 60)
+	if got := resolveProvider(n); got != opencost.AWSProvider {
+		t.Fatalf("resolveProvider = %q, want %q", got, opencost.AWSProvider)
+	}
+}
+
+func TestResolveProvider_FallsBackToProviderID(t *testing.T) {
+	// No canonical Provider property — must fall back to parsing ProviderID.
+	n := nodeWithLabels("", "gce://my-project/us-central1-a/gke-node-1", "us-central1", "e2-standard-2", 60)
+	if got := resolveProvider(n); got != opencost.GCPProvider {
+		t.Fatalf("resolveProvider = %q, want %q", got, opencost.GCPProvider)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_ExactMatch(t *testing.T) {
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsT4gNanoUSEast1Coeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g", got, awsT4gNanoUSEast1Coeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenRegionUnknown(t *testing.T) {
+	// Region is garbage; instance type is fine. Should fall back to
+	// (AWS, average-region, "") instead of returning zero.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///xx/i-1", "not-a-real-region", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenInstanceTypeUnknown(t *testing.T) {
+	// Region is real; instance type is unknown. Previously returned 0 because
+	// only the region was reset. Must now fall back to average-region.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "future-xxlarge", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenBothUnknown(t *testing.T) {
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///xx/i-1", "not-a-real-region", "future-xxlarge", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_ZeroForUnknownProvider(t *testing.T) {
+	n := nodeWithLabels("", "some-unknown-id", "us-east-1", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); got != 0 {
+		t.Fatalf("lookupCarbonCoeff = %g, want 0", got)
+	}
+}
+
+func TestLookupCarbonCoeff_Disk_ExactMatch(t *testing.T) {
+	// The CSV contains several disk rows per (provider, region), one per
+	// disk type. They collide under a key of (provider, region), so we
+	// check the lookup against whatever value the table actually holds.
+	want, ok := carbonLookupDisk[carbonLookupKeyRegion{opencost.AWSProvider, "us-east-1"}]
+	if !ok || want == 0 {
+		t.Fatalf("expected AWS/us-east-1 disk coefficient to be loaded")
+	}
+	d := diskWithLabels(opencost.AWSProvider, "aws:///us-east-1a/vol-1", "us-east-1", 60)
+	if got := lookupCarbonCoeff(d); !floatEqual(got, want) {
+		t.Fatalf("lookupCarbonCoeff disk = %g, want %g", got, want)
+	}
+}
+
+func TestLookupCarbonCoeff_Disk_FallsBackWhenRegionUnknown(t *testing.T) {
+	d := diskWithLabels(opencost.AWSProvider, "aws:///xx/vol-1", "not-a-real-region", 60)
+	want, ok := carbonLookupDisk[carbonLookupKeyRegion{opencost.AWSProvider, averageRegionKey}]
+	if !ok {
+		t.Fatalf("expected AWS average-region disk coefficient to be loaded")
+	}
+	if got := lookupCarbonCoeff(d); !floatEqual(got, want) {
+		t.Fatalf("lookupCarbonCoeff disk fallback = %g, want %g", got, want)
+	}
+}
+
+func TestLookupCarbonCoeff_Network_Populated(t *testing.T) {
+	// Regression: Network rows were loaded but never consulted, so every
+	// Network asset produced 0 emissions.
+	nw := networkWithLabels(opencost.AWSProvider, "aws:///us-east-1a/net-1", "us-east-1", 60)
+	if got := lookupCarbonCoeff(nw); !floatEqual(got, awsUSEast1NetworkCoeff) {
+		t.Fatalf("lookupCarbonCoeff network = %g, want %g", got, awsUSEast1NetworkCoeff)
+	}
+}
+
+func TestRelateCarbonAssets_MinutesToHours(t *testing.T) {
+	// Coefficient is tonnes CO2e per hour, so 120 minutes should yield exactly
+	// twice the coefficient.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "t4g.nano", 120)
+	as := opencost.NewAssetSet(*n.Window.Start(), *n.Window.End(), n)
+
+	rows, err := RelateCarbonAssets(as)
+	if err != nil {
+		t.Fatalf("RelateCarbonAssets: %v", err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("got %d rows, want 1", len(rows))
+	}
+	var row CarbonRow
+	for _, r := range rows {
+		row = r
+	}
+	want := awsT4gNanoUSEast1Coeff * 2
+	if !floatEqual(row.Co2e, want) {
+		t.Fatalf("Co2e = %g, want %g", row.Co2e, want)
+	}
+}
+
+func TestRelateCarbonAssets_ZeroForUnknownProvider(t *testing.T) {
+	n := nodeWithLabels("", "totally-unknown", "us-east-1", "t4g.nano", 60)
+	as := opencost.NewAssetSet(*n.Window.Start(), *n.Window.End(), n)
+
+	rows, err := RelateCarbonAssets(as)
+	if err != nil {
+		t.Fatalf("RelateCarbonAssets: %v", err)
+	}
+	for _, r := range rows {
+		if r.Co2e != 0 {
+			t.Fatalf("Co2e = %g, want 0 for unknown provider", r.Co2e)
+		}
+	}
+}
+
+func TestLookupCarbonCoeff_NoPanicOnNilProperties(t *testing.T) {
+	// A bare Node with nil Properties must not panic — older code would
+	// dereference props.ProviderID in the log line after resolveProvider
+	// returned "" for nil properties.
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(60 * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+	n := opencost.NewNode("node", "cluster", "", start, end, window)
+	n.Properties = nil
+
+	if got := lookupCarbonCoeff(n); got != 0 {
+		t.Fatalf("lookupCarbonCoeff with nil properties = %g, want 0", got)
+	}
+}
+
+func TestLookupTables_LoadedAtInit(t *testing.T) {
+	if len(carbonLookupNode) == 0 {
+		t.Error("carbonLookupNode is empty — init did not populate node lookups")
+	}
+	if len(carbonLookupDisk) == 0 {
+		t.Error("carbonLookupDisk is empty — init did not populate disk lookups")
+	}
+	if len(carbonLookupNetwork) == 0 {
+		t.Error("carbonLookupNetwork is empty — init did not populate network lookups")
+	}
+}

+ 45 - 8
pkg/cloud/alibaba/provider.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
 	"os"
 	"regexp"
 	"strconv"
@@ -60,8 +61,8 @@ const (
 )
 
 var (
-	// Regular expression to get the numerical value of PV suffix with GiB from *v1.PersistentVolume.
-	sizeRegEx = regexp.MustCompile("(.*?)Gi")
+	// sizeRegEx parses a PV capacity string into a numeric part and an optional binary SI suffix (Ki, Mi, Gi, Ti).
+	sizeRegEx = regexp.MustCompile(`^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti)?$`)
 )
 
 // Variable to keep track of instance families that fail in DescribePrice API due improper defaulting of systemDisk if the information is not available
@@ -1296,21 +1297,57 @@ func generateSlimK8sNodeFromV1Node(node *clustercache.Node) *SlimK8sNode {
 	return NewSlimK8sNode(instanceType, regionID, priceUnit, memorySizeInKiB, osType, providerID, instanceFamily, IsIoOptimized, systemDisk)
 }
 
-// getNumericalValueFromResourceQuantity returns the numericalValue of the resourceQuantity
-// An example is: 20Gi returns to 20. If any error occurs it returns the default value used in describePrice API which is 2000.
+// getNumericalValueFromResourceQuantity converts a Kubernetes PV capacity string (e.g. "20Gi", "48828125Ki")
+// into a whole GiB integer string, as required by the Alibaba DescribePrice API.
+// Returns ALIBABA_DEFAULT_DATADISK_SIZE if the quantity cannot be parsed.
 func getNumericalValueFromResourceQuantity(quantity string) (value string) {
-	// defaulting when any panic or empty string occurs.
 	defer func() {
-		log.Debugf("unable to determine the size of the PV so defaulting the size to %s", ALIBABA_DEFAULT_DATADISK_SIZE)
 		if err := recover(); err != nil {
+			log.Debugf("panic while parsing PV capacity %q, defaulting to %s: %v", quantity, ALIBABA_DEFAULT_DATADISK_SIZE, err)
 			value = ALIBABA_DEFAULT_DATADISK_SIZE
 		}
 		if value == "" {
+			log.Debugf("unable to determine the size of the PV from quantity %q, defaulting to %s", quantity, ALIBABA_DEFAULT_DATADISK_SIZE)
 			value = ALIBABA_DEFAULT_DATADISK_SIZE
 		}
 	}()
-	res := sizeRegEx.FindAllStringSubmatch(quantity, 1)
-	value = res[0][1]
+
+	res := sizeRegEx.FindStringSubmatch(strings.TrimSpace(quantity))
+	if len(res) < 2 || res[1] == "" {
+		return
+	}
+
+	numericPart, err := strconv.ParseFloat(res[1], 64)
+	if err != nil || numericPart <= 0 {
+		return
+	}
+
+	unit := ""
+	if len(res) >= 3 {
+		unit = res[2]
+	}
+
+	var sizeInGiB float64
+	switch unit {
+	case "Ki":
+		sizeInGiB = numericPart / (1024 * 1024)
+	case "Mi":
+		sizeInGiB = numericPart / 1024
+	case "Gi":
+		sizeInGiB = numericPart
+	case "Ti":
+		sizeInGiB = numericPart * 1024
+	default:
+		sizeInGiB = numericPart / (1024 * 1024 * 1024)
+	}
+
+	// ceil so we never underreport disk size to the DescribePrice API.
+	sizeInGiBInt := int64(math.Ceil(sizeInGiB))
+	if sizeInGiBInt <= 0 {
+		return
+	}
+
+	value = strconv.FormatInt(sizeInGiBInt, 10)
 	return
 }
 

+ 69 - 3
pkg/cloud/alibaba/provider_test.go

@@ -770,20 +770,86 @@ func TestGetNumericalValueFromResourceQuantity(t *testing.T) {
 			expectedValue:        "10",
 		},
 		{
-			name:                 "negative scenario: when inputResourceQuantity is Gi",
+			name:                 "negative scenario: when inputResourceQuantity is Gi (no numeric prefix)",
 			inputResourceQuanity: "Gi",
 			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
 		},
 		{
-			name:                 "negative scenario: when inputResourceQuantity is 10",
+			name:                 "edge case: when inputResourceQuantity is 10 (plain bytes, rounds up to 1 GiB)",
 			inputResourceQuanity: "10",
-			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
+			expectedValue:        "1",
 		},
 		{
 			name:                 "negative scenario: when inputResourceQuantity is empty string",
 			inputResourceQuanity: "",
 			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
 		},
+		{
+			name:                 "negative scenario: when inputResourceQuantity is non-numeric",
+			inputResourceQuanity: "abc",
+			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
+		},
+		{
+			// 48828125Ki / (1024*1024) = 46.875 GiB, rounds up to 47
+			name:                 "positive scenario: Ki unit - 48828125Ki",
+			inputResourceQuanity: "48828125Ki",
+			expectedValue:        "47",
+		},
+		{
+			name:                 "positive scenario: Ki unit - 1048576Ki (exactly 1 GiB)",
+			inputResourceQuanity: "1048576Ki",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Ki unit - 2097152Ki (exactly 2 GiB)",
+			inputResourceQuanity: "2097152Ki",
+			expectedValue:        "2",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 512Mi (rounds up to 1 GiB)",
+			inputResourceQuanity: "512Mi",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 1024Mi (exactly 1 GiB)",
+			inputResourceQuanity: "1024Mi",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 20480Mi (exactly 20 GiB)",
+			inputResourceQuanity: "20480Mi",
+			expectedValue:        "20",
+		},
+		{
+			name:                 "positive scenario: Gi unit - 20Gi",
+			inputResourceQuanity: "20Gi",
+			expectedValue:        "20",
+		},
+		{
+			name:                 "positive scenario: Gi unit - 100Gi",
+			inputResourceQuanity: "100Gi",
+			expectedValue:        "100",
+		},
+		{
+			name:                 "positive scenario: Ti unit - 1Ti",
+			inputResourceQuanity: "1Ti",
+			expectedValue:        "1024",
+		},
+		{
+			name:                 "positive scenario: Ti unit - 2Ti",
+			inputResourceQuanity: "2Ti",
+			expectedValue:        "2048",
+		},
+		{
+			name:                 "positive scenario: fractional Gi unit - 1.5Gi (rounds up to 2 GiB)",
+			inputResourceQuanity: "1.5Gi",
+			expectedValue:        "2",
+		},
+		{
+			name:                 "positive scenario: fractional Mi unit - 1536Mi (1.5 GiB, rounds up to 2 GiB)",
+			inputResourceQuanity: "1536Mi",
+			expectedValue:        "2",
+		},
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {

+ 124 - 24
pkg/cloud/aws/provider.go

@@ -53,6 +53,7 @@ const (
 
 	APIPricingSource              = "Public API"
 	SpotPricingSource             = "Spot Data Feed"
+	SpotPriceHistorySource        = "Spot Price History"
 	ReservedInstancePricingSource = "Savings Plan, Reserved Instance, and Out-Of-Cluster"
 	FargatePricingSource          = "Fargate"
 
@@ -96,11 +97,7 @@ func (aws *AWS) PricingSourceStatus() map[string]*models.PricingSource {
 		Enabled: true,
 	}
 
-	if !aws.SpotRefreshEnabled() {
-		sps.Available = false
-		sps.Error = "Spot instances not set up"
-		sps.Enabled = false
-	} else {
+	if aws.SpotFeedRefreshEnabled() {
 		sps.Error = ""
 		if aws.SpotPricingError != nil {
 			sps.Error = aws.SpotPricingError.Error()
@@ -112,9 +109,28 @@ func (aws *AWS) PricingSourceStatus() map[string]*models.PricingSource {
 		} else {
 			sps.Error = "No spot instances detected"
 		}
+	} else {
+		sps.Available = false
+		sps.Error = "Spot instances not set up"
+		sps.Enabled = false
 	}
 	sources[SpotPricingSource] = sps
 
+	sphs := &models.PricingSource{
+		Name:    SpotPriceHistorySource,
+		Enabled: true,
+	}
+	if aws.SpotPriceHistoryError != nil {
+		sphs.Error = aws.SpotPriceHistoryError.Error()
+		sphs.Available = false
+	} else if aws.SpotPriceHistoryCache == nil {
+		sphs.Error = "Not yet initialized"
+		sphs.Available = false
+	} else {
+		sphs.Available = true
+	}
+	sources[SpotPriceHistorySource] = sphs
+
 	rps := &models.PricingSource{
 		Name:    ReservedInstancePricingSource,
 		Enabled: true,
@@ -185,6 +201,8 @@ type AWS struct {
 	SpotRefreshRunning          bool
 	SpotPricingLock             sync.RWMutex
 	SpotPricingError            error
+	SpotPriceHistoryCache       *SpotPriceHistoryCache
+	SpotPriceHistoryError       error
 	RIPricingByInstanceID       map[string]*RIData
 	RIPricingError              error
 	RIDataRunning               bool
@@ -848,8 +866,8 @@ func (aws *AWS) getRegionPricing(nodeList []*clustercache.Node) (*http.Response,
 	return resp, pricingURL, err
 }
 
-// SpotRefreshEnabled determines whether the required configs to run the spot feed query have been set up
-func (aws *AWS) SpotRefreshEnabled() bool {
+// SpotFeedRefreshEnabled determines whether the required configs to run the spot feed query have been set up
+func (aws *AWS) SpotFeedRefreshEnabled() bool {
 	// Guard against nil receiver
 	if aws == nil {
 		return false
@@ -1019,28 +1037,36 @@ func (aws *AWS) DownloadPricingData() error {
 	}
 	log.Infof("Finished downloading \"%s\"", pricingURL)
 
-	if !aws.SpotRefreshEnabled() {
-		return nil
+	// Initialize a spot price history cache if not already initialized.
+	// Reset error to allow retrying on subsequent DownloadPricingData calls.
+	if aws.SpotPriceHistoryCache == nil {
+		aws.SpotPriceHistoryError = nil
+		aws.SpotPriceHistoryCache, aws.SpotPriceHistoryError = aws.initializeSpotPriceHistoryCache()
+		if aws.SpotPriceHistoryError != nil {
+			log.Errorf("Failed to initialize spot price history manager: %v", aws.SpotPriceHistoryError)
+		}
 	}
 
-	// Always run spot pricing refresh when performing download
-	aws.refreshSpotPricing(true)
+	if aws.SpotFeedRefreshEnabled() {
+		// Always run spot pricing refresh when performing download
+		aws.refreshSpotPricing(true)
 
-	// Only start a single refresh goroutine
-	if !aws.SpotRefreshRunning {
-		aws.SpotRefreshRunning = true
+		// Only start a single refresh goroutine
+		if !aws.SpotRefreshRunning {
+			aws.SpotRefreshRunning = true
 
-		go func() {
-			defer errs.HandlePanic()
+			go func() {
+				defer errs.HandlePanic()
 
-			for {
-				log.Infof("Spot Pricing Refresh scheduled in %.2f minutes.", SpotRefreshDuration.Minutes())
-				time.Sleep(SpotRefreshDuration)
+				for {
+					log.Infof("Spot Pricing Refresh scheduled in %.2f minutes.", SpotRefreshDuration.Minutes())
+					time.Sleep(SpotRefreshDuration)
 
-				// Reoccurring refresh checks update times
-				aws.refreshSpotPricing(false)
-			}
-		}()
+					// Reoccurring refresh checks update times
+					aws.refreshSpotPricing(false)
+				}
+			}()
+		}
 	}
 
 	return nil
@@ -1278,6 +1304,60 @@ func (aws *AWS) refreshSpotPricing(force bool) {
 	aws.SpotPricingByInstanceID = sp
 }
 
+func (aws *AWS) initializeSpotPriceHistoryCache() (*SpotPriceHistoryCache, error) {
+	log.Info("Initializing AWS Spot Price History Manager")
+
+	// Get AWS access key for creating config
+	accessKey, err := aws.GetAWSAccessKey()
+	if err != nil {
+		return nil, fmt.Errorf("getting AWS access key for spot price history: %w", err)
+	}
+
+	// Use the cluster region to create the initial AWS config and credentials.
+	// The SpotPriceHistoryFetcher itself can query multiple regions by creating
+	// region-specific EC2 clients as needed.
+	if aws.ClusterRegion == "" {
+		return nil, fmt.Errorf("no cluster region configured")
+	}
+
+	// Create config for the cluster region
+	awsConfig, err := accessKey.CreateConfig(aws.ClusterRegion)
+	if err != nil {
+		return nil, fmt.Errorf("creating AWS config for spot price history: %w", err)
+	}
+
+	return NewSpotPriceHistoryCache(NewAWSSpotPriceHistoryFetcher(awsConfig)), nil
+}
+
+func (aws *AWS) spotPricingFromHistory(k models.Key) (*SpotPriceHistoryEntry, bool) {
+	if aws.SpotPriceHistoryCache == nil {
+		return nil, false
+	}
+
+	// Extract region, instance type, and availability zone from the key
+	awsKey, ok := k.(*awsKey)
+	if !ok {
+		log.DedupedWarningf(10, "Failed to cast key to awsKey for spot price history lookup: %s", k.ID())
+		return nil, false
+	}
+
+	region, regionOk := util.GetRegion(awsKey.Labels)
+	instanceType, instanceTypeOk := util.GetInstanceType(awsKey.Labels)
+	availabilityZone, availabilityZoneOk := util.GetZone(awsKey.Labels)
+	// Skip lookup if any required information is missing
+	if !regionOk || !instanceTypeOk || !availabilityZoneOk {
+		log.DedupedWarningf(10, "Missing required info for spot price history lookup (region: %s, instanceType: %s, zone: %s): %s", region, instanceType, availabilityZone, k.ID())
+		return nil, false
+	}
+
+	price, err := aws.SpotPriceHistoryCache.GetSpotPrice(region, instanceType, availabilityZone)
+	if err != nil {
+		log.DedupedWarningf(10, "Failed to get spot price history for instance %s: %s", k.ID(), err.Error())
+		return nil, false
+	}
+	return price, true
+}
+
 // Stubbed NetworkPricing for AWS. Pull directly from aws.json for now
 func (aws *AWS) NetworkPricing() (*models.Network, error) {
 	cpricing, err := aws.Config.GetCustomPricingData()
@@ -1403,9 +1483,29 @@ func (aws *AWS) createNode(terms *AWSProductTerms, usageType string, k models.Ke
 			UsageType:    PreemptibleType,
 		}, meta, nil
 	} else if aws.isPreemptible(key) { // Preemptible but we don't have any data in the pricing report.
-		log.DedupedWarningf(5, "Node %s marked preemptible but we have no data in spot feed", k.ID())
+		log.DedupedWarningf(5, "Node %s marked preemptible but no spot feed data available; falling back to other pricing sources", k.ID())
+
+		// Try to get spot pricing from DescribeSpotPriceHistory API
+		if historyEntry, ok := aws.spotPricingFromHistory(k); ok {
+			log.DedupedInfof(5, "Using spot price history data for node %s: $%f", k.ID(), historyEntry.SpotPrice)
+			spotHistoryCost := fmt.Sprintf("%f", historyEntry.SpotPrice)
+			meta.Source = SpotPriceHistorySource
+			return &models.Node{
+				Cost:         spotHistoryCost,
+				VCPU:         terms.VCpu,
+				RAM:          terms.Memory,
+				GPU:          terms.GPU,
+				Storage:      terms.Storage,
+				BaseCPUPrice: aws.BaseCPUPrice,
+				BaseRAMPrice: aws.BaseRAMPrice,
+				BaseGPUPrice: aws.BaseGPUPrice,
+				UsageType:    PreemptibleType,
+			}, meta, nil
+		}
+
 		if publicPricingFound {
 			// return public price if found
+			log.DedupedWarningf(5, "No spot price history available for %s, falling back to on-demand pricing", k.ID())
 			return &models.Node{
 				Cost:         cost,
 				VCPU:         terms.VCpu,

+ 377 - 7
pkg/cloud/aws/provider_test.go

@@ -2,12 +2,14 @@ package aws
 
 import (
 	"encoding/json"
+	"errors"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
 	"reflect"
 	"testing"
+	"time"
 
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/pkg/cloud/models"
@@ -867,7 +869,7 @@ func (f *fakeProviderConfig) ConfigFileManager() *config.ConfigFileManager {
 	return nil
 }
 
-func TestAWS_SpotRefreshEnabled(t *testing.T) {
+func TestAWS_SpotFeedRefreshEnabled(t *testing.T) {
 	tests := []struct {
 		name                string
 		spotDataBucket      string
@@ -955,9 +957,9 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 				},
 			}
 
-			got := aws.SpotRefreshEnabled()
+			got := aws.SpotFeedRefreshEnabled()
 			if got != tt.want {
-				t.Errorf("AWS.SpotRefreshEnabled() = %v, want %v", got, tt.want)
+				t.Errorf("AWS.SpotFeedRefreshEnabled() = %v, want %v", got, tt.want)
 			}
 		})
 	}
@@ -971,10 +973,10 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 			Config:         nil, // nil Config should not cause panic
 		}
 
-		got := aws.SpotRefreshEnabled()
+		got := aws.SpotFeedRefreshEnabled()
 		want := true // Should fall back to field-based check
 		if got != want {
-			t.Errorf("AWS.SpotRefreshEnabled() with nil Config = %v, want %v", got, want)
+			t.Errorf("AWS.SpotFeedRefreshEnabled() with nil Config = %v, want %v", got, want)
 		}
 	})
 
@@ -986,10 +988,378 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 			Config:         nil, // nil Config should not cause panic
 		}
 
-		got := aws.SpotRefreshEnabled()
+		got := aws.SpotFeedRefreshEnabled()
 		want := false // No fields set, should return false
 		if got != want {
-			t.Errorf("AWS.SpotRefreshEnabled() with nil Config and no fields = %v, want %v", got, want)
+			t.Errorf("AWS.SpotFeedRefreshEnabled() with nil Config and no fields = %v, want %v", got, want)
+		}
+	})
+}
+
+func TestAWS_spotPricingFromHistory(t *testing.T) {
+	t.Run("nil cache returns false", func(t *testing.T) {
+		aws := &AWS{}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when cache is nil")
+		}
+	})
+
+	t.Run("missing region label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when region label is missing")
+		}
+	})
+
+	t.Run("missing instance type label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region": "us-east-1",
+				"topology.kubernetes.io/zone":   "us-east-1a",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when instance type label is missing")
+		}
+	})
+
+	t.Run("missing zone label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when zone label is missing")
+		}
+	})
+
+	t.Run("fetcher error returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("api error")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when fetcher returns error")
+		}
+	})
+
+	t.Run("successful lookup returns entry", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				if key.Region != "us-east-1" || key.InstanceType != "m5.large" || key.AvailabilityZone != "us-east-1a" {
+					t.Errorf("Unexpected key: %v", key)
+				}
+				return &SpotPriceHistoryEntry{
+					SpotPrice:   0.042,
+					Timestamp:   time.Now(),
+					RetrievedAt: time.Now(),
+				}, nil
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		entry, ok := aws.spotPricingFromHistory(key)
+		if !ok {
+			t.Fatal("Expected true for successful lookup")
+		}
+		if entry.SpotPrice != 0.042 {
+			t.Errorf("Expected spot price 0.042, got %f", entry.SpotPrice)
+		}
+	})
+}
+
+func TestAWS_createNode_spotHistoryFallback(t *testing.T) {
+	// Helper to build AWSProductTerms with on-demand pricing
+	makeTerms := func(sku, offerTermCode, cost string) *AWSProductTerms {
+		priceKey := sku + "." + offerTermCode + "." + HourlyRateCode
+		return &AWSProductTerms{
+			Sku: sku,
+			OnDemand: &AWSOfferTerm{
+				Sku:           sku,
+				OfferTermCode: offerTermCode,
+				PriceDimensions: map[string]*AWSRateCode{
+					priceKey: {
+						Unit:         "Hrs",
+						PricePerUnit: AWSCurrencyCode{USD: cost},
+					},
+				},
+			},
+			VCpu:   "4",
+			Memory: "16",
+		}
+	}
+
+	t.Run("preemptible node uses spot history when available", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return &SpotPriceHistoryEntry{
+					SpotPrice:   0.035,
+					Timestamp:   time.Now(),
+					RetrievedAt: time.Now(),
+				}, nil
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		// Key with PreemptibleType suffix to trigger isPreemptible
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, meta, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.035000" {
+			t.Errorf("Expected spot history cost 0.035000, got %s", node.Cost)
+		}
+		if node.UsageType != PreemptibleType {
+			t.Errorf("Expected usage type %s, got %s", PreemptibleType, node.UsageType)
+		}
+		if meta.Source != SpotPriceHistorySource {
+			t.Errorf("Expected source %s, got %s", SpotPriceHistorySource, meta.Source)
+		}
+	})
+
+	t.Run("preemptible node falls back to on-demand when history unavailable", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("no data")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.096" {
+			t.Errorf("Expected on-demand cost 0.096, got %s", node.Cost)
+		}
+		if node.UsageType != PreemptibleType {
+			t.Errorf("Expected usage type %s, got %s", PreemptibleType, node.UsageType)
+		}
+	})
+
+	t.Run("preemptible node with nil cache falls back to on-demand", func(t *testing.T) {
+		aws := &AWS{
+			BaseCPUPrice: "0.04",
+			BaseRAMPrice: "0.01",
+			BaseGPUPrice: "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.096" {
+			t.Errorf("Expected on-demand cost 0.096, got %s", node.Cost)
+		}
+	})
+
+	t.Run("preemptible node uses base spot prices when no public pricing", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("no data")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+			BaseSpotCPUPrice:      "0.02",
+			BaseSpotRAMPrice:      "0.005",
+		}
+		// Terms without valid pricing dimensions
+		terms := &AWSProductTerms{
+			Sku: "SKU123",
+			OnDemand: &AWSOfferTerm{
+				Sku:             "SKU123",
+				OfferTermCode:   "JRTCKXETXF",
+				PriceDimensions: map[string]*AWSRateCode{},
+			},
+			VCpu:   "4",
+			Memory: "16",
+		}
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.VCPUCost != "0.02" {
+			t.Errorf("Expected base spot CPU price 0.02, got %s", node.VCPUCost)
+		}
+		if node.RAMCost != "0.005" {
+			t.Errorf("Expected base spot RAM price 0.005, got %s", node.RAMCost)
+		}
+	})
+}
+
+func TestAWS_PricingSourceStatus_spotPriceHistory(t *testing.T) {
+	t.Run("not yet initialized", func(t *testing.T) {
+		aws := &AWS{
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs, ok := sources[SpotPriceHistorySource]
+		if !ok {
+			t.Fatal("Expected SpotPriceHistorySource in sources")
+		}
+		if sphs.Available {
+			t.Error("Expected Available=false when cache not initialized")
+		}
+		if sphs.Error != "Not yet initialized" {
+			t.Errorf("Expected 'Not yet initialized' error, got %q", sphs.Error)
+		}
+	})
+
+	t.Run("initialization error", func(t *testing.T) {
+		aws := &AWS{
+			SpotPriceHistoryError: errors.New("no cluster region configured"),
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs := sources[SpotPriceHistorySource]
+		if sphs.Available {
+			t.Error("Expected Available=false on error")
+		}
+		if sphs.Error != "no cluster region configured" {
+			t.Errorf("Expected error message, got %q", sphs.Error)
+		}
+	})
+
+	t.Run("successfully initialized", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs := sources[SpotPriceHistorySource]
+		if !sphs.Available {
+			t.Error("Expected Available=true when cache initialized")
 		}
 	})
 }

+ 200 - 0
pkg/cloud/aws/spotpricehistory.go

@@ -0,0 +1,200 @@
+package aws
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"sync"
+	"time"
+
+	awsSDK "github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/ec2"
+	ec2Types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
+
+	"github.com/opencost/opencost/core/pkg/log"
+)
+
+// SpotPriceHistoryKey uniquely identifies a spot price lookup by region,
+// instance type, and availability zone.
+type SpotPriceHistoryKey struct {
+	Region           string
+	InstanceType     string
+	AvailabilityZone string
+}
+
+func (key SpotPriceHistoryKey) String() string {
+	return fmt.Sprintf("%s/%s/%s", key.Region, key.InstanceType, key.AvailabilityZone)
+}
+
+const (
+	SpotPriceHistoryCacheAge = 1 * time.Hour
+)
+
+// SpotPriceHistoryEntry holds a cached spot price from the DescribeSpotPriceHistory API.
+type SpotPriceHistoryEntry struct {
+	SpotPrice float64
+	Timestamp time.Time
+
+	RetrievedAt time.Time
+	Error       error // Negative cache
+}
+
+func (spe SpotPriceHistoryEntry) shouldRefresh() bool {
+	return time.Since(spe.RetrievedAt) > SpotPriceHistoryCacheAge
+}
+
+// SpotPriceHistoryCache provides a thread-safe, on-demand cache for spot prices
+// retrieved via the DescribeSpotPriceHistory API. Entries are cached for
+// SpotPriceHistoryCacheAge and include negative caching for errors.
+type SpotPriceHistoryCache struct {
+	cache          map[SpotPriceHistoryKey]*SpotPriceHistoryEntry
+	mutex          sync.Mutex
+	refreshRunning map[SpotPriceHistoryKey]bool
+	refreshCond    *sync.Cond
+
+	fetcher SpotPriceHistoryFetcher
+}
+
+func NewSpotPriceHistoryCache(fetcher SpotPriceHistoryFetcher) *SpotPriceHistoryCache {
+	cache := &SpotPriceHistoryCache{
+		cache:          make(map[SpotPriceHistoryKey]*SpotPriceHistoryEntry),
+		refreshRunning: make(map[SpotPriceHistoryKey]bool),
+
+		fetcher: fetcher,
+	}
+	cache.refreshCond = sync.NewCond(&cache.mutex)
+	return cache
+}
+
+// GetSpotPrice returns the cached spot price for the given region, instance type,
+// and availability zone. If the cache entry is missing or stale, it fetches a
+// fresh value from the underlying SpotPriceHistoryFetcher.
+func (sph *SpotPriceHistoryCache) GetSpotPrice(region, instanceType, availabilityZone string) (*SpotPriceHistoryEntry, error) {
+	key := SpotPriceHistoryKey{
+		Region:           region,
+		InstanceType:     instanceType,
+		AvailabilityZone: availabilityZone,
+	}
+	sph.mutex.Lock()
+	for sph.refreshRunning[key] {
+		sph.refreshCond.Wait()
+	}
+	// Check if we have cached price. If so, return it.
+	entry, exists := sph.cache[key]
+	if exists && !entry.shouldRefresh() {
+		sph.mutex.Unlock()
+		return entry, entry.Error
+	}
+	// Either a cache entry does not exist or it is stale. Refresh it.
+	sph.refreshRunning[key] = true
+	sph.mutex.Unlock()
+
+	// Ensure refreshRunning is always cleared, even if the fetcher panics.
+	defer func() {
+		sph.mutex.Lock()
+		delete(sph.refreshRunning, key)
+		sph.refreshCond.Broadcast()
+		sph.mutex.Unlock()
+	}()
+
+	// Fetch the entry
+	entry, err := sph.fetcher.FetchSpotPrice(key)
+	if err != nil || entry == nil {
+		// If we fail to fetch or get a nil entry, create a negative cache entry.
+		if err == nil {
+			err = fmt.Errorf("fetcher returned nil entry for %s", key)
+		}
+		entry = &SpotPriceHistoryEntry{
+			RetrievedAt: time.Now(),
+			Error:       err,
+		}
+	} else {
+		// Normalize cache metadata so cache freshness does not depend on
+		// the fetcher setting these fields correctly.
+		entry.RetrievedAt = time.Now()
+		entry.Error = nil
+	}
+
+	// Store it into the cache
+	sph.mutex.Lock()
+	sph.cache[key] = entry
+	sph.mutex.Unlock()
+	return entry, entry.Error
+}
+
+// SpotPriceHistoryFetcher is the interface for fetching spot prices from the
+// DescribeSpotPriceHistory API (or a mock for testing).
+type SpotPriceHistoryFetcher interface {
+	FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error)
+}
+
+// AWSSpotPriceHistoryFetcher implements SpotPriceHistoryFetcher using the real
+// AWS EC2 DescribeSpotPriceHistory API. It maintains a pool of per-region
+// EC2 clients.
+type AWSSpotPriceHistoryFetcher struct {
+	awsConfig       awsSDK.Config
+	ec2ClientsMutex sync.Mutex
+	ec2Clients      map[string]*ec2.Client
+}
+
+func NewAWSSpotPriceHistoryFetcher(awsConfig awsSDK.Config) *AWSSpotPriceHistoryFetcher {
+	return &AWSSpotPriceHistoryFetcher{
+		awsConfig:  awsConfig,
+		ec2Clients: make(map[string]*ec2.Client),
+	}
+}
+
+func (a *AWSSpotPriceHistoryFetcher) getEC2Client(region string) *ec2.Client {
+	a.ec2ClientsMutex.Lock()
+	defer a.ec2ClientsMutex.Unlock()
+	if client, ok := a.ec2Clients[region]; ok {
+		return client
+	}
+	config := a.awsConfig
+	config.Region = region
+	client := ec2.NewFromConfig(config)
+	a.ec2Clients[region] = client
+	return client
+}
+
+func (a *AWSSpotPriceHistoryFetcher) FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+	log.Debugf("Retrieving spot price history for %s", key)
+	client := a.getEC2Client(key.Region)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	input := &ec2.DescribeSpotPriceHistoryInput{
+		InstanceTypes:    []ec2Types.InstanceType{ec2Types.InstanceType(key.InstanceType)},
+		AvailabilityZone: awsSDK.String(key.AvailabilityZone),
+		// Only retrieve Linux/UNIX (Amazon VPC) prices. The non-VPC
+		// "Linux/UNIX" variant was for EC2-Classic, which was fully retired in
+		// August 2023.
+		ProductDescriptions: []string{
+			"Linux/UNIX (Amazon VPC)",
+		},
+		// Only retrieve the latest price.
+		MaxResults: awsSDK.Int32(1),
+	}
+
+	resp, err := client.DescribeSpotPriceHistory(ctx, input)
+	if err != nil {
+		return nil, fmt.Errorf("describing spot price history for %s: %w", key, err)
+	}
+	if len(resp.SpotPriceHistory) == 0 {
+		return nil, fmt.Errorf("no spot price history found for %s", key)
+	}
+	spotPrice := resp.SpotPriceHistory[0]
+
+	if spotPrice.SpotPrice == nil || spotPrice.Timestamp == nil {
+		return nil, fmt.Errorf("missing required spot price history data for %s (SpotPrice=%v, Timestamp=%v)", key, spotPrice.SpotPrice, spotPrice.Timestamp)
+	}
+	price, err := strconv.ParseFloat(*spotPrice.SpotPrice, 64)
+	if err != nil {
+		return nil, fmt.Errorf("parsing spot price: %w", err)
+	}
+	return &SpotPriceHistoryEntry{
+		SpotPrice:   price,
+		Timestamp:   *spotPrice.Timestamp,
+		RetrievedAt: time.Now(),
+	}, nil
+}

+ 239 - 0
pkg/cloud/aws/spotpricehistory_test.go

@@ -0,0 +1,239 @@
+package aws
+
+import (
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+type mockSpotPriceHistoryFetcher struct {
+	fetchFunc func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error)
+}
+
+func (m *mockSpotPriceHistoryFetcher) FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+	if m.fetchFunc != nil {
+		return m.fetchFunc(key)
+	}
+	return &SpotPriceHistoryEntry{
+		SpotPrice:   0.05,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now(),
+	}, nil
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_CacheHit(t *testing.T) {
+	mockFetcher := &mockSpotPriceHistoryFetcher{}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	region := "us-west-2"
+	instanceType := "m5.large"
+	availabilityZone := "us-west-2a"
+
+	key := SpotPriceHistoryKey{
+		Region:           region,
+		InstanceType:     instanceType,
+		AvailabilityZone: availabilityZone,
+	}
+
+	cachedEntry := &SpotPriceHistoryEntry{
+		SpotPrice:   0.08,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now(),
+	}
+	cache.cache[key] = cachedEntry
+
+	entry, err := cache.GetSpotPrice(region, instanceType, availabilityZone)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if entry.SpotPrice != 0.08 {
+		t.Errorf("Expected spot price 0.08, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_CacheMiss(t *testing.T) {
+	fetchCalled := false
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCalled = true
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.12,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if !fetchCalled {
+		t.Error("Expected fetcher to be called for cache miss")
+	}
+	if entry.SpotPrice != 0.12 {
+		t.Errorf("Expected spot price 0.12, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_ConcurrentSameKey(t *testing.T) {
+	var fetchCount atomic.Int32
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCount.Add(1)
+			// Simulate slow API call to increase chance of concurrent access
+			time.Sleep(50 * time.Millisecond)
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.07,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	const goroutines = 10
+	var wg sync.WaitGroup
+	wg.Add(goroutines)
+	for i := 0; i < goroutines; i++ {
+		go func() {
+			defer wg.Done()
+			entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+			if err != nil {
+				t.Errorf("Expected no error, got %v", err)
+			}
+			if entry.SpotPrice != 0.07 {
+				t.Errorf("Expected spot price 0.07, got %f", entry.SpotPrice)
+			}
+		}()
+	}
+	wg.Wait()
+
+	if count := fetchCount.Load(); count != 1 {
+		t.Errorf("Expected exactly 1 fetch call, got %d", count)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_StaleEntry(t *testing.T) {
+	fetchCalled := false
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCalled = true
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.15,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+
+	staleEntry := &SpotPriceHistoryEntry{
+		SpotPrice:   0.08,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now().Add(-2 * time.Hour),
+	}
+	cache.cache[key] = staleEntry
+
+	entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if !fetchCalled {
+		t.Error("Expected fetcher to be called for stale entry")
+	}
+	if entry.SpotPrice != 0.15 {
+		t.Errorf("Expected refreshed spot price 0.15, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_FetchError(t *testing.T) {
+	expectedError := errors.New("fetch failed")
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			return nil, expectedError
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	_, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err == nil {
+		t.Error("Expected error from failed fetch")
+	}
+	if !errors.Is(err, expectedError) {
+		t.Errorf("Expected error %v, got %v", expectedError, err)
+	}
+
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+	cachedEntry := cache.cache[key]
+	if !errors.Is(cachedEntry.Error, expectedError) {
+		t.Errorf("Expected cached entry error %v, got %v", expectedError, cachedEntry.Error)
+	}
+}
+
+func TestSpotPriceHistoryEntry_shouldRefresh(t *testing.T) {
+	now := time.Now()
+
+	tests := []struct {
+		name        string
+		retrievedAt time.Time
+		expected    bool
+	}{
+		{
+			name:        "fresh entry",
+			retrievedAt: now,
+			expected:    false,
+		},
+		{
+			name:        "stale entry",
+			retrievedAt: now.Add(-2 * time.Hour),
+			expected:    true,
+		},
+		{
+			name:        "borderline entry",
+			retrievedAt: now.Add(-SpotPriceHistoryCacheAge + 1*time.Minute),
+			expected:    false,
+		},
+		{
+			name:        "expired entry",
+			retrievedAt: now.Add(-SpotPriceHistoryCacheAge - 1*time.Minute),
+			expected:    true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			entry := SpotPriceHistoryEntry{
+				RetrievedAt: tt.retrievedAt,
+			}
+			if got := entry.shouldRefresh(); got != tt.expected {
+				t.Errorf("shouldRefresh() = %v, want %v", got, tt.expected)
+			}
+		})
+	}
+}
+
+func TestSpotPriceHistoryKey_String(t *testing.T) {
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+	expected := "us-west-2/m5.large/us-west-2a"
+	if got := key.String(); got != expected {
+		t.Errorf("String() = %v, want %v", got, expected)
+	}
+}

+ 0 - 1
pkg/cloud/provider/providerconfig.go

@@ -88,7 +88,6 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*models.CustomPrici
 
 	// File Doesn't Exist
 	if !exists {
-		log.Infof("Could not find Custom Pricing file at path '%s'", pc.configFile.Path())
 		pc.customPricing = DefaultPricing()
 		// If config file is not present use the contents from mount models/ as pricing data
 		// in closed source rather than from from  DefaultPricing as first source of truth.

+ 4 - 0
pkg/cmd/costmodel/costmodel.go

@@ -89,6 +89,10 @@ func Execute(conf *Config) error {
 		}
 	} else if conf.MCPServerEnabled {
 		log.Warnf("MCP Server is enabled but Kubernetes is not available. MCP server requires Kubernetes to function.")
+	} else {
+		if value, exists := os.LookupEnv(env.MCPServerEnabledEnvVar); !exists || value == "" {
+			log.Infof("MCP server is now disabled by default. If you wish to use the MCP server, please set the %s environment variable to true.", env.MCPServerEnabledEnvVar)
+		}
 	}
 
 	apiutil.ApplyContainerDiagnosticEndpoints(router)

+ 9 - 0
pkg/costmodel/csv_export.go

@@ -183,6 +183,15 @@ func (e *csvExporter) writeCSVToWriter(ctx context.Context, w io.Writer, dates [
 				return data.alloc.Properties.Namespace
 			},
 		},
+		{
+			column: "Cluster",
+			value: func(data rowData) string {
+				if data.alloc.Properties == nil {
+					return ""
+				}
+				return data.alloc.Properties.Cluster
+			},
+		},
 		{
 			column: "ControllerKind",
 			value: func(data rowData) string {

+ 8 - 7
pkg/costmodel/csv_export_test.go

@@ -49,6 +49,7 @@ func Test_UpdateCSV(t *testing.T) {
 								},
 							}, // 2 PVBytes, 2 PVCost
 							Properties: &opencost.AllocationProperties{
+								Cluster:        "test-cluster",
 								Namespace:      "test-namespace",
 								Controller:     "test-controller-name",
 								ControllerKind: "test-controller-kind",
@@ -66,8 +67,8 @@ func Test_UpdateCSV(t *testing.T) {
 		assert.Len(t, model.ComputeAllocationCalls(), 1)
 		assert.Equal(t, time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].Start)
 		assert.Equal(t, time.Date(2021, 1, 2, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].End)
-		assert.Equal(t, `Date,Namespace,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost
-2021-01-01,test-namespace,test-controller-kind,test-controller-name,test-pod,test-container,0.1,0.2,0.4,0.5,11,10,2,2,0.3,0.6,0.9,2,0.8,4.6000000000000005
+		assert.Equal(t, `Date,Namespace,Cluster,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost
+2021-01-01,test-namespace,test-cluster,test-controller-kind,test-controller-name,test-pod,test-container,0.1,0.2,0.4,0.5,11,10,2,2,0.3,0.6,0.9,2,0.8,4.6000000000000005
 `, string(storage.Data))
 	})
 
@@ -101,8 +102,8 @@ func Test_UpdateCSV(t *testing.T) {
 		require.NoError(t, err)
 		// uploaded a single file with the data
 		assert.Len(t, model.ComputeAllocationCalls(), 1)
-		assert.Equal(t, `Date,Namespace,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost,Labels,Label_test-label1,Label_test-label2
-2021-01-01,test-namespace,test-controller-kind,test-controller-name,test-pod,test-container,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{""test-label1"":""test-value1"",""test-label2"":""test-value2""}",test-value1,test-value2
+		assert.Equal(t, `Date,Namespace,Cluster,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost,Labels,Label_test-label1,Label_test-label2
+2021-01-01,test-namespace,,test-controller-kind,test-controller-name,test-pod,test-container,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{""test-label1"":""test-value1"",""test-label2"":""test-value2""}",test-value1,test-value2
 `, string(storage.Data))
 	})
 
@@ -137,9 +138,9 @@ func Test_UpdateCSV(t *testing.T) {
 		// 2021-01-01 is already in the export file, so we only compute for 2021-01-02
 		assert.Equal(t, time.Date(2021, 1, 2, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].Start)
 		assert.Equal(t, time.Date(2021, 1, 3, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].End)
-		assert.Equal(t, `Date,Namespace,CPUCoreUsageAverage,CPUCoreRequestAverage,CPUCost,RAMBytesUsageAverage,RAMBytesRequestAverage,RAMCost,Label_app,ControllerKind,ControllerName,Pod,Container,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,NetworkCost,PVCost,GPUCost,TotalCost
-2021-01-01,test-namespace,0.1,0.2,0.3,0.4,0.5,0.6,app1,,,,,,,,,,,,
-2021-01-02,test-namespace,0,0,1,0,0,0,,,,,,0,0,0,0,0,0,0,1
+		assert.Equal(t, `Date,Namespace,CPUCoreUsageAverage,CPUCoreRequestAverage,CPUCost,RAMBytesUsageAverage,RAMBytesRequestAverage,RAMCost,Label_app,Cluster,ControllerKind,ControllerName,Pod,Container,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,NetworkCost,PVCost,GPUCost,TotalCost
+2021-01-01,test-namespace,0.1,0.2,0.3,0.4,0.5,0.6,app1,,,,,,,,,,,,,
+2021-01-02,test-namespace,0,0,1,0,0,0,,,,,,,0,0,0,0,0,0,0,1
 `, string(storage.Data))
 	})
 

+ 21 - 3
pkg/customcost/ingestor.go

@@ -21,6 +21,12 @@ import (
 	"github.com/opencost/opencost/pkg/env"
 )
 
+// pluginConnector abstracts *plugin.Client so buildSingleDomain can be tested with mocks.
+type pluginConnector interface {
+	Client() (plugin.ClientProtocol, error)
+	Kill()
+}
+
 // IngestorStatus includes diagnostic values for a given Ingestor
 type IngestorStatus struct {
 	Created     time.Time
@@ -65,14 +71,22 @@ type CustomCostIngestor struct {
 	isStopping   atomic.Bool
 	exitBuildCh  chan string
 	exitRunCh    chan string
-	plugins      map[string]*plugin.Client
+	plugins      map[string]pluginConnector
 	pluginsLock  sync.RWMutex
 	resolution   time.Duration
 	refreshRate  time.Duration
 }
 
-// NewIngestor is an initializer for ingestor
+// NewCustomCostIngestor is an initializer for ingestor
 func NewCustomCostIngestor(ingestorConfig *CustomCostIngestorConfig, repo Repository, plugins map[string]*plugin.Client, res time.Duration) (*CustomCostIngestor, error) {
+	connectors := make(map[string]pluginConnector, len(plugins))
+	for k, v := range plugins {
+		connectors[k] = v
+	}
+	return newCustomCostIngestor(ingestorConfig, repo, connectors, res)
+}
+
+func newCustomCostIngestor(ingestorConfig *CustomCostIngestorConfig, repo Repository, plugins map[string]pluginConnector, res time.Duration) (*CustomCostIngestor, error) {
 	if repo == nil {
 		return nil, fmt.Errorf("CustomCost: NewCustomCostIngestor: repository connot be nil")
 	}
@@ -194,7 +208,11 @@ func (ing *CustomCostIngestor) buildSingleDomain(start, end time.Time, domain st
 		return
 	}
 
-	custCostSrc := raw.(ocplugin.CustomCostSource)
+	custCostSrc, ok := raw.(ocplugin.CustomCostSource)
+	if !ok {
+		log.Errorf("plugin %s returned invalid type: expected CustomCostSource, got %T", domain, raw)
+		return
+	}
 
 	custCostResps := custCostSrc.GetCustomCosts(req)
 	// loop through each customCostResponse, adding to repo

+ 95 - 5
pkg/customcost/ingestor_test.go

@@ -1,6 +1,7 @@
 package customcost
 
 import (
+	"fmt"
 	"os/exec"
 	"runtime"
 	"sync"
@@ -8,6 +9,7 @@ import (
 	"time"
 
 	"github.com/hashicorp/go-plugin"
+	"github.com/opencost/opencost/core/pkg/opencost"
 )
 
 func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
@@ -25,7 +27,7 @@ func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
 	_, _ = client.Client()
 
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{
+		plugins: map[string]pluginConnector{
 			"test-plugin": client,
 		},
 	}
@@ -37,6 +39,7 @@ func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
 }
 
 func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
+	connectors := make(map[string]pluginConnector)
 	clients := make(map[string]*plugin.Client)
 	for _, name := range []string{"plugin-a", "plugin-b", "plugin-c"} {
 		cmd := exec.Command("sleep", "60")
@@ -50,10 +53,11 @@ func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
 			StartTimeout: 2 * time.Second,
 		})
 		_, _ = client.Client()
+		connectors[name] = client
 		clients[name] = client
 	}
 
-	ingestor := &CustomCostIngestor{plugins: clients}
+	ingestor := &CustomCostIngestor{plugins: connectors}
 	ingestor.Stop()
 
 	for name, client := range clients {
@@ -65,7 +69,7 @@ func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
 
 func TestIngestor_Stop_EmptyPluginsMap(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 	ingestor.Stop() // covers lock path with 0 iterations
 }
@@ -77,7 +81,7 @@ func TestIngestor_Stop_NilPluginsMap(t *testing.T) {
 
 func TestIngestor_Stop_AlreadyStopping(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 	ingestor.isStopping.Store(true) // atomic.Bool must use Store()!
 	ingestor.Stop()                 // should return immediately
@@ -85,7 +89,7 @@ func TestIngestor_Stop_AlreadyStopping(t *testing.T) {
 
 func TestIngestor_Stop_ConcurrentCalls(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 
 	var wg sync.WaitGroup
@@ -187,3 +191,89 @@ func TestIngestor_BuildWindow_WithPlugin(t *testing.T) {
 	// BuildWindow and buildSingleDomain; client.Client() fails fast (false exits)
 	ingestor.BuildWindow(now.Add(-time.Hour), now)
 }
+
+// mockClientProtocol implements plugin.ClientProtocol for testing.
+type mockClientProtocol struct {
+	dispenseResult interface{}
+	dispenseErr    error
+}
+
+func (m *mockClientProtocol) Dispense(string) (interface{}, error) {
+	return m.dispenseResult, m.dispenseErr
+}
+func (m *mockClientProtocol) Ping() error  { return nil }
+func (m *mockClientProtocol) Close() error { return nil }
+
+// mockPluginConnector implements pluginConnector for testing.
+type mockPluginConnector struct {
+	protocol  plugin.ClientProtocol
+	clientErr error
+	killed    bool
+}
+
+func (m *mockPluginConnector) Client() (plugin.ClientProtocol, error) {
+	if m.clientErr != nil {
+		return nil, m.clientErr
+	}
+	return m.protocol, nil
+}
+
+func (m *mockPluginConnector) Kill() { m.killed = true }
+
+func TestBuildSingleDomain_InvalidPluginType_NoPanic(t *testing.T) {
+	mock := &mockPluginConnector{
+		protocol: &mockClientProtocol{
+			dispenseResult: "not a CustomCostSource", // wrong type
+		},
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"bad-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Before the fix this would panic; now it should log an error and return.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}
+
+func TestBuildSingleDomain_DispenseError(t *testing.T) {
+	mock := &mockPluginConnector{
+		protocol: &mockClientProtocol{
+			dispenseErr: fmt.Errorf("dispense failed"),
+		},
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"err-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Should handle the error gracefully without panic.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}
+
+func TestBuildSingleDomain_ClientError(t *testing.T) {
+	mock := &mockPluginConnector{
+		clientErr: fmt.Errorf("connection failed"),
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"fail-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Should handle the error gracefully without panic.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}

+ 7 - 8
pkg/customcost/pipelineservice_test.go

@@ -10,7 +10,6 @@ import (
 	"testing"
 	"time"
 
-	"github.com/hashicorp/go-plugin"
 	"github.com/opencost/opencost/core/pkg/log"
 	"github.com/opencost/opencost/core/pkg/util/timeutil"
 )
@@ -280,7 +279,7 @@ func TestPipelineService_Stop_WithNilIngestors(t *testing.T) {
 func TestPipelineService_Stop_PartialNilIngestors(t *testing.T) {
 	hourly := &CustomCostIngestor{
 		key:     "hourly",
-		plugins: make(map[string]*plugin.Client),
+		plugins: make(map[string]pluginConnector),
 	}
 
 	ps := &PipelineService{
@@ -298,11 +297,11 @@ func TestPipelineService_Stop_ShutdownLogging(t *testing.T) {
 	ps := &PipelineService{
 		hourlyIngestor: &CustomCostIngestor{
 			key:     "hourly",
-			plugins: make(map[string]*plugin.Client),
+			plugins: make(map[string]pluginConnector),
 		},
 		dailyIngestor: &CustomCostIngestor{
 			key:     "daily",
-			plugins: make(map[string]*plugin.Client),
+			plugins: make(map[string]pluginConnector),
 		},
 		domains: []string{},
 	}
@@ -324,8 +323,8 @@ func TestPipelineService_Stop_NilIngestors(t *testing.T) {
 }
 
 func TestPipelineService_Stop_WithIngestors(t *testing.T) {
-	hourly := &CustomCostIngestor{plugins: map[string]*plugin.Client{}}
-	daily := &CustomCostIngestor{plugins: map[string]*plugin.Client{}}
+	hourly := &CustomCostIngestor{plugins: map[string]pluginConnector{}}
+	daily := &CustomCostIngestor{plugins: map[string]pluginConnector{}}
 	ps := &PipelineService{
 		hourlyIngestor: hourly,
 		dailyIngestor:  daily,
@@ -335,14 +334,14 @@ func TestPipelineService_Stop_WithIngestors(t *testing.T) {
 
 func TestPipelineService_Stop_OnlyHourlyIngestor(t *testing.T) {
 	ps := &PipelineService{
-		hourlyIngestor: &CustomCostIngestor{plugins: map[string]*plugin.Client{}},
+		hourlyIngestor: &CustomCostIngestor{plugins: map[string]pluginConnector{}},
 	}
 	ps.Stop() // should not panic when dailyIngestor is nil
 }
 
 func TestPipelineService_Stop_OnlyDailyIngestor(t *testing.T) {
 	ps := &PipelineService{
-		dailyIngestor: &CustomCostIngestor{plugins: map[string]*plugin.Client{}},
+		dailyIngestor: &CustomCostIngestor{plugins: map[string]pluginConnector{}},
 	}
 	ps.Stop() // should not panic when hourlyIngestor is nil
 }

+ 1 - 1
pkg/env/costmodel.go

@@ -443,7 +443,7 @@ func GetOVHMonthlyNodepools() []string {
 // IsMCPServerEnabled returns the environment variable value for MCPServerEnabledEnvVar which represents
 // whether or not the MCP server is enabled.
 func IsMCPServerEnabled() bool {
-	return env.GetBool(MCPServerEnabledEnvVar, true)
+	return env.GetBool(MCPServerEnabledEnvVar, false)
 }
 
 // GetMCPHTTPPort returns the environment variable value for MCPHTTPPortEnvVar which represents

+ 22 - 0
pkg/env/costmodel_test.go

@@ -79,3 +79,25 @@ func TestGetKubernetesEnabled(t *testing.T) {
 	}
 
 }
+
+func TestIsMCPServerEnabled_DefaultFalse(t *testing.T) {
+	old, hadOld := os.LookupEnv("MCP_SERVER_ENABLED")
+	os.Unsetenv("MCP_SERVER_ENABLED")
+	t.Cleanup(func() {
+		if hadOld {
+			os.Setenv("MCP_SERVER_ENABLED", old)
+		} else {
+			os.Unsetenv("MCP_SERVER_ENABLED")
+		}
+	})
+	if got := IsMCPServerEnabled(); got {
+		t.Fatalf("expected false when MCP_SERVER_ENABLED is unset, got %v", got)
+	}
+}
+
+func TestIsMCPServerEnabled_True(t *testing.T) {
+	t.Setenv("MCP_SERVER_ENABLED", "true")
+	if got := IsMCPServerEnabled(); !got {
+		t.Fatalf("expected true when env var set to true, got %v", got)
+	}
+}

Некоторые файлы не были показаны из-за большого количества измененных файлов