Просмотр исходного кода

Merge commit '50eb7e4355aef634b9c7fe9496248f0942fc13d8' into memory-tweaks-2

# Conflicts:
#	core/pkg/opencost/opencost_codecs.go
#	core/pkg/util/buffer.go
#	core/pkg/util/bufferhelper.go
#	core/pkg/util/bufferpool.go
#	core/pkg/util/monitor/memory/helpers.go
#	core/pkg/util/monitor/memory/helpers_test.go
#	core/pkg/util/monitor/memory/memorylimitstats.go
#	core/pkg/util/monitor/memory/memorylimitstats_test.go
#	core/pkg/util/stringutil/lrubank.go
#	core/pkg/util/stringutil/stringutil_test.go
Alex Meijer 4 недель назад
Родитель
Сommit
b982e7e727
39 измененных файлов с 3007 добавлено и 592 удалено
  1. 193 0
      .github/workflows/update-go-version.yaml
  2. 1 1
      CLAUDE.md
  3. 25 0
      CONTRIBUTING.md
  4. 0 61
      Dockerfile
  5. 0 30
      Dockerfile.debug
  6. 11 11
      README.md
  7. 461 160
      core/pkg/opencost/opencost_codecs.go
  8. 219 41
      core/pkg/opencost/opencost_codecs_test.go
  9. 35 38
      core/pkg/util/buffer.go
  10. 6 6
      core/pkg/util/buffer_test.go
  11. 3 3
      core/pkg/util/bufferhelper.go
  12. 26 27
      core/pkg/util/bufferpool.go
  13. 311 0
      core/pkg/util/bufferpool_test.go
  14. 6 17
      core/pkg/util/monitor/memory/helpers.go
  15. 20 22
      core/pkg/util/monitor/memory/helpers_test.go
  16. 7 8
      core/pkg/util/monitor/memory/memorylimitstats.go
  17. 1 6
      core/pkg/util/monitor/memory/memorylimitstats_test.go
  18. 1 0
      core/pkg/util/stringutil/lrubank.go
  19. 26 0
      core/pkg/util/stringutil/stringutil_test.go
  20. 37 0
      docs/ROADMAP.md
  21. 1 1
      go.mod
  22. 2 2
      go.sum
  23. 140 91
      pkg/carbon/carbonassets.go
  24. 254 0
      pkg/carbon/carbonassets_test.go
  25. 45 8
      pkg/cloud/alibaba/provider.go
  26. 69 3
      pkg/cloud/alibaba/provider_test.go
  27. 124 24
      pkg/cloud/aws/provider.go
  28. 377 7
      pkg/cloud/aws/provider_test.go
  29. 200 0
      pkg/cloud/aws/spotpricehistory.go
  30. 239 0
      pkg/cloud/aws/spotpricehistory_test.go
  31. 0 1
      pkg/cloud/provider/providerconfig.go
  32. 4 0
      pkg/cmd/costmodel/costmodel.go
  33. 9 0
      pkg/costmodel/csv_export.go
  34. 8 7
      pkg/costmodel/csv_export_test.go
  35. 21 3
      pkg/customcost/ingestor.go
  36. 95 5
      pkg/customcost/ingestor_test.go
  37. 7 8
      pkg/customcost/pipelineservice_test.go
  38. 1 1
      pkg/env/costmodel.go
  39. 22 0
      pkg/env/costmodel_test.go

+ 193 - 0
.github/workflows/update-go-version.yaml

@@ -0,0 +1,193 @@
+name: Update Go Version in "go.mod"
+
+on:
+  schedule:
+    - cron: '0 14 * * 1'  # Run every Monday at 2 PM UTC
+  workflow_dispatch:  # Allow manual triggering
+
+
+concurrency:
+  group: golang-ver-bump
+  cancel-in-progress: true
+
+jobs:
+  check-and-update-go:
+    name: Check and Update Go Version
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        repository: 'opencost/opencost'
+        ref: 'develop'
+        fetch-depth: 0
+        path: ./opencost 
+
+    - name: Setup Go
+      uses: actions/setup-go@v6
+      with:
+         go-version: 'stable'
+
+    - name: Check for existing open PRs
+      id: check-existing-prs
+      run: |
+        # Check if there are any open PRs with the go-version-update label
+        EXISTING_PRS=$(gh pr list --label "go-version-update" --state open --json number,title,url)
+        if [ "$(echo "$EXISTING_PRS" | jq 'length')" -gt 0 ]; then
+          echo "Found existing open PR(s) with go-version-update label:"
+          echo "$EXISTING_PRS" | jq -r '.[] | "  - #\(.number): \(.title) (\(.url))"'
+          echo "skip_update=true" >> $GITHUB_OUTPUT
+        else
+          echo "No existing open PRs found, proceeding with update check"
+          echo "skip_update=false" >> $GITHUB_OUTPUT
+        fi
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN}}
+        GH_REPO: ${{ github.repository }}
+
+    - name: Get current Go version from go.mod
+      id: current-version
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        CURRENT_VERSION=$(grep '^go ' go.mod | awk '{print $2}')
+        echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
+        echo "Current Go version: $CURRENT_VERSION"
+
+    - name: Get latest Go version
+      id: latest-version
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        # Get the latest Go version from golang.org/dl
+        LATEST_VERSION=$(curl -sS https://go.dev/VERSION?m=text | grep -o 'go[0-9]\+\.[0-9]\+\.[0-9]\+' | head -1)
+        echo "latest_version=$LATEST_VERSION" >> $GITHUB_OUTPUT
+        echo "Latest Go version: $LATEST_VERSION"
+
+    - name: Compare versions
+      id: version-check
+      working-directory: ./opencost
+      if: steps.check-existing-prs.outputs.skip_update == 'false'
+      run: |
+        CURRENT_VERSION="${{ steps.current-version.outputs.current_version }}"
+        LATEST_VERSION="${{ steps.latest-version.outputs.latest_version }}"
+
+        # Remove 'go' prefix for comparison
+        CURRENT_NUM=$(echo "$CURRENT_VERSION" | sed 's/^go//')
+        LATEST_NUM=$(echo "$LATEST_VERSION" | sed 's/^go//')
+
+        echo "Raw current version: $CURRENT_VERSION"
+        echo "Raw latest version: $LATEST_VERSION"
+        echo "Current version number: $CURRENT_NUM"
+        echo "Latest version number: $LATEST_NUM"
+
+        # Validate that both versions are non-empty and match an expected pattern (e.g., 1.22.1)
+        VERSION_REGEX='^[0-9]+\.[0-9]+(\.[0-9]+)?$'
+
+        if ! echo "$CURRENT_NUM" | grep -Eq "$VERSION_REGEX"; then
+          echo "Error: CURRENT_VERSION '$CURRENT_VERSION' (normalized: '$CURRENT_NUM') is invalid or empty."
+          exit 1
+        fi
+
+        if ! echo "$LATEST_NUM" | grep -Eq "$VERSION_REGEX"; then
+          echo "Error: LATEST_VERSION '$LATEST_VERSION' (normalized: '$LATEST_NUM') is invalid or empty."
+          exit 1
+        fi
+
+        if [ "$CURRENT_NUM" = "$LATEST_NUM" ]; then
+          echo "Go version is up to date"
+          echo "update_needed=false" >> $GITHUB_OUTPUT
+        else
+          # Use version-aware sorting to determine which version is greater
+          HIGHEST_VERSION=$(printf '%s\n' "$CURRENT_NUM" "$LATEST_NUM" | sort -V | tail -n1)
+
+          if [ "$HIGHEST_VERSION" = "$LATEST_NUM" ]; then
+            echo "Newer Go version available: $LATEST_VERSION (current: $CURRENT_VERSION)"
+            echo "update_needed=true" >> $GITHUB_OUTPUT
+            echo "new_version=$LATEST_VERSION" >> $GITHUB_OUTPUT
+          else
+            echo "Current Go version ($CURRENT_VERSION) is newer than or equal to latest reported version ($LATEST_VERSION); no update needed."
+            echo "update_needed=false" >> $GITHUB_OUTPUT
+          fi
+        fi
+
+    - name: Update go.mod with new Go version
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        NEW_VERSION="$(echo '${{ steps.version-check.outputs.new_version }}' | tr -d 'go')"
+        echo "Updating all go.mod files under $(pwd) to use Go version: $NEW_VERSION"
+
+        # Recursively update every go.mod we find under the working directory.
+        # (Skip vendor directories to avoid accidental edits of vendored content.)
+        while IFS= read -r go_mod_file; do
+          echo "Updating: $go_mod_file"
+          sed -i -E "s/^go[[:space:]].*/go ${NEW_VERSION}/" "$go_mod_file"
+        done < <(find . -name go.mod -type f -not -path "*/vendor/*")
+        
+
+    - name: Run go mod tidy
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        echo "Running go mod tidy for every module found under $(pwd)"
+
+        # Tidy once per module directory (dirname of each go.mod we find).
+        # Skip vendor directories to avoid editing vendored content.
+        mapfile -t tidy_dirs < <(find . -name go.mod -type f -not -path "*/vendor/*" -exec dirname {} \; | sort -u)
+        for d in "${tidy_dirs[@]}"; do
+          echo "Tidying: $d"
+          (cd "$d" && go mod tidy)
+        done
+
+    - name: Check for changes
+      id: changes
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true'
+      working-directory: ./opencost
+      run: |
+        if [ -z "$(git status --porcelain)" ]; then
+          echo "No changes detected, skipping PR creation"
+          echo "skip_pr=true" >> $GITHUB_OUTPUT
+        else
+          echo "Changes detected, will create PR" 
+          echo "skip_pr=false" >> $GITHUB_OUTPUT
+          # Generate a unique branch name with timestamp
+          branch_name="update-go-version-$(date +%Y%m%d-%H%M%S)"
+          echo "branch_name=$branch_name" >> $GITHUB_OUTPUT
+        fi
+
+    - name: Create Pull Request
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'true' && steps.changes.outputs.skip_pr == 'false'
+      id: create-pr
+      uses: peter-evans/create-pull-request@v8
+      with:
+        path: ./opencost
+        token: ${{ secrets.GITHUB_TOKEN }}
+        branch: ${{ steps.changes.outputs.branch_name }}
+        title: 'chore: update Go version to ${{ steps.version-check.outputs.new_version }}'
+        commit-message: 'chore: update Go version to ${{ steps.version-check.outputs.new_version }}'
+        delete-branch: true
+        add-paths: |
+          go.mod
+          go.sum
+          **/go.mod
+          **/go.sum
+        body: |
+          Automated PR to update Go version from ${{ steps.current-version.outputs.current_version }} to ${{ steps.version-check.outputs.new_version }}.
+          
+          This PR was automatically generated after detecting a newer Go version is available.
+        base: develop
+        labels: automated,go-version-update
+    
+
+    - name: Skip message - no update needed
+      if: steps.check-existing-prs.outputs.skip_update == 'false' && steps.version-check.outputs.update_needed == 'false'
+      run: |
+        echo "No Go version update needed. Current version is up to date."
+
+    - name: Skip message - existing PR
+      if: steps.check-existing-prs.outputs.skip_update == 'true'
+      run: |
+        echo "Skipping Go version update because an existing PR with go-version-update label is already open." 

+ 1 - 1
CLAUDE.md

@@ -144,7 +144,7 @@ just validate-protobuf
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `MCP_SERVER_ENABLED` | `true` | Enable MCP server |
+| `MCP_SERVER_ENABLED` | `false` | Enable MCP server |
 | `MCP_HTTP_PORT` | `8081` | MCP server HTTP port |
 
 ### Cloud Providers

+ 25 - 0
CONTRIBUTING.md

@@ -126,6 +126,31 @@ To run these tests:
 - Navigate to cost-model/test
 - Run `go test -timeout 700s` from the testing directory. The tests right now take about 10 minutes (600s) to run because they bring up and down pods and wait for Prometheus to scrape data about them.
 
+## Code Review Standards
+
+All pull requests must be reviewed before merging. The review process ensures:
+
+### What reviewers check:
+- **Correctness:** Does the code do what it claims?
+- **Tests:** Are new features and bug fixes covered by tests?
+- **Style:** Does the code follow Go conventions (`gofmt`, `go vet`)?
+- **Security:** Are inputs validated? Are credentials handled safely?
+- **Performance:** Are there obvious performance issues (unbounded allocations, N+1 queries)?
+
+### Review requirements:
+- At least one approval from a Committer or Maintainer is required
+- The reviewer must be a different person than the PR author
+- For security-sensitive changes, review by a Maintainer is required
+- Emergency fixes may bypass review with post-merge review required within 48 hours (per [GOVERNANCE.md](GOVERNANCE.md))
+
+## Regression Tests
+
+When fixing a bug, contributors SHOULD add a test that reproduces the bug before applying the fix. This ensures the bug does not recur. As a project-wide goal, at least 50% of bugs fixed in any six-month window should have corresponding regression tests. This is tracked by maintainers using issues labeled `bug` and measured during release reviews; it is an aspirational target for the project as a whole, not a requirement applied to individual contributors.
+
+## Finding Issues to Work On
+
+Look for issues labeled [`good first issue`](https://github.com/opencost/opencost/labels/good%20first%20issue) or [`help wanted`](https://github.com/opencost/opencost/labels/help%20wanted) for a curated list of tasks suitable for new contributors.
+
 ## Certificate of Origin
 
 By contributing to this project, you certify that your contribution was created in whole or in part by you and that you have the right to submit it under the open source license indicated in the project. In other words, please confirm that you, as a contributor, have the legal right to make the contribution. This is enforced on Pull Requests and requires `Signed-off-by` with the email address for the author in the commit message.

+ 0 - 61
Dockerfile

@@ -1,61 +0,0 @@
-FROM --platform=$BUILDPLATFORM golang:1.25.5-alpine3.21 AS build-env
-
-WORKDIR /app
-
-# This ensures that CGO is disabled for go test running AND for the build
-# step. This prevents a build failure when building an ARM64 image with
-# docker buildx. I believe this is because the ARM64 version of the
-# golang:latest image does not contain GCC, while the AMD64 version does.
-ARG CGO_ENABLED=0
-
-# Get dependencies - will also be cached if we won't change mod/sum
-COPY go.* .
-COPY core/go.* core/
-COPY modules/collector-source/go.* modules/collector-source/
-COPY modules/prometheus-source/go.* modules/prometheus-source/
-RUN go mod download
-
-ARG version=dev
-ARG	commit=HEAD
-
-ARG TARGETOS
-ARG TARGETARCH
-ENV GOOS=$TARGETOS
-ENV GOARCH=$TARGETARCH
-
-# COPY the source code as the last step
-COPY . .
-
-# Build the binary
-RUN set -e ;\
-    cd cmd/costmodel;\
-    go build -a -installsuffix cgo \
-    -ldflags \
-    "-X github.com/opencost/opencost/core/pkg/version.Version=${version} \
-    -X github.com/opencost/opencost/core/pkg/version.GitCommit=${commit}" \
-    -o /go/bin/app
-
-FROM alpine:latest
-
-LABEL org.opencontainers.image.description="Cross-cloud cost allocation models for Kubernetes workloads"
-LABEL org.opencontainers.image.documentation=https://opencost.io/docs/
-LABEL org.opencontainers.image.licenses=Apache-2.0
-LABEL org.opencontainers.image.source=https://github.com/opencost/opencost
-LABEL org.opencontainers.image.title=kubecost-cost-model
-LABEL org.opencontainers.image.url=https://opencost.io
-
-RUN apk add --update --no-cache ca-certificates
-
-COPY --from=build-env /go/bin/app /go/bin/app
-ADD --chmod=400 ./THIRD_PARTY_LICENSES.txt /THIRD_PARTY_LICENSES.txt
-ADD --chmod=500 ./configs/default.json /models/default.json
-ADD --chmod=500 ./configs/azure.json /models/azure.json
-ADD --chmod=500 ./configs/aws.json /models/aws.json
-ADD --chmod=500 ./configs/gcp.json /models/gcp.json
-ADD --chmod=500 ./configs/alibaba.json /models/alibaba.json
-ADD --chmod=500 ./configs/oracle.json /models/oracle.json
-ADD --chmod=500 ./configs/otc.json /models/otc.json
-RUN chown -R 1001:1001 /models
-
-USER 1001
-ENTRYPOINT ["/go/bin/app"]

+ 0 - 30
Dockerfile.debug

@@ -1,30 +0,0 @@
-# This dockerfile is for development purposes only; do not use this for production deployments
-FROM golang:alpine
-# The prebuilt binary path. This Dockerfile assumes the binary will be built
-# outside of Docker.
-ARG binary_path
-
-LABEL org.opencontainers.image.description="Cross-cloud cost allocation models for Kubernetes workloads"
-LABEL org.opencontainers.image.documentation=https://opencost.io/docs/
-LABEL org.opencontainers.image.licenses=Apache-2.0
-LABEL org.opencontainers.image.source=https://github.com/opencost/opencost
-LABEL org.opencontainers.image.title=kubecost-cost-model
-LABEL org.opencontainers.image.url=https://opencost.io
-
-WORKDIR /app
-RUN apk add --update --no-cache ca-certificates
-RUN go install github.com/go-delve/delve/cmd/dlv@latest
-
-ADD --chmod=644 ./THIRD_PARTY_LICENSES.txt /THIRD_PARTY_LICENSES.txt
-ADD --chmod=644 ./configs/default.json /models/default.json
-ADD --chmod=644 ./configs/azure.json /models/azure.json
-ADD --chmod=644 ./configs/aws.json /models/aws.json
-ADD --chmod=644 ./configs/gcp.json /models/gcp.json
-ADD --chmod=644 ./configs/alibaba.json /models/alibaba.json
-ADD --chmod=644 ./configs/oracle.json /models/oracle.json
-ADD --chmod=644 ./configs/otc.json /models/otc.json
-
-COPY ${binary_path} main
-
-ENTRYPOINT ["/go/bin/dlv exec --listen=:40000 --api-version=2 --headless=true --accept-multiclient --log --continue /app/main"]
-EXPOSE 9003 40000

+ 11 - 11
README.md

@@ -54,17 +54,17 @@ Note: The standalone Kubernetes manifest files have been removed. Please use Hel
 
 ## MCP Server
 
-The OpenCost MCP (Model Context Protocol) server provides AI agents with access to cost allocation and asset data through a standardized interface. The MCP server is **enabled by default** in all OpenCost deployments, runs on port 8081, and is **built into the Helm chart** for easy production deployment. Users have full control to disable it or configure custom ports and settings.
+The OpenCost MCP (Model Context Protocol) server provides AI agents with access to cost allocation and asset data through a standardized interface. The MCP server is **disabled by default** (opt-in) in all OpenCost deployments, runs on port 8081, and is **built into the Helm chart** for easy production deployment. Users have full control to enable it or configure custom ports and settings.
 
 ### Features
 
-- **Enabled by Default**: MCP server starts automatically with OpenCost
-- **Full User Control**: Easy to disable or configure port and settings
+- **Opt-in by Default**: MCP server is disabled by default to minimize the attack surface and must be explicitly enabled
+- **Full User Control**: Easy to enable or configure port and settings
 - **Allocation Queries**: Retrieve cost allocation data with filtering and aggregation
 - **Asset Queries**: Access detailed asset information including nodes, disks, load balancers, and more
 - **Cloud Cost Queries**: Query cloud cost data with provider, service, and region filtering
 - **HTTP Transport**: Uses HTTP for reliable communication with MCP clients
-- **Zero Configuration**: Works out of the box with default OpenCost deployment
+- **Simple Configuration**: Easy to enable using standard environment variables
 - **Helm Integration**: Built into the official Helm chart for production deployments
 
 ### Quick Start
@@ -105,19 +105,19 @@ opencost:
 helm repo add opencost https://opencost.github.io/opencost-helm-chart
 helm repo update
 
-# Deploy OpenCost with MCP server (enabled by default)
-helm install opencost opencost/opencost
+# Deploy OpenCost with MCP server enabled (opt-in)
+helm install opencost opencost/opencost --set opencost.mcp.enabled=true
 
 # Access MCP server via port forwarding (example)
 kubectl port-forward svc/opencost 8081:8081
 ```
 
-The MCP server is **enabled by default** in the Helm chart. For custom configuration:
+The MCP server is **disabled by default** in the Helm chart. For custom configuration:
 
 ```bash
-# Deploy with MCP server disabled
+# Deploy with MCP server enabled
 helm install opencost opencost/opencost \
-  --set opencost.mcp.enabled=false
+  --set opencost.mcp.enabled=true
 
 # Deploy with custom MCP port
 helm install opencost opencost/opencost \
@@ -132,8 +132,8 @@ helm install opencost opencost/opencost \
 
 | Configuration | Command | Description |
 |---------------|---------|-------------|
-| **Default** | `helm install opencost opencost/opencost` | MCP enabled on port 8081 |
-| **Disable** | `--set opencost.mcp.enabled=false` | Completely disable MCP server |
+| **Default** | `helm install opencost opencost/opencost` | MCP disabled by default |
+| **Enable** | `--set opencost.mcp.enabled=true` | Enable MCP server on port 8081 |
 | **Custom Port** | `--set opencost.mcp.port=9091` | Use different port |
 | **Debug Mode** | `--set opencost.mcp.extraEnv.MCP_LOG_LEVEL=debug` | Enable debug logging |
 

Разница между файлами не показана из-за своего большого размера
+ 461 - 160
core/pkg/opencost/opencost_codecs.go


+ 219 - 41
core/pkg/opencost/opencost_codecs_test.go

@@ -1,16 +1,52 @@
 package opencost
 
 import (
+	"bytes"
+	"io"
 	"testing"
 	"time"
 )
 
-func TestAllocation_BinaryEncoding(t *testing.T) {
+type UnmarshalFunc func(BingenUnmarshalable, []byte) error
+
+func RunAllocation_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAllocationSet_BinaryEncoding(t *testing.T) {
-	// TODO niko
+func RunAllocationSet_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
+	end := time.Now().UTC().Truncate(day)
+	start := end.Add(-day * 10)
+
+	for start.Before(end) {
+		set0 := GenerateMockAllocationSetClusterIdle(start)
+
+		bytes, err := set0.MarshalBinary()
+		if err != nil {
+			t.Fatalf("Failed to AllocationSet.MarshalBinary: %s", err)
+			return
+		}
+
+		set1 := new(AllocationSet)
+		err = unmarshal(set1, bytes)
+		if err != nil {
+			t.Fatalf("Failed to AllocationSet.UnmarshalBinary: %s", err)
+			return
+		}
+
+		for key, alloc := range set1.Allocations {
+			other, ok := set0.Allocations[key]
+			if !ok {
+				t.Fatalf("Failed to match Allocation for key: %s", key)
+				return
+			}
+
+			if !alloc.Equal(other) {
+				t.Fatalf("allocations for key: %s did not match", key)
+			}
+		}
+
+		start = start.Add(day)
+	}
 }
 
 func BenchmarkAllocationSetRange_BinaryEncoding(b *testing.B) {
@@ -78,7 +114,7 @@ func BenchmarkAllocationSetRange_BinaryEncoding(b *testing.B) {
 	}
 }
 
-func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
+func RunAllocationSetRange_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	endYesterday := time.Now().UTC().Truncate(day)
 	startYesterday := endYesterday.Add(-day)
 	startD2 := startYesterday
@@ -86,7 +122,6 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 	startD0 := startD1.Add(-day)
 
 	var asr0, asr1 *AllocationSetRange
-	var bs []byte
 	var err error
 
 	asr0 = NewAllocationSetRange(
@@ -94,20 +129,29 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 		GenerateMockAllocationSetClusterIdle(startD1),
 		GenerateMockAllocationSetClusterIdle(startD2),
 	)
-
-	bs, err = asr0.MarshalBinary()
-	if err != nil {
-		t.Fatalf("AllocationSetRange.Binary: unexpected error: %s", err)
-		return
+	asrSets0 := [][]byte{}
+	for _, as := range asr0.Allocations {
+		bytes, err := as.MarshalBinary()
+		if err != nil {
+			t.Fatalf("Failed to marshal allocation set into []byte: %s", err)
+			return
+		}
+		asrSets0 = append(asrSets0, bytes)
 	}
 
-	asr1 = &AllocationSetRange{}
-	err = asr1.UnmarshalBinary(bs)
-	if err != nil {
-		t.Fatalf("AllocationSetRange.Binary: unexpected error: %s", err)
-		return
+	asrSets1 := []*AllocationSet{}
+	for _, bytes := range asrSets0 {
+		allocSet := new(AllocationSet)
+		err = unmarshal(allocSet, bytes)
+		if err != nil {
+			t.Fatalf("AllocationSet.Binary: unexpected error: %s", err)
+			return
+		}
+		asrSets1 = append(asrSets1, allocSet)
 	}
 
+	asr1 = NewAllocationSetRange(asrSets1...)
+
 	if asr0.Length() != asr1.Length() {
 		t.Fatalf("AllocationSetRange.Binary: expected %d; found %d", asr0.Length(), asr1.Length())
 	}
@@ -143,7 +187,7 @@ func TestAllocationSetRange_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestAny_BinaryEncoding(t *testing.T) {
+func RunAny_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	start := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	end := start.Add(24 * time.Hour)
 	window := NewWindow(&start, &end)
@@ -167,7 +211,7 @@ func TestAny_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Any{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Any.Binary: unexpected error: %s", err)
 	}
@@ -192,15 +236,15 @@ func TestAny_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestAsset_BinaryEncoding(t *testing.T) {
+func RunAsset_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAssetSet_BinaryEncoding(t *testing.T) {
+func RunAssetSet_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	// TODO niko
 }
 
-func TestAssetSetRange_BinaryEncoding(t *testing.T) {
+func RunAssetSetRange_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	endYesterday := time.Now().UTC().Truncate(day)
 	startYesterday := endYesterday.Add(-day)
 	startD2 := startYesterday
@@ -224,7 +268,7 @@ func TestAssetSetRange_BinaryEncoding(t *testing.T) {
 	}
 
 	asr1 = &AssetSetRange{}
-	err = asr1.UnmarshalBinary(bs)
+	err = unmarshal(asr1, bs)
 	if err != nil {
 		t.Fatalf("AssetSetRange.Binary: unexpected error: %s", err)
 		return
@@ -263,7 +307,7 @@ func TestAssetSetRange_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestBreakdown_BinaryEncoding(t *testing.T) {
+func RunBreakdown_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var b0, b1 *Breakdown
 	var bs []byte
 	var err error
@@ -281,7 +325,7 @@ func TestBreakdown_BinaryEncoding(t *testing.T) {
 	}
 
 	b1 = &Breakdown{}
-	err = b1.UnmarshalBinary(bs)
+	err = unmarshal(b1, bs)
 	if err != nil {
 		t.Fatalf("Breakdown.Binary: unexpected error: %s", err)
 	}
@@ -300,7 +344,7 @@ func TestBreakdown_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestCloudAny_BinaryEncoding(t *testing.T) {
+func RunCloudAny_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -319,7 +363,7 @@ func TestCloudAny_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Cloud{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("CloudAny.Binary: unexpected error: %s", err)
 	}
@@ -329,7 +373,7 @@ func TestCloudAny_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestClusterManagement_BinaryEncoding(t *testing.T) {
+func RunClusterManagement_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -358,7 +402,7 @@ func TestClusterManagement_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestDisk_BinaryEncoding(t *testing.T) {
+func RunDisk_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -389,7 +433,7 @@ func TestDisk_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Disk{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Disk.Binary: unexpected error: %s", err)
 	}
@@ -399,7 +443,7 @@ func TestDisk_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestNode_BinaryEncoding(t *testing.T) {
+func RunNode_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -441,7 +485,7 @@ func TestNode_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &Node{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("Node.Binary: unexpected error: %s", err)
 	}
@@ -451,7 +495,7 @@ func TestNode_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestProperties_BinaryEncoding(t *testing.T) {
+func RunProperties_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var p0, p1 *AllocationProperties
 	var bs []byte
 	var err error
@@ -464,7 +508,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -501,7 +545,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -527,7 +571,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 
 	p1 = &AllocationProperties{}
-	err = p1.UnmarshalBinary(bs)
+	err = unmarshal(p1, bs)
 	if err != nil {
 		t.Fatalf("AllocationProperties.Binary: unexpected error: %s", err)
 	}
@@ -537,7 +581,7 @@ func TestProperties_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestShared_BinaryEncoding(t *testing.T) {
+func RunShared_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	ws := time.Date(2020, time.September, 16, 0, 0, 0, 0, time.UTC)
 	we := ws.Add(24 * time.Hour)
 	window := NewWindow(&ws, &we)
@@ -556,7 +600,7 @@ func TestShared_BinaryEncoding(t *testing.T) {
 	}
 
 	a1 = &SharedAsset{}
-	err = a1.UnmarshalBinary(bs)
+	err = unmarshal(a1, bs)
 	if err != nil {
 		t.Fatalf("SharedAsset.Binary: unexpected error: %s", err)
 	}
@@ -566,7 +610,7 @@ func TestShared_BinaryEncoding(t *testing.T) {
 	}
 }
 
-func TestWindow_BinaryEncoding(t *testing.T) {
+func RunWindow_BinaryEncodingTest(t *testing.T, unmarshal UnmarshalFunc) {
 	var w0, w1 Window
 	var bs []byte
 	var err error
@@ -578,7 +622,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -598,7 +642,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -618,7 +662,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -638,7 +682,7 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
 
-	err = w1.UnmarshalBinary(bs)
+	err = unmarshal(&w1, bs)
 	if err != nil {
 		t.Fatalf("Window.Binary: unexpected error: %s", err)
 	}
@@ -650,3 +694,137 @@ func TestWindow_BinaryEncoding(t *testing.T) {
 		t.Fatalf("Window.Binary: expected %v; found %v", w0.End(), w1.End())
 	}
 }
+
+type BingenUnmarshalable interface {
+	UnmarshalBinary([]byte) error
+	UnmarshalBinaryFromReader(io.Reader) error
+}
+
+func UnmarshalBingenBytes(value BingenUnmarshalable, b []byte) error {
+	return value.UnmarshalBinary(b)
+}
+
+func UnmarshalBingenReader(value BingenUnmarshalable, b []byte) error {
+	// convert bytes to reader in order to leverage io.Reader string table
+	reader := bytes.NewReader(b)
+	return value.UnmarshalBinaryFromReader(reader)
+}
+
+func RunAllOpencostBingenCodecTests(t *testing.T, unmarshal UnmarshalFunc) {
+	tests := []struct {
+		name string
+		f    func(*testing.T, UnmarshalFunc)
+	}{
+		{
+			name: "RunAllocation_BinaryEncodingTest",
+			f:    RunAllocation_BinaryEncodingTest,
+		},
+		{
+			name: "RunAllocationSet_BinaryEncodingTest",
+			f:    RunAllocationSet_BinaryEncodingTest,
+		},
+		{
+			name: "RunAllocationSetRange_BinaryEncodingTest",
+			f:    RunAllocationSetRange_BinaryEncodingTest,
+		},
+		{
+			name: "RunAny_BinaryEncodingTest",
+			f:    RunAny_BinaryEncodingTest,
+		},
+		{
+			name: "RunAsset_BinaryEncodingTest",
+			f:    RunAsset_BinaryEncodingTest,
+		},
+		{
+			name: "RunAssetSet_BinaryEncodingTest",
+			f:    RunAssetSet_BinaryEncodingTest,
+		},
+		{
+			name: "RunAssetSetRange_BinaryEncodingTest",
+			f:    RunAssetSetRange_BinaryEncodingTest,
+		},
+		{
+			name: "RunBreakdown_BinaryEncodingTest",
+			f:    RunBreakdown_BinaryEncodingTest,
+		},
+		{
+			name: "RunCloudAny_BinaryEncodingTest",
+			f:    RunCloudAny_BinaryEncodingTest,
+		},
+		{
+			name: "RunClusterManagement_BinaryEncodingTest",
+			f:    RunClusterManagement_BinaryEncodingTest,
+		},
+		{
+			name: "RunDisk_BinaryEncodingTest",
+			f:    RunDisk_BinaryEncodingTest,
+		},
+		{
+			name: "RunNode_BinaryEncodingTest",
+			f:    RunNode_BinaryEncodingTest,
+		},
+		{
+			name: "RunProperties_BinaryEncodingTest",
+			f:    RunProperties_BinaryEncodingTest,
+		},
+		{
+			name: "RunShared_BinaryEncodingTest",
+			f:    RunShared_BinaryEncodingTest,
+		},
+		{
+			name: "RunWindow_BinaryEncodingTest",
+			f:    RunWindow_BinaryEncodingTest,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(tt *testing.T) {
+			test.f(tt, unmarshal)
+		})
+	}
+}
+
+func TestOpencostBingenDefaultsWithBytes(t *testing.T) {
+	config := DefaultBingenConfiguration()
+	ConfigureBingen(config)
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenBytes)
+}
+
+func TestOpencostBingenFileStringTableEnabledWithBytes(t *testing.T) {
+	// This test _should_ still run the slice based string table because raw []byte
+	// data always uses the string slice table
+	config := DefaultBingenConfiguration()
+	config.FileBackedStringTableEnabled = true
+	config.FileBackedStringTableDir = t.TempDir()
+	ConfigureBingen(config)
+
+	// reset configuration to default on completion
+	defer ConfigureBingen(DefaultBingenConfiguration())
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenBytes)
+}
+
+func TestOpencostBingenDefaultsWithReader(t *testing.T) {
+	// This test _should_ still run the slice based string table because we haven't configured
+	// bingen to use the file string table
+	config := DefaultBingenConfiguration()
+	ConfigureBingen(config)
+
+	// we use the reader to unmarshal instead of []bytes
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenReader)
+}
+
+func TestOpencostBingenFileStringTableEnabledWithReader(t *testing.T) {
+	// This test _should_ use the file backed string table because we have enabled it AND
+	// we're using a reader
+	config := DefaultBingenConfiguration()
+	config.FileBackedStringTableEnabled = true
+	config.FileBackedStringTableDir = t.TempDir()
+	ConfigureBingen(config)
+
+	// reset configuration to default on completion
+	defer ConfigureBingen(DefaultBingenConfiguration())
+
+	RunAllOpencostBingenCodecTests(t, UnmarshalBingenReader)
+}

+ 35 - 38
core/pkg/util/buffer.go

@@ -157,7 +157,9 @@ func (b *Buffer) WriteBytes(bytes []byte) {
 	b.bw.Write(bytes)
 }
 
-// Bytes returns the unread portion of the underlying buffer storage.
+// Bytes returns the unread portion of the underlying buffer storage. If the buffer was
+// created with an `io.Reader`, then the remaining unread bytes are drained into a byte
+// slice and returned.
 func (b *Buffer) Bytes() []byte {
 	if b.bw != nil {
 		return b.bw.Bytes()
@@ -362,48 +364,13 @@ func (b *Buffer) ReadString() string {
 	return bytesToString(bytes)
 }
 
-// ReadStringBytes reads a uint16 length prefix and that many bytes as a new slice.
-// Unlike ReadString, this does not route through the global string bank.
-func (b *Buffer) ReadStringBytes() ([]byte, error) {
-	var l uint16
-	if b.bw != nil {
-		if err := readUint16(b.bw, &l); err != nil {
-			return nil, err
-		}
-		if l == 0 {
-			return []byte{}, nil
-		}
-		out := make([]byte, int(l))
-		if _, err := readFull(b.bw, out); err != nil {
-			return nil, err
-		}
-		return out, nil
-	}
-	if b.b == nil {
-		return nil, fmt.Errorf("buffer: ReadStringBytes on invalid buffer")
-	}
-	if err := readBuffUint16(b.b, &l); err != nil {
-		return nil, err
-	}
-	if l == 0 {
-		return []byte{}, nil
-	}
-	out := make([]byte, int(l))
-	if _, err := readBuffFull(b.b, out); err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
 // ReadBytes reads the specified length from the buffer and returns the byte slice.
 func (b *Buffer) ReadBytes(length int) []byte {
 	if b.bw != nil {
 		return b.bw.Next(length)
 	}
 
-	bytes := bytePool.Get(length)
-	defer bytePool.Put(bytes)
-
+	bytes := make([]byte, length)
 	_, err := readBuffFull(b.b, bytes)
 	if err != nil {
 		return bytes
@@ -412,6 +379,36 @@ func (b *Buffer) ReadBytes(length int) []byte {
 	return bytes
 }
 
+// bytesAsString converts a []byte into a string in place. Note that you should use this helper
+// when the []byte slice contains _only_ the string data and isn't part of a larger underlying array.
+// For example, a case where you should *not* use this helper:
+//
+//	func parseString(buffer *bytes.Buffer, length int) string {
+//	  bytes := buffer.Next(length)   // this extracts a sub-slice of the underlying byte array from pos->pos+length
+//
+//	  return bytesAsString(bytes)
+//	}
+//
+// Now both the []byte AND the value string are linked and neither can be GC'd until the other one is GC'd.
+// This is especially problematic if you drop the references to the byte array, as you're effectively requiring
+// 1024 bytes for an 11-byte string.
+//
+// An example where it _is_ ok, and recommended to drop the underlying []byte reference is the following:
+//
+//	func parseString(reader io.Reader, length int) string {
+//	  bytes := make([]byte, length)
+//	  io.ReadFull(reader, bytes)
+//
+//	  return bytesAsString(bytes)
+//	}
+//
+// In this case, we've create a byte array just big enough for the string, we extract the string data from the reader
+// and then cast the byte array in place to the string, and finally drop the byte array reference. This omits an additional
+// allocation if you were to use string(bytes)
+func bytesAsString(b []byte) string {
+	return unsafe.String(unsafe.SliceData(b), len(b))
+}
+
 // Conversion from byte slice to string
 func bytesToString(b []byte) string {
 	// This code will take the passed byte slice and cast it in-place into a string. By doing
@@ -423,7 +420,7 @@ func bytesToString(b []byte) string {
 	// cached string. If it does _not_ exist, then we use the passed func() string to allocate a new
 	// string and cache it. This will prevent us from allocating throw-away strings just to
 	// check our cache.
-	pinned := unsafe.String(unsafe.SliceData(b), len(b))
+	pinned := bytesAsString(b)
 
 	return stringutil.BankFunc(pinned, func() string {
 		return string(b)

+ 6 - 6
core/pkg/util/buffer_test.go

@@ -213,8 +213,8 @@ func TestBufferWriteReadInt64(t *testing.T) {
 func TestBufferBytes(t *testing.T) {
 	buf := NewBuffer()
 
-	buf.WriteInt(42)
-	buf.WriteFloat64(3.14)
+	buf.WriteInt(-42)
+	buf.WriteFloat64(-3.14)
 
 	unreadBytes := buf.Bytes()
 
@@ -223,11 +223,11 @@ func TestBufferBytes(t *testing.T) {
 	intVal := newBuf.ReadInt()
 	floatVal := newBuf.ReadFloat64()
 
-	if intVal != 42 {
-		t.Errorf("Expected int value to be 42, got %v", intVal)
+	if intVal != -42 {
+		t.Errorf("Expected int value to be -42, got %v", intVal)
 	}
-	if floatVal != 3.14 {
-		t.Errorf("Expected float value to be 3.14, got %v", floatVal)
+	if floatVal != -3.14 {
+		t.Errorf("Expected float value to be -3.14, got %v", floatVal)
 	}
 }
 

+ 3 - 3
core/pkg/util/bufferhelper.go

@@ -76,7 +76,7 @@ func readInt(r *bytes.Buffer, data *int) error {
 		return err
 	}
 
-	*data = int(order.Uint32(bs))
+	*data = int(int32(order.Uint32(bs)))
 	return nil
 }
 
@@ -246,7 +246,7 @@ func readBuffInt(r *bufio.Reader, data *int) error {
 		return err
 	}
 
-	*data = int(order.Uint32(bs))
+	*data = int(int32(order.Uint32(bs)))
 	return nil
 }
 
@@ -442,7 +442,7 @@ func writeInt(w *bytes.Buffer, data int) error {
 	var b [4]byte
 	bs := b[:]
 
-	binary.LittleEndian.PutUint32(bs, uint32(data))
+	binary.LittleEndian.PutUint32(bs, uint32(int32(data)))
 	_, err := w.Write(bs)
 	return err
 }

+ 26 - 27
core/pkg/util/bufferpool.go

@@ -6,14 +6,15 @@ import (
 	"sync"
 )
 
+// bufferPool holds "tiered" []byte `sync.Pool` instances by capacity up to math.MaxUint16
 type bufferPool struct {
-	pools [32]sync.Pool
+	pools [17]sync.Pool
 }
 
 func newBufferPool() *bufferPool {
 	bp := new(bufferPool)
 
-	for i := 0; i < 32; i++ {
+	for i := 0; i < 17; i++ {
 		length := 1 << i
 		bp.pools[i].New = func() any {
 			return make([]byte, length)
@@ -22,18 +23,23 @@ func newBufferPool() *bufferPool {
 	return bp
 }
 
-// index on the min number of bits required to store the byte data up to 32 bits.
-func nextIndex(length int) int {
-	return bits.Len32(uint32(length))
+// poolIndex returns the pool index for a buffer of the given size.
+func poolIndex(length int) int {
+	return bits.Len32(uint32(length - 1))
 }
 
-// the previous index for a provided length
-func prevIndex(length int) int {
-	next := nextIndex(length)
-	if uint32(length) == (1 << uint32(next)) {
-		return next
-	}
-	return next - 1
+// putIndex returns the pool index for returning a buffer with the given capacity.
+// It is the inverse of poolIndex: given a capacity that was originally handed out
+// by Get, it finds the pool that owns it.
+//
+// Because Get always returns buffers with capacity 1<<i, the capacity here will
+// always be a power of two. bits.Len32(1<<i) = i+1, so we subtract 1 to recover i.
+func putIndex(capacity int) int {
+	return bits.Len32(uint32(capacity)) - 1
+}
+
+func isPowerOfTwo(capacity int) bool {
+	return capacity&(capacity-1) == 0
 }
 
 func (bp *bufferPool) Get(length int) []byte {
@@ -41,29 +47,22 @@ func (bp *bufferPool) Get(length int) []byte {
 		return nil
 	}
 
-	// if it's beyond our pool bounds, just allocate and return
-	if length > math.MaxInt32 {
+	// Beyond our pool range: allocate directly
+	if length > math.MaxUint16 {
 		return make([]byte, length)
 	}
 
-	i := nextIndex(length)
-	if entry := bp.pools[i].Get(); entry != nil {
-		bytes := entry.([]byte)
-		bytes = bytes[:length]
-		return bytes
-	}
-
-	// should never get here, as there should always be an entry
-	// coming from the pool
-	return make([]byte, 1<<i)[:length]
+	i := poolIndex(length)
+	buf := bp.pools[i].Get().([]byte)
+	return buf[:length]
 }
 
 func (bp *bufferPool) Put(buf []byte) {
 	capacity := cap(buf)
-	if capacity == 0 || capacity > math.MaxInt32 {
+	if capacity == 0 || capacity > math.MaxUint16 || !isPowerOfTwo(capacity) {
 		return
 	}
 
-	i := prevIndex(capacity)
-	bp.pools[i].Put(buf)
+	i := putIndex(capacity)
+	bp.pools[i].Put(buf[:cap(buf)])
 }

+ 311 - 0
core/pkg/util/bufferpool_test.go

@@ -0,0 +1,311 @@
+package util
+
+import (
+	"math"
+	"math/bits"
+	"sync"
+	"testing"
+)
+
+// --- poolIndex / putIndex unit tests ---
+
+func TestPoolIndex(t *testing.T) {
+	cases := []struct {
+		length int
+		want   int
+	}{
+		{1, 0},
+		{2, 1},
+		{3, 2},
+		{4, 2},
+		{5, 3},
+		{7, 3},
+		{8, 3},
+		{255, 8},
+		{256, 8},
+		{1023, 10},
+		{1024, 10},
+		{math.MaxUint16 - 50, 16},
+	}
+	for _, c := range cases {
+		got := poolIndex(c.length)
+		if got != c.want {
+			t.Errorf("poolIndex(%d) = %d, want %d", c.length, got, c.want)
+		}
+	}
+}
+
+func TestAllocMinusOne(t *testing.T) {
+	bp := newBufferPool()
+	for i := 1; i <= 16; i++ {
+		capacity := 1 << i
+		length := capacity - 1
+		if length <= 0 {
+			continue
+		}
+
+		b := bp.Get(length)
+		c := cap(b)
+
+		pIndex := poolIndex(length)
+		rIndex := putIndex(c)
+
+		if pIndex != rIndex {
+			t.Errorf("pIndex: %d != rIndex: %d\n", pIndex, rIndex)
+		}
+
+	}
+}
+
+func TestPutIndex(t *testing.T) {
+	// putIndex must be the inverse of poolIndex for all power-of-two capacities
+	// that Get hands out.
+	for i := 1; i <= 16; i++ {
+		cap := 1 << i
+		got := putIndex(cap)
+		if got != i {
+			t.Errorf("putIndex(1<<%d = %d) = %d, want %d", i, cap, got, i)
+		}
+	}
+}
+
+func TestPoolIndexPutIndexRoundTrip(t *testing.T) {
+	// For any requested length, the buffer Get returns has capacity 1<<poolIndex(length).
+	// Confirm that putIndex maps that capacity back to the same pool slot.
+	for length := 1; length <= math.MaxUint16; length++ {
+		i := poolIndex(length)
+		capacity := 1 << i
+		j := putIndex(capacity)
+		if i != j {
+			t.Errorf("length=%d: poolIndex=%d, capacity=1<<%d=%d, putIndex=%d — round-trip broken",
+				length, i, i, capacity, j)
+		}
+	}
+}
+
+// --- Get ---
+
+func TestGetNilOnZeroOrNegative(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{0, -1, -100} {
+		if got := bp.Get(n); got != nil {
+			t.Errorf("Get(%d) = %v, want nil", n, got)
+		}
+	}
+}
+
+func TestGetLengthIsExact(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{1, 2, 3, 7, 8, 100, 1000, 65535, 65536} {
+		buf := bp.Get(n)
+		if len(buf) != n {
+			t.Errorf("Get(%d): len = %d, want %d", n, len(buf), n)
+		}
+	}
+}
+
+func TestGetCapacityIsPowerOfTwo(t *testing.T) {
+	bp := newBufferPool()
+	for _, n := range []int{1, 2, 3, 4, 5, 100, 1000, 550, math.MaxUint16 - 100, math.MaxUint16} {
+		buf := bp.Get(n)
+		c := cap(buf)
+		if c == 0 || !isPowerOfTwo(c) {
+			t.Errorf("Get(%d): cap = %d, not a power of two", n, c)
+		}
+	}
+}
+
+func TestGetCapacityIsSmallestFittingPowerOfTwo(t *testing.T) {
+	bp := newBufferPool()
+	cases := []struct {
+		n       int
+		wantCap int
+	}{
+		{1, 1},
+		{2, 2},
+		{3, 4},
+		{4, 4},
+		{5, 8},
+		{8, 8},
+		{9, 16},
+		{255, 256},
+		{256, 256},
+		{1024, 1024},
+	}
+	for _, c := range cases {
+		buf := bp.Get(c.n)
+		if cap(buf) != c.wantCap {
+			t.Errorf("Get(%d): cap = %d, want %d", c.n, cap(buf), c.wantCap)
+		}
+	}
+}
+
+func TestGetOversizeFallback(t *testing.T) {
+	bp := newBufferPool()
+	n := math.MaxUint16 + 1
+	buf := bp.Get(n)
+	if len(buf) != n {
+		t.Errorf("Get(MaxUint16+1): len = %d, want %d", len(buf), n)
+	}
+}
+
+// --- Put ---
+
+func TestPutDropsZeroCapBuffer(t *testing.T) {
+	// Put on a nil or zero-cap slice must not panic.
+	bp := newBufferPool()
+	bp.Put(nil)
+	bp.Put([]byte{})
+}
+
+// --- Get / Put round-trip ---
+
+func TestGetPutSamePool(t *testing.T) {
+	// A buffer returned via Put must land in the same pool that Get draws from,
+	// so the very next Get (with the same length) should reuse it.
+	bp := newBufferPool()
+
+	buf := bp.Get(100)
+	ptr := &buf[0]
+	bp.Put(buf)
+
+	buf2 := bp.Get(100)
+	if &buf2[0] != ptr {
+		// sync.Pool may have GC'd the entry; this is not a hard failure but
+		// we at minimum require length and capacity to be correct.
+		if len(buf2) != 100 {
+			t.Errorf("Get(100) after Put: len = %d, want 100", len(buf2))
+		}
+	}
+}
+
+func TestPutRestoresFullCapacity(t *testing.T) {
+	// After Put, the pooled slice should have full capacity, not the resliced length.
+	// We verify this by inspecting what comes out of the pool on the next Get.
+	bp := newBufferPool()
+
+	buf := bp.Get(10)  // len=10, cap=16
+	bp.Put(buf)        // must put back with cap=16
+	buf2 := bp.Get(15) // asks for 15 — still fits in cap=16 pool
+	if cap(buf2) < 15 {
+		t.Errorf("After Put(cap=16), Get(15): cap = %d, too small", cap(buf2))
+	}
+}
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for i := 0; i < 16; i++ {
+		cap := 1 << i
+
+		if !isPowerOfTwo(cap) {
+			t.Fatalf("Failed at: i=%d, cap=%d\n", i, cap)
+		}
+	}
+
+	for _, v := range []int{5, 17, 19, 31, 55} {
+		if isPowerOfTwo(v) {
+			t.Fatalf("Unexpected isPowerOfTwo: %d", v)
+		}
+	}
+}
+
+func TestPutNonPowerOfTwoCapIsDiscarded(t *testing.T) {
+	// Buffers with non-power-of-two capacities (e.g. from outside the pool)
+	// get silently dropped. Confirm no panic and pool still works after.
+	bp := newBufferPool()
+	value := make([]byte, 0, 17)
+	bp.Put(value)
+
+	buf := bp.Get(24)
+	if len(buf) != 24 {
+		t.Errorf("Get(24) after spurious Put: len = %d, want 24", len(buf))
+	}
+	if cap(buf) != 32 {
+		t.Errorf("Get(24) after spurious Put: cap = %d, want 32", cap(buf))
+	}
+}
+
+// --- Concurrency ---
+
+func TestConcurrentGetPut(t *testing.T) {
+	bp := newBufferPool()
+	var wg sync.WaitGroup
+	const goroutines = 64
+	const iters = 1000
+
+	for g := 0; g < goroutines; g++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			for i := 0; i < iters; i++ {
+				n := (id*iters + i) % 4096
+				if n == 0 {
+					n = 1
+				}
+				buf := bp.Get(n)
+				if len(buf) != n {
+					t.Errorf("concurrent Get(%d): len = %d", n, len(buf))
+				}
+				// Write to every byte to catch races under -race.
+				for j := range buf {
+					buf[j] = byte(j)
+				}
+				bp.Put(buf)
+			}
+		}(g)
+	}
+	wg.Wait()
+}
+
+// --- Edge cases at pool boundaries ---
+
+func TestGetExactPowerOfTwo(t *testing.T) {
+	// Exact powers of two are the boundary between two pools; confirm correct
+	// bucket selection and full round-trip for each.
+	bp := newBufferPool()
+	for i := 0; i < 17; i++ {
+		n := 1 << i
+		buf := bp.Get(n)
+		if len(buf) != n {
+			t.Errorf("Get(1<<%d=%d): len = %d", i, n, len(buf))
+		}
+		expectedCap := 1 << (bits.Len16(uint16(n - 1)))
+		if cap(buf) != expectedCap {
+			t.Errorf("Get(1<<%d=%d): cap = %d, want %d", i, n, cap(buf), expectedCap)
+		}
+		bp.Put(buf)
+	}
+}
+
+func TestGetMaxInt16(t *testing.T) {
+	i := poolIndex(math.MaxUint16)
+	if i >= 17 {
+		t.Errorf("poolIndex(MaxUint16) = %d, overflows pool array", i)
+	}
+}
+
+// --- Benchmarks ---
+
+func BenchmarkGetPut(b *testing.B) {
+	sizes := []int{64, 512, 4096, 65535}
+	for _, size := range sizes {
+		bp := newBufferPool()
+		b.Run("", func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				buf := bp.Get(size)
+				bp.Put(buf)
+			}
+		})
+	}
+}
+
+func BenchmarkGetPutParallel(b *testing.B) {
+	bp := newBufferPool()
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			buf := bp.Get(4096)
+			bp.Put(buf)
+		}
+	})
+}

+ 6 - 17
core/pkg/util/monitor/memory/helpers.go

@@ -40,7 +40,7 @@ func (tv *trackedValue) IsSet() bool {
 	return tv.current != nil
 }
 
-// Set updates the current value if it is different. If the vaue is updated, `true` is returned.
+// Set updates the current value if it is different. If the value is updated, `true` is returned.
 // Otherwise, `false` is returned.
 func (tv *trackedValue) Set(value float64) bool {
 	if tv.current == nil {
@@ -179,6 +179,7 @@ func (ema *exponentialMovingAverage) Reset() {
 // stddev, and percentiles of the contained data.
 type rollingWindow struct {
 	capacity int
+	length   int
 	window   []float64
 	index    int
 }
@@ -201,34 +202,22 @@ func newRollingWindow(capacity int) *rollingWindow {
 func (rw *rollingWindow) Push(value float64) {
 	// advance index, handle overflow
 	index := rw.index % rw.capacity
-	if index < 0 {
-		index = -index
-	}
-
 	rw.window[index] = value
-	rw.index++
 
-	// if we have wrapped all the way back around to 0, just advance the index
-	// by capacity to ensure our Len() algorithm continues to function correctly
-	if rw.index == 0 {
-		rw.index = rw.capacity
-	}
+	rw.index = (rw.index + 1) % rw.capacity
+	rw.length = min(rw.length+1, rw.capacity)
 }
 
 // Clears the rolling window values
 func (rw *rollingWindow) Clear() {
 	rw.window = make([]float64, rw.capacity)
 	rw.index = 0
+	rw.length = 0
 }
 
 // The length of the rolling window. Will never be greater that the `Cap()`.
 func (rw *rollingWindow) Len() int {
-	index := rw.index
-	if index < 0 {
-		index = -index
-	}
-
-	return min(index, rw.capacity)
+	return rw.length
 }
 
 // Cap returns the maximum capacity of the rolling window.

+ 20 - 22
core/pkg/util/monitor/memory/helpers_test.go

@@ -345,39 +345,38 @@ func TestRollingWindow_OverwritesOldestOnOverflow(t *testing.T) {
 	}
 }
 
-func TestRollingWindow_InternalIndexOverflowWrapToZero(t *testing.T) {
-	capacity := 3
-	rw := newRollingWindow(capacity)
-
-	// Assuming we have pushed ((2 * MaxInt) - 3) elements
-	rw.index = -3
-
-	// cycle back to index=0, ensure our length is still valid
-	rw.Push(1)
-	rw.Push(2)
-	rw.Push(3)
+func assertIndexLengthCap(t *testing.T, rw *rollingWindow, index int, length int, cap int) {
+	t.Helper()
 
-	if rw.Len() != capacity {
-		t.Errorf("expected length=%d after overflowing internal index back to 0. Got %d\n", capacity, rw.Len())
+	if rw.index != index {
+		t.Errorf("RollingWindow Index: %d. Expected %d", rw.index, index)
 	}
-
-	// This is because we trick our length algorithm by adding back capacity if the next index == 0
-	if rw.index != capacity {
-		t.Errorf("expected internal index = %d after reaching 0. Got %d\n", capacity, rw.index)
+	if rw.Len() != length {
+		t.Errorf("RollingWindow Length: %d. Expected %d", rw.Len(), length)
+	}
+	if rw.Cap() != cap {
+		t.Errorf("RollingWindow Capacity: %d. Expected %d", rw.Cap(), cap)
 	}
 }
 
-func TestRollingWindow_InternalIndexOverflow(t *testing.T) {
+func TestRollingWindow_BasicIndexingLengthCap(t *testing.T) {
 	capacity := 3
 	rw := newRollingWindow(capacity)
 
-	// set index to the 0 position relative to max int
-	rw.index = math.MaxInt - 1
+	assertIndexLengthCap(t, rw, 0, 0, 3)
+	rw.Push(1)
+	assertIndexLengthCap(t, rw, 1, 1, 3)
+	rw.Push(2)
+	assertIndexLengthCap(t, rw, 2, 2, 3)
+	rw.Push(3)
+	assertIndexLengthCap(t, rw, 0, 3, 3)
 
-	// advance to MaxInt, advance to -MaxInt (overflow), andvance to -MaxInt + 1
 	rw.Push(1)
+	assertIndexLengthCap(t, rw, 1, 3, 3)
 	rw.Push(2)
+	assertIndexLengthCap(t, rw, 2, 3, 3)
 	rw.Push(3)
+	assertIndexLengthCap(t, rw, 0, 3, 3)
 
 	set := newSet(1, 2, 3)
 
@@ -404,7 +403,6 @@ func TestRollingWindow_InternalIndexOverflow(t *testing.T) {
 
 		set.remove(v)
 	})
-
 }
 
 func TestRollingWindow_PartialCapacityMean(t *testing.T) {

+ 7 - 8
core/pkg/util/monitor/memory/memorylimitstats.go

@@ -22,13 +22,12 @@ type MemoryLimitConfig struct {
 	// should be distributed between a previous average value and the current observation.
 	SmoothingFactor float64
 
-	// BreachWindowSize is the total number of recent raw samples are maintained used for
-	// breach detection. This occurs when the number of samples geather than the current
-	// memory limit exceeds a threshold.
+	// BreachWindowSize is the total number of recent raw samples are maintained/used for
+	// breach detection.
 	BreachWindowSize int
 
-	// BreachThreshold is a limit of raw samples allowed to exceed the memory limit. If this
-	// threshold is reached, the samples are recalibrated.
+	// BreachThreshold is a limit of raw samples, within the `BreachWindowSize`, allowed to
+	// exceed the memory limit. If this threshold is reached, the samples are recalibrated.
 	BreachThreshold int
 
 	// CumulativeSumSlack is also known as the K-Factor (drift tolerance) in cumulative sum control
@@ -43,7 +42,7 @@ type MemoryLimitConfig struct {
 	CumulativeSumThreshold float64
 }
 
-// DfaultMemoryLimitConfig creates the recommendded values to use for detecting soft memory limit updates
+// DefaultMemoryLimitConfig creates the recommended values to use for detecting soft memory limit updates
 func DefaultMemoryLimitConfig() *MemoryLimitConfig {
 	return &MemoryLimitConfig{
 		LimitRatio:             0.90,
@@ -59,7 +58,7 @@ func DefaultMemoryLimitConfig() *MemoryLimitConfig {
 
 // MemoryLimitStats is a run-time memory statistics collector that maintains a soft memory limit
 // value based on configurable input parameters. It is designed to adjust the soft limit based on
-// meaningful changes to overall heap allocation, leveraging expontential moving average windows,
+// meaningful changes to overall heap allocation, leveraging exponential moving average windows,
 // confidence interval gates, breach detection, and cumulative sum control chart to detect meaningful
 // deviations from the mean.
 type MemoryLimitStats struct {
@@ -118,7 +117,7 @@ func (mls *MemoryLimitStats) Record(heapBytes uint64) (softLimit uint64, updated
 	mls.raw.Push(sample)
 	mls.breach.Push(sample)
 
-	// Check that the minimum number of sammples exist in the window before
+	// Check that the minimum number of samples exist in the window before
 	// calculating the memory limit
 	totalSamples := mls.window.Len()
 	if totalSamples < mls.config.MinSamples {

+ 1 - 6
core/pkg/util/monitor/memory/memorylimitstats_test.go

@@ -1,11 +1,10 @@
 package memory_test
 
 import (
-	"fmt"
 	"math/rand"
 	"testing"
 
-	"github.com/mbolt35/bingen-file-loader/core/util/monitor/memory"
+	"github.com/opencost/opencost/core/pkg/util/monitor/memory"
 )
 
 func TestObservationMode(t *testing.T) {
@@ -71,15 +70,11 @@ func TestElasticRecalibrationOnGrowth(t *testing.T) {
 		t.Fatal("expected a non-zero limit after phase 1")
 	}
 
-	fmt.Printf("Before Breach...\n")
-
 	// Phase 2: spike to 300 MiB repeatedly — should trigger recalibration.
 	for i := 0; i < config.BreachThreshold+1; i++ {
 		m.Record(500 * 1024 * 1024)
 	}
 
-	fmt.Printf("Crossed Threshold here\n")
-
 	// Phase 3: feed enough samples at new level to re-commit.
 	var recalibrated bool
 	for i := 0; i < config.MinSamples+20; i++ {

+ 1 - 0
core/pkg/util/stringutil/lrubank.go

@@ -61,6 +61,7 @@ type lruStringBank struct {
 func NewLruStringBank(capacity int, evictionInterval time.Duration) StringBank {
 	stop := make(chan struct{})
 	bank := &lruStringBank{
+		stop:     stop,
 		m:        make(map[string]*lruEntry),
 		capacity: capacity,
 	}

+ 26 - 0
core/pkg/util/stringutil/stringutil_test.go

@@ -170,11 +170,17 @@ const LruCapacity = 500_000
 const LruEvictInterval = 5 * time.Second
 
 func BenchmarkLruStringBankFunc90PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
 	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
 	defer func() {
 		if lruBank, ok := sb.(interface{ Stop() }); ok {
 			lruBank.Stop()
 		}
+
 	}()
 
 	stringutil.UpdateStringBank(sb)
@@ -182,6 +188,11 @@ func BenchmarkLruStringBankFunc90PercentDuplicate(b *testing.B) {
 }
 
 func BenchmarkLruStringBankFunc75PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
 	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
 	defer func() {
 		if lruBank, ok := sb.(interface{ Stop() }); ok {
@@ -194,6 +205,11 @@ func BenchmarkLruStringBankFunc75PercentDuplicate(b *testing.B) {
 }
 
 func BenchmarkLruStringBankFunc50PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
 	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
 	defer func() {
 		if lruBank, ok := sb.(interface{ Stop() }); ok {
@@ -206,6 +222,11 @@ func BenchmarkLruStringBankFunc50PercentDuplicate(b *testing.B) {
 }
 
 func BenchmarkLruStringBankFunc25PercentDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
 	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
 	defer func() {
 		if lruBank, ok := sb.(interface{ Stop() }); ok {
@@ -218,6 +239,11 @@ func BenchmarkLruStringBankFunc25PercentDuplicate(b *testing.B) {
 }
 
 func BenchmarkLruStringBankFuncNoDuplicate(b *testing.B) {
+	prevBank := stringutil.GetStringBank()
+	defer func() {
+		stringutil.UpdateStringBank(prevBank)
+	}()
+
 	sb := stringutil.NewLruStringBank(LruCapacity, LruEvictInterval)
 	defer func() {
 		if lruBank, ok := sb.(interface{ Stop() }); ok {

+ 37 - 0
docs/ROADMAP.md

@@ -0,0 +1,37 @@
+# OpenCost Roadmap
+
+This roadmap reflects the current priorities for the OpenCost project. It is reviewed quarterly and discussed in the biweekly [Working Group meetings](https://zoom-lfx.platform.linuxfoundation.org/meetings/opencost?view=list).
+
+## Current Focus Areas
+
+- **Cloud cost integration:** Connecting cloud billing data to the demo environment, cloud cost bug fixes, and multi-account support
+- **UI revamp:** Major frontend overhaul via LFX mentorship — new UI released, stabilizing before next core release
+- **OpenCost AI:** New sub-project for airgapped private cost models (CI/CD, testing models at scale, finding smallest viable model)
+- **First-class LLM cost support:** Design proposal for native LLM cost tracking in OpenCost core
+- **Integration test expansion:** Pod restart tests, network cost tests, resolving Prometheus-less (promless) vs Prometheus-backed test discrepancies
+- **Plugin ecosystem:** Snowflake, GitHub, and currency conversion plugins proposed; MongoDB reference implementation for currency support
+- **Helm chart signing:** Cryptographic signing of Helm charts (research in progress)
+- **Data persistence and export:** Mounting persistence for promless mode, potential S3 export for cost data
+- **Supply chain security:** Achieving [OpenSSF Best Practices](https://www.bestpractices.dev/projects/6219) Silver and Gold badges, cryptographically signed releases via Sigstore/cosign, SLSA build provenance, and SPDX license compliance across all source files
+- **Community growth:** EMEA/APAC meeting cadence, YouTube channel for meeting recordings, DigitalOcean cloud sponsorship for testing
+
+## Recent Milestones
+
+- New OpenCost UI released (v1.0 via LFX mentorship)
+- OpenCost AI sub-project introduced (first PR merged)
+- MCP server released in v1.118 with right-sizing recommendations
+- KubeModel 1.0 shipped (Fall 2025 LFX mentorship)
+- SBOM generation integrated across core and UI repos (SPDX + CycloneDX)
+- OpenSSF Scorecard integration
+- Community Maintainer role introduced
+- Gateway API deployed for infrastructure
+- Spot node testing enabled in integration test cluster
+- Copilot AI review bot enabled across repositories (provided by CNCF)
+- OpenCost Specification v0.1 published
+- Collector data source shipped (alternative to Prometheus)
+
+## How to Influence the Roadmap
+
+- Join the [OpenCost Working Group](https://zoom-lfx.platform.linuxfoundation.org/meetings/opencost?view=week) (biweekly, alternating between EMEA/APAC at 15:00 UTC and NA at 21:00 UTC)
+- Propose changes via [GitHub Issues](https://github.com/opencost/opencost/issues)
+- Discuss ideas in the [#opencost](https://cloud-native.slack.com/archives/C03D56FPD4G) channel on [CNCF Slack](https://slack.cncf.io/)

+ 1 - 1
go.mod

@@ -26,7 +26,7 @@ require (
 	github.com/aws/aws-sdk-go-v2/credentials v1.19.12
 	github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.22.6
 	github.com/aws/aws-sdk-go-v2/service/athena v1.57.1
-	github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0
+	github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0
 	github.com/aws/aws-sdk-go-v2/service/s3 v1.97.1
 	github.com/aws/aws-sdk-go-v2/service/sts v1.41.9
 	github.com/aws/smithy-go v1.24.2

+ 2 - 2
go.sum

@@ -114,8 +114,8 @@ github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.21 h1:SwGMTMLIlvDNyhMteQ6r8IJSBPl
 github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.21/go.mod h1:UUxgWxofmOdAMuqEsSppbDtGKLfR04HGsD0HXzvhI1k=
 github.com/aws/aws-sdk-go-v2/service/athena v1.57.1 h1:I+YmJvkeAN+r4mUlH2CODoYyAkoP0cQWGItPPkK60hk=
 github.com/aws/aws-sdk-go-v2/service/athena v1.57.1/go.mod h1:yZ507NVXolOco9hA2+mKH3ELnLEOZ/4mGqkrp2phNYs=
-github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0 h1:dgdIaG/GCiXMo16HAdFwpjt9Vn34bD2WVH5SiZdwzUc=
-github.com/aws/aws-sdk-go-v2/service/ec2 v1.293.0/go.mod h1:2dMnUs1QzlGzsm46i9oBHAxVHQp7b6qF7PljWcgVEVE=
+github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0 h1:98Miqj16un1WLNyM1RjVDhXYumhqZrQfAeG8i4jPG6o=
+github.com/aws/aws-sdk-go-v2/service/ec2 v1.296.0/go.mod h1:T6ndRfdhnXLIY5oKBHjYZDVj706los2zGdpThppquvA=
 github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 h1:5EniKhLZe4xzL7a+fU3C2tfUN4nWIqlLesfrjkuPFTY=
 github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI=
 github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.12 h1:qtJZ70afD3ISKWnoX3xB0J2otEqu3LqicRcDBqsj0hQ=

+ 140 - 91
pkg/carbon/carbonassets.go

@@ -3,7 +3,6 @@ package carbon
 import (
 	"embed"
 	"encoding/csv"
-	"fmt"
 	"strconv"
 	"strings"
 
@@ -15,7 +14,11 @@ import (
 //go:embed carbonlookupdata.csv
 var f embed.FS
 
-type carbonLookupKeyDisk struct {
+// averageRegionKey is the fallback region label used in the lookup CSV when a
+// specific (provider, region, instanceType) tuple cannot be matched.
+const averageRegionKey = "average-region"
+
+type carbonLookupKeyRegion struct {
 	provider string
 	region   string
 }
@@ -26,15 +29,13 @@ type carbonLookupKeyNode struct {
 	instanceType string
 }
 
-var carbonLookupNode map[carbonLookupKeyNode]float64
-var carbonLookupDisk map[carbonLookupKeyDisk]float64
-
-// Opencost does not build network types
-var carbonValidInstanceTypes map[string]string
-var carbonValidRegions map[string]string
+var (
+	carbonLookupNode    map[carbonLookupKeyNode]float64
+	carbonLookupDisk    map[carbonLookupKeyRegion]float64
+	carbonLookupNetwork map[carbonLookupKeyRegion]float64
+)
 
 func init() {
-
 	carbonData, err := f.ReadFile("carbonlookupdata.csv")
 	if err != nil {
 		log.Errorf("Error getting content of carbon lookup file: %s", err)
@@ -43,129 +44,177 @@ func init() {
 
 	reader := csv.NewReader(strings.NewReader(string(carbonData)))
 
-	// skip header
-	_, err = reader.Read()
-	if err != nil {
-		log.Errorf("Error reading carbon lookup data: %s", err)
+	if _, err := reader.Read(); err != nil {
+		log.Errorf("Error reading carbon lookup header: %s", err)
 		return
 	}
 
-	dat, err := reader.ReadAll()
+	rows, err := reader.ReadAll()
 	if err != nil {
 		log.Errorf("Error reading carbon lookup data: %s", err)
 		return
 	}
 
 	carbonLookupNode = make(map[carbonLookupKeyNode]float64)
-	carbonLookupDisk = make(map[carbonLookupKeyDisk]float64)
-
-	carbonValidInstanceTypes = make(map[string]string)
-	carbonValidRegions = make(map[string]string)
-
-	for _, carbonItem := range dat {
+	carbonLookupDisk = make(map[carbonLookupKeyRegion]float64)
+	carbonLookupNetwork = make(map[carbonLookupKeyRegion]float64)
 
-		if coeff, err := strconv.ParseFloat(carbonItem[5], 64); err != nil {
-
-			panic(fmt.Errorf("error setting up carbon lookup table: malformed carbon cost '%s'", carbonItem[5]))
-
-		} else {
-
-			provider := carbonItem[0]
-			region := carbonItem[1]
-			instanceType := carbonItem[2]
-			assetType := carbonItem[3]
+	for _, row := range rows {
+		// Skip blank records (e.g. a trailing newline in the CSV).
+		if len(row) == 0 || (len(row) == 1 && strings.TrimSpace(row[0]) == "") {
+			continue
+		}
+		if len(row) < 6 {
+			log.Warnf("carbon: skipping malformed lookup row %v", row)
+			continue
+		}
 
-			switch assetType {
-			case "Node":
-				carbonLookupNode[carbonLookupKeyNode{
-					provider:     provider,
-					region:       region,
-					instanceType: instanceType,
-				}] = coeff
-			case "Disk":
-				carbonLookupDisk[carbonLookupKeyDisk{
-					provider: provider,
-					region:   region,
-				}] = coeff
-			}
+		coeff, err := strconv.ParseFloat(row[5], 64)
+		if err != nil {
+			log.Warnf("carbon: skipping row with malformed carbon coefficient %q", row[5])
+			continue
+		}
 
-			carbonValidInstanceTypes[instanceType] = provider
-			carbonValidRegions[region] = provider
+		provider := row[0]
+		region := row[1]
+		instanceType := row[2]
+		assetType := row[3]
 
+		switch assetType {
+		case "Node":
+			carbonLookupNode[carbonLookupKeyNode{
+				provider:     provider,
+				region:       region,
+				instanceType: instanceType,
+			}] = coeff
+		case "Disk":
+			carbonLookupDisk[carbonLookupKeyRegion{
+				provider: provider,
+				region:   region,
+			}] = coeff
+		case "Network":
+			carbonLookupNetwork[carbonLookupKeyRegion{
+				provider: provider,
+				region:   region,
+			}] = coeff
 		}
-
 	}
-
 }
 
 type CarbonRow struct {
 	Co2e float64 `json:"co2e"`
 }
 
+// RelateCarbonAssets returns an estimated CO2e value for each asset in the set.
+// The returned value is in metric tonnes of CO2e, consistent with the units of
+// the embedded lookup table (tonnes CO2e per hour of asset runtime).
 func RelateCarbonAssets(as *opencost.AssetSet) (map[string]CarbonRow, error) {
-
-	res := make(map[string]CarbonRow)
+	res := make(map[string]CarbonRow, len(as.Assets))
 
 	for key, asset := range as.Assets {
-
-		// If no valid region, default to per-provider calculated average
-		region, _ := util.GetRegion(asset.GetLabels())
-		if _, ok := carbonValidRegions[region]; !ok {
-			region = "average-region"
-		}
-
-		// If no valid instance type, also default to per-provider calculated average
-		instanceType, _ := util.GetInstanceType(asset.GetLabels())
-		if _, ok := carbonValidInstanceTypes[instanceType]; !ok {
-			region = "average-region"
+		coeff := lookupCarbonCoeff(asset)
+		res[key] = CarbonRow{
+			Co2e: coeff * asset.Minutes() / 60,
 		}
+	}
 
-		provider := getProviderFromProviderID(asset.GetProperties().ProviderID)
+	return res, nil
+}
 
-		// If we're not able to parse the provider id, try to fetch the provider from the carbon data
-		if provider == "" && region != "average-region" {
-			provider = carbonValidRegions[region]
-		} else {
-			if asset.Type() == opencost.NodeAssetType || asset.Type() == opencost.DiskAssetType {
-				log.DedupedErrorf(10, "Cannot infer region information for asset '%s'", asset.GetProperties().ProviderID)
+// lookupCarbonCoeff resolves the carbon coefficient (tonnes CO2e per hour) for
+// the given asset, falling back to the provider-wide average-region value when
+// a specific region or instance type is not present in the lookup table.
+func lookupCarbonCoeff(asset opencost.Asset) float64 {
+	props := asset.GetProperties()
+	provider := resolveProvider(asset)
+	if provider == "" {
+		if isCarbonTrackedAsset(asset.Type()) {
+			providerID := ""
+			if props != nil {
+				providerID = props.ProviderID
 			}
+			log.DedupedWarningf(10, "carbon: cannot infer provider for asset %q", providerID)
 		}
+		return 0
+	}
 
-		var carbonCoeff float64
-		switch asset.Type() {
-		case opencost.NodeAssetType:
-			carbonCoeff = carbonLookupNode[carbonLookupKeyNode{
-				provider:     provider,
-				region:       region,
-				instanceType: instanceType,
-			}]
-		case opencost.DiskAssetType:
-			carbonCoeff = carbonLookupDisk[carbonLookupKeyDisk{
-				provider: provider,
-				region:   region,
-			}]
-		}
+	region, _ := util.GetRegion(asset.GetLabels())
+	instanceType, _ := util.GetInstanceType(asset.GetLabels())
 
-		res[key] = CarbonRow{
-			Co2e: carbonCoeff * asset.Minutes() / 60,
+	switch asset.Type() {
+	case opencost.NodeAssetType:
+		if coeff, ok := carbonLookupNode[carbonLookupKeyNode{provider, region, instanceType}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupNode[carbonLookupKeyNode{provider, averageRegionKey, ""}]; ok {
+			log.DedupedWarningf(10, "carbon: falling back to average-region for node (provider=%s region=%q instanceType=%q)", provider, region, instanceType)
+			return coeff
+		}
+	case opencost.DiskAssetType:
+		if coeff, ok := carbonLookupDisk[carbonLookupKeyRegion{provider, region}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupDisk[carbonLookupKeyRegion{provider, averageRegionKey}]; ok {
+			log.DedupedWarningf(10, "carbon: falling back to average-region for disk (provider=%s region=%q)", provider, region)
+			return coeff
 		}
+	case opencost.NetworkAssetType:
+		if coeff, ok := carbonLookupNetwork[carbonLookupKeyRegion{provider, region}]; ok {
+			return coeff
+		}
+		if coeff, ok := carbonLookupNetwork[carbonLookupKeyRegion{provider, averageRegionKey}]; ok {
+			return coeff
+		}
+	}
+	return 0
+}
 
+func isCarbonTrackedAsset(t opencost.AssetType) bool {
+	switch t {
+	case opencost.NodeAssetType, opencost.DiskAssetType, opencost.NetworkAssetType:
+		return true
 	}
+	return false
+}
 
-	return res, nil
+// resolveProvider returns the canonical provider name for an asset. It prefers
+// the canonical Provider property populated by the cost model, falling back to
+// parsing the cloud provider ID when the property is missing.
+func resolveProvider(asset opencost.Asset) string {
+	props := asset.GetProperties()
+	if props == nil {
+		return ""
+	}
 
+	switch props.Provider {
+	case opencost.AWSProvider, opencost.GCPProvider, opencost.AzureProvider:
+		return props.Provider
+	}
+
+	return inferProviderFromProviderID(props.ProviderID)
 }
 
-func getProviderFromProviderID(providerid string) string {
+// inferProviderFromProviderID is a best-effort fallback that matches the
+// conventional shapes of Kubernetes Node `spec.providerID` values for the
+// cloud providers present in the embedded lookup data (AWS, GCP, Azure).
+//
+// Real-world formats:
+//   - AWS:   aws:///<availability-zone>/<instance-id>  (or raw "i-…")
+//   - GCP:   gce://<project>/<zone>/<instance-name>
+//   - Azure: azure:///subscriptions/<sub>/resourceGroups/<rg>/…
+func inferProviderFromProviderID(providerID string) string {
+	id := strings.ToLower(strings.TrimSpace(providerID))
+	if id == "" {
+		return ""
+	}
 
-	if strings.HasPrefix(providerid, "gke") {
-		return opencost.GCPProvider
-	} else if strings.HasPrefix(providerid, "i-") {
+	switch {
+	case strings.HasPrefix(id, "aws:"), strings.HasPrefix(id, "i-"):
 		return opencost.AWSProvider
-	} else if strings.HasPrefix(providerid, "azure") {
+	case strings.HasPrefix(id, "gce:"), strings.HasPrefix(id, "gke"):
+		return opencost.GCPProvider
+	case strings.HasPrefix(id, "azure:"):
 		return opencost.AzureProvider
 	}
-
 	return ""
-
 }

+ 254 - 0
pkg/carbon/carbonassets_test.go

@@ -0,0 +1,254 @@
+package carbon
+
+import (
+	"math"
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	v1 "k8s.io/api/core/v1"
+)
+
+const (
+	// Known-good row from carbonlookupdata.csv:
+	//   AWS,us-east-1,t4g.nano,Node,0.012788433076234564,4.84769853777516e-06
+	awsT4gNanoUSEast1Coeff = 4.84769853777516e-06
+
+	// AWS,average-region,,Node,0.186739186034359,7.278989705005508e-05
+	awsAvgRegionNodeCoeff = 7.278989705005508e-05
+
+	// AWS,us-east-1,,Network,0.001135,4.30243315e-7
+	awsUSEast1NetworkCoeff = 4.30243315e-7
+)
+
+// floatEqual compares floats at a tolerance appropriate for the lookup table
+// values, which are stored with full float64 precision in the CSV.
+func floatEqual(a, b float64) bool {
+	if a == b {
+		return true
+	}
+	return math.Abs(a-b) <= 1e-18+1e-12*math.Max(math.Abs(a), math.Abs(b))
+}
+
+func nodeWithLabels(provider, providerID, region, instanceType string, minutes float64) *opencost.Node {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	n := opencost.NewNode("node", "cluster", providerID, start, end, window)
+	n.Properties.Provider = provider
+	labels := opencost.AssetLabels{}
+	if region != "" {
+		labels[v1.LabelTopologyRegion] = region
+	}
+	if instanceType != "" {
+		labels[v1.LabelInstanceTypeStable] = instanceType
+	}
+	n.Labels = labels
+	return n
+}
+
+func diskWithLabels(provider, providerID, region string, minutes float64) *opencost.Disk {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	d := opencost.NewDisk("disk", "cluster", providerID, start, end, window)
+	d.Properties.Provider = provider
+	if region != "" {
+		d.Labels = opencost.AssetLabels{v1.LabelTopologyRegion: region}
+	}
+	return d
+}
+
+func networkWithLabels(provider, providerID, region string, minutes float64) *opencost.Network {
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(time.Duration(minutes) * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+
+	nw := opencost.NewNetwork("network", "cluster", providerID, start, end, window)
+	nw.Properties.Provider = provider
+	if region != "" {
+		nw.Labels = opencost.AssetLabels{v1.LabelTopologyRegion: region}
+	}
+	return nw
+}
+
+func TestInferProviderFromProviderID(t *testing.T) {
+	cases := []struct {
+		name string
+		id   string
+		want string
+	}{
+		{"empty", "", ""},
+		{"aws standard", "aws:///us-east-1a/i-0abc123", opencost.AWSProvider},
+		{"aws raw instance", "i-0abc123", opencost.AWSProvider},
+		{"gce standard", "gce://my-project/us-central1-a/gke-node-1", opencost.GCPProvider},
+		{"legacy gke prefix", "gke-node-1", opencost.GCPProvider},
+		{"azure standard", "azure:///subscriptions/x/resourceGroups/y/providers/Microsoft.Compute/virtualMachines/z", opencost.AzureProvider},
+		{"unknown prefix", "something-else", ""},
+		{"whitespace and case", "  AWS:///eu-west-1a/i-xyz  ", opencost.AWSProvider},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := inferProviderFromProviderID(tc.id); got != tc.want {
+				t.Fatalf("inferProviderFromProviderID(%q) = %q, want %q", tc.id, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestResolveProvider_PrefersCanonicalProperty(t *testing.T) {
+	// ProviderID is a GCP-shaped string but the canonical property says AWS.
+	// Canonical property wins.
+	n := nodeWithLabels(opencost.AWSProvider, "gce://foo/bar/baz", "us-east-1", "t4g.nano", 60)
+	if got := resolveProvider(n); got != opencost.AWSProvider {
+		t.Fatalf("resolveProvider = %q, want %q", got, opencost.AWSProvider)
+	}
+}
+
+func TestResolveProvider_FallsBackToProviderID(t *testing.T) {
+	// No canonical Provider property — must fall back to parsing ProviderID.
+	n := nodeWithLabels("", "gce://my-project/us-central1-a/gke-node-1", "us-central1", "e2-standard-2", 60)
+	if got := resolveProvider(n); got != opencost.GCPProvider {
+		t.Fatalf("resolveProvider = %q, want %q", got, opencost.GCPProvider)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_ExactMatch(t *testing.T) {
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsT4gNanoUSEast1Coeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g", got, awsT4gNanoUSEast1Coeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenRegionUnknown(t *testing.T) {
+	// Region is garbage; instance type is fine. Should fall back to
+	// (AWS, average-region, "") instead of returning zero.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///xx/i-1", "not-a-real-region", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenInstanceTypeUnknown(t *testing.T) {
+	// Region is real; instance type is unknown. Previously returned 0 because
+	// only the region was reset. Must now fall back to average-region.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "future-xxlarge", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_FallsBackWhenBothUnknown(t *testing.T) {
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///xx/i-1", "not-a-real-region", "future-xxlarge", 60)
+	if got := lookupCarbonCoeff(n); !floatEqual(got, awsAvgRegionNodeCoeff) {
+		t.Fatalf("lookupCarbonCoeff = %g, want %g (average-region fallback)", got, awsAvgRegionNodeCoeff)
+	}
+}
+
+func TestLookupCarbonCoeff_Node_ZeroForUnknownProvider(t *testing.T) {
+	n := nodeWithLabels("", "some-unknown-id", "us-east-1", "t4g.nano", 60)
+	if got := lookupCarbonCoeff(n); got != 0 {
+		t.Fatalf("lookupCarbonCoeff = %g, want 0", got)
+	}
+}
+
+func TestLookupCarbonCoeff_Disk_ExactMatch(t *testing.T) {
+	// The CSV contains several disk rows per (provider, region), one per
+	// disk type. They collide under a key of (provider, region), so we
+	// check the lookup against whatever value the table actually holds.
+	want, ok := carbonLookupDisk[carbonLookupKeyRegion{opencost.AWSProvider, "us-east-1"}]
+	if !ok || want == 0 {
+		t.Fatalf("expected AWS/us-east-1 disk coefficient to be loaded")
+	}
+	d := diskWithLabels(opencost.AWSProvider, "aws:///us-east-1a/vol-1", "us-east-1", 60)
+	if got := lookupCarbonCoeff(d); !floatEqual(got, want) {
+		t.Fatalf("lookupCarbonCoeff disk = %g, want %g", got, want)
+	}
+}
+
+func TestLookupCarbonCoeff_Disk_FallsBackWhenRegionUnknown(t *testing.T) {
+	d := diskWithLabels(opencost.AWSProvider, "aws:///xx/vol-1", "not-a-real-region", 60)
+	want, ok := carbonLookupDisk[carbonLookupKeyRegion{opencost.AWSProvider, averageRegionKey}]
+	if !ok {
+		t.Fatalf("expected AWS average-region disk coefficient to be loaded")
+	}
+	if got := lookupCarbonCoeff(d); !floatEqual(got, want) {
+		t.Fatalf("lookupCarbonCoeff disk fallback = %g, want %g", got, want)
+	}
+}
+
+func TestLookupCarbonCoeff_Network_Populated(t *testing.T) {
+	// Regression: Network rows were loaded but never consulted, so every
+	// Network asset produced 0 emissions.
+	nw := networkWithLabels(opencost.AWSProvider, "aws:///us-east-1a/net-1", "us-east-1", 60)
+	if got := lookupCarbonCoeff(nw); !floatEqual(got, awsUSEast1NetworkCoeff) {
+		t.Fatalf("lookupCarbonCoeff network = %g, want %g", got, awsUSEast1NetworkCoeff)
+	}
+}
+
+func TestRelateCarbonAssets_MinutesToHours(t *testing.T) {
+	// Coefficient is tonnes CO2e per hour, so 120 minutes should yield exactly
+	// twice the coefficient.
+	n := nodeWithLabels(opencost.AWSProvider, "aws:///us-east-1a/i-1", "us-east-1", "t4g.nano", 120)
+	as := opencost.NewAssetSet(*n.Window.Start(), *n.Window.End(), n)
+
+	rows, err := RelateCarbonAssets(as)
+	if err != nil {
+		t.Fatalf("RelateCarbonAssets: %v", err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("got %d rows, want 1", len(rows))
+	}
+	var row CarbonRow
+	for _, r := range rows {
+		row = r
+	}
+	want := awsT4gNanoUSEast1Coeff * 2
+	if !floatEqual(row.Co2e, want) {
+		t.Fatalf("Co2e = %g, want %g", row.Co2e, want)
+	}
+}
+
+func TestRelateCarbonAssets_ZeroForUnknownProvider(t *testing.T) {
+	n := nodeWithLabels("", "totally-unknown", "us-east-1", "t4g.nano", 60)
+	as := opencost.NewAssetSet(*n.Window.Start(), *n.Window.End(), n)
+
+	rows, err := RelateCarbonAssets(as)
+	if err != nil {
+		t.Fatalf("RelateCarbonAssets: %v", err)
+	}
+	for _, r := range rows {
+		if r.Co2e != 0 {
+			t.Fatalf("Co2e = %g, want 0 for unknown provider", r.Co2e)
+		}
+	}
+}
+
+func TestLookupCarbonCoeff_NoPanicOnNilProperties(t *testing.T) {
+	// A bare Node with nil Properties must not panic — older code would
+	// dereference props.ProviderID in the log line after resolveProvider
+	// returned "" for nil properties.
+	start := time.Date(2026, time.April, 1, 0, 0, 0, 0, time.UTC)
+	end := start.Add(60 * time.Minute)
+	window := opencost.NewWindow(&start, &end)
+	n := opencost.NewNode("node", "cluster", "", start, end, window)
+	n.Properties = nil
+
+	if got := lookupCarbonCoeff(n); got != 0 {
+		t.Fatalf("lookupCarbonCoeff with nil properties = %g, want 0", got)
+	}
+}
+
+func TestLookupTables_LoadedAtInit(t *testing.T) {
+	if len(carbonLookupNode) == 0 {
+		t.Error("carbonLookupNode is empty — init did not populate node lookups")
+	}
+	if len(carbonLookupDisk) == 0 {
+		t.Error("carbonLookupDisk is empty — init did not populate disk lookups")
+	}
+	if len(carbonLookupNetwork) == 0 {
+		t.Error("carbonLookupNetwork is empty — init did not populate network lookups")
+	}
+}

+ 45 - 8
pkg/cloud/alibaba/provider.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
 	"os"
 	"regexp"
 	"strconv"
@@ -60,8 +61,8 @@ const (
 )
 
 var (
-	// Regular expression to get the numerical value of PV suffix with GiB from *v1.PersistentVolume.
-	sizeRegEx = regexp.MustCompile("(.*?)Gi")
+	// sizeRegEx parses a PV capacity string into a numeric part and an optional binary SI suffix (Ki, Mi, Gi, Ti).
+	sizeRegEx = regexp.MustCompile(`^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti)?$`)
 )
 
 // Variable to keep track of instance families that fail in DescribePrice API due improper defaulting of systemDisk if the information is not available
@@ -1296,21 +1297,57 @@ func generateSlimK8sNodeFromV1Node(node *clustercache.Node) *SlimK8sNode {
 	return NewSlimK8sNode(instanceType, regionID, priceUnit, memorySizeInKiB, osType, providerID, instanceFamily, IsIoOptimized, systemDisk)
 }
 
-// getNumericalValueFromResourceQuantity returns the numericalValue of the resourceQuantity
-// An example is: 20Gi returns to 20. If any error occurs it returns the default value used in describePrice API which is 2000.
+// getNumericalValueFromResourceQuantity converts a Kubernetes PV capacity string (e.g. "20Gi", "48828125Ki")
+// into a whole GiB integer string, as required by the Alibaba DescribePrice API.
+// Returns ALIBABA_DEFAULT_DATADISK_SIZE if the quantity cannot be parsed.
 func getNumericalValueFromResourceQuantity(quantity string) (value string) {
-	// defaulting when any panic or empty string occurs.
 	defer func() {
-		log.Debugf("unable to determine the size of the PV so defaulting the size to %s", ALIBABA_DEFAULT_DATADISK_SIZE)
 		if err := recover(); err != nil {
+			log.Debugf("panic while parsing PV capacity %q, defaulting to %s: %v", quantity, ALIBABA_DEFAULT_DATADISK_SIZE, err)
 			value = ALIBABA_DEFAULT_DATADISK_SIZE
 		}
 		if value == "" {
+			log.Debugf("unable to determine the size of the PV from quantity %q, defaulting to %s", quantity, ALIBABA_DEFAULT_DATADISK_SIZE)
 			value = ALIBABA_DEFAULT_DATADISK_SIZE
 		}
 	}()
-	res := sizeRegEx.FindAllStringSubmatch(quantity, 1)
-	value = res[0][1]
+
+	res := sizeRegEx.FindStringSubmatch(strings.TrimSpace(quantity))
+	if len(res) < 2 || res[1] == "" {
+		return
+	}
+
+	numericPart, err := strconv.ParseFloat(res[1], 64)
+	if err != nil || numericPart <= 0 {
+		return
+	}
+
+	unit := ""
+	if len(res) >= 3 {
+		unit = res[2]
+	}
+
+	var sizeInGiB float64
+	switch unit {
+	case "Ki":
+		sizeInGiB = numericPart / (1024 * 1024)
+	case "Mi":
+		sizeInGiB = numericPart / 1024
+	case "Gi":
+		sizeInGiB = numericPart
+	case "Ti":
+		sizeInGiB = numericPart * 1024
+	default:
+		sizeInGiB = numericPart / (1024 * 1024 * 1024)
+	}
+
+	// ceil so we never underreport disk size to the DescribePrice API.
+	sizeInGiBInt := int64(math.Ceil(sizeInGiB))
+	if sizeInGiBInt <= 0 {
+		return
+	}
+
+	value = strconv.FormatInt(sizeInGiBInt, 10)
 	return
 }
 

+ 69 - 3
pkg/cloud/alibaba/provider_test.go

@@ -770,20 +770,86 @@ func TestGetNumericalValueFromResourceQuantity(t *testing.T) {
 			expectedValue:        "10",
 		},
 		{
-			name:                 "negative scenario: when inputResourceQuantity is Gi",
+			name:                 "negative scenario: when inputResourceQuantity is Gi (no numeric prefix)",
 			inputResourceQuanity: "Gi",
 			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
 		},
 		{
-			name:                 "negative scenario: when inputResourceQuantity is 10",
+			name:                 "edge case: when inputResourceQuantity is 10 (plain bytes, rounds up to 1 GiB)",
 			inputResourceQuanity: "10",
-			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
+			expectedValue:        "1",
 		},
 		{
 			name:                 "negative scenario: when inputResourceQuantity is empty string",
 			inputResourceQuanity: "",
 			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
 		},
+		{
+			name:                 "negative scenario: when inputResourceQuantity is non-numeric",
+			inputResourceQuanity: "abc",
+			expectedValue:        ALIBABA_DEFAULT_DATADISK_SIZE,
+		},
+		{
+			// 48828125Ki / (1024*1024) = 46.875 GiB, rounds up to 47
+			name:                 "positive scenario: Ki unit - 48828125Ki",
+			inputResourceQuanity: "48828125Ki",
+			expectedValue:        "47",
+		},
+		{
+			name:                 "positive scenario: Ki unit - 1048576Ki (exactly 1 GiB)",
+			inputResourceQuanity: "1048576Ki",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Ki unit - 2097152Ki (exactly 2 GiB)",
+			inputResourceQuanity: "2097152Ki",
+			expectedValue:        "2",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 512Mi (rounds up to 1 GiB)",
+			inputResourceQuanity: "512Mi",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 1024Mi (exactly 1 GiB)",
+			inputResourceQuanity: "1024Mi",
+			expectedValue:        "1",
+		},
+		{
+			name:                 "positive scenario: Mi unit - 20480Mi (exactly 20 GiB)",
+			inputResourceQuanity: "20480Mi",
+			expectedValue:        "20",
+		},
+		{
+			name:                 "positive scenario: Gi unit - 20Gi",
+			inputResourceQuanity: "20Gi",
+			expectedValue:        "20",
+		},
+		{
+			name:                 "positive scenario: Gi unit - 100Gi",
+			inputResourceQuanity: "100Gi",
+			expectedValue:        "100",
+		},
+		{
+			name:                 "positive scenario: Ti unit - 1Ti",
+			inputResourceQuanity: "1Ti",
+			expectedValue:        "1024",
+		},
+		{
+			name:                 "positive scenario: Ti unit - 2Ti",
+			inputResourceQuanity: "2Ti",
+			expectedValue:        "2048",
+		},
+		{
+			name:                 "positive scenario: fractional Gi unit - 1.5Gi (rounds up to 2 GiB)",
+			inputResourceQuanity: "1.5Gi",
+			expectedValue:        "2",
+		},
+		{
+			name:                 "positive scenario: fractional Mi unit - 1536Mi (1.5 GiB, rounds up to 2 GiB)",
+			inputResourceQuanity: "1536Mi",
+			expectedValue:        "2",
+		},
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {

+ 124 - 24
pkg/cloud/aws/provider.go

@@ -53,6 +53,7 @@ const (
 
 	APIPricingSource              = "Public API"
 	SpotPricingSource             = "Spot Data Feed"
+	SpotPriceHistorySource        = "Spot Price History"
 	ReservedInstancePricingSource = "Savings Plan, Reserved Instance, and Out-Of-Cluster"
 	FargatePricingSource          = "Fargate"
 
@@ -96,11 +97,7 @@ func (aws *AWS) PricingSourceStatus() map[string]*models.PricingSource {
 		Enabled: true,
 	}
 
-	if !aws.SpotRefreshEnabled() {
-		sps.Available = false
-		sps.Error = "Spot instances not set up"
-		sps.Enabled = false
-	} else {
+	if aws.SpotFeedRefreshEnabled() {
 		sps.Error = ""
 		if aws.SpotPricingError != nil {
 			sps.Error = aws.SpotPricingError.Error()
@@ -112,9 +109,28 @@ func (aws *AWS) PricingSourceStatus() map[string]*models.PricingSource {
 		} else {
 			sps.Error = "No spot instances detected"
 		}
+	} else {
+		sps.Available = false
+		sps.Error = "Spot instances not set up"
+		sps.Enabled = false
 	}
 	sources[SpotPricingSource] = sps
 
+	sphs := &models.PricingSource{
+		Name:    SpotPriceHistorySource,
+		Enabled: true,
+	}
+	if aws.SpotPriceHistoryError != nil {
+		sphs.Error = aws.SpotPriceHistoryError.Error()
+		sphs.Available = false
+	} else if aws.SpotPriceHistoryCache == nil {
+		sphs.Error = "Not yet initialized"
+		sphs.Available = false
+	} else {
+		sphs.Available = true
+	}
+	sources[SpotPriceHistorySource] = sphs
+
 	rps := &models.PricingSource{
 		Name:    ReservedInstancePricingSource,
 		Enabled: true,
@@ -185,6 +201,8 @@ type AWS struct {
 	SpotRefreshRunning          bool
 	SpotPricingLock             sync.RWMutex
 	SpotPricingError            error
+	SpotPriceHistoryCache       *SpotPriceHistoryCache
+	SpotPriceHistoryError       error
 	RIPricingByInstanceID       map[string]*RIData
 	RIPricingError              error
 	RIDataRunning               bool
@@ -848,8 +866,8 @@ func (aws *AWS) getRegionPricing(nodeList []*clustercache.Node) (*http.Response,
 	return resp, pricingURL, err
 }
 
-// SpotRefreshEnabled determines whether the required configs to run the spot feed query have been set up
-func (aws *AWS) SpotRefreshEnabled() bool {
+// SpotFeedRefreshEnabled determines whether the required configs to run the spot feed query have been set up
+func (aws *AWS) SpotFeedRefreshEnabled() bool {
 	// Guard against nil receiver
 	if aws == nil {
 		return false
@@ -1019,28 +1037,36 @@ func (aws *AWS) DownloadPricingData() error {
 	}
 	log.Infof("Finished downloading \"%s\"", pricingURL)
 
-	if !aws.SpotRefreshEnabled() {
-		return nil
+	// Initialize a spot price history cache if not already initialized.
+	// Reset error to allow retrying on subsequent DownloadPricingData calls.
+	if aws.SpotPriceHistoryCache == nil {
+		aws.SpotPriceHistoryError = nil
+		aws.SpotPriceHistoryCache, aws.SpotPriceHistoryError = aws.initializeSpotPriceHistoryCache()
+		if aws.SpotPriceHistoryError != nil {
+			log.Errorf("Failed to initialize spot price history manager: %v", aws.SpotPriceHistoryError)
+		}
 	}
 
-	// Always run spot pricing refresh when performing download
-	aws.refreshSpotPricing(true)
+	if aws.SpotFeedRefreshEnabled() {
+		// Always run spot pricing refresh when performing download
+		aws.refreshSpotPricing(true)
 
-	// Only start a single refresh goroutine
-	if !aws.SpotRefreshRunning {
-		aws.SpotRefreshRunning = true
+		// Only start a single refresh goroutine
+		if !aws.SpotRefreshRunning {
+			aws.SpotRefreshRunning = true
 
-		go func() {
-			defer errs.HandlePanic()
+			go func() {
+				defer errs.HandlePanic()
 
-			for {
-				log.Infof("Spot Pricing Refresh scheduled in %.2f minutes.", SpotRefreshDuration.Minutes())
-				time.Sleep(SpotRefreshDuration)
+				for {
+					log.Infof("Spot Pricing Refresh scheduled in %.2f minutes.", SpotRefreshDuration.Minutes())
+					time.Sleep(SpotRefreshDuration)
 
-				// Reoccurring refresh checks update times
-				aws.refreshSpotPricing(false)
-			}
-		}()
+					// Reoccurring refresh checks update times
+					aws.refreshSpotPricing(false)
+				}
+			}()
+		}
 	}
 
 	return nil
@@ -1278,6 +1304,60 @@ func (aws *AWS) refreshSpotPricing(force bool) {
 	aws.SpotPricingByInstanceID = sp
 }
 
+func (aws *AWS) initializeSpotPriceHistoryCache() (*SpotPriceHistoryCache, error) {
+	log.Info("Initializing AWS Spot Price History Manager")
+
+	// Get AWS access key for creating config
+	accessKey, err := aws.GetAWSAccessKey()
+	if err != nil {
+		return nil, fmt.Errorf("getting AWS access key for spot price history: %w", err)
+	}
+
+	// Use the cluster region to create the initial AWS config and credentials.
+	// The SpotPriceHistoryFetcher itself can query multiple regions by creating
+	// region-specific EC2 clients as needed.
+	if aws.ClusterRegion == "" {
+		return nil, fmt.Errorf("no cluster region configured")
+	}
+
+	// Create config for the cluster region
+	awsConfig, err := accessKey.CreateConfig(aws.ClusterRegion)
+	if err != nil {
+		return nil, fmt.Errorf("creating AWS config for spot price history: %w", err)
+	}
+
+	return NewSpotPriceHistoryCache(NewAWSSpotPriceHistoryFetcher(awsConfig)), nil
+}
+
+func (aws *AWS) spotPricingFromHistory(k models.Key) (*SpotPriceHistoryEntry, bool) {
+	if aws.SpotPriceHistoryCache == nil {
+		return nil, false
+	}
+
+	// Extract region, instance type, and availability zone from the key
+	awsKey, ok := k.(*awsKey)
+	if !ok {
+		log.DedupedWarningf(10, "Failed to cast key to awsKey for spot price history lookup: %s", k.ID())
+		return nil, false
+	}
+
+	region, regionOk := util.GetRegion(awsKey.Labels)
+	instanceType, instanceTypeOk := util.GetInstanceType(awsKey.Labels)
+	availabilityZone, availabilityZoneOk := util.GetZone(awsKey.Labels)
+	// Skip lookup if any required information is missing
+	if !regionOk || !instanceTypeOk || !availabilityZoneOk {
+		log.DedupedWarningf(10, "Missing required info for spot price history lookup (region: %s, instanceType: %s, zone: %s): %s", region, instanceType, availabilityZone, k.ID())
+		return nil, false
+	}
+
+	price, err := aws.SpotPriceHistoryCache.GetSpotPrice(region, instanceType, availabilityZone)
+	if err != nil {
+		log.DedupedWarningf(10, "Failed to get spot price history for instance %s: %s", k.ID(), err.Error())
+		return nil, false
+	}
+	return price, true
+}
+
 // Stubbed NetworkPricing for AWS. Pull directly from aws.json for now
 func (aws *AWS) NetworkPricing() (*models.Network, error) {
 	cpricing, err := aws.Config.GetCustomPricingData()
@@ -1403,9 +1483,29 @@ func (aws *AWS) createNode(terms *AWSProductTerms, usageType string, k models.Ke
 			UsageType:    PreemptibleType,
 		}, meta, nil
 	} else if aws.isPreemptible(key) { // Preemptible but we don't have any data in the pricing report.
-		log.DedupedWarningf(5, "Node %s marked preemptible but we have no data in spot feed", k.ID())
+		log.DedupedWarningf(5, "Node %s marked preemptible but no spot feed data available; falling back to other pricing sources", k.ID())
+
+		// Try to get spot pricing from DescribeSpotPriceHistory API
+		if historyEntry, ok := aws.spotPricingFromHistory(k); ok {
+			log.DedupedInfof(5, "Using spot price history data for node %s: $%f", k.ID(), historyEntry.SpotPrice)
+			spotHistoryCost := fmt.Sprintf("%f", historyEntry.SpotPrice)
+			meta.Source = SpotPriceHistorySource
+			return &models.Node{
+				Cost:         spotHistoryCost,
+				VCPU:         terms.VCpu,
+				RAM:          terms.Memory,
+				GPU:          terms.GPU,
+				Storage:      terms.Storage,
+				BaseCPUPrice: aws.BaseCPUPrice,
+				BaseRAMPrice: aws.BaseRAMPrice,
+				BaseGPUPrice: aws.BaseGPUPrice,
+				UsageType:    PreemptibleType,
+			}, meta, nil
+		}
+
 		if publicPricingFound {
 			// return public price if found
+			log.DedupedWarningf(5, "No spot price history available for %s, falling back to on-demand pricing", k.ID())
 			return &models.Node{
 				Cost:         cost,
 				VCPU:         terms.VCpu,

+ 377 - 7
pkg/cloud/aws/provider_test.go

@@ -2,12 +2,14 @@ package aws
 
 import (
 	"encoding/json"
+	"errors"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
 	"reflect"
 	"testing"
+	"time"
 
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/pkg/cloud/models"
@@ -867,7 +869,7 @@ func (f *fakeProviderConfig) ConfigFileManager() *config.ConfigFileManager {
 	return nil
 }
 
-func TestAWS_SpotRefreshEnabled(t *testing.T) {
+func TestAWS_SpotFeedRefreshEnabled(t *testing.T) {
 	tests := []struct {
 		name                string
 		spotDataBucket      string
@@ -955,9 +957,9 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 				},
 			}
 
-			got := aws.SpotRefreshEnabled()
+			got := aws.SpotFeedRefreshEnabled()
 			if got != tt.want {
-				t.Errorf("AWS.SpotRefreshEnabled() = %v, want %v", got, tt.want)
+				t.Errorf("AWS.SpotFeedRefreshEnabled() = %v, want %v", got, tt.want)
 			}
 		})
 	}
@@ -971,10 +973,10 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 			Config:         nil, // nil Config should not cause panic
 		}
 
-		got := aws.SpotRefreshEnabled()
+		got := aws.SpotFeedRefreshEnabled()
 		want := true // Should fall back to field-based check
 		if got != want {
-			t.Errorf("AWS.SpotRefreshEnabled() with nil Config = %v, want %v", got, want)
+			t.Errorf("AWS.SpotFeedRefreshEnabled() with nil Config = %v, want %v", got, want)
 		}
 	})
 
@@ -986,10 +988,378 @@ func TestAWS_SpotRefreshEnabled(t *testing.T) {
 			Config:         nil, // nil Config should not cause panic
 		}
 
-		got := aws.SpotRefreshEnabled()
+		got := aws.SpotFeedRefreshEnabled()
 		want := false // No fields set, should return false
 		if got != want {
-			t.Errorf("AWS.SpotRefreshEnabled() with nil Config and no fields = %v, want %v", got, want)
+			t.Errorf("AWS.SpotFeedRefreshEnabled() with nil Config and no fields = %v, want %v", got, want)
+		}
+	})
+}
+
+func TestAWS_spotPricingFromHistory(t *testing.T) {
+	t.Run("nil cache returns false", func(t *testing.T) {
+		aws := &AWS{}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when cache is nil")
+		}
+	})
+
+	t.Run("missing region label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when region label is missing")
+		}
+	})
+
+	t.Run("missing instance type label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region": "us-east-1",
+				"topology.kubernetes.io/zone":   "us-east-1a",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when instance type label is missing")
+		}
+	})
+
+	t.Run("missing zone label returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when zone label is missing")
+		}
+	})
+
+	t.Run("fetcher error returns false", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("api error")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		_, ok := aws.spotPricingFromHistory(key)
+		if ok {
+			t.Error("Expected false when fetcher returns error")
+		}
+	})
+
+	t.Run("successful lookup returns entry", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				if key.Region != "us-east-1" || key.InstanceType != "m5.large" || key.AvailabilityZone != "us-east-1a" {
+					t.Errorf("Unexpected key: %v", key)
+				}
+				return &SpotPriceHistoryEntry{
+					SpotPrice:   0.042,
+					Timestamp:   time.Now(),
+					RetrievedAt: time.Now(),
+				}, nil
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+		}
+		key := &awsKey{
+			ProviderID: "aws:///us-east-1a/i-0123456789abcdef0",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+			},
+		}
+		entry, ok := aws.spotPricingFromHistory(key)
+		if !ok {
+			t.Fatal("Expected true for successful lookup")
+		}
+		if entry.SpotPrice != 0.042 {
+			t.Errorf("Expected spot price 0.042, got %f", entry.SpotPrice)
+		}
+	})
+}
+
+func TestAWS_createNode_spotHistoryFallback(t *testing.T) {
+	// Helper to build AWSProductTerms with on-demand pricing
+	makeTerms := func(sku, offerTermCode, cost string) *AWSProductTerms {
+		priceKey := sku + "." + offerTermCode + "." + HourlyRateCode
+		return &AWSProductTerms{
+			Sku: sku,
+			OnDemand: &AWSOfferTerm{
+				Sku:           sku,
+				OfferTermCode: offerTermCode,
+				PriceDimensions: map[string]*AWSRateCode{
+					priceKey: {
+						Unit:         "Hrs",
+						PricePerUnit: AWSCurrencyCode{USD: cost},
+					},
+				},
+			},
+			VCpu:   "4",
+			Memory: "16",
+		}
+	}
+
+	t.Run("preemptible node uses spot history when available", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return &SpotPriceHistoryEntry{
+					SpotPrice:   0.035,
+					Timestamp:   time.Now(),
+					RetrievedAt: time.Now(),
+				}, nil
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		// Key with PreemptibleType suffix to trigger isPreemptible
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, meta, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.035000" {
+			t.Errorf("Expected spot history cost 0.035000, got %s", node.Cost)
+		}
+		if node.UsageType != PreemptibleType {
+			t.Errorf("Expected usage type %s, got %s", PreemptibleType, node.UsageType)
+		}
+		if meta.Source != SpotPriceHistorySource {
+			t.Errorf("Expected source %s, got %s", SpotPriceHistorySource, meta.Source)
+		}
+	})
+
+	t.Run("preemptible node falls back to on-demand when history unavailable", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("no data")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.096" {
+			t.Errorf("Expected on-demand cost 0.096, got %s", node.Cost)
+		}
+		if node.UsageType != PreemptibleType {
+			t.Errorf("Expected usage type %s, got %s", PreemptibleType, node.UsageType)
+		}
+	})
+
+	t.Run("preemptible node with nil cache falls back to on-demand", func(t *testing.T) {
+		aws := &AWS{
+			BaseCPUPrice: "0.04",
+			BaseRAMPrice: "0.01",
+			BaseGPUPrice: "0.95",
+		}
+		terms := makeTerms("SKU123", "JRTCKXETXF", "0.096")
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.Cost != "0.096" {
+			t.Errorf("Expected on-demand cost 0.096, got %s", node.Cost)
+		}
+	})
+
+	t.Run("preemptible node uses base spot prices when no public pricing", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{
+			fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+				return nil, errors.New("no data")
+			},
+		}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			BaseCPUPrice:          "0.04",
+			BaseRAMPrice:          "0.01",
+			BaseGPUPrice:          "0.95",
+			BaseSpotCPUPrice:      "0.02",
+			BaseSpotRAMPrice:      "0.005",
+		}
+		// Terms without valid pricing dimensions
+		terms := &AWSProductTerms{
+			Sku: "SKU123",
+			OnDemand: &AWSOfferTerm{
+				Sku:             "SKU123",
+				OfferTermCode:   "JRTCKXETXF",
+				PriceDimensions: map[string]*AWSRateCode{},
+			},
+			VCpu:   "4",
+			Memory: "16",
+		}
+		key := &awsKey{
+			ProviderID:     "aws:///us-east-1a/i-0123456789abcdef0",
+			SpotLabelName:  "eks.amazonaws.com/capacityType",
+			SpotLabelValue: "SPOT",
+			Labels: map[string]string{
+				"topology.kubernetes.io/region":    "us-east-1",
+				"topology.kubernetes.io/zone":      "us-east-1a",
+				"node.kubernetes.io/instance-type": "m5.large",
+				"kubernetes.io/os":                 "linux",
+				"eks.amazonaws.com/capacityType":   "SPOT",
+			},
+		}
+
+		node, _, err := aws.createNode(terms, PreemptibleType, key)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+		if node.VCPUCost != "0.02" {
+			t.Errorf("Expected base spot CPU price 0.02, got %s", node.VCPUCost)
+		}
+		if node.RAMCost != "0.005" {
+			t.Errorf("Expected base spot RAM price 0.005, got %s", node.RAMCost)
+		}
+	})
+}
+
+func TestAWS_PricingSourceStatus_spotPriceHistory(t *testing.T) {
+	t.Run("not yet initialized", func(t *testing.T) {
+		aws := &AWS{
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs, ok := sources[SpotPriceHistorySource]
+		if !ok {
+			t.Fatal("Expected SpotPriceHistorySource in sources")
+		}
+		if sphs.Available {
+			t.Error("Expected Available=false when cache not initialized")
+		}
+		if sphs.Error != "Not yet initialized" {
+			t.Errorf("Expected 'Not yet initialized' error, got %q", sphs.Error)
+		}
+	})
+
+	t.Run("initialization error", func(t *testing.T) {
+		aws := &AWS{
+			SpotPriceHistoryError: errors.New("no cluster region configured"),
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs := sources[SpotPriceHistorySource]
+		if sphs.Available {
+			t.Error("Expected Available=false on error")
+		}
+		if sphs.Error != "no cluster region configured" {
+			t.Errorf("Expected error message, got %q", sphs.Error)
+		}
+	})
+
+	t.Run("successfully initialized", func(t *testing.T) {
+		mockFetcher := &mockSpotPriceHistoryFetcher{}
+		aws := &AWS{
+			SpotPriceHistoryCache: NewSpotPriceHistoryCache(mockFetcher),
+			Config: &fakeProviderConfig{
+				customPricing: &models.CustomPricing{},
+			},
+		}
+		sources := aws.PricingSourceStatus()
+		sphs := sources[SpotPriceHistorySource]
+		if !sphs.Available {
+			t.Error("Expected Available=true when cache initialized")
 		}
 	})
 }

+ 200 - 0
pkg/cloud/aws/spotpricehistory.go

@@ -0,0 +1,200 @@
+package aws
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"sync"
+	"time"
+
+	awsSDK "github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/ec2"
+	ec2Types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
+
+	"github.com/opencost/opencost/core/pkg/log"
+)
+
+// SpotPriceHistoryKey uniquely identifies a spot price lookup by region,
+// instance type, and availability zone.
+type SpotPriceHistoryKey struct {
+	Region           string
+	InstanceType     string
+	AvailabilityZone string
+}
+
+func (key SpotPriceHistoryKey) String() string {
+	return fmt.Sprintf("%s/%s/%s", key.Region, key.InstanceType, key.AvailabilityZone)
+}
+
+const (
+	SpotPriceHistoryCacheAge = 1 * time.Hour
+)
+
+// SpotPriceHistoryEntry holds a cached spot price from the DescribeSpotPriceHistory API.
+type SpotPriceHistoryEntry struct {
+	SpotPrice float64
+	Timestamp time.Time
+
+	RetrievedAt time.Time
+	Error       error // Negative cache
+}
+
+func (spe SpotPriceHistoryEntry) shouldRefresh() bool {
+	return time.Since(spe.RetrievedAt) > SpotPriceHistoryCacheAge
+}
+
+// SpotPriceHistoryCache provides a thread-safe, on-demand cache for spot prices
+// retrieved via the DescribeSpotPriceHistory API. Entries are cached for
+// SpotPriceHistoryCacheAge and include negative caching for errors.
+type SpotPriceHistoryCache struct {
+	cache          map[SpotPriceHistoryKey]*SpotPriceHistoryEntry
+	mutex          sync.Mutex
+	refreshRunning map[SpotPriceHistoryKey]bool
+	refreshCond    *sync.Cond
+
+	fetcher SpotPriceHistoryFetcher
+}
+
+func NewSpotPriceHistoryCache(fetcher SpotPriceHistoryFetcher) *SpotPriceHistoryCache {
+	cache := &SpotPriceHistoryCache{
+		cache:          make(map[SpotPriceHistoryKey]*SpotPriceHistoryEntry),
+		refreshRunning: make(map[SpotPriceHistoryKey]bool),
+
+		fetcher: fetcher,
+	}
+	cache.refreshCond = sync.NewCond(&cache.mutex)
+	return cache
+}
+
+// GetSpotPrice returns the cached spot price for the given region, instance type,
+// and availability zone. If the cache entry is missing or stale, it fetches a
+// fresh value from the underlying SpotPriceHistoryFetcher.
+func (sph *SpotPriceHistoryCache) GetSpotPrice(region, instanceType, availabilityZone string) (*SpotPriceHistoryEntry, error) {
+	key := SpotPriceHistoryKey{
+		Region:           region,
+		InstanceType:     instanceType,
+		AvailabilityZone: availabilityZone,
+	}
+	sph.mutex.Lock()
+	for sph.refreshRunning[key] {
+		sph.refreshCond.Wait()
+	}
+	// Check if we have cached price. If so, return it.
+	entry, exists := sph.cache[key]
+	if exists && !entry.shouldRefresh() {
+		sph.mutex.Unlock()
+		return entry, entry.Error
+	}
+	// Either a cache entry does not exist or it is stale. Refresh it.
+	sph.refreshRunning[key] = true
+	sph.mutex.Unlock()
+
+	// Ensure refreshRunning is always cleared, even if the fetcher panics.
+	defer func() {
+		sph.mutex.Lock()
+		delete(sph.refreshRunning, key)
+		sph.refreshCond.Broadcast()
+		sph.mutex.Unlock()
+	}()
+
+	// Fetch the entry
+	entry, err := sph.fetcher.FetchSpotPrice(key)
+	if err != nil || entry == nil {
+		// If we fail to fetch or get a nil entry, create a negative cache entry.
+		if err == nil {
+			err = fmt.Errorf("fetcher returned nil entry for %s", key)
+		}
+		entry = &SpotPriceHistoryEntry{
+			RetrievedAt: time.Now(),
+			Error:       err,
+		}
+	} else {
+		// Normalize cache metadata so cache freshness does not depend on
+		// the fetcher setting these fields correctly.
+		entry.RetrievedAt = time.Now()
+		entry.Error = nil
+	}
+
+	// Store it into the cache
+	sph.mutex.Lock()
+	sph.cache[key] = entry
+	sph.mutex.Unlock()
+	return entry, entry.Error
+}
+
+// SpotPriceHistoryFetcher is the interface for fetching spot prices from the
+// DescribeSpotPriceHistory API (or a mock for testing).
+type SpotPriceHistoryFetcher interface {
+	FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error)
+}
+
+// AWSSpotPriceHistoryFetcher implements SpotPriceHistoryFetcher using the real
+// AWS EC2 DescribeSpotPriceHistory API. It maintains a pool of per-region
+// EC2 clients.
+type AWSSpotPriceHistoryFetcher struct {
+	awsConfig       awsSDK.Config
+	ec2ClientsMutex sync.Mutex
+	ec2Clients      map[string]*ec2.Client
+}
+
+func NewAWSSpotPriceHistoryFetcher(awsConfig awsSDK.Config) *AWSSpotPriceHistoryFetcher {
+	return &AWSSpotPriceHistoryFetcher{
+		awsConfig:  awsConfig,
+		ec2Clients: make(map[string]*ec2.Client),
+	}
+}
+
+func (a *AWSSpotPriceHistoryFetcher) getEC2Client(region string) *ec2.Client {
+	a.ec2ClientsMutex.Lock()
+	defer a.ec2ClientsMutex.Unlock()
+	if client, ok := a.ec2Clients[region]; ok {
+		return client
+	}
+	config := a.awsConfig
+	config.Region = region
+	client := ec2.NewFromConfig(config)
+	a.ec2Clients[region] = client
+	return client
+}
+
+func (a *AWSSpotPriceHistoryFetcher) FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+	log.Debugf("Retrieving spot price history for %s", key)
+	client := a.getEC2Client(key.Region)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	input := &ec2.DescribeSpotPriceHistoryInput{
+		InstanceTypes:    []ec2Types.InstanceType{ec2Types.InstanceType(key.InstanceType)},
+		AvailabilityZone: awsSDK.String(key.AvailabilityZone),
+		// Only retrieve Linux/UNIX (Amazon VPC) prices. The non-VPC
+		// "Linux/UNIX" variant was for EC2-Classic, which was fully retired in
+		// August 2023.
+		ProductDescriptions: []string{
+			"Linux/UNIX (Amazon VPC)",
+		},
+		// Only retrieve the latest price.
+		MaxResults: awsSDK.Int32(1),
+	}
+
+	resp, err := client.DescribeSpotPriceHistory(ctx, input)
+	if err != nil {
+		return nil, fmt.Errorf("describing spot price history for %s: %w", key, err)
+	}
+	if len(resp.SpotPriceHistory) == 0 {
+		return nil, fmt.Errorf("no spot price history found for %s", key)
+	}
+	spotPrice := resp.SpotPriceHistory[0]
+
+	if spotPrice.SpotPrice == nil || spotPrice.Timestamp == nil {
+		return nil, fmt.Errorf("missing required spot price history data for %s (SpotPrice=%v, Timestamp=%v)", key, spotPrice.SpotPrice, spotPrice.Timestamp)
+	}
+	price, err := strconv.ParseFloat(*spotPrice.SpotPrice, 64)
+	if err != nil {
+		return nil, fmt.Errorf("parsing spot price: %w", err)
+	}
+	return &SpotPriceHistoryEntry{
+		SpotPrice:   price,
+		Timestamp:   *spotPrice.Timestamp,
+		RetrievedAt: time.Now(),
+	}, nil
+}

+ 239 - 0
pkg/cloud/aws/spotpricehistory_test.go

@@ -0,0 +1,239 @@
+package aws
+
+import (
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+type mockSpotPriceHistoryFetcher struct {
+	fetchFunc func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error)
+}
+
+func (m *mockSpotPriceHistoryFetcher) FetchSpotPrice(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+	if m.fetchFunc != nil {
+		return m.fetchFunc(key)
+	}
+	return &SpotPriceHistoryEntry{
+		SpotPrice:   0.05,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now(),
+	}, nil
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_CacheHit(t *testing.T) {
+	mockFetcher := &mockSpotPriceHistoryFetcher{}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	region := "us-west-2"
+	instanceType := "m5.large"
+	availabilityZone := "us-west-2a"
+
+	key := SpotPriceHistoryKey{
+		Region:           region,
+		InstanceType:     instanceType,
+		AvailabilityZone: availabilityZone,
+	}
+
+	cachedEntry := &SpotPriceHistoryEntry{
+		SpotPrice:   0.08,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now(),
+	}
+	cache.cache[key] = cachedEntry
+
+	entry, err := cache.GetSpotPrice(region, instanceType, availabilityZone)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if entry.SpotPrice != 0.08 {
+		t.Errorf("Expected spot price 0.08, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_CacheMiss(t *testing.T) {
+	fetchCalled := false
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCalled = true
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.12,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if !fetchCalled {
+		t.Error("Expected fetcher to be called for cache miss")
+	}
+	if entry.SpotPrice != 0.12 {
+		t.Errorf("Expected spot price 0.12, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_ConcurrentSameKey(t *testing.T) {
+	var fetchCount atomic.Int32
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCount.Add(1)
+			// Simulate slow API call to increase chance of concurrent access
+			time.Sleep(50 * time.Millisecond)
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.07,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	const goroutines = 10
+	var wg sync.WaitGroup
+	wg.Add(goroutines)
+	for i := 0; i < goroutines; i++ {
+		go func() {
+			defer wg.Done()
+			entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+			if err != nil {
+				t.Errorf("Expected no error, got %v", err)
+			}
+			if entry.SpotPrice != 0.07 {
+				t.Errorf("Expected spot price 0.07, got %f", entry.SpotPrice)
+			}
+		}()
+	}
+	wg.Wait()
+
+	if count := fetchCount.Load(); count != 1 {
+		t.Errorf("Expected exactly 1 fetch call, got %d", count)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_StaleEntry(t *testing.T) {
+	fetchCalled := false
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			fetchCalled = true
+			return &SpotPriceHistoryEntry{
+				SpotPrice:   0.15,
+				Timestamp:   time.Now(),
+				RetrievedAt: time.Now(),
+			}, nil
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+
+	staleEntry := &SpotPriceHistoryEntry{
+		SpotPrice:   0.08,
+		Timestamp:   time.Now(),
+		RetrievedAt: time.Now().Add(-2 * time.Hour),
+	}
+	cache.cache[key] = staleEntry
+
+	entry, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if !fetchCalled {
+		t.Error("Expected fetcher to be called for stale entry")
+	}
+	if entry.SpotPrice != 0.15 {
+		t.Errorf("Expected refreshed spot price 0.15, got %f", entry.SpotPrice)
+	}
+}
+
+func TestSpotPriceHistoryCache_GetSpotPrice_FetchError(t *testing.T) {
+	expectedError := errors.New("fetch failed")
+	mockFetcher := &mockSpotPriceHistoryFetcher{
+		fetchFunc: func(key SpotPriceHistoryKey) (*SpotPriceHistoryEntry, error) {
+			return nil, expectedError
+		},
+	}
+	cache := NewSpotPriceHistoryCache(mockFetcher)
+
+	_, err := cache.GetSpotPrice("us-west-2", "m5.large", "us-west-2a")
+	if err == nil {
+		t.Error("Expected error from failed fetch")
+	}
+	if !errors.Is(err, expectedError) {
+		t.Errorf("Expected error %v, got %v", expectedError, err)
+	}
+
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+	cachedEntry := cache.cache[key]
+	if !errors.Is(cachedEntry.Error, expectedError) {
+		t.Errorf("Expected cached entry error %v, got %v", expectedError, cachedEntry.Error)
+	}
+}
+
+func TestSpotPriceHistoryEntry_shouldRefresh(t *testing.T) {
+	now := time.Now()
+
+	tests := []struct {
+		name        string
+		retrievedAt time.Time
+		expected    bool
+	}{
+		{
+			name:        "fresh entry",
+			retrievedAt: now,
+			expected:    false,
+		},
+		{
+			name:        "stale entry",
+			retrievedAt: now.Add(-2 * time.Hour),
+			expected:    true,
+		},
+		{
+			name:        "borderline entry",
+			retrievedAt: now.Add(-SpotPriceHistoryCacheAge + 1*time.Minute),
+			expected:    false,
+		},
+		{
+			name:        "expired entry",
+			retrievedAt: now.Add(-SpotPriceHistoryCacheAge - 1*time.Minute),
+			expected:    true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			entry := SpotPriceHistoryEntry{
+				RetrievedAt: tt.retrievedAt,
+			}
+			if got := entry.shouldRefresh(); got != tt.expected {
+				t.Errorf("shouldRefresh() = %v, want %v", got, tt.expected)
+			}
+		})
+	}
+}
+
+func TestSpotPriceHistoryKey_String(t *testing.T) {
+	key := SpotPriceHistoryKey{
+		Region:           "us-west-2",
+		InstanceType:     "m5.large",
+		AvailabilityZone: "us-west-2a",
+	}
+	expected := "us-west-2/m5.large/us-west-2a"
+	if got := key.String(); got != expected {
+		t.Errorf("String() = %v, want %v", got, expected)
+	}
+}

+ 0 - 1
pkg/cloud/provider/providerconfig.go

@@ -88,7 +88,6 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*models.CustomPrici
 
 	// File Doesn't Exist
 	if !exists {
-		log.Infof("Could not find Custom Pricing file at path '%s'", pc.configFile.Path())
 		pc.customPricing = DefaultPricing()
 		// If config file is not present use the contents from mount models/ as pricing data
 		// in closed source rather than from from  DefaultPricing as first source of truth.

+ 4 - 0
pkg/cmd/costmodel/costmodel.go

@@ -89,6 +89,10 @@ func Execute(conf *Config) error {
 		}
 	} else if conf.MCPServerEnabled {
 		log.Warnf("MCP Server is enabled but Kubernetes is not available. MCP server requires Kubernetes to function.")
+	} else {
+		if value, exists := os.LookupEnv(env.MCPServerEnabledEnvVar); !exists || value == "" {
+			log.Infof("MCP server is now disabled by default. If you wish to use the MCP server, please set the %s environment variable to true.", env.MCPServerEnabledEnvVar)
+		}
 	}
 
 	apiutil.ApplyContainerDiagnosticEndpoints(router)

+ 9 - 0
pkg/costmodel/csv_export.go

@@ -183,6 +183,15 @@ func (e *csvExporter) writeCSVToWriter(ctx context.Context, w io.Writer, dates [
 				return data.alloc.Properties.Namespace
 			},
 		},
+		{
+			column: "Cluster",
+			value: func(data rowData) string {
+				if data.alloc.Properties == nil {
+					return ""
+				}
+				return data.alloc.Properties.Cluster
+			},
+		},
 		{
 			column: "ControllerKind",
 			value: func(data rowData) string {

+ 8 - 7
pkg/costmodel/csv_export_test.go

@@ -49,6 +49,7 @@ func Test_UpdateCSV(t *testing.T) {
 								},
 							}, // 2 PVBytes, 2 PVCost
 							Properties: &opencost.AllocationProperties{
+								Cluster:        "test-cluster",
 								Namespace:      "test-namespace",
 								Controller:     "test-controller-name",
 								ControllerKind: "test-controller-kind",
@@ -66,8 +67,8 @@ func Test_UpdateCSV(t *testing.T) {
 		assert.Len(t, model.ComputeAllocationCalls(), 1)
 		assert.Equal(t, time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].Start)
 		assert.Equal(t, time.Date(2021, 1, 2, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].End)
-		assert.Equal(t, `Date,Namespace,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost
-2021-01-01,test-namespace,test-controller-kind,test-controller-name,test-pod,test-container,0.1,0.2,0.4,0.5,11,10,2,2,0.3,0.6,0.9,2,0.8,4.6000000000000005
+		assert.Equal(t, `Date,Namespace,Cluster,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost
+2021-01-01,test-namespace,test-cluster,test-controller-kind,test-controller-name,test-pod,test-container,0.1,0.2,0.4,0.5,11,10,2,2,0.3,0.6,0.9,2,0.8,4.6000000000000005
 `, string(storage.Data))
 	})
 
@@ -101,8 +102,8 @@ func Test_UpdateCSV(t *testing.T) {
 		require.NoError(t, err)
 		// uploaded a single file with the data
 		assert.Len(t, model.ComputeAllocationCalls(), 1)
-		assert.Equal(t, `Date,Namespace,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost,Labels,Label_test-label1,Label_test-label2
-2021-01-01,test-namespace,test-controller-kind,test-controller-name,test-pod,test-container,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{""test-label1"":""test-value1"",""test-label2"":""test-value2""}",test-value1,test-value2
+		assert.Equal(t, `Date,Namespace,Cluster,ControllerKind,ControllerName,Pod,Container,CPUCoreUsageAverage,CPUCoreRequestAverage,RAMBytesUsageAverage,RAMBytesRequestAverage,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,CPUCost,RAMCost,NetworkCost,PVCost,GPUCost,TotalCost,Labels,Label_test-label1,Label_test-label2
+2021-01-01,test-namespace,,test-controller-kind,test-controller-name,test-pod,test-container,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{""test-label1"":""test-value1"",""test-label2"":""test-value2""}",test-value1,test-value2
 `, string(storage.Data))
 	})
 
@@ -137,9 +138,9 @@ func Test_UpdateCSV(t *testing.T) {
 		// 2021-01-01 is already in the export file, so we only compute for 2021-01-02
 		assert.Equal(t, time.Date(2021, 1, 2, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].Start)
 		assert.Equal(t, time.Date(2021, 1, 3, 0, 0, 0, 0, time.UTC), model.ComputeAllocationCalls()[0].End)
-		assert.Equal(t, `Date,Namespace,CPUCoreUsageAverage,CPUCoreRequestAverage,CPUCost,RAMBytesUsageAverage,RAMBytesRequestAverage,RAMCost,Label_app,ControllerKind,ControllerName,Pod,Container,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,NetworkCost,PVCost,GPUCost,TotalCost
-2021-01-01,test-namespace,0.1,0.2,0.3,0.4,0.5,0.6,app1,,,,,,,,,,,,
-2021-01-02,test-namespace,0,0,1,0,0,0,,,,,,0,0,0,0,0,0,0,1
+		assert.Equal(t, `Date,Namespace,CPUCoreUsageAverage,CPUCoreRequestAverage,CPUCost,RAMBytesUsageAverage,RAMBytesRequestAverage,RAMCost,Label_app,Cluster,ControllerKind,ControllerName,Pod,Container,NetworkReceiveBytes,NetworkTransferBytes,GPUs,PVBytes,NetworkCost,PVCost,GPUCost,TotalCost
+2021-01-01,test-namespace,0.1,0.2,0.3,0.4,0.5,0.6,app1,,,,,,,,,,,,,
+2021-01-02,test-namespace,0,0,1,0,0,0,,,,,,,0,0,0,0,0,0,0,1
 `, string(storage.Data))
 	})
 

+ 21 - 3
pkg/customcost/ingestor.go

@@ -21,6 +21,12 @@ import (
 	"github.com/opencost/opencost/pkg/env"
 )
 
+// pluginConnector abstracts *plugin.Client so buildSingleDomain can be tested with mocks.
+type pluginConnector interface {
+	Client() (plugin.ClientProtocol, error)
+	Kill()
+}
+
 // IngestorStatus includes diagnostic values for a given Ingestor
 type IngestorStatus struct {
 	Created     time.Time
@@ -65,14 +71,22 @@ type CustomCostIngestor struct {
 	isStopping   atomic.Bool
 	exitBuildCh  chan string
 	exitRunCh    chan string
-	plugins      map[string]*plugin.Client
+	plugins      map[string]pluginConnector
 	pluginsLock  sync.RWMutex
 	resolution   time.Duration
 	refreshRate  time.Duration
 }
 
-// NewIngestor is an initializer for ingestor
+// NewCustomCostIngestor is an initializer for ingestor
 func NewCustomCostIngestor(ingestorConfig *CustomCostIngestorConfig, repo Repository, plugins map[string]*plugin.Client, res time.Duration) (*CustomCostIngestor, error) {
+	connectors := make(map[string]pluginConnector, len(plugins))
+	for k, v := range plugins {
+		connectors[k] = v
+	}
+	return newCustomCostIngestor(ingestorConfig, repo, connectors, res)
+}
+
+func newCustomCostIngestor(ingestorConfig *CustomCostIngestorConfig, repo Repository, plugins map[string]pluginConnector, res time.Duration) (*CustomCostIngestor, error) {
 	if repo == nil {
 		return nil, fmt.Errorf("CustomCost: NewCustomCostIngestor: repository connot be nil")
 	}
@@ -194,7 +208,11 @@ func (ing *CustomCostIngestor) buildSingleDomain(start, end time.Time, domain st
 		return
 	}
 
-	custCostSrc := raw.(ocplugin.CustomCostSource)
+	custCostSrc, ok := raw.(ocplugin.CustomCostSource)
+	if !ok {
+		log.Errorf("plugin %s returned invalid type: expected CustomCostSource, got %T", domain, raw)
+		return
+	}
 
 	custCostResps := custCostSrc.GetCustomCosts(req)
 	// loop through each customCostResponse, adding to repo

+ 95 - 5
pkg/customcost/ingestor_test.go

@@ -1,6 +1,7 @@
 package customcost
 
 import (
+	"fmt"
 	"os/exec"
 	"runtime"
 	"sync"
@@ -8,6 +9,7 @@ import (
 	"time"
 
 	"github.com/hashicorp/go-plugin"
+	"github.com/opencost/opencost/core/pkg/opencost"
 )
 
 func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
@@ -25,7 +27,7 @@ func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
 	_, _ = client.Client()
 
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{
+		plugins: map[string]pluginConnector{
 			"test-plugin": client,
 		},
 	}
@@ -37,6 +39,7 @@ func TestIngestor_Stop_KillsPluginProcesses(t *testing.T) {
 }
 
 func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
+	connectors := make(map[string]pluginConnector)
 	clients := make(map[string]*plugin.Client)
 	for _, name := range []string{"plugin-a", "plugin-b", "plugin-c"} {
 		cmd := exec.Command("sleep", "60")
@@ -50,10 +53,11 @@ func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
 			StartTimeout: 2 * time.Second,
 		})
 		_, _ = client.Client()
+		connectors[name] = client
 		clients[name] = client
 	}
 
-	ingestor := &CustomCostIngestor{plugins: clients}
+	ingestor := &CustomCostIngestor{plugins: connectors}
 	ingestor.Stop()
 
 	for name, client := range clients {
@@ -65,7 +69,7 @@ func TestIngestor_Stop_MultiplePlugins(t *testing.T) {
 
 func TestIngestor_Stop_EmptyPluginsMap(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 	ingestor.Stop() // covers lock path with 0 iterations
 }
@@ -77,7 +81,7 @@ func TestIngestor_Stop_NilPluginsMap(t *testing.T) {
 
 func TestIngestor_Stop_AlreadyStopping(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 	ingestor.isStopping.Store(true) // atomic.Bool must use Store()!
 	ingestor.Stop()                 // should return immediately
@@ -85,7 +89,7 @@ func TestIngestor_Stop_AlreadyStopping(t *testing.T) {
 
 func TestIngestor_Stop_ConcurrentCalls(t *testing.T) {
 	ingestor := &CustomCostIngestor{
-		plugins: map[string]*plugin.Client{},
+		plugins: map[string]pluginConnector{},
 	}
 
 	var wg sync.WaitGroup
@@ -187,3 +191,89 @@ func TestIngestor_BuildWindow_WithPlugin(t *testing.T) {
 	// BuildWindow and buildSingleDomain; client.Client() fails fast (false exits)
 	ingestor.BuildWindow(now.Add(-time.Hour), now)
 }
+
+// mockClientProtocol implements plugin.ClientProtocol for testing.
+type mockClientProtocol struct {
+	dispenseResult interface{}
+	dispenseErr    error
+}
+
+func (m *mockClientProtocol) Dispense(string) (interface{}, error) {
+	return m.dispenseResult, m.dispenseErr
+}
+func (m *mockClientProtocol) Ping() error  { return nil }
+func (m *mockClientProtocol) Close() error { return nil }
+
+// mockPluginConnector implements pluginConnector for testing.
+type mockPluginConnector struct {
+	protocol  plugin.ClientProtocol
+	clientErr error
+	killed    bool
+}
+
+func (m *mockPluginConnector) Client() (plugin.ClientProtocol, error) {
+	if m.clientErr != nil {
+		return nil, m.clientErr
+	}
+	return m.protocol, nil
+}
+
+func (m *mockPluginConnector) Kill() { m.killed = true }
+
+func TestBuildSingleDomain_InvalidPluginType_NoPanic(t *testing.T) {
+	mock := &mockPluginConnector{
+		protocol: &mockClientProtocol{
+			dispenseResult: "not a CustomCostSource", // wrong type
+		},
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"bad-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Before the fix this would panic; now it should log an error and return.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}
+
+func TestBuildSingleDomain_DispenseError(t *testing.T) {
+	mock := &mockPluginConnector{
+		protocol: &mockClientProtocol{
+			dispenseErr: fmt.Errorf("dispense failed"),
+		},
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"err-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Should handle the error gracefully without panic.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}
+
+func TestBuildSingleDomain_ClientError(t *testing.T) {
+	mock := &mockPluginConnector{
+		clientErr: fmt.Errorf("connection failed"),
+	}
+
+	repo := NewMemoryRepository()
+	ingestor := &CustomCostIngestor{
+		plugins:    map[string]pluginConnector{"fail-plugin": mock},
+		resolution: time.Hour,
+		repo:       repo,
+		coverage:   map[string]opencost.Window{},
+	}
+
+	now := time.Now().UTC()
+	// Should handle the error gracefully without panic.
+	ingestor.BuildWindow(now.Add(-time.Hour), now)
+}

+ 7 - 8
pkg/customcost/pipelineservice_test.go

@@ -10,7 +10,6 @@ import (
 	"testing"
 	"time"
 
-	"github.com/hashicorp/go-plugin"
 	"github.com/opencost/opencost/core/pkg/log"
 	"github.com/opencost/opencost/core/pkg/util/timeutil"
 )
@@ -280,7 +279,7 @@ func TestPipelineService_Stop_WithNilIngestors(t *testing.T) {
 func TestPipelineService_Stop_PartialNilIngestors(t *testing.T) {
 	hourly := &CustomCostIngestor{
 		key:     "hourly",
-		plugins: make(map[string]*plugin.Client),
+		plugins: make(map[string]pluginConnector),
 	}
 
 	ps := &PipelineService{
@@ -298,11 +297,11 @@ func TestPipelineService_Stop_ShutdownLogging(t *testing.T) {
 	ps := &PipelineService{
 		hourlyIngestor: &CustomCostIngestor{
 			key:     "hourly",
-			plugins: make(map[string]*plugin.Client),
+			plugins: make(map[string]pluginConnector),
 		},
 		dailyIngestor: &CustomCostIngestor{
 			key:     "daily",
-			plugins: make(map[string]*plugin.Client),
+			plugins: make(map[string]pluginConnector),
 		},
 		domains: []string{},
 	}
@@ -324,8 +323,8 @@ func TestPipelineService_Stop_NilIngestors(t *testing.T) {
 }
 
 func TestPipelineService_Stop_WithIngestors(t *testing.T) {
-	hourly := &CustomCostIngestor{plugins: map[string]*plugin.Client{}}
-	daily := &CustomCostIngestor{plugins: map[string]*plugin.Client{}}
+	hourly := &CustomCostIngestor{plugins: map[string]pluginConnector{}}
+	daily := &CustomCostIngestor{plugins: map[string]pluginConnector{}}
 	ps := &PipelineService{
 		hourlyIngestor: hourly,
 		dailyIngestor:  daily,
@@ -335,14 +334,14 @@ func TestPipelineService_Stop_WithIngestors(t *testing.T) {
 
 func TestPipelineService_Stop_OnlyHourlyIngestor(t *testing.T) {
 	ps := &PipelineService{
-		hourlyIngestor: &CustomCostIngestor{plugins: map[string]*plugin.Client{}},
+		hourlyIngestor: &CustomCostIngestor{plugins: map[string]pluginConnector{}},
 	}
 	ps.Stop() // should not panic when dailyIngestor is nil
 }
 
 func TestPipelineService_Stop_OnlyDailyIngestor(t *testing.T) {
 	ps := &PipelineService{
-		dailyIngestor: &CustomCostIngestor{plugins: map[string]*plugin.Client{}},
+		dailyIngestor: &CustomCostIngestor{plugins: map[string]pluginConnector{}},
 	}
 	ps.Stop() // should not panic when hourlyIngestor is nil
 }

+ 1 - 1
pkg/env/costmodel.go

@@ -443,7 +443,7 @@ func GetOVHMonthlyNodepools() []string {
 // IsMCPServerEnabled returns the environment variable value for MCPServerEnabledEnvVar which represents
 // whether or not the MCP server is enabled.
 func IsMCPServerEnabled() bool {
-	return env.GetBool(MCPServerEnabledEnvVar, true)
+	return env.GetBool(MCPServerEnabledEnvVar, false)
 }
 
 // GetMCPHTTPPort returns the environment variable value for MCPHTTPPortEnvVar which represents

+ 22 - 0
pkg/env/costmodel_test.go

@@ -79,3 +79,25 @@ func TestGetKubernetesEnabled(t *testing.T) {
 	}
 
 }
+
+func TestIsMCPServerEnabled_DefaultFalse(t *testing.T) {
+	old, hadOld := os.LookupEnv("MCP_SERVER_ENABLED")
+	os.Unsetenv("MCP_SERVER_ENABLED")
+	t.Cleanup(func() {
+		if hadOld {
+			os.Setenv("MCP_SERVER_ENABLED", old)
+		} else {
+			os.Unsetenv("MCP_SERVER_ENABLED")
+		}
+	})
+	if got := IsMCPServerEnabled(); got {
+		t.Fatalf("expected false when MCP_SERVER_ENABLED is unset, got %v", got)
+	}
+}
+
+func TestIsMCPServerEnabled_True(t *testing.T) {
+	t.Setenv("MCP_SERVER_ENABLED", "true")
+	if got := IsMCPServerEnabled(); !got {
+		t.Fatalf("expected true when env var set to true, got %v", got)
+	}
+}

Некоторые файлы не были показаны из-за большого количества измененных файлов