Преглед изворни кода

AI Inference Costs v1 (#3845)

Signed-off-by: simanadler <sima@il.ibm.com>
Co-authored-by: Alex Meijer <ameijer@users.noreply.github.com>
simanadler пре 15 часа
родитељ
комит
67e53fad4d

+ 2 - 0
README.md

@@ -25,6 +25,7 @@ To see the full functionality of OpenCost you can view [OpenCost features](https
 - Easily export pricing data to Prometheus with /metrics endpoint ([learn more](https://www.opencost.io/docs/installation/prometheus))
 - Carbon costs for cloud resources
 - MCP support
+- **AI inference cost tracking** for [vLLM](https://vllm.ai/)-based deployments ([llm-d](https://llm-d.ai/) and compatible): cost per million tokens (input/output), KV cache-corrected pricing, shared infrastructure attribution, REST APIs and Prometheus metrics
 - Support for external costs like Datadog through [OpenCost Plugins](https://github.com/opencost/opencost-plugins)
 - Free and open source distribution ([Apache2 license](LICENSE))
 
@@ -51,6 +52,7 @@ Note: The standalone Kubernetes manifest files have been removed. Please use Hel
 - [CLI / kubectl cost](https://www.opencost.io/docs/integrations/kubectl-cost)
 - [Prometheus Metrics](https://www.opencost.io/docs/integrations/prometheus)
 - [User Interface](https://www.opencost.io/docs/installation/ui)
+- [AI Inference Cost Tracking](docs/inference-cost-tracking.md)
 
 ## MCP Server
 

+ 27 - 0
core/pkg/source/datasource.go

@@ -216,6 +216,14 @@ const (
 
 	// Data Coverage
 	QueryDataCoverage = "QueryDataCoverage"
+
+	// Inference Metrics
+	QueryInferencePromptTokens         = "QueryInferencePromptTokens"
+	QueryInferenceGenerationTokens     = "QueryInferenceGenerationTokens"
+	QueryInferenceInputProcessingTime  = "QueryInferenceInputProcessingTime"
+	QueryInferenceOutputProcessingTime = "QueryInferenceOutputProcessingTime"
+	QueryInferenceCachedTokens         = "QueryInferenceCachedTokens"
+	QueryInferenceCacheConfig          = "QueryInferenceCacheConfig"
 )
 
 type MetricsQuerier interface {
@@ -433,6 +441,25 @@ type MetricsQuerier interface {
 
 	// Data Coverage Query
 	QueryDataCoverage(limitDays int) (time.Time, time.Time, error)
+
+	// Inference Metrics (vLLM) - relevant when INFERENCE_COST_ENABLED is set to true
+	// QueryInferencePromptTokens returns prompt token counts by model_name and namespace
+	QueryInferencePromptTokens(start, end time.Time) *Future[InferenceTokensResult]
+
+	// QueryInferenceGenerationTokens returns generation token counts by model_name and namespace
+	QueryInferenceGenerationTokens(start, end time.Time) *Future[InferenceTokensResult]
+
+	// QueryInferenceInputProcessingTime returns input processing time in seconds by model_name and namespace
+	QueryInferenceInputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult]
+
+	// QueryInferenceOutputProcessingTime returns output processing time in seconds by model_name and namespace
+	QueryInferenceOutputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult]
+
+	// QueryInferenceCachedTokens returns KV cache hit counts by model_name and namespace
+	QueryInferenceCachedTokens(start, end time.Time) *Future[InferenceTokensResult]
+
+	// QueryInferenceCacheConfig returns cache configuration (prefix caching enabled) by model_name and namespace
+	QueryInferenceCacheConfig(t time.Time) *Future[InferenceCacheConfigResult]
 }
 
 type OpenCostDataSource interface {

+ 58 - 0
core/pkg/source/decoders.go

@@ -2134,6 +2134,64 @@ type DCGMDeviceContainerUsageResult struct {
 	Value     float64
 }
 
+// Inference Metrics Decoders
+
+func DecodeInferenceTokensResult(result *QueryResult) *InferenceTokensResult {
+	modelName, _ := result.GetString("model_name")
+	namespace, _ := result.GetString("namespace")
+	key := modelName + ":" + namespace
+	
+	// Get the value from the last vector point if available
+	var value float64
+	if len(result.Values) > 0 {
+		value = result.Values[len(result.Values)-1].Value
+	}
+
+	return &InferenceTokensResult{
+		Values: map[string]float64{
+			key: value,
+		},
+	}
+}
+
+func DecodeInferenceProcessingTimeResult(result *QueryResult) *InferenceProcessingTimeResult {
+	modelName, _ := result.GetString("model_name")
+	namespace, _ := result.GetString("namespace")
+	key := modelName + ":" + namespace
+	
+	// Get the value from the last vector point if available
+	var value float64
+	if len(result.Values) > 0 {
+		value = result.Values[len(result.Values)-1].Value
+	}
+
+	return &InferenceProcessingTimeResult{
+		Values: map[string]float64{
+			key: value,
+		},
+	}
+}
+
+func DecodeInferenceCacheConfigResult(result *QueryResult) *InferenceCacheConfigResult {
+	modelName, _ := result.GetString("model_name")
+	namespace, _ := result.GetString("namespace")
+	key := modelName + ":" + namespace
+	
+	// Get the value from the last vector point if available
+	var prefixCachingEnabled float64
+	if len(result.Values) > 0 {
+		prefixCachingEnabled = result.Values[len(result.Values)-1].Value
+	}
+
+	return &InferenceCacheConfigResult{
+		Configs: map[string]*InferenceCacheConfig{
+			key: {
+				PrefixCachingEnabled: prefixCachingEnabled > 0,
+			},
+		},
+	}
+}
+
 func DecodeDCGMDeviceContainerUsageResult(result *QueryResult) *DCGMDeviceContainerUsageResult {
 	uuid, _ := result.GetString(UUIDLabel)
 	podUID, _ := result.GetString(PodUIDLabel)

+ 178 - 0
core/pkg/source/decoders_test.go

@@ -0,0 +1,178 @@
+package source
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/util"
+	"github.com/stretchr/testify/require"
+)
+
+// vecs is a helper to build a slice of vectors from raw values. The timestamps
+// are monotonically increasing so that "last value" semantics are exercised.
+func vecs(values ...float64) []*util.Vector {
+	out := make([]*util.Vector, 0, len(values))
+	for i, v := range values {
+		out = append(out, &util.Vector{Timestamp: float64(i), Value: v})
+	}
+	return out
+}
+
+func TestDecodeInferenceTokensResult(t *testing.T) {
+	cases := []struct {
+		name        string
+		metric      map[string]any
+		values      []*util.Vector
+		expectedKey string
+		expectedVal float64
+	}{
+		{
+			name:        "uses last value with model and namespace",
+			metric:      map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:      vecs(10, 20, 42),
+			expectedKey: "llama3:default",
+			expectedVal: 42,
+		},
+		{
+			name:        "empty values defaults to zero",
+			metric:      map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:      vecs(),
+			expectedKey: "llama3:default",
+			expectedVal: 0,
+		},
+		{
+			name:        "single value",
+			metric:      map[string]any{"model_name": "mistral", "namespace": "team-a"},
+			values:      vecs(7),
+			expectedKey: "mistral:team-a",
+			expectedVal: 7,
+		},
+		{
+			name:        "missing labels yields separator-only key",
+			metric:      map[string]any{},
+			values:      vecs(5),
+			expectedKey: ":",
+			expectedVal: 5,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			result := NewQueryResult(tc.metric, tc.values, nil)
+
+			decoded := DecodeInferenceTokensResult(result)
+
+			require.NotNil(t, decoded)
+			require.Len(t, decoded.Values, 1)
+			val, ok := decoded.Values[tc.expectedKey]
+			require.True(t, ok, "expected key %q to be present", tc.expectedKey)
+			require.Equal(t, tc.expectedVal, val)
+		})
+	}
+}
+
+func TestDecodeInferenceProcessingTimeResult(t *testing.T) {
+	cases := []struct {
+		name        string
+		metric      map[string]any
+		values      []*util.Vector
+		expectedKey string
+		expectedVal float64
+	}{
+		{
+			name:        "uses last value with model and namespace",
+			metric:      map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:      vecs(1.5, 2.5, 3.75),
+			expectedKey: "llama3:default",
+			expectedVal: 3.75,
+		},
+		{
+			name:        "empty values defaults to zero",
+			metric:      map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:      vecs(),
+			expectedKey: "llama3:default",
+			expectedVal: 0,
+		},
+		{
+			name:        "missing labels yields separator-only key",
+			metric:      map[string]any{},
+			values:      vecs(9.9),
+			expectedKey: ":",
+			expectedVal: 9.9,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			result := NewQueryResult(tc.metric, tc.values, nil)
+
+			decoded := DecodeInferenceProcessingTimeResult(result)
+
+			require.NotNil(t, decoded)
+			require.Len(t, decoded.Values, 1)
+			val, ok := decoded.Values[tc.expectedKey]
+			require.True(t, ok, "expected key %q to be present", tc.expectedKey)
+			require.Equal(t, tc.expectedVal, val)
+		})
+	}
+}
+
+func TestDecodeInferenceCacheConfigResult(t *testing.T) {
+	cases := []struct {
+		name            string
+		metric          map[string]any
+		values          []*util.Vector
+		expectedKey     string
+		expectedEnabled bool
+	}{
+		{
+			name:            "positive value means prefix caching enabled",
+			metric:          map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:          vecs(1),
+			expectedKey:     "llama3:default",
+			expectedEnabled: true,
+		},
+		{
+			name:            "zero value means prefix caching disabled",
+			metric:          map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:          vecs(0),
+			expectedKey:     "llama3:default",
+			expectedEnabled: false,
+		},
+		{
+			name:            "negative value means prefix caching disabled",
+			metric:          map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:          vecs(-1),
+			expectedKey:     "llama3:default",
+			expectedEnabled: false,
+		},
+		{
+			name:            "uses last value to determine enabled",
+			metric:          map[string]any{"model_name": "mistral", "namespace": "team-a"},
+			values:          vecs(1, 0, 1),
+			expectedKey:     "mistral:team-a",
+			expectedEnabled: true,
+		},
+		{
+			name:            "empty values defaults to disabled",
+			metric:          map[string]any{"model_name": "llama3", "namespace": "default"},
+			values:          vecs(),
+			expectedKey:     "llama3:default",
+			expectedEnabled: false,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			result := NewQueryResult(tc.metric, tc.values, nil)
+
+			decoded := DecodeInferenceCacheConfigResult(result)
+
+			require.NotNil(t, decoded)
+			require.Len(t, decoded.Configs, 1)
+			config, ok := decoded.Configs[tc.expectedKey]
+			require.True(t, ok, "expected key %q to be present", tc.expectedKey)
+			require.NotNil(t, config)
+			require.Equal(t, tc.expectedEnabled, config.PrefixCachingEnabled)
+		})
+	}
+}

+ 28 - 0
core/pkg/source/inference_results.go

@@ -0,0 +1,28 @@
+package source
+
+// InferenceCacheConfig holds cache configuration for a model.
+type InferenceCacheConfig struct {
+	PrefixCachingEnabled bool
+}
+
+// InferenceTokensResult represents token counts.
+// Used for prompt tokens, generation tokens, and cached tokens.
+type InferenceTokensResult struct {
+	// Values maps "model_name:namespace" to token count
+	Values map[string]float64
+}
+
+// InferenceProcessingTimeResult represents processing time in seconds.
+// Used for input processing time and output processing time.
+type InferenceProcessingTimeResult struct {
+	// Values maps "model_name:namespace" to processing time in seconds
+	Values map[string]float64
+}
+
+// InferenceCacheConfigResult represents cache configuration.
+type InferenceCacheConfigResult struct {
+	// Configs maps "model_name:namespace" to cache configuration
+	Configs map[string]*InferenceCacheConfig
+}
+
+// Made with Bob

+ 40 - 0
core/pkg/source/mock.go

@@ -1028,6 +1028,46 @@ func (m *MockMetricsQuerier) QueryResourceQuotaStatusUsedRAMLimitMax(start, end
 
 // Data Coverage Query
 
+// Inference Metrics
+
+func (m *MockMetricsQuerier) QueryInferencePromptTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return getFutureFromOverride(m.overrides, QueryInferencePromptTokens, func() *Future[InferenceTokensResult] {
+		return m.noop.QueryInferencePromptTokens(start, end)
+	})
+}
+
+func (m *MockMetricsQuerier) QueryInferenceGenerationTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return getFutureFromOverride(m.overrides, QueryInferenceGenerationTokens, func() *Future[InferenceTokensResult] {
+		return m.noop.QueryInferenceGenerationTokens(start, end)
+	})
+}
+
+func (m *MockMetricsQuerier) QueryInferenceInputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	return getFutureFromOverride(m.overrides, QueryInferenceInputProcessingTime, func() *Future[InferenceProcessingTimeResult] {
+		return m.noop.QueryInferenceInputProcessingTime(start, end)
+	})
+}
+
+func (m *MockMetricsQuerier) QueryInferenceOutputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	return getFutureFromOverride(m.overrides, QueryInferenceOutputProcessingTime, func() *Future[InferenceProcessingTimeResult] {
+		return m.noop.QueryInferenceOutputProcessingTime(start, end)
+	})
+}
+
+func (m *MockMetricsQuerier) QueryInferenceCachedTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return getFutureFromOverride(m.overrides, QueryInferenceCachedTokens, func() *Future[InferenceTokensResult] {
+		return m.noop.QueryInferenceCachedTokens(start, end)
+	})
+}
+
+func (m *MockMetricsQuerier) QueryInferenceCacheConfig(t time.Time) *Future[InferenceCacheConfigResult] {
+	return getFutureFromOverride(m.overrides, QueryInferenceCacheConfig, func() *Future[InferenceCacheConfigResult] {
+		return m.noop.QueryInferenceCacheConfig(t)
+	})
+}
+
+// Data Coverage Query
+
 func (m *MockMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {
 	if v, ok := m.overrides[QueryDataCoverage]; ok {
 		if f, ok := v.(func(int) (time.Time, time.Time, error)); ok {

+ 26 - 0
core/pkg/source/noop.go

@@ -685,6 +685,32 @@ func (m *NoOpMetricsQuerier) QueryResourceQuotaStatusUsedRAMLimitMax(start, end
 	return newEmptyResult(DecodeResourceResult)
 }
 
+// Inference Metrics
+
+func (m *NoOpMetricsQuerier) QueryInferencePromptTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return newEmptyResult(DecodeInferenceTokensResult)
+}
+
+func (m *NoOpMetricsQuerier) QueryInferenceGenerationTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return newEmptyResult(DecodeInferenceTokensResult)
+}
+
+func (m *NoOpMetricsQuerier) QueryInferenceInputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	return newEmptyResult(DecodeInferenceProcessingTimeResult)
+}
+
+func (m *NoOpMetricsQuerier) QueryInferenceOutputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	return newEmptyResult(DecodeInferenceProcessingTimeResult)
+}
+
+func (m *NoOpMetricsQuerier) QueryInferenceCachedTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	return newEmptyResult(DecodeInferenceTokensResult)
+}
+
+func (m *NoOpMetricsQuerier) QueryInferenceCacheConfig(t time.Time) *Future[InferenceCacheConfigResult] {
+	return newEmptyResult(DecodeInferenceCacheConfigResult)
+}
+
 // Data Coverage Query
 
 func (m *NoOpMetricsQuerier) QueryDataCoverage(_ int) (time.Time, time.Time, error) {

+ 32 - 0
core/pkg/source/record.go

@@ -848,6 +848,38 @@ func (m *RecordMetricsQuerier) QueryResourceQuotaStatusUsedRAMLimitMax(start, en
 	return m.Querier.QueryResourceQuotaStatusUsedRAMLimitMax(start, end)
 }
 
+// Inference Metrics
+
+func (m *RecordMetricsQuerier) QueryInferencePromptTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	m.recordCall(QueryInferencePromptTokens)
+	return m.Querier.QueryInferencePromptTokens(start, end)
+}
+
+func (m *RecordMetricsQuerier) QueryInferenceGenerationTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	m.recordCall(QueryInferenceGenerationTokens)
+	return m.Querier.QueryInferenceGenerationTokens(start, end)
+}
+
+func (m *RecordMetricsQuerier) QueryInferenceInputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	m.recordCall(QueryInferenceInputProcessingTime)
+	return m.Querier.QueryInferenceInputProcessingTime(start, end)
+}
+
+func (m *RecordMetricsQuerier) QueryInferenceOutputProcessingTime(start, end time.Time) *Future[InferenceProcessingTimeResult] {
+	m.recordCall(QueryInferenceOutputProcessingTime)
+	return m.Querier.QueryInferenceOutputProcessingTime(start, end)
+}
+
+func (m *RecordMetricsQuerier) QueryInferenceCachedTokens(start, end time.Time) *Future[InferenceTokensResult] {
+	m.recordCall(QueryInferenceCachedTokens)
+	return m.Querier.QueryInferenceCachedTokens(start, end)
+}
+
+func (m *RecordMetricsQuerier) QueryInferenceCacheConfig(t time.Time) *Future[InferenceCacheConfigResult] {
+	m.recordCall(QueryInferenceCacheConfig)
+	return m.Querier.QueryInferenceCacheConfig(t)
+}
+
 // Data Coverage Query
 
 func (m *RecordMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {

+ 446 - 0
docs/inference-cost-tracking.md

@@ -0,0 +1,446 @@
+# AI Inference Cost Tracking
+
+OpenCost tracks infrastructure costs for AI inference workloads running [vLLM](https://vllm.ai/)-based models ([llm-d](https://llm-d.ai/) and compatible deployments). It exposes costs both as Prometheus metrics and as queryable REST API endpoints.
+
+## Overview
+
+The inference cost tracking feature:
+1. Collects token metrics from [vLLM](https://vllm.ai/) via Prometheus (`prompt_tokens_total`, `generation_tokens_total`, prefill/decode timing, KV cache hits)
+2. Collects infrastructure costs (GPU, CPU, RAM, shared infra) from OpenCost's allocation layer
+3. Calculates blended and differentiated (input/output) cost per million tokens under two cost bases: `allocation` and `usage`
+4. Exports Inference Prometheus gauge metrics per model/namespace
+5. Serves two REST API endpoints for on-demand cost queries with filtering, aggregation, and time-series support
+
+## Enabling Inference Cost Tracking
+
+Set the following environment variable on the OpenCost deployment:
+
+```bash
+INFERENCE_COST_ENABLED=true
+```
+
+OpenCost reads `PROMETHEUS_SERVER_ENDPOINT` for both the core metrics and the [vLLM](https://vllm.ai/) metric queries, so no separate Prometheus configuration is needed.
+
+### Full Environment Variable Reference
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `INFERENCE_COST_ENABLED` | `false` | Enable inference cost tracking |
+| `INFERENCE_MODEL_LABEL` | `llm-d.ai/model` | Pod label whose value is the vLLM model name. **Must match the `model_name` label on vLLM Prometheus metrics.** See [Model label](#model-label) for details. |
+| `INFERENCE_SHARED_INFRA_LABEL` | `llm-d.ai/inference-shared` | Pod label key identifying shared infra pods (EPP, gateway). See [Shared infrastructure label](#shared-infrastructure-label) for details. |
+| `INFERENCE_SHARED_INFRA_LABEL_VALUE` | `true` | Label value that marks a pod as shared infra. See [Shared infrastructure label](#shared-infrastructure-label) for details. |
+| `INFERENCE_COLLECTION_INTERVAL` | `2m` | Background collection interval |
+
+### Kubernetes Deployment Example
+
+```yaml
+env:
+  - name: INFERENCE_COST_ENABLED
+    value: "true"
+  - name: INFERENCE_MODEL_LABEL
+    value: "llm-d.ai/model"
+```
+
+## Cost Bases
+
+OpenCost computes costs under two distinct bases, surfaced on every metric and API response:
+
+| Cost Basis | Label / Value | Description |
+|------------|---------------|-------------|
+| **Allocation** | `cost_basis=allocation` | `max(request, usage) × price` + idle share + shared infra share. **Reconciles to the infrastructure bill.** |
+| **Usage** | `cost_basis=usage` | Actual resource consumption only. Does **not** reconcile to the bill; idle and shared infrastructure costs are excluded. |
+
+Use `allocation` for chargeback/showback and bill reconciliation. Use `usage` for pure workload efficiency analysis.
+
+## Prometheus Metrics
+
+When `INFERENCE_COST_ENABLED=true`, OpenCost registers and emits inference gauge metrics every collection interval. All metrics carry `model_name`, `model_version`, `namespace`, `cost_basis`, and `workload_type` labels.
+
+The `workload_type` label is currently always set to `inference`. Future versions may support additional workload types such as `training` or `fine-tuning`.
+
+Note: `pod`, `controller`, `controller_kind`, `container`, `workload_type` aggregation are available via [REST APIs](#rest-api-endpoints), although the only workload_type currently support is "inference".
+
+### `llm_total_hourly_cost`
+
+**Hourly infrastructure cost** attributed to a model.
+
+**Labels:** `model_name`, `model_version`, `namespace`, `cost_basis`, `workload_type`
+
+This is an instantaneous hourly rate ($/hour), not a cumulative counter.
+
+```promql
+# Current hourly cost for a model
+llm_total_hourly_cost{model_name="Qwen/Qwen3-32B", cost_basis="allocation"}
+
+# Estimated 24-hour cost if current rate continues (real-time projection)
+llm_total_hourly_cost{model_name="Qwen/Qwen3-32B", cost_basis="allocation"} * 24
+
+# Actual 24-hour cost based on historical average (more accurate for reporting)
+avg_over_time(llm_total_hourly_cost{model_name="Qwen/Qwen3-32B", cost_basis="allocation"}[24h]) * 24
+```
+
+### `llm_cost_per_million_tokens`
+
+**Cost per 1M tokens.** This metric serves dual purposes based on the `phase` label:
+
+- **Without `phase` label (blended):** Combined cost for all tokens (prompt + generation)
+- **`phase=prompt`:** Cost per 1M delivered input tokens (uses `promptTokens` as denominator; see `llm_cache_savings_fraction` for KV cache utilization)
+- **`phase=generation`:** Cost per 1M output tokens
+
+**Labels:** `model_name`, `model_version`, `namespace`, `cost_basis`, `phase`, `allocation_method`, `workload_type`
+
+The `phase` label distinguishes between:
+- *(empty)* — Blended cost across all tokens
+- `prompt` — Input/prompt token cost
+- `generation` — Output/generation token cost
+
+The `allocation_method` label (present only when `phase` is set) indicates how the input/output split was calculated:
+
+| Value | Meaning |
+|-------|---------|
+| `compute_time` | Cost split proportionally by vLLM prefill/decode time. KV cache utilization is reported separately in `llm_cache_savings_fraction`. |
+| `prefix_caching_off` | Same time-based split but prefix caching is explicitly disabled on the vLLM instance — `llm_cache_savings_fraction` will be zero by configuration. |
+| `multiplier` | Fixed output/input cost ratio (vLLM timing metrics unavailable; default ratio 2.5×). |
+| *(empty)* | No tokens processed or total cost is zero (allocation join failed — see [Labeling Requirements](#labeling-requirements)). |
+
+```promql
+# Current blended cost per 1M tokens
+llm_cost_per_million_tokens{model_name="Qwen/Qwen3-32B", cost_basis="allocation", phase=""}
+
+# Input (prompt) cost per 1M delivered input tokens
+llm_cost_per_million_tokens{
+  model_name="Qwen/Qwen3-32B",
+  cost_basis="allocation",
+  phase="prompt",
+  allocation_method="compute_time"
+}
+
+# Output (generation) cost per 1M tokens
+llm_cost_per_million_tokens{
+  model_name="Qwen/Qwen3-32B",
+  cost_basis="allocation",
+  phase="generation"
+}
+
+# Input vs output cost ratio
+llm_cost_per_million_tokens{phase="generation"} / llm_cost_per_million_tokens{phase="prompt"}
+
+# Average blended cost over the past 24 hours
+avg_over_time(llm_cost_per_million_tokens{model_name="Qwen/Qwen3-32B", phase=""}[24h])
+
+# Compare models side-by-side (blended)
+llm_cost_per_million_tokens{cost_basis="allocation", phase=""}
+
+# Sum input and output costs
+sum by (model_name, namespace) (llm_cost_per_million_tokens{phase=~"prompt|generation"})
+```
+
+### `llm_cache_savings_fraction`
+
+**Fraction of prompt tokens served from the KV cache** (range 0–1). A value of `0.9` means 90% of prompt tokens were cache hits and required no prefill computation.
+
+**Labels:** `model_name`, `model_version`, `namespace`, `workload_type`
+
+Zero when prefix caching is disabled (`allocation_method=prefix_caching_off` on `llm_cost_per_million_tokens`) or when no cache hits occurred in the window.
+
+```promql
+# Current cache hit fraction for a model
+llm_cache_savings_fraction{model_name="Qwen/Qwen3-32B"}
+
+# Models with less than 50% cache hit rate (potential tuning opportunity)
+llm_cache_savings_fraction < 0.5
+
+# Cache hit rate trend over 24 hours
+avg_over_time(llm_cache_savings_fraction{model_name="Qwen/Qwen3-32B"}[24h])
+
+# Compare cache utilization across all models
+sort_desc(llm_cache_savings_fraction)
+```
+
+### Example Alerting Rule
+
+```yaml
+groups:
+- name: inference_costs
+  rules:
+  - alert: HighInferenceCost
+    expr: llm_cost_per_million_tokens{cost_basis="allocation"} > 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High inference cost for {{ $labels.model_name }}"
+      description: "Model {{ $labels.model_name }} in {{ $labels.namespace }} costs ${{ $value }}/M tokens"
+```
+
+## REST API Endpoints
+
+Two HTTP endpoints are available when `INFERENCE_COST_ENABLED=true`. They compute costs on demand by querying Prometheus and the OpenCost allocation layer, consistent with how `/allocation` and `/assets` work.
+
+### `GET /inferenceCost/total`
+
+Returns a single aggregated `InferenceCostSet` covering the full requested window.
+
+**Query parameters:**
+
+| Parameter | Required | Description |
+|-----------|----------|-------------|
+| `window` | Yes | Time window: RFC3339 `start,end` or named range (e.g. `7d`, `24h`, `2025-01-01T00:00:00Z,2025-01-02T00:00:00Z`) |
+| `costBasis` | No | `allocation` (default) or `usage` |
+| `aggregate` | No | Comma-separated dimensions: `model_name`, `model_version`, `namespace`, `cluster`, `pod`, `controller`, `controller_kind`, `container`, `workload_type` |
+| `accumulate` | No | Step size within the window: `hour`, `day`, `week`, `month` (results are then accumulated into one total) |
+| `filter` | No | `prop:value` pairs joined with `+` for AND logic, e.g. `namespace:default+model_name:llama3+workload_type:inference` |
+
+**Example:**
+
+```bash
+# Total inference costs for the last 7 days, broken down by model
+curl "http://localhost:9003/inferenceCost/total?window=7d&aggregate=model_name&costBasis=allocation"
+```
+
+**Response shape:**
+
+```json
+{
+  "data": {
+    "inferenceCosts": {
+      "Qwen/Qwen3-32B:llm-d-namespace": {
+        "properties": {
+          "modelName": "Qwen/Qwen3-32B"
+        },
+        "window": { "start": "...", "end": "..." },
+        "costBasis": "allocation",
+        "totalCost": 42.50,
+        "promptTokens": 12000000,
+        "generationTokens": 3000000,
+        "totalTokens": 15000000,
+        "costPerMillionTokens": 2.83,
+        "inputCost": 28.40,
+        "outputCost": 14.10,
+        "inputCostPerMillionTokens": 2.37,
+        "outputCostPerMillionTokens": 4.70,
+        "cacheSavingsFraction": 0.067,
+        "allocationMethod": "compute_time"
+      }
+    },
+    "window": { "start": "...", "end": "..." }
+  }
+}
+```
+
+### `GET /inferenceCost/timeseries`
+
+Returns an `InferenceCostSetRange` — one `InferenceCostSet` per step within the window.
+
+Same parameters as `/total`, but `accumulate` is **required** (it defines the step size).
+
+**Example:**
+
+```bash
+# Daily inference costs per model over the past 7 days
+curl "http://localhost:9003/inferenceCost/timeseries?window=7d&accumulate=day&aggregate=model_name"
+
+# Hourly costs for a specific namespace, usage basis
+curl "http://localhost:9003/inferenceCost/timeseries?window=24h&accumulate=hour&costBasis=usage&filter=namespace:llm-d-prod"
+```
+
+**Response shape:**
+
+```json
+{
+  "data": {
+    "inferenceCostSets": [
+      {
+        "inferenceCosts": { ... },
+        "window": { "start": "2025-01-01T00:00:00Z", "end": "2025-01-02T00:00:00Z" }
+      },
+      ...
+    ],
+    "window": { "start": "2025-01-01T00:00:00Z", "end": "2025-01-08T00:00:00Z" }
+  }
+}
+```
+
+## Labeling Requirements
+
+Correct pod labeling is critical for cost attribution. OpenCost joins infrastructure costs (from the Kubernetes allocation layer) with token metrics (from Prometheus) using the model name and namespace as the join key.
+
+### Model label
+
+Every vLLM inference pod **must** carry a label whose key matches `INFERENCE_MODEL_LABEL` (default: `llm-d.ai/model`) and whose **value exactly matches the `model_name` label on the vLLM Prometheus metrics**.
+
+```yaml
+# Pod spec
+metadata:
+  labels:
+    llm-d.ai/model: "Qwen3-32B"   # must match --served-model-name (or --model if --served-model-name is not set)
+```
+
+`--served-model-name` controls the exact string vLLM uses as the `model_name` label in Prometheus metrics. Set the pod label to that same value. If `--served-model-name` is not set, vLLM uses the fully-qualified `--model` path (e.g. `Qwen/Qwen3-32B`) as `model_name`, so the pod label must match that instead.
+
+If this label is missing or the value differs from `model_name` in vLLM metrics, the allocation join fails: **token counts will appear in the API response but all cost fields will be zero** and `allocationMethod` will be empty.
+
+OpenCost attempts to reconcile fully-qualified model names (e.g. `org/model`) against short names (`model`) automatically, but the namespace must always match exactly.
+
+### Diagnosing a labeling mismatch
+
+```bash
+# Check what label value OpenCost sees in the allocation layer
+curl "localhost:9003/allocation?window=1h&aggregate=label:llm-d.ai/model&namespace=<ns>" \
+  | jq '.data[0] | keys'
+
+# Check what model_name vLLM is reporting in Prometheus
+curl "http://prometheus:9090/api/v1/query?query=vllm:prompt_tokens_total{namespace=\"<ns>\"}" \
+  | jq '.data.result[].metric.model_name'
+```
+
+If the values differ, update the pod label to match the vLLM `model_name`. OpenCost also logs a warning when it detects and auto-corrects a mismatch:
+
+```
+InferenceCost: remapping metric key "org/model:namespace" → "model:namespace" (model-name mismatch with allocation label)
+```
+
+### Shared infrastructure label
+
+Pods for shared infrastructure (EPP, gateway, routers) that serve multiple models should be labelled with `INFERENCE_SHARED_INFRA_LABEL` so their costs are distributed proportionally across all models rather than appearing as unattributed overhead:
+
+```yaml
+metadata:
+  labels:
+    llm-d.ai/inference-shared: "true"
+```
+
+## Architecture
+
+The feature is implemented in `pkg/inferencecost/` and consists of:
+
+| Component | File | Responsibility |
+|-----------|------|----------------|
+| **Collector** | `collector.go` | Queries the OpenCost allocation layer for infrastructure costs and Prometheus for [vLLM](https://vllm.ai/) token/timing/cache metrics |
+| **Calculator** | `calculator.go` | Computes blended and differentiated (input/output) cost-per-million-token rates for both cost bases |
+| **Exporter** | `exporter.go` | Registers and emits the `llm_*` Prometheus gauges |
+| **QueryService** | `queryservice.go` | Handles HTTP requests for `/inferenceCost/total` and `/inferenceCost/timeseries` |
+| **Runner** | `runner.go` | Drives periodic background collection for the Prometheus exporter |
+| **Types / API Types** | `types.go`, `apitypes.go` | Internal and HTTP-facing data models |
+
+Integration point: `pkg/cmd/costmodel/costmodel.go` registers the HTTP routes and initialises the collector, calculator, exporter, and runner when `INFERENCE_COST_ENABLED=true`.
+
+## Cost Calculation Methodology
+
+### Infrastructure Cost Collection
+
+The Collector fetches an `AllocationSet` from OpenCost's allocation layer for the requested window. It then attributes allocation costs to each model by matching pods that carry the `INFERENCE_MODEL_LABEL` pod label. Pods that carry `INFERENCE_SHARED_INFRA_LABEL=INFERENCE_SHARED_INFRA_LABEL_VALUE` (e.g. EPP, gateway) are treated as shared infrastructure: their costs are distributed across all models proportionally (by `AllocationTotalCost` weight) for `allocation` cost basis. For `usage` cost basis, shared infra costs are excluded.
+
+### Input/Output Cost Split
+
+OpenCost uses **compute-time based allocation** by default:
+
+1. Collects cumulative processing times from [vLLM](https://vllm.ai/):
+   - `vllm:request_prefill_time_seconds_sum` — total time spent on input (prefill)
+   - `vllm:request_time_per_output_token_seconds_sum` — total time spent on output (decode)
+2. Allocates infrastructure cost proportionally: `InputCost = TotalCost × (PrefillTime / TotalTime)`
+3. Calculates per-million rates using `PromptTokens` for input (all delivered prompt tokens) and `GenerationTokens` for output
+
+**KV cache savings** are reported in `cacheSavingsFraction` (`cachedTokens / promptTokens`, range 0–1). The dollar cost split already reflects cache savings implicitly — when the KV cache serves tokens without prefill work, prefill time is lower, so less cost is attributed to input. `cacheSavingsFraction` makes this benefit explicit and user-readable.
+
+`cacheSavingsFraction` is sourced directly from `vllm:prefix_cache_hits_total`, which reports cached **tokens** (not blocks). It is non-zero when cache hits were recorded in the window and the metric is available. `vllm:cache_config_info` is queried separately, but only to detect whether prefix caching is explicitly disabled — not for any arithmetic.
+
+**Fallback**: if [vLLM](https://vllm.ai/) timing metrics are unavailable, the Calculator falls back to a fixed multiplier (default 2.5×: output tokens cost 2.5× input tokens).
+
+The `allocationMethod` field records which path was taken for each result (see [allocation_method values](#llm_input_cost_per_million_tokens)).
+
+### Example Calculation
+
+```
+Model: Qwen/Qwen3-32B  |  Window: 1 hour
+
+Infrastructure (allocation basis):
+  AllocationTotalCost = $3.20/hr (GPU + shared infra share)
+
+Token metrics from vLLM:
+  PromptTokens = 12,000,000
+  GenerationTokens = 3,000,000
+  TotalTokens = 15,000,000
+  PrefillTime = 600s, DecodeTime = 600s  → each 50%
+
+KV cache:
+  CachedTokens = 800,000  (from vllm:prefix_cache_hits_total — token-level counter)
+  CacheSavingsFraction = 800,000 / 12,000,000 = 6.7%
+
+Blended:              $3.20 / 15,000,000 × 1,000,000 = $0.213/M tokens
+Input (delivered):    ($3.20 × 0.5) / 12,000,000 × 1,000,000 = $0.133/M prompt tokens
+Output:               ($3.20 × 0.5) / 3,000,000 × 1,000,000 = $0.533/M output tokens
+Cache savings:        6.7% of prompt tokens served from KV cache
+```
+
+## Required [vLLM](https://vllm.ai/) Metrics
+
+| Metric | Required for |
+|--------|-------------|
+| `vllm:prompt_tokens_total` | Token counts, blended cost rate |
+| `vllm:generation_tokens_total` | Token counts, blended cost rate |
+| `vllm:request_prefill_time_seconds_sum` | Compute-time allocation (input/output split) |
+| `vllm:request_time_per_output_token_seconds_sum` | Compute-time allocation (input/output split) |
+| `vllm:prefix_cache_hits_total` | `cacheSavingsFraction` (token-level counter; optional) |
+| `vllm:cache_config_info` | `prefix_caching_off` detection (from `enable_prefix_caching` label; optional) |
+
+All metrics must carry `model_name` and `namespace` labels. Verify availability:
+
+```bash
+kubectl exec -n <namespace> <vllm-pod> -- curl -s localhost:8000/metrics | grep -E "prompt_tokens|generation_tokens|prefill_time|output_token"
+```
+
+## Troubleshooting
+
+### No metrics appearing
+
+1. Confirm `INFERENCE_COST_ENABLED=true` is set on the OpenCost pod
+2. Check OpenCost logs: `kubectl logs -n opencost deployment/opencost | grep -i inference`
+3. Verify Prometheus is reachable from OpenCost and [vLLM](https://vllm.ai/) metrics are present
+
+### Metrics show zero cost / `allocationMethod` is empty
+
+This means the allocation join failed — token data was found in Prometheus but no matching pod cost was found in the allocation layer. See [Labeling Requirements](#labeling-requirements).
+
+Quick diagnosis:
+```bash
+# What label values does the allocation layer see?
+curl "localhost:9003/allocation?window=1h&aggregate=label:llm-d.ai/model&namespace=<ns>" \
+  | jq '.data[0] | keys'
+
+# What model_name does vLLM report?
+curl "http://prometheus:9090/api/v1/query?query=vllm:prompt_tokens_total{namespace=\"<ns>\"}" \
+  | jq '.data.result[].metric.model_name'
+```
+
+If the values differ, update the pod label on the vLLM deployment to match.
+
+### `allocationMethod=multiplier` instead of `compute_time`
+
+[vLLM](https://vllm.ai/) timing metrics are missing or zero. Check:
+
+```bash
+kubectl exec -n <namespace> <vllm-pod> -- curl -s localhost:8000/metrics | grep prefill_time
+```
+
+### `cacheSavingsFraction` is zero but prefix caching is expected to be active
+
+One of the following:
+- **Prefix caching is disabled** on this vLLM instance (`enable_prefix_caching=false` in `vllm:cache_config_info`) — `allocationMethod` will be `prefix_caching_off`, which is accurate and expected
+- **No cache hits occurred** in this window despite prefix caching being enabled — normal for low-traffic or first-request windows
+- **`vllm:prefix_cache_hits_total` metric is missing or zero** — check that vLLM is emitting it. This metric reports cached tokens directly; if unavailable, `cacheSavingsFraction` will be zero.
+- **`vllm:cache_config_info` metric is missing** — this only affects `prefix_caching_off` detection, not `cacheSavingsFraction`. OpenCost logs a warning if the metric exists but the pod-label join fails:
+  ```
+  InferenceCost: vllm:cache_config_info exists in Prometheus but the join with
+  vllm:prompt_tokens_total produced no results — likely a pod-label mismatch
+  ```
+
+### Costs look too high
+
+- Check utilization: `costBasis=allocation` includes idle time. A GPU reserved for an hour but processing very few tokens will show a high $/M token rate. 
+- Check whether shared infra pods (EPP, gateway) are correctly labelled with `INFERENCE_SHARED_INFRA_LABEL`. Without this label their costs appear as unattributed allocation overhead.
+
+## Support
+
+- GitHub Issues: https://github.com/opencost/opencost/issues
+- Slack: [#opencost](https://cloud-native.slack.com/archives/C03D56FPD4G) on CNCF Slack

+ 69 - 1
docs/opencost.postman_collection.json

@@ -124,6 +124,74 @@
 				}
 			},
 			"response": []
+		},
+		{
+			"name": "inference costs total by model (requires INFERENCE_COST_ENABLED=true)",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "{{host}}/inferenceCost/total?window=7d&aggregate=model_name&costBasis=allocation",
+					"host": [
+						"{{host}}"
+					],
+					"path": [
+						"inferenceCost",
+						"total"
+					],
+					"query": [
+						{
+							"key": "window",
+							"value": "7d"
+						},
+						{
+							"key": "aggregate",
+							"value": "model_name"
+						},
+						{
+							"key": "costBasis",
+							"value": "allocation"
+						}
+					]
+				}
+			},
+			"response": []
+		},
+		{
+			"name": "inference costs daily timeseries by model (requires INFERENCE_COST_ENABLED=true)",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "{{host}}/inferenceCost/timeseries?window=7d&accumulate=day&aggregate=model_name&costBasis=allocation",
+					"host": [
+						"{{host}}"
+					],
+					"path": [
+						"inferenceCost",
+						"timeseries"
+					],
+					"query": [
+						{
+							"key": "window",
+							"value": "7d"
+						},
+						{
+							"key": "accumulate",
+							"value": "day"
+						},
+						{
+							"key": "aggregate",
+							"value": "model_name"
+						},
+						{
+							"key": "costBasis",
+							"value": "allocation"
+						}
+					]
+				}
+			},
+			"response": []
 		}
 	],
 	"event": [
@@ -153,4 +221,4 @@
 			"type": "default"
 		}
 	]
-}
+}

+ 338 - 0
docs/swagger.json

@@ -137,6 +137,170 @@
           }
         }
       }
+    },
+    "/inferenceCost/total": {
+      "get": {
+        "summary": "query for aggregated AI inference costs",
+        "description": "Returns a single aggregated InferenceCostSet covering the full requested window. Costs are broken down per model/namespace with blended and differentiated (input/output) cost-per-million-token rates under the chosen cost basis. Requires `INFERENCE_COST_ENABLED=true`.",
+        "parameters": [
+          {
+            "name": "window",
+            "in": "query",
+            "required": true,
+            "description": "Duration of time over which to query. Accepts durations like `7d`, `24h`, or RFC3339 date pairs like `2024-01-01T00:00:00Z,2024-01-02T00:00:00Z`.",
+            "schema": {
+              "type": "string"
+            },
+            "examples": {
+              "7days": {
+                "value": "7d"
+              },
+              "24hours": {
+                "value": "24h"
+              },
+              "range": {
+                "value": "2024-01-01T00:00:00Z,2024-01-08T00:00:00Z"
+              }
+            }
+          },
+          {
+            "name": "costBasis",
+            "in": "query",
+            "description": "`allocation` (default): max(request,usage) × price + idle + shared infra; reconciles to the infrastructure bill. `usage`: actual consumption only; idle and shared infra costs excluded; does not reconcile to the bill.",
+            "schema": {
+              "type": "string",
+              "enum": ["allocation", "usage"],
+              "default": "allocation"
+            }
+          },
+          {
+            "name": "aggregate",
+            "in": "query",
+            "description": "Comma-separated dimensions to aggregate by. Supported values: `model_name`, `model_version`, `namespace`, `cluster`, `pod`, `controller`, `controller_kind`, `container`, `workload_type`.",
+            "schema": {
+              "type": "string"
+            },
+            "example": "model_name"
+          },
+          {
+            "name": "accumulate",
+            "in": "query",
+            "description": "Step size used internally before accumulating into the total. Accepted values: `hour`, `day`, `week`, `month`. Optional for this endpoint.",
+            "schema": {
+              "type": "string",
+              "enum": ["hour", "day", "week", "month"]
+            }
+          },
+          {
+            "name": "filter",
+            "in": "query",
+            "description": "Filter results by property values. Format: `prop:value` for a single filter, `prop:value+prop:value` for AND logic. Supported properties: `model_name`, `model_version`, `namespace`, `cluster`, `pod`, `controller`, `controller_kind`, `container`, `workload_type`.",
+            "schema": {
+              "type": "string"
+            },
+            "example": "namespace:llm-d-prod+model_name:Qwen/Qwen3-32B"
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Success",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InferenceCostSetResponse"
+                }
+              }
+            }
+          },
+          "400": {
+            "description": "Bad request — missing or invalid parameters"
+          },
+          "501": {
+            "description": "Inference cost tracking is not enabled (`INFERENCE_COST_ENABLED` is not set to `true`)"
+          }
+        }
+      }
+    },
+    "/inferenceCost/timeseries": {
+      "get": {
+        "summary": "query for AI inference costs as a time series",
+        "description": "Returns one InferenceCostSet per time step within the requested window. The `accumulate` parameter is required and defines the step size. All other parameters are identical to `/inferenceCost/total`. Requires `INFERENCE_COST_ENABLED=true`.",
+        "parameters": [
+          {
+            "name": "window",
+            "in": "query",
+            "required": true,
+            "description": "Duration of time over which to query. Accepts durations like `7d`, `24h`, or RFC3339 date pairs.",
+            "schema": {
+              "type": "string"
+            },
+            "examples": {
+              "7days": {
+                "value": "7d"
+              },
+              "range": {
+                "value": "2024-01-01T00:00:00Z,2024-01-08T00:00:00Z"
+              }
+            }
+          },
+          {
+            "name": "accumulate",
+            "in": "query",
+            "required": true,
+            "description": "Step size for each time-series data point. Required for this endpoint.",
+            "schema": {
+              "type": "string",
+              "enum": ["hour", "day", "week", "month"]
+            },
+            "example": "day"
+          },
+          {
+            "name": "costBasis",
+            "in": "query",
+            "description": "`allocation` (default) or `usage`. See `/inferenceCost/total` for details.",
+            "schema": {
+              "type": "string",
+              "enum": ["allocation", "usage"],
+              "default": "allocation"
+            }
+          },
+          {
+            "name": "aggregate",
+            "in": "query",
+            "description": "Comma-separated dimensions to aggregate by: `model_name`, `model_version`, `namespace`, `cluster`, `pod`, `controller`, `controller_kind`, `container`, `workload_type`.",
+            "schema": {
+              "type": "string"
+            },
+            "example": "model_name"
+          },
+          {
+            "name": "filter",
+            "in": "query",
+            "description": "Filter by property values. Format: `prop:value+prop:value` (AND logic). Supported properties: `model_name`, `model_version`, `namespace`, `cluster`, `pod`, `controller`, `controller_kind`, `container`, `workload_type`.",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Success",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InferenceCostSetRangeResponse"
+                }
+              }
+            }
+          },
+          "400": {
+            "description": "Bad request — missing or invalid parameters (including missing `accumulate`)"
+          },
+          "501": {
+            "description": "Inference cost tracking is not enabled (`INFERENCE_COST_ENABLED` is not set to `true`)"
+          }
+        }
+      }
     }
   },
   "components": {
@@ -313,6 +477,180 @@
             }
           }
         }
+      },
+      "Window": {
+        "type": "object",
+        "properties": {
+          "start": {
+            "type": "string",
+            "format": "date-time"
+          },
+          "end": {
+            "type": "string",
+            "format": "date-time"
+          }
+        }
+      },
+      "InferenceCostProperties": {
+        "type": "object",
+        "properties": {
+          "modelName": {
+            "type": "string",
+            "description": "Name of the AI model (e.g. `Qwen/Qwen3-32B`)."
+          },
+          "modelVersion": {
+            "type": "string",
+            "description": "Version of the model. Omitted when not available."
+          },
+          "namespace": {
+            "type": "string",
+            "description": "Kubernetes namespace where the model is deployed."
+          },
+          "cluster": {
+            "type": "string",
+            "description": "Cluster identifier. Omitted when not available."
+          },
+          "pod": {
+            "type": "string",
+            "description": "Kubernetes pod name. Omitted when not available."
+          },
+          "controller": {
+            "type": "string",
+            "description": "Kubernetes controller name (e.g. deployment, statefulset). Omitted when not available."
+          },
+          "controllerKind": {
+            "type": "string",
+            "description": "Kubernetes controller kind (e.g. Deployment, StatefulSet). Omitted when not available."
+          },
+          "container": {
+            "type": "string",
+            "description": "Container name within the pod. Omitted when not available."
+          },
+          "workloadType": {
+            "type": "string",
+            "description": "Type of workload generating these costs. Currently always 'inference'. Future values may include 'training', 'fine-tuning', etc.",
+            "example": "inference"
+          }
+        }
+      },
+      "InferenceCostResponse": {
+        "type": "object",
+        "description": "Inference cost data for a single model/namespace combination within a time window.",
+        "properties": {
+          "properties": {
+            "$ref": "#/components/schemas/InferenceCostProperties"
+          },
+          "window": {
+            "$ref": "#/components/schemas/Window"
+          },
+          "costBasis": {
+            "type": "string",
+            "enum": ["allocation", "usage"],
+            "description": "The cost basis used for this response. `allocation`: max(request,usage) × price + idle + shared infra; reconciles to the bill. `usage`: actual consumption only; idle and shared infra costs excluded; does not reconcile to the bill."
+          },
+          "totalCost": {
+            "type": "number",
+            "description": "Total infrastructure cost for the window under the chosen cost basis, in USD."
+          },
+          "promptTokens": {
+            "type": "number",
+            "description": "Total prompt (input) tokens processed in the window."
+          },
+          "generationTokens": {
+            "type": "number",
+            "description": "Total generation (output) tokens produced in the window."
+          },
+          "totalTokens": {
+            "type": "number",
+            "description": "Total tokens delivered (prompt + generation)."
+          },
+          "costPerMillionTokens": {
+            "type": "number",
+            "description": "Blended infrastructure cost per 1M delivered tokens (input + output combined), in USD."
+          },
+          "inputCost": {
+            "type": "number",
+            "description": "Portion of totalCost attributed to input (prompt) processing, in USD."
+          },
+          "outputCost": {
+            "type": "number",
+            "description": "Portion of totalCost attributed to output (generation) processing, in USD."
+          },
+          "inputCostPerMillionTokens": {
+            "type": "number",
+            "description": "Infrastructure cost per 1M delivered input (prompt) tokens, in USD. Uses promptTokens as the denominator so the rate is meaningful regardless of KV cache hit rate. See cacheSavingsFraction for cache utilization."
+          },
+          "outputCostPerMillionTokens": {
+            "type": "number",
+            "description": "Infrastructure cost per 1M output (generation) tokens, in USD."
+          },
+          "cacheSavingsFraction": {
+            "type": "number",
+            "description": "Fraction of prompt tokens served from the KV cache (range 0–1). Sourced from vllm:prefix_cache_hits_total, which reports cached tokens directly. Zero when prefix caching is disabled (see allocationMethod), when no cache hits occurred in the window, or when the metric is unavailable."
+          },
+          "allocationMethod": {
+            "type": "string",
+            "enum": ["compute_time", "prefix_caching_off", "multiplier"],
+            "description": "Method used to split costs between input and output tokens. `compute_time`: split proportionally by vLLM prefill/decode time; KV cache savings are reflected in cacheSavingsFraction. `prefix_caching_off`: same time-based split but prefix caching is explicitly disabled on the vLLM instance — cacheSavingsFraction will be zero by configuration. `multiplier`: fixed output/input ratio (fallback when timing metrics are unavailable)."
+          }
+        }
+      },
+      "InferenceCostSet": {
+        "type": "object",
+        "description": "A collection of InferenceCostResponse entries for a single time window, keyed by aggregation key (e.g. `modelName:namespace`).",
+        "properties": {
+          "inferenceCosts": {
+            "type": "object",
+            "additionalProperties": {
+              "$ref": "#/components/schemas/InferenceCostResponse"
+            }
+          },
+          "window": {
+            "$ref": "#/components/schemas/Window"
+          }
+        }
+      },
+      "InferenceCostSetResponse": {
+        "type": "object",
+        "description": "Response envelope for /inferenceCost/total.",
+        "properties": {
+          "code": {
+            "type": "integer"
+          },
+          "status": {
+            "type": "string"
+          },
+          "data": {
+            "$ref": "#/components/schemas/InferenceCostSet"
+          }
+        }
+      },
+      "InferenceCostSetRangeResponse": {
+        "type": "object",
+        "description": "Response envelope for /inferenceCost/timeseries.",
+        "properties": {
+          "code": {
+            "type": "integer"
+          },
+          "status": {
+            "type": "string"
+          },
+          "data": {
+            "type": "object",
+            "properties": {
+              "inferenceCostSets": {
+                "type": "array",
+                "description": "One InferenceCostSet per time step.",
+                "items": {
+                  "$ref": "#/components/schemas/InferenceCostSet"
+                }
+              },
+              "window": {
+                "$ref": "#/components/schemas/Window"
+              }
+            }
+          }
+        }
       }
     }
   }

+ 75 - 0
modules/collector-source/pkg/collector/metricsquerier.go

@@ -1,6 +1,7 @@
 package collector
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/opencost/opencost/core/pkg/source"
@@ -747,3 +748,77 @@ func (c *collectorMetricsQuerier) QueryResourceQuotaStatusUsedRAMLimitMax(start,
 func (c *collectorMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {
 	return c.collectorProvider.GetDailyDataCoverage(limitDays)
 }
+
+
+// Inference cost methods - not supported by collector source (only available via Prometheus)
+func (c *collectorMetricsQuerier) QueryInferencePromptTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferencePromptTokens",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceTokensResult {
+		return &source.InferenceTokensResult{Values: make(map[string]float64)}
+	}
+	return source.NewFuture(decoder, ch)
+}
+
+func (c *collectorMetricsQuerier) QueryInferenceGenerationTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferenceGenerationTokens",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceTokensResult {
+		return &source.InferenceTokensResult{Values: make(map[string]float64)}
+	}
+	return source.NewFuture(decoder, ch)
+}
+
+func (c *collectorMetricsQuerier) QueryInferenceInputProcessingTime(start, end time.Time) *source.Future[source.InferenceProcessingTimeResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferenceInputProcessingTime",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceProcessingTimeResult {
+		return &source.InferenceProcessingTimeResult{Values: make(map[string]float64)}
+	}
+	return source.NewFuture(decoder, ch)
+}
+
+func (c *collectorMetricsQuerier) QueryInferenceOutputProcessingTime(start, end time.Time) *source.Future[source.InferenceProcessingTimeResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferenceOutputProcessingTime",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceProcessingTimeResult {
+		return &source.InferenceProcessingTimeResult{Values: make(map[string]float64)}
+	}
+	return source.NewFuture(decoder, ch)
+}
+
+func (c *collectorMetricsQuerier) QueryInferenceCachedTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferenceCachedTokens",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceTokensResult {
+		return &source.InferenceTokensResult{Values: make(map[string]float64)}
+	}
+	return source.NewFuture(decoder, ch)
+}
+
+func (c *collectorMetricsQuerier) QueryInferenceCacheConfig(t time.Time) *source.Future[source.InferenceCacheConfigResult] {
+	ch := make(source.QueryResultsChan, 1)
+	ch <- &source.QueryResults{
+		Query: "QueryInferenceCacheConfig",
+		Error: fmt.Errorf("inference metrics not supported by collector source"),
+	}
+	decoder := func(result *source.QueryResult) *source.InferenceCacheConfigResult {
+		return &source.InferenceCacheConfigResult{Configs: make(map[string]*source.InferenceCacheConfig)}
+	}
+	return source.NewFuture(decoder, ch)
+}

+ 366 - 0
modules/prometheus-source/pkg/prom/inference_queries.go

@@ -0,0 +1,366 @@
+package prom
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/core/pkg/source"
+	"github.com/opencost/opencost/core/pkg/util"
+)
+
+// QueryInferencePromptTokens implements MetricsQuerier.QueryInferencePromptTokens
+func (pds *PrometheusMetricsQuerier) QueryInferencePromptTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	// Create a channel for the async result
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	// Execute query asynchronously
+	go func() {
+		values, err := queryCounterDelta(ctx, "vllm:prompt_tokens_total", start, end)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		// Convert map to QueryResults format
+		results := mapToQueryResults(values)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceTokensResult, resultsChan)
+}
+
+// QueryInferenceGenerationTokens implements MetricsQuerier.QueryInferenceGenerationTokens
+func (pds *PrometheusMetricsQuerier) QueryInferenceGenerationTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	go func() {
+		values, err := queryCounterDelta(ctx, "vllm:generation_tokens_total", start, end)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		results := mapToQueryResults(values)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceTokensResult, resultsChan)
+}
+
+// QueryInferenceInputProcessingTime implements MetricsQuerier.QueryInferenceInputProcessingTime
+func (pds *PrometheusMetricsQuerier) QueryInferenceInputProcessingTime(start, end time.Time) *source.Future[source.InferenceProcessingTimeResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	go func() {
+		values, err := queryCounterDelta(ctx, "vllm:request_prefill_time_seconds_sum", start, end)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		results := mapToQueryResults(values)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceProcessingTimeResult, resultsChan)
+}
+
+// QueryInferenceOutputProcessingTime implements MetricsQuerier.QueryInferenceOutputProcessingTime
+func (pds *PrometheusMetricsQuerier) QueryInferenceOutputProcessingTime(start, end time.Time) *source.Future[source.InferenceProcessingTimeResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	go func() {
+		values, err := queryCounterDelta(ctx, "vllm:request_time_per_output_token_seconds_sum", start, end)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		results := mapToQueryResults(values)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceProcessingTimeResult, resultsChan)
+}
+
+// QueryInferenceCachedTokens implements MetricsQuerier.QueryInferenceCachedTokens
+func (pds *PrometheusMetricsQuerier) QueryInferenceCachedTokens(start, end time.Time) *source.Future[source.InferenceTokensResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	go func() {
+		values, err := queryCounterDelta(ctx, "vllm:prefix_cache_hits_total", start, end)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		results := mapToQueryResults(values)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceTokensResult, resultsChan)
+}
+
+// QueryInferenceCacheConfig implements MetricsQuerier.QueryInferenceCacheConfig
+func (pds *PrometheusMetricsQuerier) QueryInferenceCacheConfig(t time.Time) *source.Future[source.InferenceCacheConfigResult] {
+	ctx := pds.promContexts.NewNamedContext(ClusterContextName)
+	
+	resultsChan := make(source.QueryResultsChan, 1)
+	
+	go func() {
+		configs, err := queryCacheConfigs(ctx, t)
+		if err != nil {
+			resultsChan <- &source.QueryResults{Error: err}
+			return
+		}
+		
+		results := cacheConfigMapToQueryResults(configs)
+		resultsChan <- &source.QueryResults{Results: results}
+	}()
+	
+	return source.NewFuture(decodeInferenceCacheConfigResult, resultsChan)
+}
+
+// Decoder functions
+
+func decodeInferenceTokensResult(result *source.QueryResult) *source.InferenceTokensResult {
+	key, _ := result.GetString("key")
+	value := result.Values[0].Value
+	
+	return &source.InferenceTokensResult{
+		Values: map[string]float64{key: value},
+	}
+}
+
+func decodeInferenceProcessingTimeResult(result *source.QueryResult) *source.InferenceProcessingTimeResult {
+	key, _ := result.GetString("key")
+	value := result.Values[0].Value
+	
+	return &source.InferenceProcessingTimeResult{
+		Values: map[string]float64{key: value},
+	}
+}
+
+func decodeInferenceCacheConfigResult(result *source.QueryResult) *source.InferenceCacheConfigResult {
+	key, _ := result.GetString("key")
+	enabled := result.Values[0].Value > 0
+	
+	return &source.InferenceCacheConfigResult{
+		Configs: map[string]*source.InferenceCacheConfig{
+			key: {PrefixCachingEnabled: enabled},
+		},
+	}
+}
+
+// Helper functions
+
+// mapToQueryResults converts a map[string]float64 to []*QueryResult
+func mapToQueryResults(values map[string]float64) []*source.QueryResult {
+	results := make([]*source.QueryResult, 0, len(values))
+	for key, value := range values {
+		result := source.NewQueryResult(
+			map[string]any{"key": key},
+			[]*util.Vector{{Value: value}},
+			nil,
+		)
+		results = append(results, result)
+	}
+	return results
+}
+
+// cacheConfigMapToQueryResults converts a map[string]*InferenceCacheConfig to []*QueryResult
+func cacheConfigMapToQueryResults(configs map[string]*source.InferenceCacheConfig) []*source.QueryResult {
+	results := make([]*source.QueryResult, 0, len(configs))
+	for key, config := range configs {
+		value := 0.0
+		if config.PrefixCachingEnabled {
+			value = 1.0
+		}
+		result := source.NewQueryResult(
+			map[string]any{"key": key},
+			[]*util.Vector{{Value: value}},
+			nil,
+		)
+		results = append(results, result)
+	}
+	return results
+}
+
+// queryCounterDelta returns the net increase of a monotonic counter metric
+// over [start, end] per (model_name, namespace).
+//
+// It uses the @ modifier to pin two instant queries to start and end,
+// then subtracts. This avoids the extrapolation inflation produced by
+// increase(metric[Xm]) when a series has fewer samples than the window
+// (e.g. a pod that restarted mid-window, or a sum across many replicas
+// where Prometheus extrapolates each series independently before summing).
+//
+// last_over_time(metric[2m] @ t) fetches the most recent sample within 2
+// minutes of t. 2 minutes covers the default 30s scrape interval with margin.
+// Series with no sample near start get a start-value of 0 (treated as new),
+// which is the correct behaviour for pods that started mid-window.
+// Negative deltas (counter resets) are treated as resets and the delta is set to the end value (post-reset activity).
+func queryCounterDelta(ctx *Context, metric string, start, end time.Time) (map[string]float64, error) {
+	startUnix := start.Unix()
+	// Clamp end to now: last_over_time with a future @ timestamp returns no results.
+	effectiveEnd := end
+	if now := time.Now(); end.After(now) {
+		effectiveEnd = now
+	}
+	endUnix := effectiveEnd.Unix()
+
+	// The lookback for last_over_time must span the full window duration.
+	// A model that was active earlier in the window but idle at query time
+	// will have its last sample somewhere within the window — a narrow 2m
+	// lookback would miss it entirely. Using the window duration as the
+	// lookback guarantees we find the last sample that existed anywhere in
+	// the window, while the @ pin ensures we don't extrapolate past end.
+	windowDuration := effectiveEnd.Sub(start)
+	windowMinutes := int(windowDuration.Minutes())
+	if windowMinutes < 2 {
+		windowMinutes = 2
+	}
+
+	// Query counter value at the end of the window.
+	endQuery := fmt.Sprintf(`sum by (model_name, namespace) (last_over_time(%s[%dm] @ %d))`, metric, windowMinutes, endUnix)
+	endVals, err := queryInstantMetric(ctx, endQuery, effectiveEnd)
+	if err != nil {
+		return nil, fmt.Errorf("end-of-window query for %s: %w", metric, err)
+	}
+
+	// Query counter value at the start of the window.
+	// Use a narrow 2m lookback here: we want the value just before the window
+	// opens, not a stale value from much earlier that would undercount the delta.
+	startQuery := fmt.Sprintf(`sum by (model_name, namespace) (last_over_time(%s[2m] @ %d))`, metric, startUnix)
+	startVals, err := queryInstantMetric(ctx, startQuery, effectiveEnd)
+	if err != nil {
+		return nil, fmt.Errorf("start-of-window query for %s: %w", metric, err)
+	}
+
+	// Delta = end - start. If negative (counter reset), use endVal as a
+	// lower bound to capture post-reset activity rather than reporting 0.
+	out := make(map[string]float64, len(endVals))
+	for key, endVal := range endVals {
+		delta := endVal - startVals[key]
+		if delta < 0 {
+			// Counter reset detected: use endVal to capture post-reset activity
+			delta = endVal
+		}
+		out[key] = delta
+	}
+	return out, nil
+}
+
+// queryCacheConfigs queries vllm:cache_config_info joined with token metrics
+// to get enable_prefix_caching per (model_name, namespace).
+// When the join produces no results for a model that has token data, a warning
+// is emitted to aid diagnosis of pod-label mismatches.
+func queryCacheConfigs(ctx *Context, t time.Time) (map[string]*source.InferenceCacheConfig, error) {
+	// Join cache_config_info (has enable_prefix_caching label) with
+	// prompt_tokens_total (has model_name) using namespace+pod as the join key.
+	query := `
+		max by (model_name, namespace, enable_prefix_caching) (
+			sum by (model_name, namespace, pod) (vllm:prompt_tokens_total)
+			* on (namespace, pod) group_left(enable_prefix_caching)
+			max by (namespace, pod, enable_prefix_caching) (vllm:cache_config_info)
+		)
+	`
+
+	raw, _, err := ctx.query(query, t)
+	if err != nil {
+		return nil, err
+	}
+
+	results := NewQueryResults(query, raw, source.ClusterKeyWithDefaults(ctx.config.ClusterLabel))
+	if results.Error != nil {
+		return nil, results.Error
+	}
+
+	out := make(map[string]*source.InferenceCacheConfig)
+	for _, result := range results.Results {
+		modelName, err := result.GetString("model_name")
+		if err != nil || modelName == "" {
+			continue
+		}
+		namespace, err := result.GetString("namespace")
+		if err != nil || namespace == "" {
+			namespace = "unknown"
+		}
+		enablePrefixCaching, err := result.GetString("enable_prefix_caching")
+		if err != nil {
+			continue
+		}
+		prefixCachingEnabled := strings.EqualFold(enablePrefixCaching, "true")
+		key := modelNamespaceKey(modelName, namespace)
+		out[key] = &source.InferenceCacheConfig{PrefixCachingEnabled: prefixCachingEnabled}
+	}
+
+	// Check for models that have token data but no cache config — likely a join
+	// failure due to pod-label mismatch between cache_config_info and prompt_tokens_total.
+	// Only run the diagnostic query when the join produced nothing; skip it on the happy path.
+	if len(out) == 0 {
+		rawQuery := `max by (namespace) (vllm:cache_config_info)`
+		rawResult, _, rawErr := ctx.query(rawQuery, t)
+		if rawErr == nil {
+			diagResults := NewQueryResults(rawQuery, rawResult, source.ClusterKeyWithDefaults(ctx.config.ClusterLabel))
+			if diagResults.Error == nil && len(diagResults.Results) > 0 {
+				log.Warnf("InferenceCost: vllm:cache_config_info exists in Prometheus but the join with "+
+					"vllm:prompt_tokens_total produced no results — likely a pod-label mismatch between "+
+					"the two metrics (check that both carry matching 'namespace' and 'pod' labels). "+
+					"prefix_caching_off detection will be disabled; allocation method will be 'compute_time'.")
+			}
+		}
+	}
+
+	return out, nil
+}
+
+// queryInstantMetric runs a Prometheus instant query evaluated at t and returns a
+// map[model_name:namespace]value.
+func queryInstantMetric(ctx *Context, query string, t time.Time) (map[string]float64, error) {
+	raw, _, err := ctx.query(query, t)
+	if err != nil {
+		return nil, err
+	}
+
+	results := NewQueryResults(query, raw, source.ClusterKeyWithDefaults(ctx.config.ClusterLabel))
+	if results.Error != nil {
+		return nil, results.Error
+	}
+
+	out := make(map[string]float64, len(results.Results))
+	for _, result := range results.Results {
+		modelName, err := result.GetString("model_name")
+		if err != nil || modelName == "" {
+			continue
+		}
+		namespace, err := result.GetString("namespace")
+		if err != nil || namespace == "" {
+			namespace = "unknown"
+		}
+		if len(result.Values) == 0 {
+			continue
+		}
+		value := result.Values[0].Value
+		out[modelNamespaceKey(modelName, namespace)] = value
+	}
+	return out, nil
+}
+
+func modelNamespaceKey(modelName, namespace string) string {
+	return modelName + ":" + namespace
+}
+
+// Made with Bob

+ 3 - 1
pkg/cmd/costmodel/config.go

@@ -13,6 +13,7 @@ type Config struct {
 	CloudCostEnabled       bool
 	CustomCostEnabled      bool
 	MCPServerEnabled       bool
+	InferenceCostEnabled   bool
 }
 
 func DefaultConfig() *Config {
@@ -23,6 +24,7 @@ func DefaultConfig() *Config {
 		CloudCostEnabled:       env.IsCloudCostEnabled(),
 		MCPServerEnabled:       env.IsMCPServerEnabled(),
 		CustomCostEnabled:      env.IsCustomCostEnabled(),
+		InferenceCostEnabled:   env.IsInferenceCostEnabled(),
 	}
 }
 
@@ -30,7 +32,7 @@ func (c *Config) log() {
 	log.Infof("Kubernetes enabled: %t", c.KubernetesEnabled)
 	log.Infof("Carbon Estimates enabled: %t", c.CarbonEstimatesEnabled)
 	log.Infof("Cloud Costs enabled: %t", c.CloudCostEnabled)
-	log.Infof("Custom Costs enabled: %t", c.CustomCostEnabled)
 	log.Infof("MCP Server enabled: %t", c.MCPServerEnabled)
 	log.Infof("Custom Costs enabled: %t", c.CustomCostEnabled)
+	log.Infof("Inference Cost enabled: %t", c.InferenceCostEnabled)
 }

+ 46 - 0
pkg/cmd/costmodel/costmodel.go

@@ -24,6 +24,7 @@ import (
 	"github.com/opencost/opencost/pkg/costmodel"
 	"github.com/opencost/opencost/pkg/env"
 	"github.com/opencost/opencost/pkg/filemanager"
+	"github.com/opencost/opencost/pkg/inferencecost"
 	opencost_mcp "github.com/opencost/opencost/pkg/mcp"
 	"github.com/opencost/opencost/pkg/metrics"
 )
@@ -51,6 +52,18 @@ func Execute(conf *Config) error {
 			log.Errorf("couldn't start CSV export worker: %v", err)
 		}
 
+		// Register inference cost routes unconditionally so clients receive 501
+		// (not 404) when INFERENCE_COST_ENABLED=false. The QueryService nil-
+		// checks in each handler produce the 501 when qs is nil.
+		var inferenceCostQueryService *inferencecost.QueryService
+		if conf.InferenceCostEnabled {
+			if err := StartInferenceCostCollector(ctx, a, &inferenceCostQueryService); err != nil {
+				log.Errorf("Failed to start inference cost collector: %v", err)
+			}
+		}
+		router.GET("/inferenceCost/total", inferenceCostQueryService.GetInferenceCostTotalHandler())
+		router.GET("/inferenceCost/timeseries", inferenceCostQueryService.GetInferenceCostTimeseriesHandler())
+
 		// Register OpenCost Specific Endpoints
 		router.GET("/allocation", a.ComputeAllocationHandler)
 		router.GET("/allocation/summary", a.ComputeAllocationHandlerSummary)
@@ -147,6 +160,39 @@ func Execute(conf *Config) error {
 	}
 }
 
+// StartInferenceCostCollector initialises and starts the inference cost
+// collection loop as a background goroutine, and populates *qs with the
+// QueryService so the caller can register routes. It is a no-op if the
+// collector cannot be initialised (error is logged, existing functionality
+// is unaffected).
+func StartInferenceCostCollector(ctx context.Context, a *costmodel.Accesses, qs **inferencecost.QueryService) error {
+	cfg := inferencecost.DefaultConfig()
+
+	// Get the MetricsQuerier from the DataSource
+	metricsQuerier := a.DataSource.Metrics()
+
+	collector, err := inferencecost.NewCollector(cfg, a.Model, metricsQuerier)
+	if err != nil {
+		return err
+	}
+
+	exporter := inferencecost.NewExporter()
+	if err := exporter.Register(); err != nil {
+		return err
+	}
+
+	calculator := inferencecost.NewCalculator(cfg)
+	runner := inferencecost.NewRunner(collector, calculator, exporter, cfg.CollectionInterval)
+
+	// The collector and calculator are shared between the background runner
+	// and the API; both paths are read-only, so sharing is safe.
+	*qs = inferencecost.NewQueryService(collector, calculator)
+
+	go runner.Start(ctx)
+	log.Infof("InferenceCost: collector started (interval=%s)", cfg.CollectionInterval)
+	return nil
+}
+
 func StartExportWorker(ctx context.Context, model costmodel.AllocationModel) error {
 	exportPath := env.GetExportCSVFile()
 	if exportPath == "" {

+ 34 - 0
pkg/env/costmodel.go

@@ -102,6 +102,13 @@ const (
 
 	// Metrics Emitter
 	MetricsEmitterQueryWindowEnvVar = "METRICS_EMITTER_QUERY_WINDOW"
+
+	// Inference Cost
+	InferenceCostEnabledEnvVar               = "INFERENCE_COST_ENABLED"
+	InferenceModelLabelEnvVar                = "INFERENCE_MODEL_LABEL"
+	InferenceSharedInfraLabelEnvVar          = "INFERENCE_SHARED_INFRA_LABEL"
+	InferenceSharedInfraLabelValueEnvVar     = "INFERENCE_SHARED_INFRA_LABEL_VALUE"
+	InferenceCollectionIntervalEnvVar        = "INFERENCE_COLLECTION_INTERVAL"
 )
 
 func GetGCPAuthSecretFilePath() string {
@@ -420,3 +427,30 @@ func GetMCPHTTPPort() int {
 func GetMetricsEmitterQueryWindow() time.Duration {
 	return env.GetDuration(MetricsEmitterQueryWindowEnvVar, 2*time.Minute)
 }
+
+// IsInferenceCostEnabled returns whether the inference cost collector is enabled.
+func IsInferenceCostEnabled() bool {
+	return env.GetBool(InferenceCostEnabledEnvVar, false)
+}
+
+// GetInferenceModelLabel returns the Kubernetes pod label used to identify model name.
+func GetInferenceModelLabel() string {
+	return env.Get(InferenceModelLabelEnvVar, "llm-d.ai/model")
+}
+
+// GetInferenceSharedInfraLabel returns the label key identifying shared inference infra pods.
+func GetInferenceSharedInfraLabel() string {
+	return env.Get(InferenceSharedInfraLabelEnvVar, "llm-d.ai/inference-shared")
+}
+
+// GetInferenceSharedInfraLabelValue returns the label value identifying shared inference infra pods.
+func GetInferenceSharedInfraLabelValue() string {
+	return env.Get(InferenceSharedInfraLabelValueEnvVar, "true")
+}
+
+// GetInferenceCollectionInterval returns the time interval for inference cost collection.
+// Default is 2 minutes to match the core metrics emitter query window.
+func GetInferenceCollectionInterval() time.Duration {
+	return env.GetDuration(InferenceCollectionIntervalEnvVar, 2*time.Minute)
+}
+

+ 297 - 0
pkg/inferencecost/aggregate.go

@@ -0,0 +1,297 @@
+package inferencecost
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+)
+
+// supportedAggregateProperties lists the InferenceCostProperties dimensions
+// that the collector actually populates in Phase 1.
+var supportedAggregateProperties = map[string]bool{
+	"model_name":      true,
+	"model_version":   true,
+	"namespace":       true,
+	"cluster":         true,
+	"pod":             true,
+	"controller":      true,
+	"controller_kind": true,
+	"container":       true,
+	"workload_type":   true,
+}
+
+// aggKey derives the aggregation map key for an InferenceCostResponse given
+// the requested aggregation dimensions. Returns an error if any dimension is
+// not in supportedAggregateProperties.
+func aggKey(props InferenceCostAPIProperties, aggregateBy []string) (string, error) {
+	if len(aggregateBy) == 0 {
+		// Use "model:namespace" as the natural key. ":" cannot appear in
+		// Kubernetes label values or namespace names, so this is unambiguous
+		// even when the model name contains "/" (e.g. "org/model").
+		ns := props.Namespace
+		if ns == "" {
+			ns = opencost.UnallocatedSuffix
+		}
+		return props.ModelName + ":" + ns, nil
+	}
+
+	parts := make([]string, 0, len(aggregateBy))
+	for _, dim := range aggregateBy {
+		if !supportedAggregateProperties[dim] {
+			return "", fmt.Errorf("unsupported aggregation dimension %q: supported dimensions are model_name, model_version, namespace, cluster, pod, controller, controller_kind, container, workload_type", dim)
+		}
+		var val string
+		switch dim {
+		case "model_name":
+			val = props.ModelName
+		case "model_version":
+			val = props.ModelVersion
+		case "namespace":
+			val = props.Namespace
+		case "cluster":
+			val = props.Cluster
+		case "pod":
+			val = props.Pod
+		case "controller":
+			val = props.Controller
+		case "controller_kind":
+			val = props.ControllerKind
+		case "container":
+			val = props.Container
+		case "workload_type":
+			val = props.WorkloadType
+		}
+		if val == "" {
+			val = opencost.UnallocatedSuffix
+		}
+		parts = append(parts, val)
+	}
+	return strings.Join(parts, "/"), nil
+}
+
+// addResponse merges src into dst, summing additive fields and recomputing
+// derived per-million-token rates from the summed numerators/denominators.
+// Rates are NOT averaged — they are recomputed after summing to preserve
+// accuracy (e.g. two models with different throughput cannot be averaged).
+func addResponse(dst, src *InferenceCostResponse) {
+	dst.TotalCost += src.TotalCost
+	dst.PromptTokens += src.PromptTokens
+	dst.GenerationTokens += src.GenerationTokens
+	dst.TotalTokens += src.TotalTokens
+	dst.InputCost += src.InputCost
+	dst.OutputCost += src.OutputCost
+	dst.cachedTokens += src.cachedTokens
+
+	// Recompute blended rate from accumulated totals.
+	if dst.TotalTokens > 0 {
+		dst.CostPerMillionTokens = dst.TotalCost / dst.TotalTokens * 1_000_000
+	} else {
+		dst.CostPerMillionTokens = 0
+	}
+
+	if dst.PromptTokens > 0 {
+		dst.InputCostPerMillionTokens = dst.InputCost / dst.PromptTokens * 1_000_000
+		// Clamped to [0, 1]: see calculator.go for the full explanation of why
+		// cachedTokens can exceed promptTokens in high-reuse workloads.
+		dst.CacheSavingsFraction = min(dst.cachedTokens/dst.PromptTokens, 1.0)
+	} else {
+		dst.InputCostPerMillionTokens = 0
+		dst.CacheSavingsFraction = 0
+	}
+
+	if dst.GenerationTokens > 0 {
+		dst.OutputCostPerMillionTokens = dst.OutputCost / dst.GenerationTokens * 1_000_000
+	} else {
+		dst.OutputCostPerMillionTokens = 0
+	}
+
+	// Preserve the allocation method from the first entry; clear it when
+	// methods differ (the merged entry reflects a mixed derivation).
+	if dst.AllocationMethod != src.AllocationMethod {
+		dst.AllocationMethod = ""
+	}
+}
+
+// aggregate groups the InferenceCosts in s by the given dimensions, summing
+// all additive fields and recomputing derived rates. If aggregateBy is empty
+// the set is returned unchanged.
+//
+// Properties that are not part of the aggregation key are cleared on the
+// merged entry so the response accurately reflects what was grouped on.
+// For example, aggregating by "namespace" clears modelName so the result
+// doesn't show an arbitrary model name from whichever entry merged last.
+func (s *InferenceCostSet) aggregate(aggregateBy []string) error {
+	if len(aggregateBy) == 0 {
+		return nil
+	}
+
+	aggDims := make(map[string]bool, len(aggregateBy))
+	for _, d := range aggregateBy {
+		aggDims[d] = true
+	}
+
+	aggMap := make(map[string]*InferenceCostResponse, len(s.InferenceCosts))
+	for _, ic := range s.InferenceCosts {
+		key, err := aggKey(ic.Properties, aggregateBy)
+		if err != nil {
+			return err
+		}
+		if existing, ok := aggMap[key]; ok {
+			addResponse(existing, ic)
+		} else {
+			// Clone so we don't mutate the original, then clear non-grouped
+			// properties so the response reflects only the aggregation key.
+			clone := *ic
+			if !aggDims["model_name"] {
+				clone.Properties.ModelName = ""
+				clone.Properties.ModelVersion = ""
+			}
+			if !aggDims["namespace"] {
+				clone.Properties.Namespace = ""
+			}
+			if !aggDims["cluster"] {
+				clone.Properties.Cluster = ""
+			}
+			if !aggDims["pod"] {
+				clone.Properties.Pod = ""
+			}
+			if !aggDims["controller"] {
+				clone.Properties.Controller = ""
+				clone.Properties.ControllerKind = ""
+			}
+			if !aggDims["controller_kind"] {
+				clone.Properties.ControllerKind = ""
+			}
+			if !aggDims["container"] {
+				clone.Properties.Container = ""
+			}
+			if !aggDims["workload_type"] {
+				clone.Properties.WorkloadType = ""
+			}
+			aggMap[key] = &clone
+		}
+	}
+
+	s.InferenceCosts = aggMap
+	return nil
+}
+
+// accumulate collapses a slice of InferenceCostSets into a single set whose
+// Window spans all of them. It sums additive fields across windows and
+// recomputes derived rates. It is used to produce the flat total for the
+// /total endpoint.
+func accumulate(sets []*InferenceCostSet) *InferenceCostSet {
+	if len(sets) == 0 {
+		return newInferenceCostSet(opencost.Window{})
+	}
+
+	// Determine the combined window.
+	first := sets[0]
+	combined := opencost.NewClosedWindow(*first.Window.Start(), *first.Window.End())
+	for _, s := range sets[1:] {
+		if s.Window.Start() != nil && s.Window.Start().Before(*combined.Start()) {
+			combined = opencost.NewClosedWindow(*s.Window.Start(), *combined.End())
+		}
+		if s.Window.End() != nil && s.Window.End().After(*combined.End()) {
+			combined = opencost.NewClosedWindow(*combined.Start(), *s.Window.End())
+		}
+	}
+
+	out := newInferenceCostSet(combined)
+	for _, s := range sets {
+		for key, ic := range s.InferenceCosts {
+			if existing, ok := out.InferenceCosts[key]; ok {
+				addResponse(existing, ic)
+			} else {
+				clone := *ic
+				// Update the window to the combined range.
+				clone.Window = combined
+				out.InferenceCosts[key] = &clone
+			}
+		}
+	}
+
+	return out
+}
+
+// --- Phase-1 minimal filter ---
+
+// filterSpec holds a single parsed property:value constraint.
+type filterSpec struct {
+	property string
+	value    string
+}
+
+// parseFilter parses a Phase-1 filter string of the form
+//   prop:"value"[+prop:"value"]*
+// All terms are ANDed. Only dimensions in supportedAggregateProperties are
+// accepted. Values are unquoted if surrounded by double-quotes.
+//
+// This is intentionally minimal for Phase 1. Phase 2 can adopt the full
+// core/pkg/filter AST (with wildcard/OR support).
+func parseFilter(s string) ([]filterSpec, error) {
+	if s == "" {
+		return nil, nil
+	}
+	// "+" is the documented AND separator. HTTP clients that URL-encode query
+	// strings may transmit "+" as "%2B" (preserved) or as a literal "+" which
+	// some frameworks decode to a space before the handler sees it. Split on
+	// both to handle either case.
+	s = strings.ReplaceAll(s, " ", "+")
+	terms := strings.Split(s, "+")
+	specs := make([]filterSpec, 0, len(terms))
+	for _, term := range terms {
+		term = strings.TrimSpace(term)
+		if term == "" {
+			continue
+		}
+		idx := strings.IndexByte(term, ':')
+		if idx < 0 {
+			return nil, fmt.Errorf("invalid filter term %q: expected property:value", term)
+		}
+		prop := strings.TrimSpace(term[:idx])
+		if !supportedAggregateProperties[prop] {
+			return nil, fmt.Errorf("unsupported filter property %q: supported properties are model_name, model_version, namespace, cluster, pod, controller, controller_kind, container, workload_type", prop)
+		}
+		val := strings.TrimSpace(term[idx+1:])
+		// Strip surrounding double-quotes.
+		if len(val) >= 2 && val[0] == '"' && val[len(val)-1] == '"' {
+			val = val[1 : len(val)-1]
+		}
+		specs = append(specs, filterSpec{property: prop, value: val})
+	}
+	return specs, nil
+}
+
+// matchesFilter reports whether an InferenceCostResponse satisfies all filter
+// specs (AND semantics).
+func matchesFilter(ic *InferenceCostResponse, specs []filterSpec) bool {
+	for _, spec := range specs {
+		var actual string
+		switch spec.property {
+		case "model_name":
+			actual = ic.Properties.ModelName
+		case "model_version":
+			actual = ic.Properties.ModelVersion
+		case "namespace":
+			actual = ic.Properties.Namespace
+		case "cluster":
+			actual = ic.Properties.Cluster
+		case "pod":
+			actual = ic.Properties.Pod
+		case "controller":
+			actual = ic.Properties.Controller
+		case "controller_kind":
+			actual = ic.Properties.ControllerKind
+		case "container":
+			actual = ic.Properties.Container
+		case "workload_type":
+			actual = ic.Properties.WorkloadType
+		}
+		if actual != spec.value {
+			return false
+		}
+	}
+	return true
+}

+ 519 - 0
pkg/inferencecost/aggregate_test.go

@@ -0,0 +1,519 @@
+package inferencecost
+
+import (
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+)
+
+// makeWindow is a test helper.
+func makeWindow(start, end time.Time) opencost.Window {
+	return opencost.NewClosedWindow(start, end)
+}
+
+// makeICR builds a minimal InferenceCostResponse for aggregation tests.
+func makeICR(modelName, ns string, totalCost, promptTokens, genTokens, inputCost, outputCost float64) *InferenceCostResponse {
+	totalTokens := promptTokens + genTokens
+	var cpmt, icpmt, ocpmt float64
+	if totalTokens > 0 {
+		cpmt = totalCost / totalTokens * 1_000_000
+	}
+	if promptTokens > 0 {
+		icpmt = inputCost / promptTokens * 1_000_000
+	}
+	if genTokens > 0 {
+		ocpmt = outputCost / genTokens * 1_000_000
+	}
+	return &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			ModelName: modelName,
+			Namespace: ns,
+		},
+		CostBasis:                  CostBasisAllocation,
+		TotalCost:                  totalCost,
+		PromptTokens:               promptTokens,
+		GenerationTokens:           genTokens,
+		TotalTokens:                totalTokens,
+		CostPerMillionTokens:       cpmt,
+		InputCost:                  inputCost,
+		OutputCost:                 outputCost,
+		InputCostPerMillionTokens:  icpmt,
+		OutputCostPerMillionTokens: ocpmt,
+	}
+}
+
+// --- aggKey ---
+
+func TestAggKey_NoAggregation(t *testing.T) {
+	props := InferenceCostAPIProperties{ModelName: "llama", Namespace: "prod"}
+	key, err := aggKey(props, nil)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "llama:prod" {
+		t.Errorf("key = %q, want %q", key, "llama:prod")
+	}
+}
+
+func TestAggKey_NoAggregation_SlashInModelName(t *testing.T) {
+	// Model names like "org/model" must not produce an ambiguous key.
+	// ":" is the separator because it cannot appear in K8s label values or namespaces.
+	props := InferenceCostAPIProperties{ModelName: "Qwen/Qwen3-VL-2B-Instruct", Namespace: "test-epd-ec"}
+	key, err := aggKey(props, nil)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "Qwen/Qwen3-VL-2B-Instruct:test-epd-ec" {
+		t.Errorf("key = %q, want %q", key, "Qwen/Qwen3-VL-2B-Instruct:test-epd-ec")
+	}
+}
+
+func TestAggKey_ByModelName(t *testing.T) {
+	props := InferenceCostAPIProperties{ModelName: "llama", Namespace: "prod"}
+	key, err := aggKey(props, []string{"model_name"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "llama" {
+		t.Errorf("key = %q, want %q", key, "llama")
+	}
+}
+
+func TestAggKey_MultiDim(t *testing.T) {
+	props := InferenceCostAPIProperties{ModelName: "llama", Namespace: "prod"}
+	key, err := aggKey(props, []string{"model_name", "namespace"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "llama/prod" {
+		t.Errorf("key = %q, want %q", key, "llama/prod")
+	}
+}
+
+func TestAggKey_UnsupportedDimension(t *testing.T) {
+	props := InferenceCostAPIProperties{ModelName: "llama", Namespace: "prod"}
+	_, err := aggKey(props, []string{"product"})
+	if err == nil {
+		t.Fatal("expected error for unsupported dimension, got nil")
+	}
+}
+
+func TestAggKey_EmptyValueFallsBackToUnallocated(t *testing.T) {
+	props := InferenceCostAPIProperties{ModelName: "llama"} // Namespace empty
+	key, err := aggKey(props, []string{"namespace"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != opencost.UnallocatedSuffix {
+		t.Errorf("key = %q, want %q (unallocated suffix)", key, opencost.UnallocatedSuffix)
+	}
+}
+
+func TestAggKey_ByPod(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName: "llama",
+		Namespace: "prod",
+		Pod:       "llama-pod-123",
+	}
+	key, err := aggKey(props, []string{"pod"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "llama-pod-123" {
+		t.Errorf("key = %q, want %q", key, "llama-pod-123")
+	}
+}
+
+func TestAggKey_ByController(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName:  "llama",
+		Namespace:  "prod",
+		Controller: "llama-deployment",
+	}
+	key, err := aggKey(props, []string{"controller"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "llama-deployment" {
+		t.Errorf("key = %q, want %q", key, "llama-deployment")
+	}
+}
+
+func TestAggKey_ByControllerKind(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName:      "llama",
+		Namespace:      "prod",
+		ControllerKind: "Deployment",
+	}
+	key, err := aggKey(props, []string{"controller_kind"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "Deployment" {
+		t.Errorf("key = %q, want %q", key, "Deployment")
+	}
+}
+
+func TestAggKey_ByContainer(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName: "llama",
+		Namespace: "prod",
+		Container: "vllm-container",
+	}
+	key, err := aggKey(props, []string{"container"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "vllm-container" {
+		t.Errorf("key = %q, want %q", key, "vllm-container")
+	}
+}
+
+func TestAggKey_MultiDimWithNewFields(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName:      "llama",
+		Namespace:      "prod",
+		Pod:            "llama-pod-123",
+		Controller:     "llama-deployment",
+		ControllerKind: "Deployment",
+		Container:      "vllm-container",
+	}
+	key, err := aggKey(props, []string{"namespace", "controller_kind", "container"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if key != "prod/Deployment/vllm-container" {
+		t.Errorf("key = %q, want %q", key, "prod/Deployment/vllm-container")
+	}
+}
+
+func TestAggKey_AllDimensions(t *testing.T) {
+	props := InferenceCostAPIProperties{
+		ModelName:      "llama",
+		ModelVersion:   "v1.0",
+		Namespace:      "prod",
+		Cluster:        "cluster-1",
+		Pod:            "llama-pod-123",
+		Controller:     "llama-deployment",
+		ControllerKind: "Deployment",
+		Container:      "vllm-container",
+	}
+	key, err := aggKey(props, []string{
+		"model_name", "model_version", "namespace", "cluster",
+		"pod", "controller", "controller_kind", "container",
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	expected := "llama/v1.0/prod/cluster-1/llama-pod-123/llama-deployment/Deployment/vllm-container"
+	if key != expected {
+		t.Errorf("key = %q, want %q", key, expected)
+	}
+}
+
+// --- aggregate ---
+
+func TestAggregate_ByModelName_SumsTokensAndRecomputesRates(t *testing.T) {
+	now := time.Now().UTC()
+	win := makeWindow(now.Add(-time.Hour), now)
+
+	set := newInferenceCostSet(win)
+	// Two entries for the same model, different namespaces.
+	set.InferenceCosts["llama/ns1"] = makeICR("llama", "ns1", 100, 1_000_000, 500_000, 70, 30)
+	set.InferenceCosts["llama/ns2"] = makeICR("llama", "ns2", 200, 2_000_000, 1_000_000, 140, 60)
+
+	if err := set.aggregate([]string{"model_name"}); err != nil {
+		t.Fatalf("aggregate error: %v", err)
+	}
+
+	if len(set.InferenceCosts) != 1 {
+		t.Fatalf("expected 1 entry after aggregation, got %d", len(set.InferenceCosts))
+	}
+
+	agg := set.InferenceCosts["llama"]
+	if agg == nil {
+		t.Fatal("expected key 'llama', not found")
+	}
+	if agg.TotalCost != 300 {
+		t.Errorf("TotalCost = %.2f, want 300", agg.TotalCost)
+	}
+	if agg.PromptTokens != 3_000_000 {
+		t.Errorf("PromptTokens = %.0f, want 3_000_000", agg.PromptTokens)
+	}
+	if agg.TotalTokens != 4_500_000 {
+		t.Errorf("TotalTokens = %.0f, want 4_500_000", agg.TotalTokens)
+	}
+
+	// Rate must be recomputed from sums, not averaged.
+	wantCPMT := 300.0 / 4_500_000 * 1_000_000
+	if !floatEq(agg.CostPerMillionTokens, wantCPMT) {
+		t.Errorf("CostPerMillionTokens = %.4f, want %.4f", agg.CostPerMillionTokens, wantCPMT)
+	}
+}
+
+func TestAggregate_NoAggregation_DoesNothing(t *testing.T) {
+	now := time.Now().UTC()
+	win := makeWindow(now.Add(-time.Hour), now)
+
+	set := newInferenceCostSet(win)
+	set.InferenceCosts["llama/prod"] = makeICR("llama", "prod", 100, 1_000_000, 500_000, 70, 30)
+	set.InferenceCosts["mistral/prod"] = makeICR("mistral", "prod", 50, 500_000, 250_000, 35, 15)
+
+	if err := set.aggregate(nil); err != nil {
+		t.Fatalf("aggregate error: %v", err)
+	}
+	if len(set.InferenceCosts) != 2 {
+		t.Errorf("expected 2 entries after no-op aggregation, got %d", len(set.InferenceCosts))
+	}
+}
+
+// --- accumulate ---
+
+func TestAccumulate_EmptySlice(t *testing.T) {
+	out := accumulate(nil)
+	if out == nil {
+		t.Fatal("expected non-nil result for empty slice")
+	}
+	if len(out.InferenceCosts) != 0 {
+		t.Errorf("expected 0 entries, got %d", len(out.InferenceCosts))
+	}
+}
+
+func TestAccumulate_SumsAcrossWindows(t *testing.T) {
+	now := time.Now().UTC()
+	win1 := makeWindow(now.Add(-2*time.Hour), now.Add(-1*time.Hour))
+	win2 := makeWindow(now.Add(-1*time.Hour), now)
+
+	set1 := newInferenceCostSet(win1)
+	set1.InferenceCosts["llama/prod"] = makeICR("llama", "prod", 100, 1_000_000, 500_000, 70, 30)
+
+	set2 := newInferenceCostSet(win2)
+	set2.InferenceCosts["llama/prod"] = makeICR("llama", "prod", 120, 1_200_000, 600_000, 84, 36)
+
+	out := accumulate([]*InferenceCostSet{set1, set2})
+
+	if len(out.InferenceCosts) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(out.InferenceCosts))
+	}
+	acc := out.InferenceCosts["llama/prod"]
+	if acc.TotalCost != 220 {
+		t.Errorf("TotalCost = %.2f, want 220", acc.TotalCost)
+	}
+	if acc.PromptTokens != 2_200_000 {
+		t.Errorf("PromptTokens = %.0f, want 2_200_000", acc.PromptTokens)
+	}
+	// Combined window should span from win1.Start to win2.End.
+	if out.Window.Start() == nil || out.Window.End() == nil {
+		t.Fatal("expected combined window to be non-nil")
+	}
+}
+
+func TestAccumulate_MultipleModels(t *testing.T) {
+	now := time.Now().UTC()
+	win1 := makeWindow(now.Add(-2*time.Hour), now.Add(-1*time.Hour))
+	win2 := makeWindow(now.Add(-1*time.Hour), now)
+
+	set1 := newInferenceCostSet(win1)
+	set1.InferenceCosts["llama/prod"] = makeICR("llama", "prod", 100, 1_000_000, 500_000, 70, 30)
+	set1.InferenceCosts["mistral/prod"] = makeICR("mistral", "prod", 50, 500_000, 250_000, 35, 15)
+
+	set2 := newInferenceCostSet(win2)
+	set2.InferenceCosts["llama/prod"] = makeICR("llama", "prod", 120, 1_200_000, 600_000, 84, 36)
+	// mistral absent in set2
+
+	out := accumulate([]*InferenceCostSet{set1, set2})
+
+	if len(out.InferenceCosts) != 2 {
+		t.Errorf("expected 2 entries, got %d", len(out.InferenceCosts))
+	}
+	llama := out.InferenceCosts["llama/prod"]
+	if llama == nil || llama.TotalCost != 220 {
+		t.Errorf("llama TotalCost = %.2f, want 220", llama.TotalCost)
+	}
+	mistral := out.InferenceCosts["mistral/prod"]
+	if mistral == nil || mistral.TotalCost != 50 {
+		t.Errorf("mistral TotalCost = %.2f, want 50", mistral.TotalCost)
+	}
+}
+
+// --- parseFilter ---
+
+func TestParseFilter_Empty(t *testing.T) {
+	specs, err := parseFilter("")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(specs) != 0 {
+		t.Errorf("expected 0 specs, got %d", len(specs))
+	}
+}
+
+func TestParseFilter_SingleTerm(t *testing.T) {
+	specs, err := parseFilter(`namespace:"llm-prod"`)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(specs) != 1 {
+		t.Fatalf("expected 1 spec, got %d", len(specs))
+	}
+	if specs[0].property != "namespace" || specs[0].value != "llm-prod" {
+		t.Errorf("spec = %+v, want {namespace, llm-prod}", specs[0])
+	}
+}
+
+func TestParseFilter_MultiTermAnd(t *testing.T) {
+	specs, err := parseFilter(`namespace:"llm-prod"+model_name:"llama"`)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(specs) != 2 {
+		t.Fatalf("expected 2 specs, got %d", len(specs))
+	}
+}
+
+func TestParseFilter_UnsupportedProperty(t *testing.T) {
+	_, err := parseFilter(`product:"team-a"`)
+	if err == nil {
+		t.Fatal("expected error for unsupported property, got nil")
+	}
+}
+
+func TestParseFilter_MissingColon(t *testing.T) {
+	_, err := parseFilter(`namespacellm-prod`)
+	if err == nil {
+		t.Fatal("expected error for missing colon, got nil")
+	}
+}
+
+// --- matchesFilter ---
+
+func TestMatchesFilter_NoSpecs(t *testing.T) {
+	ic := &InferenceCostResponse{Properties: InferenceCostAPIProperties{Namespace: "prod"}}
+	if !matchesFilter(ic, nil) {
+		t.Error("expected matchesFilter to return true for empty specs")
+	}
+}
+
+func TestMatchesFilter_Match(t *testing.T) {
+	ic := &InferenceCostResponse{Properties: InferenceCostAPIProperties{Namespace: "prod"}}
+	specs := []filterSpec{{property: "namespace", value: "prod"}}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for matching spec")
+	}
+}
+
+func TestMatchesFilter_NoMatch(t *testing.T) {
+	ic := &InferenceCostResponse{Properties: InferenceCostAPIProperties{Namespace: "prod"}}
+	specs := []filterSpec{{property: "namespace", value: "staging"}}
+	if matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return false for non-matching spec")
+	}
+}
+
+
+func TestMatchesFilter_Pod(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace: "prod",
+			Pod:       "llama-pod-123",
+		},
+	}
+	specs := []filterSpec{{property: "pod", value: "llama-pod-123"}}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for matching pod")
+	}
+}
+
+func TestMatchesFilter_Controller(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace:  "prod",
+			Controller: "llama-deployment",
+		},
+	}
+	specs := []filterSpec{{property: "controller", value: "llama-deployment"}}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for matching controller")
+	}
+}
+
+func TestMatchesFilter_ControllerKind(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace:      "prod",
+			ControllerKind: "Deployment",
+		},
+	}
+	specs := []filterSpec{{property: "controller_kind", value: "Deployment"}}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for matching controller_kind")
+	}
+}
+
+func TestMatchesFilter_Container(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace: "prod",
+			Container: "vllm-container",
+		},
+	}
+	specs := []filterSpec{{property: "container", value: "vllm-container"}}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for matching container")
+	}
+}
+
+func TestMatchesFilter_MultipleNewDimensions(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace:      "prod",
+			Pod:            "llama-pod-123",
+			ControllerKind: "Deployment",
+			Container:      "vllm-container",
+		},
+	}
+	specs := []filterSpec{
+		{property: "namespace", value: "prod"},
+		{property: "controller_kind", value: "Deployment"},
+		{property: "container", value: "vllm-container"},
+	}
+	if !matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return true for all matching specs")
+	}
+}
+
+func TestMatchesFilter_NewDimensionNoMatch(t *testing.T) {
+	ic := &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			Namespace: "prod",
+			Pod:       "llama-pod-123",
+		},
+	}
+	specs := []filterSpec{{property: "pod", value: "mistral-pod-456"}}
+	if matchesFilter(ic, specs) {
+		t.Error("expected matchesFilter to return false for non-matching pod")
+	}
+}
+
+func TestParseFilter_NewDimensions(t *testing.T) {
+	specs, err := parseFilter(`pod:"llama-pod-123"+controller:"llama-deployment"+container:"vllm"`)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(specs) != 3 {
+		t.Fatalf("expected 3 specs, got %d", len(specs))
+	}
+	
+	expectedSpecs := []filterSpec{
+		{property: "pod", value: "llama-pod-123"},
+		{property: "controller", value: "llama-deployment"},
+		{property: "container", value: "vllm"},
+	}
+	
+	for i, expected := range expectedSpecs {
+		if specs[i].property != expected.property || specs[i].value != expected.value {
+			t.Errorf("spec[%d] = %+v, want %+v", i, specs[i], expected)
+		}
+	}
+}

+ 142 - 0
pkg/inferencecost/apitypes.go

@@ -0,0 +1,142 @@
+package inferencecost
+
+import (
+	"github.com/opencost/opencost/core/pkg/opencost"
+)
+
+// InferenceCostResponse is the flat, per-cost-basis API representation of
+// inference costs for a single model/namespace in a time window. It is
+// projected from the internal InferenceCost struct (which stores costs keyed
+// by CostBasis) so that the JSON output matches the design doc shape exactly.
+type InferenceCostResponse struct {
+	Properties InferenceCostAPIProperties `json:"properties"`
+	Window     opencost.Window            `json:"window"`
+
+	// CostBasis identifies whether these costs are usage-based or
+	// allocation-based. Set from the QueryRequest.
+	CostBasis CostBasis `json:"costBasis"`
+
+	// Total infrastructure cost for the window under the chosen cost basis.
+	TotalCost float64 `json:"totalCost"`
+
+	// Token counts from vLLM metrics.
+	PromptTokens     float64 `json:"promptTokens"`
+	GenerationTokens float64 `json:"generationTokens"`
+	TotalTokens      float64 `json:"totalTokens"`
+
+	// Blended cost per 1M delivered tokens (input + output together).
+	CostPerMillionTokens float64 `json:"costPerMillionTokens"`
+
+	// Input/output cost split. InputCost and OutputCost sum to TotalCost if the cost basis is allocation.
+	InputCost  float64 `json:"inputCost"`
+	OutputCost float64 `json:"outputCost"`
+
+	// Per-million cost metrics for differentiated pricing.
+	// InputCostPerMillionTokens uses PromptTokens as the denominator (all delivered
+	// input tokens, including those served from KV cache).
+	InputCostPerMillionTokens  float64 `json:"inputCostPerMillionTokens"`
+	OutputCostPerMillionTokens float64 `json:"outputCostPerMillionTokens"`
+
+	// CacheSavingsFraction is the fraction of prompt tokens served from the KV
+	// cache (CachedTokens / PromptTokens, clamped to [0, 1]). Zero when prefix
+	// caching is disabled (see allocationMethod) or when no cache hits occurred
+	// in the window.
+	//
+	// Note: in workloads with heavy prefix reuse (e.g. benchmarks with long
+	// shared system prompts), the raw ratio can exceed 1.0 because
+	// vllm:prefix_cache_hits_total counts tokens retrieved from cache per
+	// request — including prefixes established by earlier requests outside the
+	// current window — while vllm:prompt_tokens_total only counts new input
+	// tokens delivered in this window. The value is clamped to 1.0 in that case.
+	CacheSavingsFraction float64 `json:"cacheSavingsFraction"`
+
+	// cachedTokens is carried for aggregation recomputation of CacheSavingsFraction
+	// and is not included in the JSON output.
+	cachedTokens float64
+
+	// AllocationMethod records which input/output cost-split path was used.
+	// Informational; omitted when empty.
+	AllocationMethod AllocationMethod `json:"allocationMethod,omitempty"`
+}
+
+// InferenceCostAPIProperties is the JSON-facing properties struct for API
+// responses. It mirrors InferenceCostProperties but with explicit JSON tags
+// matching the design doc field names.
+type InferenceCostAPIProperties struct {
+	ModelName      string `json:"modelName"`
+	ModelVersion   string `json:"modelVersion,omitempty"`
+	Namespace      string `json:"namespace"`
+	Cluster        string `json:"cluster,omitempty"`
+	Pod            string `json:"pod,omitempty"`
+	Controller     string `json:"controller,omitempty"`
+	ControllerKind string `json:"controllerKind,omitempty"`
+	Container      string `json:"container,omitempty"`
+	WorkloadType   string `json:"workloadType"` // currently always "inference"
+}
+
+// InferenceCostSet holds a collection of InferenceCostResponses for a single
+// time window, keyed by aggregation key.
+type InferenceCostSet struct {
+	InferenceCosts map[string]*InferenceCostResponse `json:"inferenceCosts"`
+	Window         opencost.Window                   `json:"window"`
+}
+
+// InferenceCostSetRange holds multiple InferenceCostSets covering a broader
+// time range. Used for the /timeseries endpoint.
+type InferenceCostSetRange struct {
+	InferenceCostSets []*InferenceCostSet `json:"inferenceCostSets"`
+	Window            opencost.Window     `json:"window"`
+}
+
+// newInferenceCostResponse projects a single InferenceCost into the flat
+// per-basis API response type for the given window.
+func newInferenceCostResponse(ic *InferenceCost, basis CostBasis, win opencost.Window) *InferenceCostResponse {
+	var totalCost float64
+	if basis == CostBasisUsage {
+		totalCost = ic.UsageTotalCost
+	} else {
+		totalCost = ic.AllocationTotalCost
+	}
+
+	cpmt := ic.CostPerMillionTokens[basis]
+	icpmt := ic.InputCostPerMillionTokens[basis]
+	ocpmt := ic.OutputCostPerMillionTokens[basis]
+	inputCost := ic.InputCost[basis]
+	outputCost := ic.OutputCost[basis]
+
+	return &InferenceCostResponse{
+		Properties: InferenceCostAPIProperties{
+			ModelName:      ic.Properties.ModelName,
+			ModelVersion:   ic.Properties.ModelVersion,
+			Namespace:      ic.Properties.Namespace,
+			Cluster:        ic.Properties.Cluster,
+			Pod:            ic.Properties.Pod,
+			Controller:     ic.Properties.Controller,
+			ControllerKind: ic.Properties.ControllerKind,
+			Container:      ic.Properties.Container,
+			WorkloadType:   ic.Properties.WorkloadType,
+		},
+		Window:                     win,
+		CostBasis:                  basis,
+		TotalCost:                  totalCost,
+		PromptTokens:               ic.PromptTokens,
+		GenerationTokens:           ic.GenerationTokens,
+		TotalTokens:                ic.TotalTokens,
+		CostPerMillionTokens:       cpmt,
+		InputCost:                  inputCost,
+		OutputCost:                 outputCost,
+		InputCostPerMillionTokens:  icpmt,
+		OutputCostPerMillionTokens: ocpmt,
+		CacheSavingsFraction:       ic.CacheSavingsFraction,
+		cachedTokens:               ic.CachedTokens,
+		AllocationMethod:           ic.AllocationMethod,
+	}
+}
+
+// newInferenceCostSet creates an empty InferenceCostSet for the given window.
+func newInferenceCostSet(win opencost.Window) *InferenceCostSet {
+	return &InferenceCostSet{
+		InferenceCosts: make(map[string]*InferenceCostResponse),
+		Window:         win,
+	}
+}

+ 164 - 0
pkg/inferencecost/calculator.go

@@ -0,0 +1,164 @@
+package inferencecost
+
+import "github.com/opencost/opencost/core/pkg/log"
+
+// Calculator computes derived cost metrics for a slice of InferenceCost structs.
+type Calculator struct {
+	config *Config
+}
+
+// NewCalculator creates a Calculator with the given config.
+func NewCalculator(config *Config) *Calculator {
+	return &Calculator{config: config}
+}
+
+// CalculateCosts populates derived cost fields on each InferenceCost in-place.
+func (c *Calculator) CalculateCosts(metrics []*InferenceCost) {
+	for _, m := range metrics {
+		c.calculateModelCosts(m)
+	}
+}
+
+func (c *Calculator) calculateModelCosts(m *InferenceCost) {
+	m.CostPerMillionTokens = make(map[CostBasis]float64)
+	m.InputCostPerMillionTokens = make(map[CostBasis]float64)
+	m.OutputCostPerMillionTokens = make(map[CostBasis]float64)
+	m.InputCost = make(map[CostBasis]float64)
+	m.OutputCost = make(map[CostBasis]float64)
+
+	// Usage cost requires evidence of actual token processing. Without tokens,
+	// the pod was provisioned but idle: there is no active compute to charge for.
+	if m.TotalTokens == 0 {
+		m.UsageTotalCost = 0
+	}
+
+	// Blended cost per million tokens (all delivered tokens, including cached).
+	// Uses TotalTokens — answers "average cost per delivered token".
+	if m.TotalTokens > 0 {
+		m.CostPerMillionTokens[CostBasisAllocation] = m.AllocationTotalCost / m.TotalTokens * 1_000_000
+		m.CostPerMillionTokens[CostBasisUsage] = m.UsageTotalCost / m.TotalTokens * 1_000_000
+	}
+
+	// Case 1: no tokens or no cost — allocation method not applicable.
+	if m.TotalTokens == 0 || (m.AllocationTotalCost == 0 && m.UsageTotalCost == 0) {
+		m.AllocationMethod = ""
+		return
+	}
+
+	// Cache savings fraction: fraction of prompt tokens served from KV cache.
+	// Clamped to [0, 1]: vllm:prefix_cache_hits_total counts tokens retrieved
+	// from cache per request, while vllm:prompt_tokens_total counts new input
+	// tokens. In workloads with heavy prefix reuse (e.g. benchmarks), cached
+	// tokens can exceed prompt tokens within a short window because cache hits
+	// reflect prefixes established by earlier requests, including those outside
+	// the current window. Values >1 before clamping indicate extreme cache reuse.
+	if m.CacheConfigKnown && !m.PrefixCachingEnabled {
+		m.CacheSavingsFraction = 0
+	} else if m.PromptTokens > 0 {
+		m.CacheSavingsFraction = min(m.CachedTokens/m.PromptTokens, 1.0)
+	}
+
+	// Input/output split — choose the allocation method.
+	// Require both timing components to be present for compute-time allocation.
+	// One-sided timing data is treated as incomplete and falls back to multiplier.
+	hasCompleteTimingData := m.InputProcessingTime > 0 && m.OutputProcessingTime > 0
+	if c.config.AllocationMode == AllocationModeComputeTime && hasCompleteTimingData {
+		c.calculateComputeTimeSplit(m)
+	} else {
+		if c.config.AllocationMode == AllocationModeComputeTime && !hasCompleteTimingData {
+			log.Debugf("InferenceCost: incomplete timing data for model %s/%s (input=%f output=%f), using multiplier fallback",
+				m.Properties.ModelName, m.Properties.Namespace, m.InputProcessingTime, m.OutputProcessingTime)
+		}
+		c.calculateMultiplierSplit(m)
+	}
+}
+
+// calculateComputeTimeSplit allocates costs proportionally by vLLM processing time.
+// Uses PromptTokens (delivered input tokens) as the input denominator.
+func (c *Calculator) calculateComputeTimeSplit(m *InferenceCost) {
+	totalTime := m.InputProcessingTime + m.OutputProcessingTime
+	if totalTime == 0 {
+		// Timing data present but both zero — fall back.
+		c.calculateMultiplierSplit(m)
+		return
+	}
+
+	inputFraction := m.InputProcessingTime / totalTime
+	outputFraction := 1 - inputFraction
+
+	// Determine allocation method based on cache config.
+	// Only set prefix_caching_off when the config was successfully retrieved
+	// and explicitly indicates caching is disabled — not when the metric is absent.
+	if m.CacheConfigKnown && !m.PrefixCachingEnabled {
+		m.AllocationMethod = AllocationMethodPrefixCachingOff
+	} else {
+		m.AllocationMethod = AllocationMethodComputeTime
+	}
+
+	for _, basis := range []CostBasis{CostBasisUsage, CostBasisAllocation} {
+		var totalCost float64
+		if basis == CostBasisUsage {
+			totalCost = m.UsageTotalCost
+		} else {
+			totalCost = m.AllocationTotalCost
+		}
+
+		inputCost := totalCost * inputFraction
+		outputCost := totalCost * outputFraction
+
+		m.InputCost[basis] = inputCost
+		m.OutputCost[basis] = outputCost
+
+		if m.PromptTokens > 0 {
+			m.InputCostPerMillionTokens[basis] = inputCost / m.PromptTokens * 1_000_000
+		}
+		if m.GenerationTokens > 0 {
+			m.OutputCostPerMillionTokens[basis] = outputCost / m.GenerationTokens * 1_000_000
+		}
+	}
+
+	log.Debugf("InferenceCost: compute-time split model=%s/%s input=%.1f%% output=%.1f%% method=%s",
+		m.Properties.ModelName, m.Properties.Namespace,
+		inputFraction*100, outputFraction*100, m.AllocationMethod)
+}
+
+// calculateMultiplierSplit allocates costs using a fixed output/input ratio.
+// Uses EffectiveInputTokens for cost allocation; InputCostPerMillionTokens uses PromptTokens as denominator.
+func (c *Calculator) calculateMultiplierSplit(m *InferenceCost) {
+	m.AllocationMethod = AllocationMethodMultiplier
+
+	multiplier := c.config.OutputTokenCostMultiplier
+	if multiplier <= 0 {
+		multiplier = defaultOutputTokenCostMultiplier
+	}
+
+	// weightedTokens based on effective input tokens (cache-corrected).
+	weightedTokens := m.EffectiveInputTokens + m.GenerationTokens*multiplier
+	if weightedTokens == 0 {
+		return
+	}
+
+	for _, basis := range []CostBasis{CostBasisUsage, CostBasisAllocation} {
+		var totalCost float64
+		if basis == CostBasisUsage {
+			totalCost = m.UsageTotalCost
+		} else {
+			totalCost = m.AllocationTotalCost
+		}
+
+		inputCostPerToken := totalCost / weightedTokens
+
+		inputCost := inputCostPerToken * m.EffectiveInputTokens
+		outputCost := inputCostPerToken * multiplier * m.GenerationTokens
+
+		m.InputCost[basis] = inputCost
+		m.OutputCost[basis] = outputCost
+
+		if m.PromptTokens > 0 {
+			m.InputCostPerMillionTokens[basis] = inputCost / m.PromptTokens * 1_000_000
+		}
+		if m.GenerationTokens > 0 {
+			m.OutputCostPerMillionTokens[basis] = outputCost / m.GenerationTokens * 1_000_000
+		}
+	}
+}

+ 331 - 0
pkg/inferencecost/calculator_test.go

@@ -0,0 +1,331 @@
+package inferencecost
+
+import (
+	"math"
+	"testing"
+)
+
+func defaultConfig() *Config {
+	return &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+}
+
+func newCalc(cfg *Config) *Calculator { return NewCalculator(cfg) }
+
+// floatEq returns true if a and b differ by less than 1e-9.
+func floatEq(a, b float64) bool { return math.Abs(a-b) < 1e-9 }
+
+// ---- blended per-million-tokens ----
+
+func TestCalculator_BlendedCostPerMillionTokens(t *testing.T) {
+	cfg := defaultConfig()
+	m := &InferenceCost{
+		AllocationTotalCost: 4.0,
+		UsageTotalCost:      1.0,
+		PromptTokens:        800_000,
+		GenerationTokens:    200_000,
+		TotalTokens:         1_000_000,
+		EffectiveInputTokens: 800_000,
+		// no timing data → multiplier fallback
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if !floatEq(m.CostPerMillionTokens[CostBasisAllocation], 4.0) {
+		t.Errorf("allocation blended want 4.0 got %f", m.CostPerMillionTokens[CostBasisAllocation])
+	}
+	if !floatEq(m.CostPerMillionTokens[CostBasisUsage], 1.0) {
+		t.Errorf("usage blended want 1.0 got %f", m.CostPerMillionTokens[CostBasisUsage])
+	}
+}
+
+func TestCalculator_BlendedZeroTokens(t *testing.T) {
+	m := &InferenceCost{AllocationTotalCost: 1.0, UsageTotalCost: 0.5}
+	newCalc(defaultConfig()).CalculateCosts([]*InferenceCost{m})
+
+	if m.CostPerMillionTokens[CostBasisAllocation] != 0 {
+		t.Error("expected zero blended cost when TotalTokens == 0")
+	}
+}
+
+// ---- compute-time split ----
+
+func TestCalculator_ComputeTimeSplit_BothBases(t *testing.T) {
+	cfg := defaultConfig()
+	m := &InferenceCost{
+		AllocationTotalCost:  4.0,
+		UsageTotalCost:       1.0,
+		PromptTokens:         600_000,
+		GenerationTokens:     400_000,
+		TotalTokens:          1_000_000,
+		EffectiveInputTokens: 600_000, // no cache correction
+		InputProcessingTime:  70.0,
+		OutputProcessingTime: 30.0,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	// inputFraction = 0.7, outputFraction = 0.3
+	// usage: inputCost=0.7, outputCost=0.3
+	wantUsageInput := 0.7 / 600_000 * 1_000_000
+	wantUsageOutput := 0.3 / 400_000 * 1_000_000
+	wantAllocInput := (4.0 * 0.7) / 600_000 * 1_000_000
+	wantAllocOutput := (4.0 * 0.3) / 400_000 * 1_000_000
+
+	if !floatEq(m.InputCostPerMillionTokens[CostBasisUsage], wantUsageInput) {
+		t.Errorf("usage input want %f got %f", wantUsageInput, m.InputCostPerMillionTokens[CostBasisUsage])
+	}
+	if !floatEq(m.OutputCostPerMillionTokens[CostBasisUsage], wantUsageOutput) {
+		t.Errorf("usage output want %f got %f", wantUsageOutput, m.OutputCostPerMillionTokens[CostBasisUsage])
+	}
+	if !floatEq(m.InputCostPerMillionTokens[CostBasisAllocation], wantAllocInput) {
+		t.Errorf("alloc input want %f got %f", wantAllocInput, m.InputCostPerMillionTokens[CostBasisAllocation])
+	}
+	if !floatEq(m.OutputCostPerMillionTokens[CostBasisAllocation], wantAllocOutput) {
+		t.Errorf("alloc output want %f got %f", wantAllocOutput, m.OutputCostPerMillionTokens[CostBasisAllocation])
+	}
+	if m.AllocationMethod != AllocationMethodComputeTime {
+		t.Errorf("expected compute_time (no block size), got %s", m.AllocationMethod)
+	}
+}
+
+func TestCalculator_ComputeTimeSplit_InputOutputSumToTotal(t *testing.T) {
+	cfg := defaultConfig()
+	m := &InferenceCost{
+		AllocationTotalCost:  10.0,
+		UsageTotalCost:       3.0,
+		PromptTokens:         500_000,
+		GenerationTokens:     500_000,
+		TotalTokens:          1_000_000,
+		EffectiveInputTokens: 500_000,
+		InputProcessingTime:  60.0,
+		OutputProcessingTime: 40.0,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	// input_cost + output_cost must equal total for each basis.
+	// Back-compute dollar amounts from the per-million rates using PromptTokens.
+	for _, basis := range []CostBasis{CostBasisUsage, CostBasisAllocation} {
+		var totalCost float64
+		if basis == CostBasisUsage {
+			totalCost = m.UsageTotalCost
+		} else {
+			totalCost = m.AllocationTotalCost
+		}
+		inputCost := m.InputCostPerMillionTokens[basis] / 1_000_000 * m.PromptTokens
+		outputCost := m.OutputCostPerMillionTokens[basis] / 1_000_000 * m.GenerationTokens
+		if !floatEq(inputCost+outputCost, totalCost) {
+			t.Errorf("basis=%s: input+output=%.6f want %.6f", basis, inputCost+outputCost, totalCost)
+		}
+	}
+}
+
+// ---- KV cache savings fraction ----
+
+func TestCalculator_CacheSavingsFraction(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	// 8 cached tokens out of 20 prompt tokens → 40% savings
+	m := &InferenceCost{
+		AllocationTotalCost:  1.0,
+		UsageTotalCost:       1.0,
+		PromptTokens:         20,
+		GenerationTokens:     10,
+		TotalTokens:          30,
+		CachedTokens:         8,
+		CacheConfigKnown:     true,
+		PrefixCachingEnabled: true,
+		EffectiveInputTokens: 12,
+		InputProcessingTime:  60,
+		OutputProcessingTime: 40,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	// inputCostPerMillionTokens uses PromptTokens (20) as denominator.
+	wantInputCPM := (1.0 * 0.6 / 20) * 1_000_000
+	got := m.InputCostPerMillionTokens[CostBasisUsage]
+	if !floatEq(got, wantInputCPM) {
+		t.Errorf("input CPM want %f got %f", wantInputCPM, got)
+	}
+	// CacheSavingsFraction = 8/20 = 0.4
+	if !floatEq(m.CacheSavingsFraction, 0.4) {
+		t.Errorf("CacheSavingsFraction want 0.4 got %f", m.CacheSavingsFraction)
+	}
+	// Method collapses to compute_time regardless of cache hits.
+	if m.AllocationMethod != AllocationMethodComputeTime {
+		t.Errorf("expected compute_time, got %s", m.AllocationMethod)
+	}
+}
+
+func TestCalculator_CacheCorrection_Disabled_WhenConfigUnknown(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	// CacheConfigKnown=false simulates vllm:cache_config_info being unavailable.
+	m := &InferenceCost{
+		AllocationTotalCost:  1.0,
+		UsageTotalCost:       1.0,
+		PromptTokens:         20,
+		GenerationTokens:     10,
+		TotalTokens:          30,
+		CachedTokens:         0,
+		CacheConfigKnown:     false,
+		EffectiveInputTokens: 20,
+		InputProcessingTime:  60,
+		OutputProcessingTime: 40,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.AllocationMethod != AllocationMethodComputeTime {
+		t.Errorf("expected compute_time when cache config unknown, got %s", m.AllocationMethod)
+	}
+	wantInput := (1.0 * 0.6 / 20) * 1_000_000
+	if !floatEq(m.InputCostPerMillionTokens[CostBasisUsage], wantInput) {
+		t.Errorf("want %f got %f", wantInput, m.InputCostPerMillionTokens[CostBasisUsage])
+	}
+	if m.CacheSavingsFraction != 0 {
+		t.Errorf("CacheSavingsFraction want 0 when config unknown, got %f", m.CacheSavingsFraction)
+	}
+}
+
+func TestCalculator_PrefixCachingOff_WhenConfigKnownAndDisabled(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	m := &InferenceCost{
+		AllocationTotalCost:  1.0,
+		UsageTotalCost:       1.0,
+		PromptTokens:         100,
+		GenerationTokens:     50,
+		TotalTokens:          150,
+		CachedTokens:         0,
+		CacheConfigKnown:     true,
+		PrefixCachingEnabled: false, // explicitly disabled
+		EffectiveInputTokens: 100,
+		InputProcessingTime:  70,
+		OutputProcessingTime: 30,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.AllocationMethod != AllocationMethodPrefixCachingOff {
+		t.Errorf("expected prefix_caching_off, got %s", m.AllocationMethod)
+	}
+}
+
+func TestCalculator_CacheCorrection_Disabled_WhenNoCacheHits(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	m := &InferenceCost{
+		AllocationTotalCost:  1.0,
+		UsageTotalCost:       1.0,
+		PromptTokens:         100,
+		GenerationTokens:     50,
+		TotalTokens:          150,
+		CachedTokens:         0, // no hits in this window
+		CacheConfigKnown:     true,
+		PrefixCachingEnabled: true, // caching is on, just no hits occurred
+		EffectiveInputTokens: 100,
+		InputProcessingTime:  70,
+		OutputProcessingTime: 30,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.AllocationMethod != AllocationMethodComputeTime {
+		t.Errorf("expected compute_time when prefix caching enabled but no hits in window, got %s", m.AllocationMethod)
+	}
+}
+
+// ---- multiplier fallback ----
+
+func TestCalculator_MultiplierFallback_BothBases(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	// No timing data → multiplier fallback
+	m := &InferenceCost{
+		AllocationTotalCost:  5.0,
+		UsageTotalCost:       2.0,
+		PromptTokens:         800_000,
+		GenerationTokens:     200_000,
+		TotalTokens:          1_000_000,
+		EffectiveInputTokens: 800_000,
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.AllocationMethod != AllocationMethodMultiplier {
+		t.Errorf("expected multiplier method, got %s", m.AllocationMethod)
+	}
+
+	// weightedTokens = 800000 + 200000*2.5 = 1300000
+	// usage: inputCPT = 2.0/1300000; inputCPM = inputCPT*1e6
+	// alloc: inputCPT = 5.0/1300000
+	for _, tc := range []struct {
+		basis     CostBasis
+		totalCost float64
+	}{
+		{CostBasisUsage, 2.0},
+		{CostBasisAllocation, 5.0},
+	} {
+		weighted := 800_000.0 + 200_000.0*2.5
+		wantInput := (tc.totalCost / weighted) * 1_000_000
+		wantOutput := wantInput * 2.5
+		if !floatEq(m.InputCostPerMillionTokens[tc.basis], wantInput) {
+			t.Errorf("basis=%s input want %f got %f", tc.basis, wantInput, m.InputCostPerMillionTokens[tc.basis])
+		}
+		if !floatEq(m.OutputCostPerMillionTokens[tc.basis], wantOutput) {
+			t.Errorf("basis=%s output want %f got %f", tc.basis, wantOutput, m.OutputCostPerMillionTokens[tc.basis])
+		}
+	}
+}
+
+func TestCalculator_MultiplierFallback_ZeroTokens(t *testing.T) {
+	m := &InferenceCost{AllocationTotalCost: 1.0, UsageTotalCost: 0.5}
+	// EffectiveInputTokens and GenerationTokens are both 0
+	cfg := &Config{AllocationMode: AllocationModeMultiplier, OutputTokenCostMultiplier: 2.5}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.InputCostPerMillionTokens[CostBasisUsage] != 0 ||
+		m.OutputCostPerMillionTokens[CostBasisAllocation] != 0 {
+		t.Error("expected zero derived costs when tokens are zero")
+	}
+}
+
+func TestCalculator_IncompleteTimingData_FallsBackToMultiplier(t *testing.T) {
+	cfg := &Config{
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+	m := &InferenceCost{
+		AllocationTotalCost:  5.0,
+		UsageTotalCost:       2.0,
+		PromptTokens:         800_000,
+		GenerationTokens:     200_000,
+		TotalTokens:          1_000_000,
+		EffectiveInputTokens: 800_000,
+		InputProcessingTime:  60,
+		OutputProcessingTime: 0, // incomplete timing data
+	}
+	newCalc(cfg).CalculateCosts([]*InferenceCost{m})
+
+	if m.AllocationMethod != AllocationMethodMultiplier {
+		t.Fatalf("expected multiplier fallback for incomplete timing data, got %s", m.AllocationMethod)
+	}
+
+	weighted := 800_000.0 + 200_000.0*2.5
+	wantInput := (2.0 / weighted) * 1_000_000
+	wantOutput := wantInput * 2.5
+
+	if !floatEq(m.InputCostPerMillionTokens[CostBasisUsage], wantInput) {
+		t.Errorf("usage input want %f got %f", wantInput, m.InputCostPerMillionTokens[CostBasisUsage])
+	}
+	if !floatEq(m.OutputCostPerMillionTokens[CostBasisUsage], wantOutput) {
+		t.Errorf("usage output want %f got %f", wantOutput, m.OutputCostPerMillionTokens[CostBasisUsage])
+	}
+}

+ 606 - 0
pkg/inferencecost/collector.go

@@ -0,0 +1,606 @@
+package inferencecost
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/filter/allocation"
+	"github.com/opencost/opencost/core/pkg/filter/ops"
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/core/pkg/opencost"
+	"github.com/opencost/opencost/core/pkg/source"
+)
+
+// AllocationQuerier is the subset of the cost model needed to fetch per-model
+// infrastructure costs. Abstracted as an interface for testability.
+type AllocationQuerier interface {
+	// ComputeAllocation returns an AllocationSet for the given time window.
+	ComputeAllocation(start, end time.Time) (*opencost.AllocationSet, error)
+}
+
+// Collector gathers per-model infrastructure costs from the OpenCost allocation
+// layer and token/timing/cache metrics from the data source.
+type Collector struct {
+	allocationQuerier AllocationQuerier
+	metricsQuerier    source.MetricsQuerier
+	config            *Config
+}
+
+// NewCollector creates a Collector that uses the provided MetricsQuerier for
+// inference metrics.
+func NewCollector(config *Config, querier AllocationQuerier, metricsQuerier source.MetricsQuerier) (*Collector, error) {
+	return &Collector{
+		allocationQuerier: querier,
+		metricsQuerier:    metricsQuerier,
+		config:            config,
+	}, nil
+}
+
+// CollectMetrics queries all data sources and returns one InferenceCost per
+// model/namespace combination. start and end define the time window to query;
+// the caller is responsible for choosing appropriate boundaries (e.g. the
+// runner uses now-interval..now; the API uses the request window).
+func (c *Collector) CollectMetrics(ctx context.Context, start, end time.Time) ([]*InferenceCost, error) {
+	// --- Infrastructure costs from OpenCost allocation layer ---
+	allocationCosts, err := c.queryAllocationCosts(ctx, start, end)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query allocation costs: %w", err)
+	}
+	log.Infof("InferenceCost: collected allocation costs for %d model/namespace combinations", len(allocationCosts))
+
+	// --- Token metrics from data source ---
+	// Query all metrics concurrently using Futures
+	promptTokensFuture := c.metricsQuerier.QueryInferencePromptTokens(start, end)
+	generationTokensFuture := c.metricsQuerier.QueryInferenceGenerationTokens(start, end)
+	inputTimeFuture := c.metricsQuerier.QueryInferenceInputProcessingTime(start, end)
+	outputTimeFuture := c.metricsQuerier.QueryInferenceOutputProcessingTime(start, end)
+	cachedTokensFuture := c.metricsQuerier.QueryInferenceCachedTokens(start, end)
+	cacheConfigFuture := c.metricsQuerier.QueryInferenceCacheConfig(end)
+
+	// Await required metrics (prompt and generation tokens)
+	promptTokensResults, err := promptTokensFuture.Await()
+	if err != nil {
+		return nil, fmt.Errorf("failed to query prompt tokens: %w", err)
+	}
+	promptTokens := mergeTokenResults(promptTokensResults)
+
+	generationTokensResults, err := generationTokensFuture.Await()
+	if err != nil {
+		return nil, fmt.Errorf("failed to query generation tokens: %w", err)
+	}
+	generationTokens := mergeTokenResults(generationTokensResults)
+
+	// --- Timing metrics (optional — degrade gracefully) ---
+	inputProcessingTime := make(map[string]float64)
+	if inputTimeResults, err := inputTimeFuture.Await(); err != nil {
+		log.Warnf("InferenceCost: failed to query input processing time (will use multiplier fallback): %v", err)
+	} else {
+		inputProcessingTime = mergeProcessingTimeResults(inputTimeResults)
+	}
+
+	outputProcessingTime := make(map[string]float64)
+	if outputTimeResults, err := outputTimeFuture.Await(); err != nil {
+		log.Warnf("InferenceCost: failed to query output processing time (will use multiplier fallback): %v", err)
+	} else {
+		outputProcessingTime = mergeProcessingTimeResults(outputTimeResults)
+	}
+
+	// --- KV cache hits (optional — degrade gracefully) ---
+	cachedTokens := make(map[string]float64)
+	if cachedTokensResults, err := cachedTokensFuture.Await(); err != nil {
+		log.Warnf("InferenceCost: failed to query KV cache hits (cacheSavingsFraction will be zero): %v", err)
+	} else {
+		cachedTokens = mergeTokenResults(cachedTokensResults)
+	}
+
+	// --- KV cache config (prefix caching enabled flag only) ---
+	cacheConfigs := make(map[string]*cacheConfig)
+	if cacheConfigResults, err := cacheConfigFuture.Await(); err != nil {
+		log.Warnf("InferenceCost: failed to query cache config (prefix_caching_off detection disabled): %v", err)
+	} else {
+		cacheConfigs = mergeCacheConfigResults(cacheConfigResults)
+	}
+
+	return c.combineMetrics(allocationCosts, promptTokens, generationTokens,
+		inputProcessingTime, outputProcessingTime, cachedTokens, cacheConfigs, start, end), nil
+}
+
+// cacheConfig holds per-model KV cache configuration from vllm:cache_config_info.
+type cacheConfig struct {
+	prefixCachingEnabled bool
+}
+
+// allocationResult holds the two cost figures derived from one Allocation.
+type allocationResult struct {
+	allocationTotalCost float64
+	usageTotalCost      float64
+	namespace           string
+	cluster             string
+	pod                 string
+	controller          string
+	controllerKind      string
+	container           string
+}
+
+// queryAllocationCosts calls the OpenCost allocation layer twice:
+// once with idle sharing (for allocation costs) and once without (for usage costs).
+// This ensures allocation costs reconcile to the bill while usage costs reflect
+// only active compute without idle or waste.
+// This approach was chosen rather than doing a single call and deducting idle and optionally shared, so that
+// core logic is not duplicated.  A performance penalty is paid though.
+func (c *Collector) queryAllocationCosts(ctx context.Context, start, end time.Time) (map[string]*allocationResult, error) {
+	// Query 1: Allocation costs with idle sharing (reconciles to bill)
+	allocationCosts, err := c.queryAllocationCostsWithIdle(ctx, start, end)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query allocation costs with idle: %w", err)
+	}
+
+	// Query 2: Usage costs without idle sharing (active compute only)
+	usageCosts, err := c.queryAllocationCostsWithoutIdle(ctx, start, end)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query usage costs without idle: %w", err)
+	}
+
+	// Merge results: allocation costs from first query, usage costs from second
+	results := make(map[string]*allocationResult)
+	for key, allocResult := range allocationCosts {
+		if allocResult == nil {
+			continue
+		}
+		// Copy the full allocationResult so we retain pod/controller/container metadata
+		copied := *allocResult
+		copied.usageTotalCost = 0 // Will be filled from usageCosts
+		results[key] = &copied
+	}
+
+	// Fill in usage costs from the second query
+	for key, usageResult := range usageCosts {
+		if result, exists := results[key]; exists {
+			result.usageTotalCost = usageResult.usageTotalCost
+		} else {
+			// Model exists in usage query but not allocation query (shouldn't happen)
+			log.Warnf("InferenceCost: model %s has usage cost but no allocation cost", key)
+			results[key] = usageResult
+		}
+	}
+
+	// Log the differences
+	for key, result := range results {
+		modelName, namespace := parseKey(key)
+		if result.allocationTotalCost > 0 {
+			log.Debugf("InferenceCost: model=%s ns=%s alloc=$%.4f usage=$%.4f (%.1f%% of alloc)", 
+				modelName, namespace, result.allocationTotalCost, result.usageTotalCost, 
+				(result.usageTotalCost/result.allocationTotalCost)*100)
+		}
+	}
+	return results, nil
+}
+
+// queryAllocationCostsWithIdle queries allocations with idle sharing enabled.
+func (c *Collector) queryAllocationCostsWithIdle(ctx context.Context, start, end time.Time) (map[string]*allocationResult, error) {
+	as, err := c.allocationQuerier.ComputeAllocation(start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create a filter to match shared infrastructure allocations by label
+	// This ensures allocations with the shared infra label are moved to shareSet
+	// and distributed among other allocations, rather than aggregating into __unallocated__
+	shareFilter := ops.Eq(
+		ops.WithKey(allocation.FieldLabel, c.config.SharedInfraLabel),
+		c.config.SharedInfraLabelValue,
+	)
+
+	opts := &opencost.AllocationAggregationOptions{
+		ShareIdle:    opencost.ShareWeighted,
+		ShareSplit:   opencost.ShareWeighted,
+		Share:        shareFilter,
+		SharedLabels: map[string][]string{c.config.SharedInfraLabel: {c.config.SharedInfraLabelValue}},
+	}
+
+	aggregateBy := []string{"label:" + c.config.ModelLabel}
+	if err := as.AggregateBy(aggregateBy, opts); err != nil {
+		return nil, fmt.Errorf("AggregateBy label:%s: %w", c.config.ModelLabel, err)
+	}
+
+	return c.extractAllocationResults(as, true)
+}
+
+// queryAllocationCostsWithoutIdle queries allocations without idle or shared
+// infrastructure cost sharing. Usage costs reflect active compute only.
+func (c *Collector) queryAllocationCostsWithoutIdle(ctx context.Context, start, end time.Time) (map[string]*allocationResult, error) {
+	as, err := c.allocationQuerier.ComputeAllocation(start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create a filter to match shared infrastructure allocations by label
+	// Even though we're not sharing costs (ShareSplit: ShareNone), we still need
+	// the Share filter to identify and separate shared infra allocations from
+	// regular allocations, preventing them from aggregating into __unallocated__
+	shareFilter := ops.Eq(
+		ops.WithKey(allocation.FieldLabel, c.config.SharedInfraLabel),
+		c.config.SharedInfraLabelValue,
+	)
+
+	opts := &opencost.AllocationAggregationOptions{
+		ShareIdle:    opencost.ShareNone,
+		ShareSplit:   opencost.ShareNone,
+		Share:        shareFilter,
+		SharedLabels: map[string][]string{c.config.SharedInfraLabel: {c.config.SharedInfraLabelValue}},
+	}
+
+	aggregateBy := []string{"label:" + c.config.ModelLabel}
+	if err := as.AggregateBy(aggregateBy, opts); err != nil {
+		return nil, fmt.Errorf("AggregateBy label:%s: %w", c.config.ModelLabel, err)
+	}
+
+	return c.extractAllocationResults(as, false)
+}
+
+// extractAllocationResults extracts cost data from an AllocationSet.
+func (c *Collector) extractAllocationResults(as *opencost.AllocationSet, isAllocationCost bool) (map[string]*allocationResult, error) {
+	results := make(map[string]*allocationResult)
+	for name, alloc := range as.Allocations {
+		if alloc == nil {
+			continue
+		}
+		// Skip the synthetic __idle__ and __unallocated__ entries.
+		if strings.HasPrefix(name, "__") {
+			continue
+		}
+
+		modelName := extractModelName(alloc, c.config.ModelLabel)
+		if modelName == "" {
+			continue
+		}
+
+		namespace := ""
+		cluster := ""
+		pod := ""
+		controller := ""
+		controllerKind := ""
+		container := ""
+		
+		if alloc.Properties != nil {
+			namespace = alloc.Properties.Namespace
+			cluster = alloc.Properties.Cluster
+			pod = alloc.Properties.Pod
+			controller = alloc.Properties.Controller
+			controllerKind = alloc.Properties.ControllerKind
+			container = alloc.Properties.Container
+		}
+
+		key := modelNamespaceKey(modelName, namespace)
+
+		// Accumulate costs for the same model/namespace key
+		existing, exists := results[key]
+		if !exists {
+			existing = &allocationResult{
+				namespace:      namespace,
+				cluster:        cluster,
+				pod:            pod,
+				controller:     controller,
+				controllerKind: controllerKind,
+				container:      container,
+			}
+			results[key] = existing
+		}
+
+		if isAllocationCost {
+			// For allocation cost: use TotalCost() which includes idle and shared
+			existing.allocationTotalCost += alloc.TotalCost()
+		} else {
+			// For usage cost: use TotalCost() from the ShareNone query (no idle)
+			existing.usageTotalCost += alloc.TotalCost()
+		}
+		
+		// When aggregating multiple allocations, preserve the first non-empty values
+		// for pod, controller, and container. This provides representative values
+		// when costs are aggregated across multiple pods/containers.
+		if existing.pod == "" && pod != "" {
+			existing.pod = pod
+		}
+		if existing.controller == "" && controller != "" {
+			existing.controller = controller
+		}
+		if existing.controllerKind == "" && controllerKind != "" {
+			existing.controllerKind = controllerKind
+		}
+		if existing.container == "" && container != "" {
+			existing.container = container
+		}
+	}
+
+	return results, nil
+}
+
+// extractModelName extracts the model name from the allocation name or label.
+// After AggregateBy("label:<key>"), the allocation Name is the label value.
+func extractModelName(alloc *opencost.Allocation, _ string) string {
+	if alloc == nil {
+		return ""
+	}
+	// AggregateBy sets the Name to the label value.
+	return alloc.Name
+}
+
+// canonicalModelName normalizes a model name by stripping any org/vendor prefix
+// before the last "/".
+// Examples:
+//   - "MiniMaxAI/MiniMax-M2.7" -> "MiniMax-M2.7"
+//   - "google/gemma-4-31B" -> "gemma-4-31B"
+func canonicalModelName(modelName string) string {
+	if idx := strings.LastIndex(modelName, "/"); idx >= 0 {
+		return modelName[idx+1:]
+	}
+	return modelName
+}
+
+// reconcileTokenKeys re-keys entries only when there is a confirmed mismatch
+// between the metric key and the allocation-backed model key for the same
+// namespace.
+//
+// Two common mismatch examples:
+//   1. Fully-qualified vLLM model name vs short allocation label:
+//      "google/gemma-4-31B:llm-d-pic" -> "gemma-4-31B:llm-d-pic"
+//   2. Fully-qualified vLLM model name vs short allocation label with a
+//      different vendor/org prefix:
+//      "MiniMaxAI/MiniMax-M2.7:llm-d-pic" -> "MiniMax-M2.7:llm-d-pic"
+//
+// Exact matches are preserved. Keys with no matching allocation-backed target
+// are also preserved unchanged. A warning is logged for every remapped key so
+// the mismatch is auditable.
+//
+// Returns both the reconciled map and a set of keys that were remapped (to be
+// excluded later).
+func reconcileTokenKeys(tokens map[string]float64, allocCosts map[string]*allocationResult) (map[string]float64, map[string]struct{}) {
+	// Build index: normalizedShortName:namespace -> allocKey, preferring
+	// allocation keys that are already in short-name form.
+	shortIndex := make(map[string]string, len(allocCosts))
+	for allocKey := range allocCosts {
+		modelName, namespace := parseKey(allocKey)
+		shortName := canonicalModelName(modelName)
+		shortKey := modelNamespaceKey(shortName, namespace)
+
+		if existing, found := shortIndex[shortKey]; found {
+			existingModelName, _ := parseKey(existing)
+			if existingModelName == shortName {
+				continue
+			}
+		}
+		shortIndex[shortKey] = allocKey
+	}
+
+	out := make(map[string]float64, len(tokens))
+	remappedKeys := make(map[string]struct{})
+
+	for k, v := range tokens {
+		modelName, namespace := parseKey(k)
+		shortName := canonicalModelName(modelName)
+		shortKey := modelNamespaceKey(shortName, namespace)
+
+		if allocKey, found := shortIndex[shortKey]; found {
+			if k != allocKey {
+				log.Warnf("InferenceCost: remapping metric key %q → %q (model-name mismatch with allocation label)", k, allocKey)
+				out[allocKey] += v
+				remappedKeys[k] = struct{}{}
+				continue
+			}
+		}
+
+		out[k] = v
+	}
+	return out, remappedKeys
+}
+
+// reconcileCacheConfigKeys re-keys a cacheConfig map the same way reconcileTokenKeys
+// does for float64 maps — handling fully-qualified vs short model name mismatches.
+func reconcileCacheConfigKeys(configs map[string]*cacheConfig, allocCosts map[string]*allocationResult) (map[string]*cacheConfig, map[string]struct{}) {
+	shortIndex := make(map[string]string, len(allocCosts))
+	for allocKey := range allocCosts {
+		modelName, namespace := parseKey(allocKey)
+		shortName := canonicalModelName(modelName)
+		shortKey := modelNamespaceKey(shortName, namespace)
+		if _, exists := shortIndex[shortKey]; !exists {
+			shortIndex[shortKey] = allocKey
+		}
+	}
+
+	out := make(map[string]*cacheConfig, len(configs))
+	remappedKeys := make(map[string]struct{})
+
+	for k, v := range configs {
+		modelName, namespace := parseKey(k)
+		shortName := canonicalModelName(modelName)
+		shortKey := modelNamespaceKey(shortName, namespace)
+
+		if allocKey, found := shortIndex[shortKey]; found {
+			if k != allocKey {
+				log.Warnf("InferenceCost: remapping cache config key %q → %q (model-name mismatch with allocation label)", k, allocKey)
+				out[allocKey] = v
+				remappedKeys[k] = struct{}{}
+				continue
+			}
+		}
+
+		out[k] = v
+	}
+	return out, remappedKeys
+}
+
+// combineMetrics joins all data sources into InferenceCost structs.
+func (c *Collector) combineMetrics(
+	allocCosts map[string]*allocationResult,
+	promptTokens, generationTokens,
+	inputProcessingTime, outputProcessingTime,
+	cachedTokens map[string]float64,
+	cacheConfigs map[string]*cacheConfig,
+	start, end time.Time,
+) []*InferenceCost {
+
+	// Reconcile token map keys against allocation keys to handle the case where
+	// vLLM reports a fully-qualified model name (e.g. "org/model") but the K8s
+	// pod label uses only the short name ("model"). Re-keying fires only when a
+	// mismatch is detected; keys that already match are left unchanged.
+	// Track which keys were remapped so we can exclude them from final results.
+	var remappedKeys map[string]struct{}
+	promptTokens, remappedKeys = reconcileTokenKeys(promptTokens, allocCosts)
+
+	var remapped map[string]struct{}
+	generationTokens, remapped = reconcileTokenKeys(generationTokens, allocCosts)
+	for k := range remapped {
+		remappedKeys[k] = struct{}{}
+	}
+
+	inputProcessingTime, remapped = reconcileTokenKeys(inputProcessingTime, allocCosts)
+	for k := range remapped {
+		remappedKeys[k] = struct{}{}
+	}
+
+	outputProcessingTime, remapped = reconcileTokenKeys(outputProcessingTime, allocCosts)
+	for k := range remapped {
+		remappedKeys[k] = struct{}{}
+	}
+
+	cachedTokens, remapped = reconcileTokenKeys(cachedTokens, allocCosts)
+	for k := range remapped {
+		remappedKeys[k] = struct{}{}
+	}
+
+	cacheConfigs, remapped = reconcileCacheConfigKeys(cacheConfigs, allocCosts)
+	for k := range remapped {
+		remappedKeys[k] = struct{}{}
+	}
+
+	// Union of all keys across sources.
+	// Include timing/cache maps as well so models that only appear in those
+	// sources are not dropped before cost calculation.
+	keys := make(map[string]struct{})
+	for k := range allocCosts {
+		keys[k] = struct{}{}
+	}
+	for k := range promptTokens {
+		keys[k] = struct{}{}
+	}
+	for k := range generationTokens {
+		keys[k] = struct{}{}
+	}
+	for k := range inputProcessingTime {
+		keys[k] = struct{}{}
+	}
+	for k := range outputProcessingTime {
+		keys[k] = struct{}{}
+	}
+	for k := range cachedTokens {
+		keys[k] = struct{}{}
+	}
+	for k := range cacheConfigs {
+		keys[k] = struct{}{}
+	}
+
+	results := make([]*InferenceCost, 0, len(keys))
+	for key := range keys {
+		// Skip keys that were remapped to avoid duplicate series
+		if _, wasRemapped := remappedKeys[key]; wasRemapped {
+			continue
+		}
+
+		modelName, namespace := parseKey(key)
+
+		cfg := cacheConfigs[key]
+		var prefixCachingEnabled, cacheConfigKnown bool
+		if cfg != nil {
+			prefixCachingEnabled = cfg.prefixCachingEnabled
+			cacheConfigKnown = true
+		}
+
+		ic := &InferenceCost{
+			Properties: InferenceCostProperties{
+				ModelName:    modelName,
+				Namespace:    namespace,
+				WorkloadType: "inference",
+			},
+			PromptTokens:         promptTokens[key],
+			GenerationTokens:     generationTokens[key],
+			InputProcessingTime:  inputProcessingTime[key],
+			OutputProcessingTime: outputProcessingTime[key],
+			CachedTokens:         cachedTokens[key],
+			PrefixCachingEnabled: prefixCachingEnabled,
+			CacheConfigKnown:     cacheConfigKnown,
+			Timestamp:            end,
+		}
+		ic.Window.Start = start
+		ic.Window.End = end
+
+		if ar, ok := allocCosts[key]; ok {
+			ic.AllocationTotalCost = ar.allocationTotalCost
+			ic.UsageTotalCost = ar.usageTotalCost
+			ic.Properties.Cluster = ar.cluster
+			ic.Properties.Pod = ar.pod
+			ic.Properties.Controller = ar.controller
+			ic.Properties.ControllerKind = ar.controllerKind
+			ic.Properties.Container = ar.container
+			if namespace == "" {
+				ic.Properties.Namespace = ar.namespace
+			}
+		}
+
+		ic.TotalTokens = ic.PromptTokens + ic.GenerationTokens
+		ic.EffectiveInputTokens = ic.PromptTokens - ic.CachedTokens
+		if ic.EffectiveInputTokens < 0 {
+			ic.EffectiveInputTokens = 0
+		}
+
+		results = append(results, ic)
+	}
+	return results
+}
+
+func modelNamespaceKey(modelName, namespace string) string {
+	return modelName + ":" + namespace
+}
+
+func parseKey(key string) (modelName, namespace string) {
+	idx := strings.IndexByte(key, ':')
+	if idx < 0 {
+		return key, "unknown"
+	}
+	return key[:idx], key[idx+1:]
+}
+
+
+// mergeTokenResults merges multiple InferenceTokensResult into a single map
+func mergeTokenResults(results []*source.InferenceTokensResult) map[string]float64 {
+	merged := make(map[string]float64)
+	for _, result := range results {
+		for k, v := range result.Values {
+			merged[k] = v
+		}
+	}
+	return merged
+}
+
+// mergeProcessingTimeResults merges multiple InferenceProcessingTimeResult into a single map
+func mergeProcessingTimeResults(results []*source.InferenceProcessingTimeResult) map[string]float64 {
+	merged := make(map[string]float64)
+	for _, result := range results {
+		for k, v := range result.Values {
+			merged[k] = v
+		}
+	}
+	return merged
+}
+
+// mergeCacheConfigResults merges multiple InferenceCacheConfigResult into a single map
+func mergeCacheConfigResults(results []*source.InferenceCacheConfigResult) map[string]*cacheConfig {
+	merged := make(map[string]*cacheConfig)
+	for _, result := range results {
+		for k, v := range result.Configs {
+			merged[k] = &cacheConfig{prefixCachingEnabled: v.PrefixCachingEnabled}
+		}
+	}
+	return merged
+}

+ 455 - 0
pkg/inferencecost/collector_test.go

@@ -0,0 +1,455 @@
+package inferencecost
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	"github.com/opencost/opencost/core/pkg/source"
+)
+
+// mockQuerier implements AllocationQuerier for testing.
+type mockQuerier struct {
+	set *opencost.AllocationSet
+	err error
+	// For dual-query tests, return different sets on subsequent calls
+	callCount int
+	sets      []*opencost.AllocationSet
+}
+
+func (m *mockQuerier) ComputeAllocation(start, end time.Time) (*opencost.AllocationSet, error) {
+	if m.err != nil {
+		return nil, m.err
+	}
+	
+	// If multiple sets are provided, return them in sequence
+	if len(m.sets) > 0 {
+		if m.callCount < len(m.sets) {
+			set := m.sets[m.callCount]
+			m.callCount++
+			return set, nil
+		}
+		// Return last set for any additional calls
+		return m.sets[len(m.sets)-1], nil
+	}
+	
+	// Otherwise return the single set
+	return m.set, nil
+}
+
+// Helper function to create a mock metrics querier with custom inference metric responses
+func newMockMetricsQuerierWithInferenceMetrics(
+	promptTokens map[string]float64,
+	generationTokens map[string]float64,
+	inputTime map[string]float64,
+	outputTime map[string]float64,
+	cachedTokens map[string]float64,
+	cacheConfigs map[string]*source.InferenceCacheConfig,
+) *source.MockMetricsQuerier {
+	mock := source.NewMockMetricsQuerier()
+	
+	// Set up inference metric overrides
+	if promptTokens != nil {
+		mock.SetOverride(source.QueryInferencePromptTokens, []*source.InferenceTokensResult{
+			{Values: promptTokens},
+		})
+	}
+	if generationTokens != nil {
+		mock.SetOverride(source.QueryInferenceGenerationTokens, []*source.InferenceTokensResult{
+			{Values: generationTokens},
+		})
+	}
+	if inputTime != nil {
+		mock.SetOverride(source.QueryInferenceInputProcessingTime, []*source.InferenceProcessingTimeResult{
+			{Values: inputTime},
+		})
+	}
+	if outputTime != nil {
+		mock.SetOverride(source.QueryInferenceOutputProcessingTime, []*source.InferenceProcessingTimeResult{
+			{Values: outputTime},
+		})
+	}
+	if cachedTokens != nil {
+		mock.SetOverride(source.QueryInferenceCachedTokens, []*source.InferenceTokensResult{
+			{Values: cachedTokens},
+		})
+	}
+	if cacheConfigs != nil {
+		mock.SetOverride(source.QueryInferenceCacheConfig, []*source.InferenceCacheConfigResult{
+			{Configs: cacheConfigs},
+		})
+	}
+	
+	return mock
+}
+
+func makeAllocation(name string, gpuCost, cpuCost, ramCost, gpuCostIdle, cpuCostIdle, ramCostIdle float64, labels map[string]string, namespace string) *opencost.Allocation {
+	a := &opencost.Allocation{
+		Name:    name,
+		GPUCost: gpuCost,
+		CPUCost: cpuCost,
+		RAMCost: ramCost,
+		// Idle fields stored directly — they are added into TotalCost by OpenCost
+		// when idle is distributed via ShareWeighted.
+		GPUCostIdle: gpuCostIdle,
+		CPUCostIdle: cpuCostIdle,
+		RAMCostIdle: ramCostIdle,
+		Properties: &opencost.AllocationProperties{
+			Namespace: namespace,
+			Labels:    opencost.AllocationLabels(labels),
+		},
+	}
+	return a
+}
+
+func baseConfig() *Config {
+	return &Config{
+		PrometheusURL:             "http://fake-prometheus:9090",
+		CollectionInterval:        5 * time.Minute,
+		ModelLabel:                "llm-d.ai/model",
+		SharedInfraLabel:          "llm-d.ai/inference-shared",
+		SharedInfraLabelValue:     "true",
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: 2.5,
+	}
+}
+
+// TestCollector_ExtractAllocationResults verifies that extractAllocationResults
+// correctly extracts allocation and usage costs from AllocationSets.
+func TestCollector_ExtractAllocationResults(t *testing.T) {
+	now := time.Now()
+	cfg := baseConfig()
+	c := &Collector{config: cfg}
+	
+	// Test allocation cost extraction (with idle)
+	allocWithIdle := &opencost.Allocation{
+		Name:    "llama-3",
+		GPUCost: 3.0,
+		CPUCost: 0.5,
+		RAMCost: 0.5,
+		Properties: &opencost.AllocationProperties{
+			Namespace: "llm-prod",
+		},
+	}
+	asWithIdle := opencost.NewAllocationSet(now.Add(-5*time.Minute), now)
+	asWithIdle.Set(allocWithIdle)
+
+	resultsAlloc, err := c.extractAllocationResults(asWithIdle, true)
+	if err != nil {
+		t.Fatalf("extractAllocationResults (allocation) failed: %v", err)
+	}
+
+	key := modelNamespaceKey("llama-3", "llm-prod")
+	r, ok := resultsAlloc[key]
+	if !ok {
+		t.Fatal("expected allocation result for llama-3/llm-prod")
+	}
+	
+	if !floatEq(r.allocationTotalCost, 4.0) {
+		t.Errorf("allocationTotalCost want 4.0 got %f", r.allocationTotalCost)
+	}
+	if r.usageTotalCost != 0 {
+		t.Errorf("usageTotalCost should be 0 in allocation query, got %f", r.usageTotalCost)
+	}
+
+	// Test usage cost extraction (without idle)
+	allocWithoutIdle := &opencost.Allocation{
+		Name:    "llama-3",
+		GPUCost: 2.0,
+		CPUCost: 0.3,
+		RAMCost: 0.3,
+		Properties: &opencost.AllocationProperties{
+			Namespace: "llm-prod",
+		},
+	}
+	asWithoutIdle := opencost.NewAllocationSet(now.Add(-5*time.Minute), now)
+	asWithoutIdle.Set(allocWithoutIdle)
+
+	resultsUsage, err := c.extractAllocationResults(asWithoutIdle, false)
+	if err != nil {
+		t.Fatalf("extractAllocationResults (usage) failed: %v", err)
+	}
+
+	r2, ok := resultsUsage[key]
+	if !ok {
+		t.Fatal("expected usage result for llama-3/llm-prod")
+	}
+	
+	if !floatEq(r2.usageTotalCost, 2.6) {
+		t.Errorf("usageTotalCost want 2.6 got %f", r2.usageTotalCost)
+	}
+	if r2.allocationTotalCost != 0 {
+		t.Errorf("allocationTotalCost should be 0 in usage query, got %f", r2.allocationTotalCost)
+	}
+}
+
+// TestCollector_UsageCost_ExcludesIdle verifies the mathematical relationship
+// between allocation and usage costs when idle is present.
+func TestCollector_UsageCost_ExcludesIdle(t *testing.T) {
+	// With ShareWeighted: AllocationTotalCost = 4.0 (GPU 3.0 + CPU 0.5 + RAM 0.5)
+	// With ShareNone: UsageCost = 2.6 (excludes idle: 1.0 + 0.2 + 0.2 = 1.4)
+
+	allocTotal := 4.0
+	idleGPU, idleCPU, idleRAM := 1.0, 0.2, 0.2
+	expectedUsageCost := allocTotal - (idleGPU + idleCPU + idleRAM)
+
+	if !floatEq(expectedUsageCost, 2.6) {
+		t.Errorf("expected usage cost 2.6 got %f", expectedUsageCost)
+	}
+	if expectedUsageCost >= allocTotal {
+		t.Error("usage cost should be less than allocation cost when idle is present")
+	}
+}
+
+// TestCollector_CombineMetrics_DerivesCachedTokens verifies that combineMetrics
+// passes CachedTokens through directly and derives EffectiveInputTokens correctly.
+func TestCollector_CombineMetrics_DerivesCachedTokens(t *testing.T) {
+	cfg := baseConfig()
+
+	allocCosts := map[string]*allocationResult{
+		"llama-3:llm-prod": {allocationTotalCost: 4.0, usageTotalCost: 2.6, namespace: "llm-prod"},
+	}
+	promptTokens := map[string]float64{"llama-3:llm-prod": 20}
+	genTokens := map[string]float64{"llama-3:llm-prod": 10}
+	inputTime := map[string]float64{}
+	outputTime := map[string]float64{}
+	// vllm:prefix_cache_hits_total reports tokens directly (not blocks).
+	cachedTokens := map[string]float64{"llama-3:llm-prod": 8}
+	cacheConfigs := map[string]*cacheConfig{"llama-3:llm-prod": {prefixCachingEnabled: true}}
+
+	c := &Collector{config: cfg}
+	now := time.Now()
+	results := c.combineMetrics(allocCosts, promptTokens, genTokens, inputTime, outputTime, cachedTokens, cacheConfigs, now.Add(-1*time.Hour), now)
+
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	m := results[0]
+	if !floatEq(m.CachedTokens, 8) {
+		t.Errorf("CachedTokens want 8 got %f", m.CachedTokens)
+	}
+	if !floatEq(m.EffectiveInputTokens, 12) {
+		t.Errorf("EffectiveInputTokens want 12 got %f", m.EffectiveInputTokens)
+	}
+}
+
+// TestCollector_CombineMetrics_NoCacheHits_FallsBackToPromptTokens verifies that
+// EffectiveInputTokens equals PromptTokens when no cache hits are reported.
+func TestCollector_CombineMetrics_NoCacheHits_FallsBackToPromptTokens(t *testing.T) {
+	cfg := baseConfig()
+
+	allocCosts := map[string]*allocationResult{
+		"llama-3:llm-prod": {allocationTotalCost: 1.0, usageTotalCost: 1.0, namespace: "llm-prod"},
+	}
+	promptTokens := map[string]float64{"llama-3:llm-prod": 1000}
+	genTokens := map[string]float64{"llama-3:llm-prod": 500}
+	// cachedTokens map is empty — simulates metric being unavailable
+	cacheHits := map[string]float64{}
+	cacheConfigs := map[string]*cacheConfig{"llama-3:llm-prod": {prefixCachingEnabled: true}}
+
+	c := &Collector{config: cfg}
+	now := time.Now()
+	results := c.combineMetrics(allocCosts, promptTokens, genTokens,
+		map[string]float64{}, map[string]float64{}, cacheHits, cacheConfigs, now.Add(-1*time.Hour), now)
+
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	m := results[0]
+	if !floatEq(m.EffectiveInputTokens, 1000) {
+		t.Errorf("EffectiveInputTokens should fall back to PromptTokens=1000, got %f", m.EffectiveInputTokens)
+	}
+}
+
+// TestReconcileTokenKeys_OrgPrefixMismatch verifies that a metric key with a
+// fully-qualified org/model name is re-keyed to match the allocation key that
+// uses only the short name, and that keys which already match are left unchanged.
+func TestReconcileTokenKeys_OrgPrefixMismatch(t *testing.T) {
+	allocCosts := map[string]*allocationResult{
+		"MiniMax-M2.7:llm-d-pic": {allocationTotalCost: 489.0, namespace: "llm-d-pic"},
+		"gpt-oss-120b:dolev-inf": {allocationTotalCost: 453.0, namespace: "dolev-inf"},
+		// This alloc key already has a slash and no short-name alternative.
+		"meta-llama/Llama-3:prod": {allocationTotalCost: 10.0, namespace: "prod"},
+	}
+
+	tokens := map[string]float64{
+		// Mismatch: vLLM uses full org/model, alloc uses short name.
+		"MiniMaxAI/MiniMax-M2.7:llm-d-pic": 4316.0,
+		"openai/gpt-oss-120b:dolev-inf":    4773.0,
+		// Already matches alloc key — should pass through unchanged.
+		"meta-llama/Llama-3:prod": 1000.0,
+		// No alloc entry at all — should pass through unchanged.
+		"unknown-org/new-model:some-ns": 99.0,
+	}
+
+	out, remappedKeys := reconcileTokenKeys(tokens, allocCosts)
+
+	// Remapped entries should appear under the short-name alloc keys.
+	if v, ok := out["MiniMax-M2.7:llm-d-pic"]; !ok || !floatEq(v, 4316.0) {
+		t.Errorf("MiniMax-M2.7:llm-d-pic want 4316.0 got %v (ok=%v)", v, ok)
+	}
+	if v, ok := out["gpt-oss-120b:dolev-inf"]; !ok || !floatEq(v, 4773.0) {
+		t.Errorf("gpt-oss-120b:dolev-inf want 4773.0 got %v (ok=%v)", v, ok)
+	}
+	// Original org-prefixed keys must be gone.
+	if _, ok := out["MiniMaxAI/MiniMax-M2.7:llm-d-pic"]; ok {
+		t.Error("org-prefixed key MiniMaxAI/MiniMax-M2.7:llm-d-pic should have been removed")
+	}
+	if _, ok := out["openai/gpt-oss-120b:dolev-inf"]; ok {
+		t.Error("org-prefixed key openai/gpt-oss-120b:dolev-inf should have been removed")
+	}
+	// Verify remapped keys are tracked.
+	if _, ok := remappedKeys["MiniMaxAI/MiniMax-M2.7:llm-d-pic"]; !ok {
+		t.Error("MiniMaxAI/MiniMax-M2.7:llm-d-pic should be in remappedKeys")
+	}
+	if _, ok := remappedKeys["openai/gpt-oss-120b:dolev-inf"]; !ok {
+		t.Error("openai/gpt-oss-120b:dolev-inf should be in remappedKeys")
+	}
+	// Keys that already matched or had no alloc entry pass through unchanged.
+	if v, ok := out["meta-llama/Llama-3:prod"]; !ok || !floatEq(v, 1000.0) {
+		t.Errorf("meta-llama/Llama-3:prod want 1000.0 got %v (ok=%v)", v, ok)
+	}
+	if v, ok := out["unknown-org/new-model:some-ns"]; !ok || !floatEq(v, 99.0) {
+		t.Errorf("unknown-org/new-model:some-ns want 99.0 got %v (ok=%v)", v, ok)
+	}
+}
+
+func TestReconcileTokenKeys_PrefersShortAllocationKeyWhenBothFormsExist(t *testing.T) {
+	allocCosts := map[string]*allocationResult{
+		"gemma-4-31B:llm-d-pic":        {allocationTotalCost: 10.0, namespace: "llm-d-pic"},
+		"google/gemma-4-31B:llm-d-pic": {allocationTotalCost: 1.0, namespace: "llm-d-pic"},
+	}
+
+	tokens := map[string]float64{
+		"google/gemma-4-31B:llm-d-pic": 123.0,
+	}
+
+	out, remappedKeys := reconcileTokenKeys(tokens, allocCosts)
+
+	if v, ok := out["gemma-4-31B:llm-d-pic"]; !ok || !floatEq(v, 123.0) {
+		t.Errorf("gemma-4-31B:llm-d-pic want 123.0 got %v (ok=%v)", v, ok)
+	}
+	if _, ok := out["google/gemma-4-31B:llm-d-pic"]; ok {
+		t.Error("google/gemma-4-31B:llm-d-pic should have been folded into gemma-4-31B:llm-d-pic")
+	}
+	if _, ok := remappedKeys["google/gemma-4-31B:llm-d-pic"]; !ok {
+		t.Error("google/gemma-4-31B:llm-d-pic should be in remappedKeys")
+	}
+}
+
+// TestCollector_BuildQueryWindow verifies that buildQueryWindow generates
+// correct Prometheus time range selectors based on CollectionInterval.
+// TestQueryCounterDelta_Formula verifies the delta = end - start subtraction
+// and that negative deltas (counter resets) use endVal to capture post-reset activity.
+func TestQueryCounterDelta_Formula(t *testing.T) {
+	tests := []struct {
+		name     string
+		endVal   float64
+		startVal float64
+		want     float64
+	}{
+		{name: "normal increase", endVal: 1000, startVal: 200, want: 800},
+		{name: "no activity", endVal: 500, startVal: 500, want: 0},
+		{name: "counter reset uses endVal", endVal: 100, startVal: 900, want: 100},
+		{name: "new pod (no start sample)", endVal: 400, startVal: 0, want: 400},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			delta := tt.endVal - tt.startVal
+			if delta < 0 {
+				delta = tt.endVal
+			}
+			if delta != tt.want {
+				t.Errorf("delta = %v, want %v", delta, tt.want)
+			}
+		})
+	}
+}
+
+// TestReconcileTokenKeys_NoMismatch verifies that when all token keys directly
+// match allocation keys, no re-keying occurs and no entries are dropped.
+func TestReconcileTokenKeys_NoMismatch(t *testing.T) {
+	allocCosts := map[string]*allocationResult{
+		"llama-3:prod": {allocationTotalCost: 1.0},
+	}
+	tokens := map[string]float64{
+		"llama-3:prod": 500.0,
+	}
+
+	out, remappedKeys := reconcileTokenKeys(tokens, allocCosts)
+	if v, ok := out["llama-3:prod"]; !ok || !floatEq(v, 500.0) {
+		t.Errorf("want llama-3:prod=500.0 got %v (ok=%v)", v, ok)
+	}
+	if len(out) != 1 {
+		t.Errorf("expected 1 entry, got %d", len(out))
+	}
+	if len(remappedKeys) != 0 {
+		t.Errorf("expected no remapped keys, got %d", len(remappedKeys))
+	}
+}
+
+// TestCollector_CollectMetrics_EmptyMetrics ensures that CollectMetrics
+// handles empty metrics gracefully (returns empty results, not an error).
+func TestCollector_CollectMetrics_EmptyMetrics(t *testing.T) {
+	cfg := baseConfig()
+
+	now := time.Now()
+	querier := &mockQuerier{set: opencost.NewAllocationSet(now.Add(-5*time.Minute), now)}
+	
+	// Use the standard mock - it will return empty results by default
+	metricsQuerier := source.NewMockMetricsQuerier()
+
+	collector, err := NewCollector(cfg, querier, metricsQuerier)
+	if err != nil {
+		t.Fatalf("NewCollector returned unexpected error: %v", err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	end := time.Now()
+	start := end.Add(-5 * time.Minute)
+	results, err := collector.CollectMetrics(ctx, start, end)
+	// With empty metrics, CollectMetrics should succeed with empty results
+	if err != nil {
+		t.Errorf("unexpected error with empty metrics: %v", err)
+	}
+	if len(results) != 0 {
+		t.Errorf("expected 0 results with empty metrics, got %d", len(results))
+	}
+}
+
+func TestCollector_CombineMetrics_IncludesTimingOnlyKeysInUnion(t *testing.T) {
+	cfg := baseConfig()
+	c := &Collector{config: cfg}
+
+	allocCosts := map[string]*allocationResult{}
+	promptTokens := map[string]float64{}
+	genTokens := map[string]float64{}
+	inputTime := map[string]float64{"timing-only:ns1": 60}
+	outputTime := map[string]float64{"timing-only:ns1": 40}
+	cacheHits := map[string]float64{"timing-only:ns1": 2}
+	cacheConfigs := map[string]*cacheConfig{}
+
+	now := time.Now()
+	results := c.combineMetrics(allocCosts, promptTokens, genTokens, inputTime, outputTime, cacheHits, cacheConfigs, now.Add(-1*time.Hour), now)
+
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+
+	m := results[0]
+	if m.Properties.ModelName != "timing-only" || m.Properties.Namespace != "ns1" {
+		t.Fatalf("unexpected properties: model=%s namespace=%s", m.Properties.ModelName, m.Properties.Namespace)
+	}
+	if !floatEq(m.InputProcessingTime, 60) {
+		t.Errorf("InputProcessingTime want 60 got %f", m.InputProcessingTime)
+	}
+	if !floatEq(m.OutputProcessingTime, 40) {
+		t.Errorf("OutputProcessingTime want 40 got %f", m.OutputProcessingTime)
+	}
+	if !floatEq(m.CachedTokens, 2) {
+		t.Errorf("CachedTokens want 2 got %f", m.CachedTokens)
+	}
+}

+ 15 - 0
pkg/inferencecost/env.go

@@ -0,0 +1,15 @@
+package inferencecost
+
+import (
+	"time"
+
+	coreenv "github.com/opencost/opencost/core/pkg/env"
+	"github.com/opencost/opencost/pkg/env"
+)
+
+func isInferenceCostEnabled() bool         { return env.IsInferenceCostEnabled() }
+func getPrometheusURL() string              { return coreenv.Get("PROMETHEUS_SERVER_ENDPOINT", "") }
+func getModelLabel() string                 { return env.GetInferenceModelLabel() }
+func getSharedInfraLabel() string           { return env.GetInferenceSharedInfraLabel() }
+func getSharedInfraLabelValue() string      { return env.GetInferenceSharedInfraLabelValue() }
+func getCollectionInterval() time.Duration  { return env.GetInferenceCollectionInterval() }

+ 137 - 0
pkg/inferencecost/exporter.go

@@ -0,0 +1,137 @@
+package inferencecost
+
+import (
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Exporter registers and emits the llm_* Prometheus metrics.
+type Exporter struct {
+	totalCost            *prometheus.GaugeVec
+	costPerMillionTokens *prometheus.GaugeVec
+	cacheSavingsFraction *prometheus.GaugeVec
+}
+
+// NewExporter creates an Exporter with all gauge vectors initialised.
+func NewExporter() *Exporter {
+	return &Exporter{
+		totalCost: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "llm_total_hourly_cost",
+				Help: "Hourly infrastructure cost attributed to an LLM model. " +
+					"cost_basis=allocation reconciles to the infrastructure bill (includes idle and shared infra costs). " +
+					"cost_basis=usage reflects active compute only; idle and shared infra costs are excluded and it does NOT reconcile to the bill.",
+			},
+			[]string{"model_name", "model_version", "namespace", "cost_basis", "workload_type"},
+		),
+		costPerMillionTokens: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "llm_cost_per_million_tokens",
+				Help: "Infrastructure cost per 1M tokens. " +
+					"Without phase label: blended cost (input + output combined). " +
+					"phase=prompt: cost per 1M delivered input tokens (promptTokens denominator; see llm_cache_savings_fraction for KV cache utilization). " +
+					"phase=generation: cost per 1M output tokens. " +
+					"cost_basis=allocation includes idle and shared infra; reconciles to bill. " +
+					"cost_basis=usage reflects active compute only; idle and shared infra costs excluded; does NOT reconcile to bill. " +
+					"allocation_method=compute_time: split proportionally by vLLM prefill/decode time; KV cache savings in llm_cache_savings_fraction. " +
+					"allocation_method=prefix_caching_off: same time-based split; prefix caching explicitly disabled on vLLM instance. " +
+					"allocation_method=multiplier: fixed output/input ratio used (timing metrics unavailable).",
+			},
+			[]string{"model_name", "model_version", "namespace", "cost_basis", "phase", "allocation_method", "workload_type"},
+		),
+		cacheSavingsFraction: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "llm_cache_savings_fraction",
+				Help: "Fraction of prompt tokens served from the KV cache (range 0–1). " +
+					"A value of 0.9 means 90% of prompt tokens were cache hits. " +
+					"Zero when prefix caching is disabled (allocation_method=prefix_caching_off) or when no cache hits occurred in the window.",
+			},
+			[]string{"model_name", "model_version", "namespace", "workload_type"},
+		),
+	}
+}
+
+// Register registers all gauge vectors with the default Prometheus registry.
+// Returns an error if any registration fails (e.g. called twice).
+func (e *Exporter) Register() error {
+	for _, c := range []prometheus.Collector{
+		e.totalCost,
+		e.costPerMillionTokens,
+		e.cacheSavingsFraction,
+	} {
+		if err := prometheus.Register(c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Export sets gauge values for all metrics derived from the given InferenceCost slice.
+// Gauges are reset before each export so decommissioned models do not persist.
+func (e *Exporter) Export(metrics []*InferenceCost) {
+	e.totalCost.Reset()
+	e.costPerMillionTokens.Reset()
+	e.cacheSavingsFraction.Reset()
+
+	for _, m := range metrics {
+		version := m.Properties.ModelVersion
+		if version == "" {
+			version = "unknown"
+		}
+		method := string(m.AllocationMethod)
+		workloadType := m.Properties.WorkloadType
+		if workloadType == "" {
+			workloadType = "unknown"
+		}
+
+		// Calculate window duration in hours for normalization to hourly rate
+		windowDuration := m.Window.End.Sub(m.Window.Start)
+		windowHours := windowDuration.Hours()
+		if windowHours <= 0 {
+			// Avoid division by zero; skip this metric if window is invalid
+			log.Warnf("InferenceCost: skipping export for model=%s ns=%s (invalid window duration: %v)",
+				m.Properties.ModelName, m.Properties.Namespace, windowDuration)
+			continue
+		}
+
+		for _, basis := range []CostBasis{CostBasisUsage, CostBasisAllocation} {
+			basisStr := string(basis)
+
+			// Normalize total cost to hourly rate: totalCost / windowHours
+			hourlyCost := totalCostForBasis(m, basis) / windowHours
+			e.totalCost.WithLabelValues(
+				m.Properties.ModelName, version, m.Properties.Namespace, basisStr, workloadType,
+			).Set(hourlyCost)
+
+			// Blended cost (no phase label)
+			e.costPerMillionTokens.WithLabelValues(
+				m.Properties.ModelName, version, m.Properties.Namespace, basisStr, "", "", workloadType,
+			).Set(m.CostPerMillionTokens[basis])
+
+			// Input cost (phase=prompt)
+			e.costPerMillionTokens.WithLabelValues(
+				m.Properties.ModelName, version, m.Properties.Namespace, basisStr, "prompt", method, workloadType,
+			).Set(m.InputCostPerMillionTokens[basis])
+
+			// Output cost (phase=generation)
+			e.costPerMillionTokens.WithLabelValues(
+				m.Properties.ModelName, version, m.Properties.Namespace, basisStr, "generation", method, workloadType,
+			).Set(m.OutputCostPerMillionTokens[basis])
+		}
+
+		e.cacheSavingsFraction.WithLabelValues(
+			m.Properties.ModelName, version, m.Properties.Namespace, workloadType,
+		).Set(m.CacheSavingsFraction)
+
+		log.Debugf("InferenceCost: exported model=%s ns=%s alloc_total=$%.4f usage_total=$%.4f method=%s workload_type=%s",
+			m.Properties.ModelName, m.Properties.Namespace,
+			m.AllocationTotalCost, m.UsageTotalCost, method, workloadType)
+	}
+}
+
+func totalCostForBasis(m *InferenceCost, basis CostBasis) float64 {
+	if basis == CostBasisUsage {
+		return m.UsageTotalCost
+	}
+	return m.AllocationTotalCost
+}

+ 271 - 0
pkg/inferencecost/exporter_test.go

@@ -0,0 +1,271 @@
+package inferencecost
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+)
+
+// newTestExporter creates a fresh Exporter registered on an isolated Prometheus
+// registry so tests don't conflict with each other or the default registry.
+func newTestExporter(t *testing.T) (*Exporter, *prometheus.Registry) {
+	t.Helper()
+	reg := prometheus.NewRegistry()
+	e := NewExporter()
+	for _, c := range []prometheus.Collector{
+		e.totalCost,
+		e.costPerMillionTokens,
+		e.cacheSavingsFraction,
+	} {
+		if err := reg.Register(c); err != nil {
+			t.Fatalf("failed to register collector: %v", err)
+		}
+	}
+	return e, reg
+}
+
+func sampleMetric(method AllocationMethod) *InferenceCost {
+	now := time.Now()
+	ic := &InferenceCost{
+		Properties: InferenceCostProperties{
+			ModelName:    "meta-llama/Llama-3.1-8B",
+			ModelVersion: "v1",
+			Namespace:    "llm-prod",
+		},
+		AllocationTotalCost: 4.0,
+		UsageTotalCost:      1.0,
+		TotalTokens:         1_000_000,
+		EffectiveInputTokens: 800_000,
+		GenerationTokens:    200_000,
+		AllocationMethod:    method,
+		CostPerMillionTokens: map[CostBasis]float64{
+			CostBasisAllocation: 4.0,
+			CostBasisUsage:      1.0,
+		},
+		InputCostPerMillionTokens: map[CostBasis]float64{
+			CostBasisAllocation: 3.5,
+			CostBasisUsage:      0.875,
+		},
+		OutputCostPerMillionTokens: map[CostBasis]float64{
+			CostBasisAllocation: 7.0,
+			CostBasisUsage:      1.75,
+		},
+		Timestamp: now,
+	}
+	// Set a 1-hour window so costs are already normalized (4.0 for 1 hour = 4.0/hour)
+	ic.Window.Start = now.Add(-1 * time.Hour)
+	ic.Window.End = now
+	return ic
+}
+
+// TestExporter_MetricNames verifies that exported metric names are llm_* not opencost_inference_*.
+func TestExporter_MetricNames(t *testing.T) {
+	e, reg := newTestExporter(t)
+	e.Export([]*InferenceCost{sampleMetric(AllocationMethodComputeTime)})
+
+	mfs, err := reg.Gather()
+	if err != nil {
+		t.Fatalf("gather: %v", err)
+	}
+
+	names := make([]string, 0, len(mfs))
+	for _, mf := range mfs {
+		names = append(names, mf.GetName())
+	}
+
+	required := []string{
+		"llm_total_hourly_cost",
+		"llm_cost_per_million_tokens",
+		"llm_cache_savings_fraction",
+	}
+	for _, want := range required {
+		found := false
+		for _, got := range names {
+			if got == want {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Errorf("metric %q not found; registered names: %v", want, names)
+		}
+	}
+
+	for _, name := range names {
+		if strings.HasPrefix(name, "opencost_inference") {
+			t.Errorf("found deprecated metric name %q — should be llm_*", name)
+		}
+	}
+}
+
+// TestExporter_TwoCostBasisSeriesPerModel verifies that llm_total_hourly_cost produces
+// two series (usage + allocation) and llm_cost_per_million_tokens produces
+// six series (2 cost bases × 3 phase values: blended/"", prompt, generation).
+func TestExporter_TwoCostBasisSeriesPerModel(t *testing.T) {
+	e, reg := newTestExporter(t)
+	e.Export([]*InferenceCost{sampleMetric(AllocationMethodComputeTime)})
+
+	// llm_total_hourly_cost should have 2 series (usage + allocation)
+	count := testutil.CollectAndCount(e.totalCost)
+	if count != 2 {
+		t.Errorf("llm_total_hourly_cost: expected 2 series (usage+allocation), got %d", count)
+	}
+
+	// llm_cost_per_million_tokens should have 6 series:
+	// 2 cost bases × 3 phases (blended/"", prompt, generation)
+	count = testutil.CollectAndCount(e.costPerMillionTokens)
+	if count != 6 {
+		t.Errorf("llm_cost_per_million_tokens: expected 6 series (2 bases × 3 phases), got %d", count)
+	}
+
+	// Verify both cost_basis values are present for llm_total_hourly_cost.
+	mfs, _ := reg.Gather()
+	for _, mf := range mfs {
+		if mf.GetName() != "llm_total_hourly_cost" {
+			continue
+		}
+		bases := make(map[string]bool)
+		for _, m := range mf.GetMetric() {
+			for _, lp := range m.GetLabel() {
+				if lp.GetName() == "cost_basis" {
+					bases[lp.GetValue()] = true
+				}
+			}
+		}
+		if !bases["usage"] {
+			t.Error("llm_total_hourly_cost missing cost_basis=usage series")
+		}
+		if !bases["allocation"] {
+			t.Error("llm_total_hourly_cost missing cost_basis=allocation series")
+		}
+	}
+}
+
+// TestExporter_PhaseLabelsAndAllocationMethod verifies that
+// llm_cost_per_million_tokens has the correct phase labels and allocation_method.
+func TestExporter_PhaseLabelsAndAllocationMethod(t *testing.T) {
+	for _, method := range []AllocationMethod{
+		AllocationMethodComputeTime,
+		AllocationMethodPrefixCachingOff,
+		AllocationMethodMultiplier,
+	} {
+		e, reg := newTestExporter(t)
+		e.Export([]*InferenceCost{sampleMetric(method)})
+
+		mfs, _ := reg.Gather()
+		for _, mf := range mfs {
+			name := mf.GetName()
+			if name != "llm_cost_per_million_tokens" {
+				continue
+			}
+			bases := make(map[string]bool)
+			phases := make(map[string]bool)
+			methods := make(map[string]bool)
+			for _, m := range mf.GetMetric() {
+				for _, lp := range m.GetLabel() {
+					switch lp.GetName() {
+					case "cost_basis":
+						bases[lp.GetValue()] = true
+					case "phase":
+						phases[lp.GetValue()] = true
+					case "allocation_method":
+						methods[lp.GetValue()] = true
+					}
+				}
+			}
+			if !bases["usage"] || !bases["allocation"] {
+				t.Errorf("%s method=%s: missing cost_basis label values, got %v", name, method, bases)
+			}
+			// Should have 3 phase values: "" (blended), "prompt", "generation"
+			if !phases[""] || !phases["prompt"] || !phases["generation"] {
+				t.Errorf("%s method=%s: expected phases [\"\", \"prompt\", \"generation\"], got %v", name, method, phases)
+			}
+			// allocation_method should be present (for phase=prompt and phase=generation)
+			// and empty (for blended phase="")
+			if !methods[string(method)] || !methods[""] {
+				t.Errorf("%s: expected allocation_method values [%s, \"\"], got %v", name, method, methods)
+			}
+		}
+	}
+}
+
+// TestExporter_HelpStringsContainReconciliationNote verifies that usage-basis
+// metrics document that they do not reconcile to the bill.
+func TestExporter_HelpStringsContainReconciliationNote(t *testing.T) {
+	e, reg := newTestExporter(t)
+	e.Export([]*InferenceCost{sampleMetric(AllocationMethodComputeTime)})
+
+	mfs, _ := reg.Gather()
+	reconciliationKeyword := "does NOT reconcile"
+
+	for _, mf := range mfs {
+		name := mf.GetName()
+		if name != "llm_total_hourly_cost" && name != "llm_cost_per_million_tokens" {
+			continue
+		}
+		help := mf.GetHelp()
+		if !strings.Contains(help, reconciliationKeyword) {
+			t.Errorf("%s Help string should mention reconciliation, got: %q", name, help)
+		}
+	}
+}
+
+// TestExporter_CacheSavingsFraction verifies that llm_cache_savings_fraction is exported correctly.
+func TestExporter_CacheSavingsFraction(t *testing.T) {
+	e, reg := newTestExporter(t)
+	ic := sampleMetric(AllocationMethodComputeTime)
+	ic.CacheSavingsFraction = 0.4
+	e.Export([]*InferenceCost{ic})
+
+	mfs, _ := reg.Gather()
+	for _, mf := range mfs {
+		if mf.GetName() != "llm_cache_savings_fraction" {
+			continue
+		}
+		if len(mf.GetMetric()) != 1 {
+			t.Fatalf("expected 1 series for llm_cache_savings_fraction, got %d", len(mf.GetMetric()))
+		}
+		val := mf.GetMetric()[0].GetGauge().GetValue()
+		if !floatEq(val, 0.4) {
+			t.Errorf("llm_cache_savings_fraction want 0.4 got %f", val)
+		}
+		return
+	}
+	t.Error("llm_cache_savings_fraction metric not found")
+}
+
+// TestExporter_Values verifies that exported gauge values match InferenceCost fields.
+func TestExporter_Values(t *testing.T) {
+	e, reg := newTestExporter(t)
+	ic := sampleMetric(AllocationMethodComputeTime)
+	e.Export([]*InferenceCost{ic})
+
+	mfs, _ := reg.Gather()
+	for _, mf := range mfs {
+		if mf.GetName() != "llm_total_hourly_cost" {
+			continue
+		}
+		for _, m := range mf.GetMetric() {
+			var basis string
+			for _, lp := range m.GetLabel() {
+				if lp.GetName() == "cost_basis" {
+					basis = lp.GetValue()
+				}
+			}
+			val := m.GetGauge().GetValue()
+			switch basis {
+			case "allocation":
+				if !floatEq(val, ic.AllocationTotalCost) {
+					t.Errorf("llm_total_hourly_cost allocation want %f got %f", ic.AllocationTotalCost, val)
+				}
+			case "usage":
+				if !floatEq(val, ic.UsageTotalCost) {
+					t.Errorf("llm_total_hourly_cost usage want %f got %f", ic.UsageTotalCost, val)
+				}
+			}
+		}
+	}
+}

+ 178 - 0
pkg/inferencecost/queryservice.go

@@ -0,0 +1,178 @@
+package inferencecost
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/julienschmidt/httprouter"
+	"github.com/opencost/opencost/core/pkg/opencost"
+	proto "github.com/opencost/opencost/core/pkg/protocol"
+	"github.com/opencost/opencost/core/pkg/util/httputil"
+)
+
+// protocol is the package-level HTTP response helper, mirroring the pattern
+// used in pkg/costmodel/router.go.
+var protocol = proto.HTTP()
+
+// QueryService handles HTTP requests for the /inferenceCost endpoints. It
+// computes inference costs on demand at request time by driving the Collector
+// and Calculator, matching the pattern of OpenCost's /allocation and /assets
+// APIs (no stored repository).
+//
+// Note: each request issues two ComputeAllocation calls per sub-window (one
+// with idle sharing for allocation costs, one without for usage costs). This
+// is consistent with /allocation which also recomputes from Prometheus on
+// every request and has no result cache.
+type QueryService struct {
+	collector  *Collector
+	calculator *Calculator
+}
+
+// NewQueryService creates a QueryService. Both collector and calculator must
+// be non-nil; if either is nil the handlers return 501.
+func NewQueryService(c *Collector, calc *Calculator) *QueryService {
+	return &QueryService{
+		collector:  c,
+		calculator: calc,
+	}
+}
+
+// GetInferenceCostTotalHandler returns an httprouter-compatible handler for
+// GET /inferenceCost/total.
+//
+// Query parameters:
+//   - window (required): RFC3339 "start,end" or named range
+//   - costBasis: "usage" or "allocation" (default: allocation)
+//   - aggregate: comma-separated list of dimensions (model_name, model_version, namespace, cluster)
+//   - accumulate: hour, day, week, month (optional; controls sub-window step for large windows)
+//   - filter: prop:value[+prop:value] (optional; Phase-1 minimal, AND semantics)
+func (qs *QueryService) GetInferenceCostTotalHandler() func(http.ResponseWriter, *http.Request, httprouter.Params) {
+	return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
+		if qs == nil || qs.collector == nil || qs.calculator == nil {
+			http.Error(w, "InferenceCost query service not available", http.StatusNotImplemented)
+			return
+		}
+
+		qp := httputil.NewQueryParams(r.URL.Query())
+		req, err := ParseInferenceCostRequest(qp)
+		if err != nil {
+			protocol.WriteError(w, protocol.BadRequest(err.Error()))
+			return
+		}
+
+		setRange, err := qs.query(r.Context(), req)
+		if err != nil {
+			protocol.WriteError(w, protocol.InternalServerError(fmt.Sprintf("InferenceCost query failed: %s", err)))
+			return
+		}
+
+		// Accumulate all step-windows into a single total set.
+		total := accumulate(setRange.InferenceCostSets)
+
+		protocol.WriteData(w, total)
+	}
+}
+
+// GetInferenceCostTimeseriesHandler returns an httprouter-compatible handler
+// for GET /inferenceCost/timeseries.
+//
+// The 'accumulate' parameter is required for this endpoint (it defines the
+// time-series step size). All other parameters are the same as /total.
+func (qs *QueryService) GetInferenceCostTimeseriesHandler() func(http.ResponseWriter, *http.Request, httprouter.Params) {
+	return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
+		if qs == nil || qs.collector == nil || qs.calculator == nil {
+			http.Error(w, "InferenceCost query service not available", http.StatusNotImplemented)
+			return
+		}
+
+		qp := httputil.NewQueryParams(r.URL.Query())
+		req, err := ParseInferenceCostTimeseriesRequest(qp)
+		if err != nil {
+			protocol.WriteError(w, protocol.BadRequest(err.Error()))
+			return
+		}
+
+		setRange, err := qs.query(r.Context(), req)
+		if err != nil {
+			protocol.WriteError(w, protocol.InternalServerError(fmt.Sprintf("InferenceCost query failed: %s", err)))
+			return
+		}
+
+		protocol.WriteData(w, setRange)
+	}
+}
+
+// query drives the on-demand compute loop for a QueryRequest:
+//  1. Split [Start, End] into Step-sized sub-windows.
+//  2. For each sub-window: collect metrics from Prometheus + allocation layer,
+//     calculate costs, project to the requested cost basis, apply filter, aggregate.
+//  3. Return all sub-window sets in an InferenceCostSetRange.
+//
+// The caller decides whether to accumulate the range into a single total (/total)
+// or return the per-step sets (/timeseries).
+func (qs *QueryService) query(ctx context.Context, req *QueryRequest) (*InferenceCostSetRange, error) {
+	overallWindow := opencost.NewClosedWindow(req.Start, req.End)
+	sets := make([]*InferenceCostSet, 0)
+
+	stepStart := req.Start
+	for stepStart.Before(req.End) {
+		stepEnd := stepStart.Add(req.Step)
+		if stepEnd.After(req.End) {
+			stepEnd = req.End
+		}
+
+		stepWindow := opencost.NewClosedWindow(stepStart, stepEnd)
+		set, err := qs.computeStep(ctx, req, stepWindow)
+		if err != nil {
+			return nil, fmt.Errorf("computing window [%s, %s]: %w", stepStart.Format(time.RFC3339), stepEnd.Format(time.RFC3339), err)
+		}
+		sets = append(sets, set)
+
+		stepStart = stepEnd
+	}
+
+	return &InferenceCostSetRange{
+		InferenceCostSets: sets,
+		Window:            overallWindow,
+	}, nil
+}
+
+// computeStep computes inference costs for a single sub-window, projects to
+// the requested cost basis, applies filters, and aggregates.
+func (qs *QueryService) computeStep(ctx context.Context, req *QueryRequest, win opencost.Window) (*InferenceCostSet, error) {
+	// Collect raw metrics for this sub-window.
+	metrics, err := qs.collector.CollectMetrics(ctx, *win.Start(), *win.End())
+	if err != nil {
+		return nil, fmt.Errorf("collecting metrics: %w", err)
+	}
+
+	// Compute derived costs (per-million rates, input/output split).
+	qs.calculator.CalculateCosts(metrics)
+
+	set := newInferenceCostSet(win)
+	for _, ic := range metrics {
+		resp := newInferenceCostResponse(ic, req.CostBasis, win)
+
+		// Apply Phase-1 filter (AND semantics over supported properties).
+		if !matchesFilter(resp, req.Filter) {
+			continue
+		}
+
+		// Derive the natural key for this entry; aggregate() will re-key if
+		// aggregateBy is set, but we still need a unique key here.
+		key, err := aggKey(resp.Properties, nil) // nil = natural key
+		if err != nil {
+			return nil, err
+		}
+		set.InferenceCosts[key] = resp
+	}
+
+	// Aggregate into the requested dimensions.
+	if err := set.aggregate(req.AggregateBy); err != nil {
+		return nil, fmt.Errorf("aggregating results: %w", err)
+	}
+
+	return set, nil
+}

+ 132 - 0
pkg/inferencecost/queryservice_helper.go

@@ -0,0 +1,132 @@
+package inferencecost
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	"github.com/opencost/opencost/core/pkg/util/mapper"
+	"github.com/opencost/opencost/core/pkg/util/timeutil"
+)
+
+// QueryRequest holds the parsed parameters for an inference cost query.
+type QueryRequest struct {
+	Start      time.Time
+	End        time.Time
+	CostBasis  CostBasis
+	AggregateBy []string
+	Accumulate opencost.AccumulateOption
+	Filter     []filterSpec
+	Step       time.Duration
+}
+
+// ParseInferenceCostRequest parses the common query parameters from an HTTP
+// request into a QueryRequest. It mirrors the pattern used by
+// customcost.ParseCustomCostTotalRequest.
+//
+// Required: window (RFC3339 start,end or named range).
+// Optional: costBasis (default: allocation), aggregate, accumulate, filter.
+//
+// The step field is derived from the accumulate option.
+func ParseInferenceCostRequest(qp mapper.PrimitiveMap) (*QueryRequest, error) {
+	// --- window (required) ---
+	windowStr := qp.Get("window", "")
+	if windowStr == "" {
+		return nil, fmt.Errorf("missing required 'window' parameter")
+	}
+	window, err := opencost.ParseWindowUTC(windowStr)
+	if err != nil {
+		return nil, fmt.Errorf("invalid 'window' parameter: %w", err)
+	}
+	if window.IsOpen() {
+		return nil, fmt.Errorf("invalid 'window' parameter: window must have both start and end: %s", windowStr)
+	}
+
+	// --- costBasis (optional, default: allocation) ---
+	basisStr := qp.Get("costBasis", string(CostBasisAllocation))
+	var basis CostBasis
+	switch basisStr {
+	case string(CostBasisUsage):
+		basis = CostBasisUsage
+	case string(CostBasisAllocation):
+		basis = CostBasisAllocation
+	default:
+		return nil, fmt.Errorf("invalid 'costBasis' parameter %q: must be %q or %q", basisStr, CostBasisUsage, CostBasisAllocation)
+	}
+
+	// --- aggregate (optional) ---
+	aggregateByRaw := qp.GetList("aggregate", ",")
+	for _, dim := range aggregateByRaw {
+		if !supportedAggregateProperties[dim] {
+			return nil, fmt.Errorf("unsupported 'aggregate' dimension %q: supported dimensions are model_name, model_version, namespace, cluster, pod, controller, controller_kind, container, workload_type", dim)
+		}
+	}
+
+	// --- accumulate (optional, defaults to none) ---
+	accumulate := opencost.ParseAccumulate(qp.Get("accumulate", ""))
+
+	// --- filter (optional) ---
+	filterSpecs, err := parseFilter(qp.Get("filter", ""))
+	if err != nil {
+		return nil, fmt.Errorf("invalid 'filter' parameter: %w", err)
+	}
+
+	// Derive step from accumulate. When no accumulate is requested the step
+	// spans the full window (produces a single data point).
+	step := stepFromAccumulate(accumulate, *window.Start(), *window.End())
+
+	return &QueryRequest{
+		Start:       *window.Start(),
+		End:         *window.End(),
+		CostBasis:   basis,
+		AggregateBy: aggregateByRaw,
+		Accumulate:  accumulate,
+		Filter:      filterSpecs,
+		Step:        step,
+	}, nil
+}
+
+// ParseInferenceCostTimeseriesRequest is identical to ParseInferenceCostRequest
+// but requires the accumulate parameter (timeseries must have a defined step).
+func ParseInferenceCostTimeseriesRequest(qp mapper.PrimitiveMap) (*QueryRequest, error) {
+	req, err := ParseInferenceCostRequest(qp)
+	if err != nil {
+		return nil, err
+	}
+	if qp.Get("accumulate", "") == "" {
+		return nil, fmt.Errorf("missing required 'accumulate' parameter for timeseries endpoint: must be one of hour, day, week, month")
+	}
+	if req.Accumulate == opencost.AccumulateOptionNone {
+		return nil, fmt.Errorf("invalid 'accumulate' parameter for timeseries endpoint: must be one of hour, day, week, month")
+	}
+	return req, nil
+}
+
+// stepFromAccumulate returns the time.Duration that corresponds to an
+// AccumulateOption. When AccumulateOptionNone is given (or not recognised),
+// the full window duration is used so the loop produces a single pass.
+//
+// This is a local reimplementation of the package-private helpers in
+// pkg/costmodel (resolveStepFromQuery / resolveStepForAccumulate) — kept here
+// to preserve the domain package's self-containment.
+func stepFromAccumulate(opt opencost.AccumulateOption, start, end time.Time) time.Duration {
+	switch opt {
+	case opencost.AccumulateOptionHour:
+		return time.Hour
+	case opencost.AccumulateOptionDay:
+		return timeutil.Day
+	case opencost.AccumulateOptionWeek:
+		return timeutil.Week
+	case opencost.AccumulateOptionMonth:
+		// Steps by 30 days (consistent with
+		// how OpenCost's CustomCost and CloudCost handle monthly accumulation).
+		return timeutil.Day * 30
+	default:
+		// No accumulation: single window covering the full request range.
+		d := end.Sub(start)
+		if d <= 0 {
+			d = timeutil.Day
+		}
+		return d
+	}
+}

+ 171 - 0
pkg/inferencecost/queryservice_helper_test.go

@@ -0,0 +1,171 @@
+package inferencecost
+
+import (
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	"github.com/opencost/opencost/core/pkg/util/httputil"
+)
+
+// makeQP is a small helper that converts a plain map into an httputil.QueryParams
+// so tests can call ParseInferenceCostRequest without spinning up HTTP.
+func makeQP(pairs map[string]string) httputil.QueryParams {
+	raw := make(map[string][]string, len(pairs))
+	for k, v := range pairs {
+		raw[k] = []string{v}
+	}
+	return httputil.NewQueryParams(raw)
+}
+
+// validWindowStr returns a well-formed RFC3339 window string covering [yesterday, now].
+func validWindowStr() string {
+	now := time.Now().UTC().Truncate(time.Hour)
+	start := now.Add(-24 * time.Hour)
+	return start.Format(time.RFC3339) + "," + now.Format(time.RFC3339)
+}
+
+// --- ParseInferenceCostRequest ---
+
+func TestParseInferenceCostRequest_MissingWindow(t *testing.T) {
+	_, err := ParseInferenceCostRequest(makeQP(map[string]string{}))
+	if err == nil {
+		t.Fatal("expected error for missing window, got nil")
+	}
+}
+
+func TestParseInferenceCostRequest_InvalidWindow(t *testing.T) {
+	_, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window": "not-a-window",
+	}))
+	if err == nil {
+		t.Fatal("expected error for invalid window, got nil")
+	}
+}
+
+func TestParseInferenceCostRequest_DefaultBasis(t *testing.T) {
+	req, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window": validWindowStr(),
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if req.CostBasis != CostBasisAllocation {
+		t.Errorf("default CostBasis = %q, want %q", req.CostBasis, CostBasisAllocation)
+	}
+}
+
+func TestParseInferenceCostRequest_UsageBasis(t *testing.T) {
+	req, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window":    validWindowStr(),
+		"costBasis": "usage",
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if req.CostBasis != CostBasisUsage {
+		t.Errorf("CostBasis = %q, want %q", req.CostBasis, CostBasisUsage)
+	}
+}
+
+func TestParseInferenceCostRequest_InvalidCostBasis(t *testing.T) {
+	_, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window":    validWindowStr(),
+		"costBasis": "bogus",
+	}))
+	if err == nil {
+		t.Fatal("expected error for invalid costBasis, got nil")
+	}
+}
+
+func TestParseInferenceCostRequest_ValidAggregation(t *testing.T) {
+	req, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window":    validWindowStr(),
+		"aggregate": "model_name,namespace",
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(req.AggregateBy) != 2 {
+		t.Errorf("AggregateBy len = %d, want 2", len(req.AggregateBy))
+	}
+}
+
+func TestParseInferenceCostRequest_UnsupportedAggregateDimension(t *testing.T) {
+	_, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window":    validWindowStr(),
+		"aggregate": "product",
+	}))
+	if err == nil {
+		t.Fatal("expected error for unsupported aggregate dimension, got nil")
+	}
+}
+
+func TestParseInferenceCostRequest_ValidFilter(t *testing.T) {
+	req, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window": validWindowStr(),
+		"filter": `namespace:"llm-prod"`,
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(req.Filter) != 1 {
+		t.Errorf("Filter len = %d, want 1", len(req.Filter))
+	}
+	if req.Filter[0].property != "namespace" || req.Filter[0].value != "llm-prod" {
+		t.Errorf("Filter[0] = %+v, want {namespace, llm-prod}", req.Filter[0])
+	}
+}
+
+func TestParseInferenceCostRequest_InvalidFilterProperty(t *testing.T) {
+	_, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window": validWindowStr(),
+		"filter": `product:"team-a"`,
+	}))
+	if err == nil {
+		t.Fatal("expected error for unsupported filter property, got nil")
+	}
+}
+
+func TestParseInferenceCostRequest_AccumulateDay(t *testing.T) {
+	req, err := ParseInferenceCostRequest(makeQP(map[string]string{
+		"window":     validWindowStr(),
+		"accumulate": "day",
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if req.Accumulate != opencost.AccumulateOptionDay {
+		t.Errorf("Accumulate = %q, want %q", req.Accumulate, opencost.AccumulateOptionDay)
+	}
+	if req.Step != 24*time.Hour {
+		t.Errorf("Step = %s, want 24h", req.Step)
+	}
+}
+
+// --- ParseInferenceCostTimeseriesRequest ---
+
+func TestParseInferenceCostTimeseriesRequest_MissingAccumulate(t *testing.T) {
+	_, err := ParseInferenceCostTimeseriesRequest(makeQP(map[string]string{
+		"window": validWindowStr(),
+	}))
+	if err == nil {
+		t.Fatal("expected error when accumulate is missing for timeseries, got nil")
+	}
+}
+
+func TestParseInferenceCostTimeseriesRequest_Valid(t *testing.T) {
+	req, err := ParseInferenceCostTimeseriesRequest(makeQP(map[string]string{
+		"window":     validWindowStr(),
+		"accumulate": "hour",
+	}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if req.Accumulate != opencost.AccumulateOptionHour {
+		t.Errorf("Accumulate = %q, want %q", req.Accumulate, opencost.AccumulateOptionHour)
+	}
+	if req.Step != time.Hour {
+		t.Errorf("Step = %s, want 1h", req.Step)
+	}
+}

+ 331 - 0
pkg/inferencecost/queryservice_test.go

@@ -0,0 +1,331 @@
+package inferencecost
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/opencost"
+	"github.com/opencost/opencost/core/pkg/source"
+)
+
+// --- Fake Prometheus server ---
+
+// promSample is one model/namespace series value for a fake Prometheus response.
+type promSample struct {
+	modelName string
+	namespace string
+	value     float64
+}
+
+// promInstantJSON returns a minimal Prometheus API v1 instant-query JSON
+// payload containing the given vector samples.
+func promInstantJSON(samples []promSample) string {
+	type result struct {
+		Metric map[string]string `json:"metric"`
+		Value  [2]interface{}    `json:"value"`
+	}
+	results := make([]result, 0, len(samples))
+	for _, s := range samples {
+		results = append(results, result{
+			Metric: map[string]string{
+				"model_name": s.modelName,
+				"namespace":  s.namespace,
+			},
+			Value: [2]interface{}{1.0, fmt.Sprintf("%f", s.value)},
+		})
+	}
+	data := map[string]interface{}{
+		"status": "success",
+		"data": map[string]interface{}{
+			"resultType": "vector",
+			"result":     results,
+		},
+	}
+	b, _ := json.Marshal(data)
+	return string(b)
+}
+
+// newFakePromServer returns a *httptest.Server that responds to any
+// /api/v1/query request with the provided samples.
+func newFakePromServer(t *testing.T, samples []promSample) *httptest.Server {
+	t.Helper()
+	body := promInstantJSON(samples)
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte(body)) //nolint:errcheck
+	}))
+}
+
+// --- HTTP handler tests (do not require real Prometheus / allocation data) ---
+
+func TestQueryService_NilCollector_Returns501(t *testing.T) {
+	qs := &QueryService{collector: nil, calculator: nil}
+	handler := qs.GetInferenceCostTotalHandler()
+
+	req, _ := http.NewRequest(http.MethodGet, "/inferenceCost/total?window=2024-01-01T00:00:00Z,2024-01-02T00:00:00Z", nil)
+	rr := httptest.NewRecorder()
+	handler(rr, req, nil)
+
+	if rr.Code != http.StatusNotImplemented {
+		t.Errorf("total handler: status = %d, want %d (501)", rr.Code, http.StatusNotImplemented)
+	}
+
+	handler2 := qs.GetInferenceCostTimeseriesHandler()
+	rr2 := httptest.NewRecorder()
+	handler2(rr2, req, nil)
+	if rr2.Code != http.StatusNotImplemented {
+		t.Errorf("timeseries handler: status = %d, want %d (501)", rr2.Code, http.StatusNotImplemented)
+	}
+}
+
+func TestQueryService_MissingWindow_Returns400(t *testing.T) {
+	cfg := baseConfig()
+	querier := &mockQuerier{set: opencost.NewAllocationSet(time.Now().Add(-time.Hour), time.Now())}
+	metricsQuerier := source.NewMockMetricsQuerier()
+	collector, err := NewCollector(cfg, querier, metricsQuerier)
+	if err != nil {
+		t.Fatalf("NewCollector: %v", err)
+	}
+	qs := NewQueryService(collector, NewCalculator(cfg))
+
+	handler := qs.GetInferenceCostTotalHandler()
+	req, _ := http.NewRequest(http.MethodGet, "/inferenceCost/total", nil) // no window
+	rr := httptest.NewRecorder()
+	handler(rr, req, nil)
+
+	if rr.Code != http.StatusBadRequest {
+		t.Errorf("status = %d, want 400", rr.Code)
+	}
+}
+
+func TestQueryService_InvalidCostBasis_Returns400(t *testing.T) {
+	cfg := baseConfig()
+	querier := &mockQuerier{set: opencost.NewAllocationSet(time.Now().Add(-time.Hour), time.Now())}
+	metricsQuerier := source.NewMockMetricsQuerier()
+	collector, _ := NewCollector(cfg, querier, metricsQuerier)
+	qs := NewQueryService(collector, NewCalculator(cfg))
+
+	handler := qs.GetInferenceCostTotalHandler()
+	req, _ := http.NewRequest(http.MethodGet, "/inferenceCost/total?window=2024-01-01T00:00:00Z,2024-01-02T00:00:00Z&costBasis=bogus", nil)
+	rr := httptest.NewRecorder()
+	handler(rr, req, nil)
+
+	if rr.Code != http.StatusBadRequest {
+		t.Errorf("status = %d, want 400 for invalid costBasis", rr.Code)
+	}
+}
+
+func TestQueryService_TimeseriesMissingAccumulate_Returns400(t *testing.T) {
+	srv := newFakePromServer(t, nil)
+	defer srv.Close()
+	cfg := baseConfig()
+	cfg.PrometheusURL = srv.URL
+	querier := &mockQuerier{set: opencost.NewAllocationSet(time.Now().Add(-time.Hour), time.Now())}
+	metricsQuerier := source.NewMockMetricsQuerier()
+	collector, _ := NewCollector(cfg, querier, metricsQuerier)
+	qs := NewQueryService(collector, NewCalculator(cfg))
+
+	handler := qs.GetInferenceCostTimeseriesHandler()
+	req, _ := http.NewRequest(http.MethodGet, "/inferenceCost/timeseries?window=2024-01-01T00:00:00Z,2024-01-02T00:00:00Z", nil)
+	rr := httptest.NewRecorder()
+	handler(rr, req, nil)
+
+	if rr.Code != http.StatusBadRequest {
+		t.Errorf("status = %d, want 400 for missing accumulate", rr.Code)
+	}
+}
+
+// --- query() unit tests using pre-built InferenceCost structs ---
+//
+// computeStep calls CollectMetrics → CalculateCosts → project/filter/aggregate.
+// We can test the projection and aggregation layers directly by using
+// combineMetrics with pre-built allocationResult and token maps (bypassing the
+// AggregateBy path in the real collector which requires a live K8s label setup).
+
+func TestQuery_SingleStep_ReturnsOneSet(t *testing.T) {
+	qs, srv := newQueryServiceWithDirectMetrics(t, "llama-3", "llm-prod", 10.0, 1_000_000)
+	defer srv.Close()
+
+	now := time.Now().UTC()
+	req := &QueryRequest{
+		Start:     now.Add(-time.Hour),
+		End:       now,
+		CostBasis: CostBasisAllocation,
+		Step:      time.Hour,
+	}
+
+	setRange, err := qs.query(context.Background(), req)
+	if err != nil {
+		t.Fatalf("query error: %v", err)
+	}
+	if len(setRange.InferenceCostSets) != 1 {
+		t.Errorf("expected 1 set for a single step, got %d", len(setRange.InferenceCostSets))
+	}
+}
+
+func TestQuery_TwoSteps_ReturnsTwoSets(t *testing.T) {
+	qs, srv := newQueryServiceWithDirectMetrics(t, "llama-3", "llm-prod", 5.0, 500_000)
+	defer srv.Close()
+
+	now := time.Now().UTC().Truncate(time.Hour)
+	req := &QueryRequest{
+		Start:      now.Add(-2 * time.Hour),
+		End:        now,
+		CostBasis:  CostBasisAllocation,
+		Accumulate: opencost.AccumulateOptionHour,
+		Step:       time.Hour,
+	}
+
+	setRange, err := qs.query(context.Background(), req)
+	if err != nil {
+		t.Fatalf("query error: %v", err)
+	}
+	if len(setRange.InferenceCostSets) != 2 {
+		t.Errorf("expected 2 sets for 2-hour range at 1-hour step, got %d", len(setRange.InferenceCostSets))
+	}
+}
+
+func TestQuery_Filter_ExcludesNonMatchingNamespace(t *testing.T) {
+	qs, srv := newQueryServiceWithDirectMetrics(t, "llama-3", "llm-prod", 10.0, 1_000_000)
+	defer srv.Close()
+
+	filterSpecs, _ := parseFilter(`namespace:"other-ns"`)
+	now := time.Now().UTC()
+	req := &QueryRequest{
+		Start:     now.Add(-time.Hour),
+		End:       now,
+		CostBasis: CostBasisAllocation,
+		Step:      time.Hour,
+		Filter:    filterSpecs,
+	}
+
+	setRange, err := qs.query(context.Background(), req)
+	if err != nil {
+		t.Fatalf("query error: %v", err)
+	}
+	total := accumulate(setRange.InferenceCostSets)
+	if len(total.InferenceCosts) != 0 {
+		t.Errorf("expected 0 entries after filtering to non-matching namespace, got %d", len(total.InferenceCosts))
+	}
+}
+
+func TestQuery_Filter_MatchingNamespace(t *testing.T) {
+	qs, srv := newQueryServiceWithDirectMetrics(t, "llama-3", "llm-prod", 10.0, 1_000_000)
+	defer srv.Close()
+
+	filterSpecs, _ := parseFilter(`namespace:"llm-prod"`)
+	now := time.Now().UTC()
+	req := &QueryRequest{
+		Start:     now.Add(-time.Hour),
+		End:       now,
+		CostBasis: CostBasisAllocation,
+		Step:      time.Hour,
+		Filter:    filterSpecs,
+	}
+
+	setRange, err := qs.query(context.Background(), req)
+	if err != nil {
+		t.Fatalf("query error: %v", err)
+	}
+	total := accumulate(setRange.InferenceCostSets)
+	if len(total.InferenceCosts) == 0 {
+		t.Error("expected at least one entry after filtering to matching namespace, got 0")
+	}
+}
+
+func TestQuery_UsageBasis_LowerThanAllocation(t *testing.T) {
+	// Usage costs should be <= allocation costs (no idle sharing).
+	// The fake allocation querier returns fixed costs; the multiplier calculator
+	// sets both basis costs from the allocationResult directly.
+	qs, srv := newQueryServiceWithDirectMetrics(t, "llama-3", "llm-prod", 10.0, 1_000_000)
+	defer srv.Close()
+
+	now := time.Now().UTC()
+
+	queryBasis := func(basis CostBasis) float64 {
+		req := &QueryRequest{
+			Start:     now.Add(-time.Hour),
+			End:       now,
+			CostBasis: basis,
+			Step:      time.Hour,
+		}
+		setRange, err := qs.query(context.Background(), req)
+		if err != nil {
+			t.Fatalf("query error: %v", err)
+		}
+		total := accumulate(setRange.InferenceCostSets)
+		var tc float64
+		for _, ic := range total.InferenceCosts {
+			tc += ic.TotalCost
+		}
+		return tc
+	}
+
+	allocCost := queryBasis(CostBasisAllocation)
+	usageCost := queryBasis(CostBasisUsage)
+
+	// With ShareNone for usage (default config), usage cost ≤ allocation cost.
+	if usageCost > allocCost {
+		t.Errorf("usageCost (%.4f) > allocCost (%.4f): unexpected", usageCost, allocCost)
+	}
+}
+
+// newQueryServiceWithDirectMetrics builds a QueryService backed by a fake
+// Prometheus server and an allocation querier that returns a pre-built
+// AllocationSet with the model label set in its sanitized form.
+//
+// OpenCost's AggregateBy("label:llm-d.ai/model") sanitizes the label key
+// via promutil.SanitizeLabelName before lookup, converting non-alphanumeric
+// characters to underscores. So we must store the label under the sanitized
+// key "llm_d_ai_model" for the allocation to be found and named correctly
+// after aggregation.
+func newQueryServiceWithDirectMetrics(t *testing.T, modelName, namespace string, allocCost, tokenCount float64) (*QueryService, *httptest.Server) {
+	t.Helper()
+
+	samples := []promSample{
+		{modelName: modelName, namespace: namespace, value: tokenCount},
+	}
+	srv := newFakePromServer(t, samples)
+
+	cfg := baseConfig()
+	cfg.PrometheusURL = srv.URL
+	cfg.AllocationMode = AllocationModeMultiplier
+	cfg.OutputTokenCostMultiplier = 2.5
+	// cfg.ModelLabel = "llm-d.ai/model" (default from baseConfig)
+
+	// The sanitized form of "llm-d.ai/model" that AggregateBy uses for lookup.
+	sanitizedLabelKey := "llm_d_ai_model"
+
+	querier := &mockQuerier{
+		set: func() *opencost.AllocationSet {
+			now := time.Now()
+			as := opencost.NewAllocationSet(now.Add(-time.Hour), now)
+			a := &opencost.Allocation{
+				Name:    modelName,
+				GPUCost: allocCost,
+				Properties: &opencost.AllocationProperties{
+					Namespace: namespace,
+					Labels: opencost.AllocationLabels{
+						sanitizedLabelKey: modelName,
+					},
+				},
+			}
+			as.Set(a)
+			return as
+		}(),
+	}
+
+	metricsQuerier := source.NewMockMetricsQuerier()
+	collector, err := NewCollector(cfg, querier, metricsQuerier)
+	if err != nil {
+		t.Fatalf("NewCollector: %v", err)
+	}
+	calculator := NewCalculator(cfg)
+	return NewQueryService(collector, calculator), srv
+}

+ 59 - 0
pkg/inferencecost/runner.go

@@ -0,0 +1,59 @@
+package inferencecost
+
+import (
+	"context"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/log"
+)
+
+// Runner periodically drives the collect → calculate → export pipeline.
+type Runner struct {
+	collector  *Collector
+	calculator *Calculator
+	exporter   *Exporter
+	interval   time.Duration
+}
+
+// NewRunner creates a Runner. The exporter must already be registered with
+// Prometheus (call exporter.Register() before NewRunner).
+func NewRunner(collector *Collector, calculator *Calculator, exporter *Exporter, interval time.Duration) *Runner {
+	return &Runner{
+		collector:  collector,
+		calculator: calculator,
+		exporter:   exporter,
+		interval:   interval,
+	}
+}
+
+// Start runs the collection loop until ctx is cancelled.
+func (r *Runner) Start(ctx context.Context) {
+	log.Infof("InferenceCost: starting collection loop (interval=%s)", r.interval)
+	r.runOnce(ctx)
+
+	ticker := time.NewTicker(r.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			log.Infof("InferenceCost: collection loop stopped")
+			return
+		case <-ticker.C:
+			r.runOnce(ctx)
+		}
+	}
+}
+
+func (r *Runner) runOnce(ctx context.Context) {
+	end := time.Now()
+	start := end.Add(-r.interval)
+	metrics, err := r.collector.CollectMetrics(ctx, start, end)
+	if err != nil {
+		log.Errorf("InferenceCost: collection failed: %v", err)
+		return
+	}
+	r.calculator.CalculateCosts(metrics)
+	r.exporter.Export(metrics)
+	log.Debugf("InferenceCost: exported metrics for %d models", len(metrics))
+}

+ 174 - 0
pkg/inferencecost/types.go

@@ -0,0 +1,174 @@
+package inferencecost
+
+import "time"
+
+// CostBasis defines whether costs are usage-based or allocation-based.
+type CostBasis string
+
+const (
+	// CostBasisUsage measures actual resource consumption only.
+	// Does not reconcile to the infrastructure bill — idle and waste are unattributed.
+	CostBasisUsage CostBasis = "usage"
+
+	// CostBasisAllocation measures max(request,usage) × runtime × price plus idle
+	// (ShareWeighted) and shared infra (EPP, gateway). Reconciles to the bill.
+	CostBasisAllocation CostBasis = "allocation"
+)
+
+// AllocationMethod indicates how the input/output cost split was computed.
+type AllocationMethod string
+
+const (
+	// AllocationMethodComputeTime splits costs proportionally by vLLM prefill and
+	// decode time. inputCostPerMillionTokens uses PromptTokens as the denominator
+	// (delivered tokens); see cacheSavingsFraction for KV cache benefit.
+	AllocationMethodComputeTime AllocationMethod = "compute_time"
+
+	// AllocationMethodPrefixCachingOff splits by input / output token time; prefix
+	// caching is explicitly disabled on the vLLM instance. cacheSavingsFraction
+	// will be zero by configuration, not by absence of cache hits.
+	AllocationMethodPrefixCachingOff AllocationMethod = "prefix_caching_off"
+
+	// AllocationMethodMultiplier splits using a fixed output/input cost multiplier
+	// (vLLM timing metrics unavailable).
+	AllocationMethodMultiplier AllocationMethod = "multiplier"
+)
+
+// InferenceCostProperties identifies a unique inference cost entity.
+type InferenceCostProperties struct {
+	ModelName      string
+	ModelVersion   string
+	Namespace      string
+	Cluster        string
+	Pod            string
+	Controller     string
+	ControllerKind string
+	Container      string
+	WorkloadType   string
+}
+
+// InferenceCost holds all cost data for a single model/namespace over a
+// collection interval.
+type InferenceCost struct {
+	Properties InferenceCostProperties
+
+	// Window is the time range over which these costs were collected.
+	// Used to normalize total costs to hourly rates for Prometheus metrics.
+	Window struct {
+		Start time.Time
+		End   time.Time
+	}
+
+	// Costs from OpenCost allocation layer.
+	// AllocationTotalCost = max(request,usage)×price + idle share + shared infra share.
+	AllocationTotalCost float64
+	// UsageTotalCost = actual_usage×price only; does not reconcile to bill.
+	UsageTotalCost float64
+
+	// Token counts from vLLM Prometheus metrics.
+	PromptTokens     float64
+	GenerationTokens float64
+	TotalTokens      float64
+
+	// Processing times from vLLM Prometheus metrics (seconds in collection window).
+	InputProcessingTime  float64
+	OutputProcessingTime float64
+
+	// KV cache data from vLLM Prometheus metrics.
+	// CachedTokens is the number of prompt tokens served from the KV cache,
+	// sourced directly from vllm:prefix_cache_hits_total (token-level counter).
+	// Zero when the metric is unavailable.
+	CachedTokens float64
+	// PrefixCachingEnabled reflects the enable_prefix_caching label from
+	// vllm:cache_config_info. False when the metric is absent.
+	PrefixCachingEnabled bool
+	// CacheConfigKnown is true when vllm:cache_config_info was successfully
+	// joined for this model. When false, PrefixCachingEnabled is meaningless.
+	CacheConfigKnown bool
+
+	// EffectiveInputTokens is PromptTokens - CachedTokens when cache correction
+	// is enabled, otherwise equals PromptTokens.
+	EffectiveInputTokens float64
+
+	// CacheSavingsFraction is CachedTokens / PromptTokens, clamped to [0, 1].
+	// Represents the fraction of prompt tokens served from the KV cache. Zero
+	// when PromptTokens is zero or prefix caching is disabled. The raw ratio can
+	// exceed 1 in high-reuse workloads; see apitypes.go for the full explanation.
+	CacheSavingsFraction float64
+
+	// InputCost and OutputCost are the dollar amounts attributed to input and
+	// output processing respectively.
+	InputCost  map[CostBasis]float64
+	OutputCost map[CostBasis]float64
+
+	// AllocationMethod records which input/output split path was used.
+	AllocationMethod AllocationMethod
+
+	// Derived cost-per-million-token metrics, keyed by CostBasis.
+	// Blended (input+output together), using TotalTokens as denominator.
+	CostPerMillionTokens map[CostBasis]float64
+	// Per-million input tokens, using PromptTokens as denominator.
+	InputCostPerMillionTokens map[CostBasis]float64
+	// Per-million output tokens, using GenerationTokens as denominator.
+	OutputCostPerMillionTokens map[CostBasis]float64
+
+	Timestamp time.Time
+}
+
+// Config holds configuration for the inference cost collector.
+type Config struct {
+	// PrometheusURL is the Prometheus server endpoint for vLLM metric queries.
+	PrometheusURL string
+
+	// CollectionInterval is how often metrics are collected.
+	// Configurable via INFERENCE_COLLECTION_INTERVAL environment variable.
+	// Default is 2 minutes to match the core metrics emitter query window.
+	CollectionInterval time.Duration
+
+	// Enabled controls whether the inference cost collector runs.
+	Enabled bool
+
+	// ModelLabel is the Kubernetes pod label whose value equals the vLLM
+	// model_name metric label. Used to aggregate allocation costs by model.
+	ModelLabel string
+
+	// SharedInfraLabel and SharedInfraLabelValue identify shared inference
+	// infrastructure pods (EPP, gateway) that lack ModelLabel.
+	SharedInfraLabel      string
+	SharedInfraLabelValue string
+
+	// AllocationMode controls the input/output split method.
+	// "compute_time" uses vLLM timing metrics (preferred).
+	// "multiplier" uses a fixed ratio (fallback).
+	AllocationMode string
+
+	// OutputTokenCostMultiplier is the output/input cost ratio used when
+	// AllocationMode is "multiplier".
+	OutputTokenCostMultiplier float64
+
+}
+
+const (
+	AllocationModeComputeTime = "compute_time"
+	AllocationModeMultiplier  = "multiplier"
+
+	defaultOutputTokenCostMultiplier = 2.5
+	defaultModelLabel                = "llm-d.ai/model"
+	defaultSharedInfraLabel          = "llm-d.ai/inference-shared"
+	defaultSharedInfraLabelValue     = "true"
+)
+
+// DefaultConfig returns a Config populated from environment variables via the
+// env package. Callers should check Enabled before starting the collector.
+func DefaultConfig() *Config {
+	return &Config{
+		PrometheusURL:             getPrometheusURL(),
+		CollectionInterval:        getCollectionInterval(),
+		Enabled:                   isInferenceCostEnabled(),
+		ModelLabel:                getModelLabel(),
+		SharedInfraLabel:          getSharedInfraLabel(),
+		SharedInfraLabelValue:     getSharedInfraLabelValue(),
+		AllocationMode:            AllocationModeComputeTime,
+		OutputTokenCostMultiplier: defaultOutputTokenCostMultiplier,
+	}
+}