vllm-project · googs1025 · Nov 25, 2025 · varungup90 · Dec 1, 2025 · googs1025
diff --git a/pkg/plugins/gateway/algorithms/pd_disaggregation.go b/pkg/plugins/gateway/algorithms/pd_disaggregation.go
@@ -56,24 +56,27 @@ const (
 
 	defaultMaxRequest             float64 = 32
 	defaultMaxTokenThroughputDiff float64 = 2048
+	defaultRemotePrefillThreshold         = 512
 )
 
 var (
 	prefillRequestTimeout         int     = utils.LoadEnvInt("AIBRIX_PREFILL_REQUEST_TIMEOUT", defaultPrefillRequestTimeout)
 	aibrixDecodeMaxRequest        float64 = utils.LoadEnvFloat("AIBRIX_DECODE_MAX_REQUEST", defaultMaxRequest)
 	aibrixDecodeMaxThroughputDiff float64 = utils.LoadEnvFloat("AIBRIX_DECODE_MAX_THROUGHPUT", defaultMaxTokenThroughputDiff)
+	remotePrefillThreshold                = utils.LoadEnvInt("AIBRIX_REMOTE_PREFILL_THRESHOLD", defaultRemotePrefillThreshold)
 )
 
 func init() {
 	Register(RouterPD, NewPDRouter)
 }
 
 type pdRouter struct {
-	cache                 cache.Cache
-	tokenizer             tokenizer.Tokenizer
-	prefixCacheIndexer    *prefixcacheindexer.PrefixHashTable
-	prefillRequestTracker *PrefillRequestTracker
-	httpClient            *http.Client
+	cache                  cache.Cache
+	tokenizer              tokenizer.Tokenizer
+	prefixCacheIndexer     *prefixcacheindexer.PrefixHashTable
+	prefillRequestTracker  *PrefillRequestTracker
+	httpClient             *http.Client
+	remotePrefillThreshold int
 }
 
 // PrefillRequestTracker manages prefill-specific request counts
@@ -110,11 +113,12 @@ func NewPDRouter() (types.Router, error) {
 	}
 
 	return pdRouter{
-		cache:                 c,
-		tokenizer:             tokenizerObj,
-		prefixCacheIndexer:    prefixcacheindexer.NewPrefixHashTable(),
-		prefillRequestTracker: NewPrefillRequestTracker(),
-		httpClient:            httpClient,
+		cache:                  c,
+		tokenizer:              tokenizerObj,
+		prefixCacheIndexer:     prefixcacheindexer.NewPrefixHashTable(),
+		prefillRequestTracker:  NewPrefillRequestTracker(),
+		httpClient:             httpClient,
+		remotePrefillThreshold: remotePrefillThreshold,
 	}, nil
 }
 
@@ -126,13 +130,89 @@ func NewPrefillRequestTracker() *PrefillRequestTracker {
 	}
 }
 
+// Route determines which pod should handle the incoming LLM request.
+// It supports two routing strategies:
+//
+// 1. **Short-Prompt Optimization (Decode-Only)**:
+//   - If the input prompt length (in tokens) ≤ AIBRIX_SHORT_PROMPT_THRESHOLD (default value = 2048 tokens),
+//     the request is routed directly to a decode pod.
+//   - The decode pod performs *both* prefill (KV cache computation) and decoding locally,
+//     bypassing the remote prefill step entirely.
+//   - This reduces latency for short prompts by eliminating inter-pod communication.
+//
+// 2. **Standard Two-Stage Pipeline (Prefill+Decode)**:
+//   - For longer prompts, the system uses the traditional split architecture:
+//     a) A prefill pod computes the KV cache.
+//     b) The result is sent to a decode pod for autoregressive generation.
+//
+// Route decision flow:
+/*
+┌───────────────────────┐
+│     Client Request    │
+│   (e.g., "Hello!")    │
+└──────────┬────────────┘
+           │
+           ▼
+┌───────────────────────┐
+│     pdRouter.Route()  │
+│   (Tokenize & Decide) │
+└──────────┬────────────┘
+           │
+┌──────────┴────────────┬───────────────────────────────┐
+│                       │                               │
+│  Is token count       │ Yes                           │ No
+│  ≤ threshold?         ▼                               ▼
+│             ┌─────────────────────┐        ┌─────────────────────┐
+│             │   Decode Pod        │        │   Prefill Pod       │
+│             │ (Local Prefill +    │        │ (Remote Prefill,    │
+│             │  Decode in one pod) │        │  KV Cache Compute)  │
+│             └──────────┬──────────┘        └──────────┬──────────┘
+│                        │                              │
+│                        │                              ▼
+│                        │                   ┌─────────────────────┐
+│                        │                   │   Decode Pod        │
+│                        │                   │ (Generation Only)   │
+│                        │                   └──────────┬──────────┘
+│                        │                              │
+└────────────────────────┼──────────────────────────────┘
+                         │
+                         ▼
+               ┌───────────────────────┐
+               │    Response to        │
+               │      Client           │
+               └───────────────────────┘
+*/
 func (r pdRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
 	// Validate engine consistency across all prefill pods
 	llmEngine, err := validateAndGetLLMEngine(readyPodList.All())
 	if err != nil {
 		return "", fmt.Errorf("engine validation failed for request %s: %w", ctx.RequestID, err)
 	}
 
+	// NOTE: Short-prompt optimization (bypassing remote prefill) is currently ONLY supported for vLLM.
+	//       Other engines (e.g., SGLang) do not support performing prefill+decode in a decode-only pod,
+	//       so they must always go through the full prefill → decode pipeline.
+	if r.remotePrefillThreshold > 0 && llmEngine == VLLMEngine {
+		tokens, err := r.tokenizer.TokenizeInputText(ctx.Message)
+		if err != nil {
+			klog.Warningf("Tokenization for short-prompt check failed, falling back to standard routing: %v", err)
+		} else if len(tokens) <= r.remotePrefillThreshold {
+			klog.InfoS("Short prompt detected: bypassing remote prefill",
+				"request_id", ctx.RequestID,
+				"token_count", len(tokens),
+				"threshold", r.remotePrefillThreshold)
+			// short prompt optimization (Decode-Only Path)
+			_, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All())
+			if err != nil {
+				klog.Warning("Failed to select decode pod for direct inference; falling back to prefill-decode flow",
+					"request_id", ctx.RequestID, "error", err)
+			} else if decodePod != nil {
+				ctx.SetTargetPod(decodePod)
+				return ctx.TargetAddress(), nil
+			}
-			_, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All())
-			if err != nil {
-				klog.Warning("Failed to select decode pod for direct inference; falling back to prefill-decode flow",
-					"request_id", ctx.RequestID, "error", err)
-			} else if decodePod != nil {
-				ctx.SetTargetPod(decodePod)
-				return ctx.TargetAddress(), nil
-			}
+			decodePod, err := r.selectDecodePod(ctx, readyPodList.All())
+			if err != nil {
+				klog.Warning("Failed to select decode pod for direct inference; falling back to prefill-decode flow",
+					"request_id", ctx.RequestID, "error", err)
+			} else {
+				ctx.SetTargetPod(decodePod)
+				return ctx.TargetAddress(), nil
+			}
-			_, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All())
-			if err != nil {
-				klog.Warning("Failed to select decode pod for direct inference; falling back to prefill-decode flow",
-					"request_id", ctx.RequestID, "error", err)
-			} else if decodePod != nil {
-				ctx.SetTargetPod(decodePod)
-				return ctx.TargetAddress(), nil
-			}
+			decodePod, err := r.selectDecodePod(ctx, readyPodList.All())
+			if err != nil {
+				klog.Warning("Failed to select decode pod for direct inference; falling back to prefill-decode flow",
+					"request_id", ctx.RequestID, "error", err)
+			} else {
+				ctx.SetTargetPod(decodePod)
+				return ctx.TargetAddress(), nil
+			}
+		}
+	}
+
 	prefillPod, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All())
 	if err != nil {
 		return "", fmt.Errorf("failed to filter prefill/decode pods for request %s: %w", ctx.RequestID, err)