('/workspace/source/test/e2e/explainer/test_art_explainer.py', 38, 'Skipped: ODH does not support art explainer at the moment')('/workspace/source/test/e2e/predictor/test_grpc.py', 35, 'Skipped: Not testable in ODH at the moment')('/workspace/source/test/e2e/predictor/test_torchserve.py', 34, 'Skipped: ODH does not support torchserve at the moment')test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KServe is a', service_name=... {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.auth @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="auth-enabled-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, ], id="auth-enabled-default", ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_auth_enabled_requires_token(test_case: TestCase): # noqa: F811 """ Test that when auth is enabled (default): - Requests WITH valid token succeed - Requests WITHOUT token are rejected (401/403) """ inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name sa_name = f"{service_name}-test-sa" test_failed = False # Enable auth for this test if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "true" try: # Create LLMInferenceService create_llmisvc(kserve_client, test_case.llm_service) > wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) llmisvc/test_llm_auth.py:275: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90>, {'api_version': 'serving.kserve.io/v1alpha1', 'kin...enable-a18fd8e2'}, {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]}, 'status': None}, 900) kwargs = {}, func_name = 'wait_for_llm_isvc_ready' timestamp_start = '2026-06-15T06:03:08.552722', start_time = 1781503388.5530953 duration = 900.3913719654083, timestamp_end = '2026-06-15T06:18:08.944471' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90> given = {'api_version': 'serving.kserve.io/v1alpha1', 'kind': 'LLMInferenceService', 'metadata': {'annotations': {'security....-auth-enable-a18fd8e2'}, {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]}, 'status': None} timeout_seconds = 900 @log_execution def wait_for_llm_isvc_ready( kserve_client: KServeClient, given: V1alpha1LLMInferenceService, timeout_seconds: int = 900, ) -> str: def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) return True > return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0) llmisvc/test_llm_inference_service.py:1115: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922fbafc0> timeout = 900, interval = 1.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: > raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) E AssertionError: Missing true conditions: {'WorkloadsReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}] llmisvc/test_llm_inference_service.py:1110: AssertionErrortest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m... {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) print(f"{prefix} Waiting for model response from {service_name}") > wait_for_model_response( kserve_client, test_case, test_case.wait_timeout, extra_headers=test_case.extra_headers, ) llmisvc/test_llm_inference_service.py:727: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f7425d76390>, TestCase(base_refs=['router-managed', 'workload-llm... {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]}, 'status': None}, model_name='facebook/opt-125m'), 900) kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}} func_name = 'wait_for_model_response' timestamp_start = '2026-06-15T06:07:11.714988', start_time = 1781503631.7154357 duration = 1102.550819158554, timestamp_end = '2026-06-15T06:25:34.266259' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7425d76390> test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m... {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]}, 'status': None}, model_name='facebook/opt-125m') timeout_seconds = 900 extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'} @log_execution def wait_for_model_response( kserve_client: KServeClient, test_case: TestCase, # noqa: F811 timeout_seconds: int = 900, extra_headers: Optional[Dict[str, str]] = None, ) -> str: def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) > response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0) llmisvc/test_llm_inference_service.py:1030: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e9a020> timeout = 900, interval = 5.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response > raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) E AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request llmisvc/test_llm_inference_service.py:1026: AssertionErrortest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout... {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) print(f"{prefix} Waiting for model response from {service_name}") > wait_for_model_response( kserve_client, test_case, test_case.wait_timeout, extra_headers=test_case.extra_headers, ) llmisvc/test_llm_inference_service.py:727: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0>, TestCase(base_refs=['router-managed', 'workload-llm... {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]}, 'status': None}, model_name='facebook/opt-125m'), 900) kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}} func_name = 'wait_for_model_response' timestamp_start = '2026-06-15T06:26:15.484944', start_time = 1781504775.4854898 duration = 1102.5697031021118, timestamp_end = '2026-06-15T06:44:38.055197' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0> test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout... {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]}, 'status': None}, model_name='facebook/opt-125m') timeout_seconds = 900 extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'} @log_execution def wait_for_model_response( kserve_client: KServeClient, test_case: TestCase, # noqa: F811 timeout_seconds: int = 900, extra_headers: Optional[Dict[str, str]] = None, ) -> str: def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) > response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0) llmisvc/test_llm_inference_service.py:1030: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e99760> timeout = 900, interval = 5.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response > raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) E AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request llmisvc/test_llm_inference_service.py:1026: AssertionErrortest_case = TestCase(base_refs=['router-with-refs', 'scheduler-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KSer... {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") > wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) llmisvc/test_llm_inference_service.py:723: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f1922bed290>, {'api_version': 'serving.kserve.io/v1alpha1', 'kin...-with-ec5d4bfa'}, {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]}, 'status': None}, 900) kwargs = {}, func_name = 'wait_for_llm_isvc_ready' timestamp_start = '2026-06-15T06:30:03.487652', start_time = 1781505003.4879394 duration = 901.1005208492279, timestamp_end = '2026-06-15T06:45:04.588475' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f1922bed290> given = {'api_version': 'serving.kserve.io/v1alpha1', 'kind': 'LLMInferenceService', 'metadata': {'annotations': {'security....router-with-ec5d4bfa'}, {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]}, 'status': None} timeout_seconds = 900 @log_execution def wait_for_llm_isvc_ready( kserve_client: KServeClient, given: V1alpha1LLMInferenceService, timeout_seconds: int = 900, ) -> str: def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) return True > return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0) llmisvc/test_llm_inference_service.py:1115: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922fb9ee0> timeout = 900, interval = 1.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: > raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) E AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:30:54Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'status': 'True', 'type': 'WorkloadsReady'}] llmisvc/test_llm_inference_service.py:1110: AssertionErrortest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt='KServe is a', ...opt-125m-with-lora-hf-a7886ead'}]}, 'status': None}, model_name='publishers/kserve-ci-e2e-test/models/lora-adapter-1') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") > wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) llmisvc/test_llm_inference_service.py:723: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f7425babd10>, {'api_version': 'serving.kserve.io/v1alpha1', 'kin...vc-mod-495991f8'}, {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]}, 'status': None}, 900) kwargs = {}, func_name = 'wait_for_llm_isvc_ready' timestamp_start = '2026-06-15T06:44:39.921048', start_time = 1781505879.9213114 duration = 900.540575504303, timestamp_end = '2026-06-15T06:59:40.461894' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7425babd10> given = {'api_version': 'serving.kserve.io/v1alpha1', 'kind': 'LLMInferenceService', 'metadata': {'annotations': {'security....-llmisvc-mod-495991f8'}, {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]}, 'status': None} timeout_seconds = 900 @log_execution def wait_for_llm_isvc_ready( kserve_client: KServeClient, given: V1alpha1LLMInferenceService, timeout_seconds: int = 900, ) -> str: def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) return True > return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0) llmisvc/test_llm_inference_service.py:1115: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f7425e99d00> timeout = 900, interval = 1.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: > raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) E AssertionError: Missing true conditions: {'Ready', 'WorkloadsReady'}, expected {'Ready', 'RouterReady', 'WorkloadsReady'}, got [{'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:16Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}] llmisvc/test_llm_inference_service.py:1110: AssertionErrortest_case = TestCase(base_refs=['router-with-refs-pd', 'scheduler-managed', 'workload-pd-cpu', 'model-fb-opt-125m'], prompt='You a... {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") > wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) llmisvc/test_llm_inference_service.py:723: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f192340b750>, {'api_version': 'serving.kserve.io/v1alpha1', 'kin...h-ref-d1f07093'}, {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]}, 'status': None}, 900) kwargs = {}, func_name = 'wait_for_llm_isvc_ready' timestamp_start = '2026-06-15T06:52:14.786869', start_time = 1781506334.7871306 duration = 900.4981956481934, timestamp_end = '2026-06-15T07:07:15.285340' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f192340b750> given = {'api_version': 'serving.kserve.io/v1alpha1', 'kind': 'LLMInferenceService', 'metadata': {'annotations': {'security....er-with-ref-d1f07093'}, {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]}, 'status': None} timeout_seconds = 900 @log_execution def wait_for_llm_isvc_ready( kserve_client: KServeClient, given: V1alpha1LLMInferenceService, timeout_seconds: int = 900, ) -> str: def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) return True > return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0) llmisvc/test_llm_inference_service.py:1115: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922f6b240> timeout = 900, interval = 1.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def assert_llm_isvc_ready(): out = get_llmisvc( kserve_client, given.metadata.name, given.metadata.namespace, given.api_version.split("/")[1], ) if "status" not in out: raise AssertionError("No status found in LLM inference service") status = out["status"] if "conditions" not in status: raise AssertionError("No conditions found in status") expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"} got_true_conditions = set() conditions = status["conditions"] for condition in conditions: if condition.get("status") == "True": got_true_conditions.add(condition.get("type")) missing_conditions = expected_true_conditions - got_true_conditions if missing_conditions: > raise AssertionError( f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}" ) E AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-pd-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:54:37Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PrefillWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:53:00Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'status': 'True', 'type': 'WorkloadsReady'}] llmisvc/test_llm_inference_service.py:1110: AssertionErrortest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n... {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]}, 'status': None}, model_name='facebook/opt-125m') @pytest.mark.llminferenceservice @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "test_case", [ pytest.param( TestCase( base_refs=[ "router-with-gateway-ref", "router-with-managed-route", "model-fb-opt-125m", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=create_response_assertion(with_field="choices"), expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="custom-route-timeout-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs", "scheduler-managed", "workload-single-cpu", "model-fb-opt-125m", ], prompt="KServe is a", service_name="router-with-refs-test", expected_gateway=ROUTER_GATEWAYS[0], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[0]], routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-custom-route-timeout-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="custom-route-timeout-pd-test", response_assertion=assert_200_with_choices, ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-with-refs-pd", "scheduler-managed", "workload-pd-cpu", "model-fb-opt-125m", ], prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", service_name="router-with-refs-pd-test", response_assertion=assert_200_with_choices, expected_gateway=ROUTER_GATEWAYS[1], before_test=[ lambda: create_router_resources( gateways=[ROUTER_GATEWAYS[1]], routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]], ) ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.custom_gateway, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-dp-ep-gpu", "workload-dp-ep-prefill-gpu", "model-deepseek-v2-lite", ], prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically " "where the compute plane (P) and the data plane (D) are independently deployed and managed for a " "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the " "fundamental challenges of network latency and data consistency, elaborate on the advanced " "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: " "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to " "evolve to support optimal performance and minimize inter-plane communication overhead, especially for " "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically " "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: " "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) " "and their applicability in balancing performance and data integrity across a globally distributed data plane. " "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, " "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. " "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently " "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, " "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). " "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on " "workload patterns and data locality, potentially involving live migration strategies. " "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter " "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), " "fine-grained access control to data at rest and in motion, and identity management across disaggregated " "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) " "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: " "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and " "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) " "would be essential? How would incident response and troubleshooting differ in this disaggregated environment " "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across " "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries " "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) " "where the benefits of P/D disaggregation would strongly outweigh its complexities. " "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions " "directly interacting with object storage, in-memory disaggregation) that could further drive or " "transform P/D disaggregation in cloud computing.", max_tokens=2000, ), marks=[ pytest.mark.cluster_gpu, pytest.mark.cluster_nvidia, pytest.mark.cluster_nvidia_roce, ], ), pytest.param( TestCase( base_refs=[ "router-no-scheduler", "workload-single-cpu", "model-fb-opt-125m", ], prompt="What is KServe?", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.no_scheduler, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "workload-simulated-dp-ep-cpu", "model-fb-opt-125m", ], prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, " "but without the resources requirements for DP+EP (GPUs and ROCe/IB).", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node], ), # Scheduler config tests pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-inline-config", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-inline-config-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Chat completions endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], model_name="Qwen/Qwen2.5-0.5B-Instruct", endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=create_response_assertion(with_field="choices"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-configmap-ref", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-configmap-ref-test", before_test=[create_scheduler_configmap], after_test=[delete_scheduler_configmap], ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-replicas", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-ha-replicas-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-custom-template", "workload-llmd-simulator", ], prompt="KServe is a", service_name="scheduler-custom-template-test", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), # Precise prefix KV cache routing test pytest.param( TestCase( base_refs=[ "router-managed", "scheduler-with-precise-prefix-cache-inline-config", "workload-llmd-simulator-kvcache", ], prompt="KServe is a", service_name="precise-prefix-cache-test", ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Models endpoint coverage pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/models", response_assertion=create_response_assertion(with_field="data"), ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/completions", prompt="KServe is a", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions pytest.param( TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches("facebook/opt-125m"), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, peers=[ TestCase( base_refs=[ "router-managed", "workload-llmd-simulator", "model-qwen2.5-0.5b", ], endpoint="/v1/chat/completions", prompt="What is KServe?", payload_formatter=chat_completions_payload, response_assertion=assert_model_field_matches( "Qwen/Qwen2.5-0.5B-Instruct" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct", }, ), ], ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator, pytest.mark.model_routing, ], ), # Model-based routing via X-Gateway-Model-Name header — LoRA adapter pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/completions", prompt="KServe is a", model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", payload_formatter=completions_payload, response_assertion=assert_model_field_matches( f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1" ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA) pytest.param( TestCase( base_refs=[ "router-managed", "workload-single-cpu", "model-fb-opt-125m-with-lora-hf", ], endpoint="/v1/models", response_assertion=assert_models_contains( "facebook/opt-125m", f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", "lora-adapter-1", f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1", ), url_getter=get_model_routing_url, extra_headers={ MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m", }, ), marks=[ pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.model_routing, pytest.mark.lora, ], ), ], indirect=["test_case"], ids=generate_test_id, ) @log_execution def test_llm_inference_service(test_case: TestCase): # noqa: F811 inject_k8s_proxy() kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config"), client_configuration=client.Configuration(), ) service_name = test_case.llm_service.metadata.name if not test_case.llm_service.metadata.annotations: test_case.llm_service.metadata.annotations = {} test_case.llm_service.metadata.annotations[ "security.opendatahub.io/enable-auth" ] = "false" prefix = test_case.log_prefix test_failed = False try: print(f"{prefix} Creating LLMInferenceService {service_name}") create_llmisvc(kserve_client, test_case.llm_service) print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready") wait_for_llm_isvc_ready( kserve_client, test_case.llm_service, test_case.wait_timeout ) print(f"{prefix} Waiting for model response from {service_name}") > wait_for_model_response( kserve_client, test_case, test_case.wait_timeout, extra_headers=test_case.extra_headers, ) llmisvc/test_llm_inference_service.py:727: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (<kserve.api.kserve_client.KServeClient object at 0x7f7424fee210>, TestCase(base_refs=['router-managed', 'workload-sin... {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]}, 'status': None}, model_name='facebook/opt-125m'), 900) kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}} func_name = 'wait_for_model_response' timestamp_start = '2026-06-15T07:01:51.337397', start_time = 1781506911.3376548 duration = 900.2222678661346, timestamp_end = '2026-06-15T07:16:51.559924' @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ timestamp_start = datetime.now().isoformat() logger.info( f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}" ) start_time = time.time() try: > result = func(*args, **kwargs) llmisvc/logging.py:40: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ kserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7424fee210> test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n... {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]}, 'status': None}, model_name='facebook/opt-125m') timeout_seconds = 900 extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'} @log_execution def wait_for_model_response( kserve_client: KServeClient, test_case: TestCase, # noqa: F811 timeout_seconds: int = 900, extra_headers: Optional[Dict[str, str]] = None, ) -> str: def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) > response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0) llmisvc/test_llm_inference_service.py:1030: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ assertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e9bec0> timeout = 900, interval = 5.0 def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout last_msg = None while True: try: > return assertion_fn() llmisvc/test_llm_inference_service.py:1126: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def get_successful_response(): try: if test_case.url_getter: service_url = test_case.url_getter(kserve_client, test_case.llm_service) else: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"❌ Failed to get service URL: {e}") from e model_url = service_url + test_case.endpoint headers = {"Content-Type": "application/json"} if extra_headers: headers.update(extra_headers) if test_case.payload_formatter is not None: test_payload = test_case.payload_formatter(test_case) elif test_case.prompt is not None: test_payload = { "model": test_case.model_name if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers else extra_headers[MODEL_ROUTING_HEADER], "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } else: test_payload = None logger.info(f"Calling LLM service at {model_url} with payload {test_payload}") try: if test_payload is not None: response = post_with_retry( model_url, headers=headers, json_data=test_payload, timeout=test_case.response_timeout, ) else: response = get_with_retry( model_url, headers=headers, timeout=test_case.response_timeout, ) except Exception as e: logger.error(f"❌ Failed to call model: {e}") raise AssertionError(f"❌ Failed to call model: {e}") from e logger.info(f"Model response is {response.status_code}: {response.text[:500]}") if 200 <= response.status_code < 300: return response > raise AssertionError( f"Service returned {response.status_code}: {response.text}" ) E AssertionError: Service returned 401: llmisvc/test_llm_inference_service.py:1026: AssertionError