<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="7" skipped="3" tests="38" time="5514.903" timestamp="2026-06-15T06:01:42.410561" hostname="kserve-group-test-vgn9g-e2e-llm-inference-service-pod"><testcase classname="" name="explainer.test_art_explainer" time="0.000"><skipped message="collection skipped">('/workspace/source/test/e2e/explainer/test_art_explainer.py', 38, 'Skipped: ODH does not support art explainer at the moment')</skipped></testcase><testcase classname="" name="predictor.test_grpc" time="0.000"><skipped message="collection skipped">('/workspace/source/test/e2e/predictor/test_grpc.py', 35, 'Skipped: Not testable in ODH at the moment')</skipped></testcase><testcase classname="" name="predictor.test_torchserve" time="0.000"><skipped message="collection skipped">('/workspace/source/test/e2e/predictor/test_torchserve.py', 34, 'Skipped: ODH does not support torchserve at the moment')</skipped></testcase><testcase classname="llmisvc.test_gateway_section_name" name="test_gateway_section_name_propagation[cluster_single_node-cluster_cpu-with-section-name]" time="33.854" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-replicas-workload-llmd-simulator]" time="100.584" /><testcase classname="llmisvc.test_gateway_section_name" name="test_gateway_section_name_propagation[cluster_single_node-cluster_cpu-without-section-name]" time="49.566" /><testcase classname="llmisvc.test_llm_auth" name="test_llm_auth_enabled_requires_token[cluster_cpu-cluster_single_node-auth-enabled-default]" time="902.539"><failure message="AssertionError: Missing true conditions: {'WorkloadsReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]">test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KServe is a', service_name=...               {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.auth
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="auth-enabled-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                ],
                id="auth-enabled-default",
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_auth_enabled_requires_token(test_case: TestCase):  # noqa: F811
        """
        Test that when auth is enabled (default):
        - Requests WITH valid token succeed
        - Requests WITHOUT token are rejected (401/403)
        """
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        sa_name = f"{service_name}-test-sa"
        test_failed = False
    
        # Enable auth for this test
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "true"
    
        try:
            # Create LLMInferenceService
            create_llmisvc(kserve_client, test_case.llm_service)
&gt;           wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )

llmisvc/test_llm_auth.py:275: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90&gt;, {'api_version': 'serving.kserve.io/v1alpha1',
 'kin...enable-a18fd8e2'},
                       {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},
 'status': None}, 900)
kwargs = {}, func_name = 'wait_for_llm_isvc_ready'
timestamp_start = '2026-06-15T06:03:08.552722', start_time = 1781503388.5530953
duration = 900.3913719654083, timestamp_end = '2026-06-15T06:18:08.944471'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90&gt;
given = {'api_version': 'serving.kserve.io/v1alpha1',
 'kind': 'LLMInferenceService',
 'metadata': {'annotations': {'security....-auth-enable-a18fd8e2'},
                       {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},
 'status': None}
timeout_seconds = 900

    @log_execution
    def wait_for_llm_isvc_ready(
        kserve_client: KServeClient,
        given: V1alpha1LLMInferenceService,
        timeout_seconds: int = 900,
    ) -&gt; str:
        def assert_llm_isvc_ready():
            out = get_llmisvc(
                kserve_client,
                given.metadata.name,
                given.metadata.namespace,
                given.api_version.split("/")[1],
            )
    
            if "status" not in out:
                raise AssertionError("No status found in LLM inference service")
    
            status = out["status"]
            if "conditions" not in status:
                raise AssertionError("No conditions found in status")
    
            expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
            got_true_conditions = set()
    
            conditions = status["conditions"]
    
            for condition in conditions:
                if condition.get("status") == "True":
                    got_true_conditions.add(condition.get("type"))
    
            missing_conditions = expected_true_conditions - got_true_conditions
            if missing_conditions:
                raise AssertionError(
                    f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
                )
            return True
    
&gt;       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)

llmisvc/test_llm_inference_service.py:1115: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_llm_isvc_ready.&lt;locals&gt;.assert_llm_isvc_ready at 0x7f1922fbafc0&gt;
timeout = 900, interval = 1.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def assert_llm_isvc_ready():
        out = get_llmisvc(
            kserve_client,
            given.metadata.name,
            given.metadata.namespace,
            given.api_version.split("/")[1],
        )
    
        if "status" not in out:
            raise AssertionError("No status found in LLM inference service")
    
        status = out["status"]
        if "conditions" not in status:
            raise AssertionError("No conditions found in status")
    
        expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
        got_true_conditions = set()
    
        conditions = status["conditions"]
    
        for condition in conditions:
            if condition.get("status") == "True":
                got_true_conditions.add(condition.get("type"))
    
        missing_conditions = expected_true_conditions - got_true_conditions
        if missing_conditions:
&gt;           raise AssertionError(
                f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
            )
E           AssertionError: Missing true conditions: {'WorkloadsReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]

llmisvc/test_llm_inference_service.py:1110: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-custom-template-workload-llmd-simulator]" time="61.546" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-precise-prefix-cache-inline-config-workload-llmd-simulator-kvcache]" time="63.789" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator0]" time="60.638" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator1]" time="1144.559"><failure message="AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request">test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m...              {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
            wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )
            print(f"{prefix} Waiting for model response from {service_name}")
&gt;           wait_for_model_response(
                kserve_client,
                test_case,
                test_case.wait_timeout,
                extra_headers=test_case.extra_headers,
            )

llmisvc/test_llm_inference_service.py:727: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f7425d76390&gt;, TestCase(base_refs=['router-managed', 'workload-llm...        {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},
 'status': None}, model_name='facebook/opt-125m'), 900)
kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}
func_name = 'wait_for_model_response'
timestamp_start = '2026-06-15T06:07:11.714988', start_time = 1781503631.7154357
duration = 1102.550819158554, timestamp_end = '2026-06-15T06:25:34.266259'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f7425d76390&gt;
test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m...              {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},
 'status': None}, model_name='facebook/opt-125m')
timeout_seconds = 900
extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}

    @log_execution
    def wait_for_model_response(
        kserve_client: KServeClient,
        test_case: TestCase,  # noqa: F811
        timeout_seconds: int = 900,
        extra_headers: Optional[Dict[str, str]] = None,
    ) -&gt; str:
        def get_successful_response():
            try:
                if test_case.url_getter:
                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)
                else:
                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)
            except Exception as e:
                raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
            model_url = service_url + test_case.endpoint
    
            headers = {"Content-Type": "application/json"}
            if extra_headers:
                headers.update(extra_headers)
    
            if test_case.payload_formatter is not None:
                test_payload = test_case.payload_formatter(test_case)
            elif test_case.prompt is not None:
                test_payload = {
                    "model": test_case.model_name
                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                    else extra_headers[MODEL_ROUTING_HEADER],
                    "prompt": test_case.prompt,
                    "max_tokens": test_case.max_tokens,
                }
            else:
                test_payload = None
    
            logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
            try:
                if test_payload is not None:
                    response = post_with_retry(
                        model_url,
                        headers=headers,
                        json_data=test_payload,
                        timeout=test_case.response_timeout,
                    )
                else:
                    response = get_with_retry(
                        model_url,
                        headers=headers,
                        timeout=test_case.response_timeout,
                    )
            except Exception as e:
                logger.error(f"❌ Failed to call model: {e}")
                raise AssertionError(f"❌ Failed to call model: {e}") from e
    
            logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
            if 200 &lt;= response.status_code &lt; 300:
                return response
            raise AssertionError(
                f"Service returned {response.status_code}: {response.text}"
            )
    
&gt;       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)

llmisvc/test_llm_inference_service.py:1030: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_model_response.&lt;locals&gt;.get_successful_response at 0x7f7425e9a020&gt;
timeout = 900, interval = 5.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def get_successful_response():
        try:
            if test_case.url_getter:
                service_url = test_case.url_getter(kserve_client, test_case.llm_service)
            else:
                service_url = get_llm_service_url(kserve_client, test_case.llm_service)
        except Exception as e:
            raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
        model_url = service_url + test_case.endpoint
    
        headers = {"Content-Type": "application/json"}
        if extra_headers:
            headers.update(extra_headers)
    
        if test_case.payload_formatter is not None:
            test_payload = test_case.payload_formatter(test_case)
        elif test_case.prompt is not None:
            test_payload = {
                "model": test_case.model_name
                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                else extra_headers[MODEL_ROUTING_HEADER],
                "prompt": test_case.prompt,
                "max_tokens": test_case.max_tokens,
            }
        else:
            test_payload = None
    
        logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
        try:
            if test_payload is not None:
                response = post_with_retry(
                    model_url,
                    headers=headers,
                    json_data=test_payload,
                    timeout=test_case.response_timeout,
                )
            else:
                response = get_with_retry(
                    model_url,
                    headers=headers,
                    timeout=test_case.response_timeout,
                )
        except Exception as e:
            logger.error(f"❌ Failed to call model: {e}")
            raise AssertionError(f"❌ Failed to call model: {e}") from e
    
        logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
        if 200 &lt;= response.status_code &lt; 300:
            return response
&gt;       raise AssertionError(
            f"Service returned {response.status_code}: {response.text}"
        )
E       AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request

llmisvc/test_llm_inference_service.py:1026: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_auth" name="test_llm_auth_invalid_token_rejected[cluster_cpu-cluster_single_node-auth-invalid-token]" time="176.855" /><testcase classname="llmisvc.test_llm_auth" name="test_llm_auth_disabled_no_token_required[cluster_cpu-cluster_single_node-auth-disabled]" time="157.075" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-gateway-ref-router-with-managed-route-model-fb-opt-125m-workload-llmd-simulator]" time="54.966" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]" time="172.118" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator2]" time="1143.560"><failure message="AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request">test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout...              {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
            wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )
            print(f"{prefix} Waiting for model response from {service_name}")
&gt;           wait_for_model_response(
                kserve_client,
                test_case,
                test_case.wait_timeout,
                extra_headers=test_case.extra_headers,
            )

llmisvc/test_llm_inference_service.py:727: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0&gt;, TestCase(base_refs=['router-managed', 'workload-llm...        {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},
 'status': None}, model_name='facebook/opt-125m'), 900)
kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}
func_name = 'wait_for_model_response'
timestamp_start = '2026-06-15T06:26:15.484944', start_time = 1781504775.4854898
duration = 1102.5697031021118, timestamp_end = '2026-06-15T06:44:38.055197'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0&gt;
test_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout...              {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},
 'status': None}, model_name='facebook/opt-125m')
timeout_seconds = 900
extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}

    @log_execution
    def wait_for_model_response(
        kserve_client: KServeClient,
        test_case: TestCase,  # noqa: F811
        timeout_seconds: int = 900,
        extra_headers: Optional[Dict[str, str]] = None,
    ) -&gt; str:
        def get_successful_response():
            try:
                if test_case.url_getter:
                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)
                else:
                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)
            except Exception as e:
                raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
            model_url = service_url + test_case.endpoint
    
            headers = {"Content-Type": "application/json"}
            if extra_headers:
                headers.update(extra_headers)
    
            if test_case.payload_formatter is not None:
                test_payload = test_case.payload_formatter(test_case)
            elif test_case.prompt is not None:
                test_payload = {
                    "model": test_case.model_name
                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                    else extra_headers[MODEL_ROUTING_HEADER],
                    "prompt": test_case.prompt,
                    "max_tokens": test_case.max_tokens,
                }
            else:
                test_payload = None
    
            logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
            try:
                if test_payload is not None:
                    response = post_with_retry(
                        model_url,
                        headers=headers,
                        json_data=test_payload,
                        timeout=test_case.response_timeout,
                    )
                else:
                    response = get_with_retry(
                        model_url,
                        headers=headers,
                        timeout=test_case.response_timeout,
                    )
            except Exception as e:
                logger.error(f"❌ Failed to call model: {e}")
                raise AssertionError(f"❌ Failed to call model: {e}") from e
    
            logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
            if 200 &lt;= response.status_code &lt; 300:
                return response
            raise AssertionError(
                f"Service returned {response.status_code}: {response.text}"
            )
    
&gt;       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)

llmisvc/test_llm_inference_service.py:1030: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_model_response.&lt;locals&gt;.get_successful_response at 0x7f7425e99760&gt;
timeout = 900, interval = 5.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def get_successful_response():
        try:
            if test_case.url_getter:
                service_url = test_case.url_getter(kserve_client, test_case.llm_service)
            else:
                service_url = get_llm_service_url(kserve_client, test_case.llm_service)
        except Exception as e:
            raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
        model_url = service_url + test_case.endpoint
    
        headers = {"Content-Type": "application/json"}
        if extra_headers:
            headers.update(extra_headers)
    
        if test_case.payload_formatter is not None:
            test_payload = test_case.payload_formatter(test_case)
        elif test_case.prompt is not None:
            test_payload = {
                "model": test_case.model_name
                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                else extra_headers[MODEL_ROUTING_HEADER],
                "prompt": test_case.prompt,
                "max_tokens": test_case.max_tokens,
            }
        else:
            test_payload = None
    
        logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
        try:
            if test_payload is not None:
                response = post_with_retry(
                    model_url,
                    headers=headers,
                    json_data=test_payload,
                    timeout=test_case.response_timeout,
                )
            else:
                response = get_with_retry(
                    model_url,
                    headers=headers,
                    timeout=test_case.response_timeout,
                )
        except Exception as e:
            logger.error(f"❌ Failed to call model: {e}")
            raise AssertionError(f"❌ Failed to call model: {e}") from e
    
        logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
        if 200 &lt;= response.status_code &lt; 300:
            return response
&gt;       raise AssertionError(
            f"Service returned {response.status_code}: {response.text}"
        )
E       AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request

llmisvc/test_llm_inference_service.py:1026: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-custom-route-timeout-scheduler-managed-workload-single-cpu-model-fb-opt-125m]" time="148.014" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-refs-scheduler-managed-workload-single-cpu-model-fb-opt-125m]" time="906.242"><failure message="AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:30:54Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'status': 'True', 'type': 'WorkloadsReady'}]">test_case = TestCase(base_refs=['router-with-refs', 'scheduler-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KSer...              {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
&gt;           wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )

llmisvc/test_llm_inference_service.py:723: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f1922bed290&gt;, {'api_version': 'serving.kserve.io/v1alpha1',
 'kin...-with-ec5d4bfa'},
                       {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},
 'status': None}, 900)
kwargs = {}, func_name = 'wait_for_llm_isvc_ready'
timestamp_start = '2026-06-15T06:30:03.487652', start_time = 1781505003.4879394
duration = 901.1005208492279, timestamp_end = '2026-06-15T06:45:04.588475'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f1922bed290&gt;
given = {'api_version': 'serving.kserve.io/v1alpha1',
 'kind': 'LLMInferenceService',
 'metadata': {'annotations': {'security....router-with-ec5d4bfa'},
                       {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},
 'status': None}
timeout_seconds = 900

    @log_execution
    def wait_for_llm_isvc_ready(
        kserve_client: KServeClient,
        given: V1alpha1LLMInferenceService,
        timeout_seconds: int = 900,
    ) -&gt; str:
        def assert_llm_isvc_ready():
            out = get_llmisvc(
                kserve_client,
                given.metadata.name,
                given.metadata.namespace,
                given.api_version.split("/")[1],
            )
    
            if "status" not in out:
                raise AssertionError("No status found in LLM inference service")
    
            status = out["status"]
            if "conditions" not in status:
                raise AssertionError("No conditions found in status")
    
            expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
            got_true_conditions = set()
    
            conditions = status["conditions"]
    
            for condition in conditions:
                if condition.get("status") == "True":
                    got_true_conditions.add(condition.get("type"))
    
            missing_conditions = expected_true_conditions - got_true_conditions
            if missing_conditions:
                raise AssertionError(
                    f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
                )
            return True
    
&gt;       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)

llmisvc/test_llm_inference_service.py:1115: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_llm_isvc_ready.&lt;locals&gt;.assert_llm_isvc_ready at 0x7f1922fb9ee0&gt;
timeout = 900, interval = 1.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def assert_llm_isvc_ready():
        out = get_llmisvc(
            kserve_client,
            given.metadata.name,
            given.metadata.namespace,
            given.api_version.split("/")[1],
        )
    
        if "status" not in out:
            raise AssertionError("No status found in LLM inference service")
    
        status = out["status"]
        if "conditions" not in status:
            raise AssertionError("No conditions found in status")
    
        expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
        got_true_conditions = set()
    
        conditions = status["conditions"]
    
        for condition in conditions:
            if condition.get("status") == "True":
                got_true_conditions.add(condition.get("type"))
    
        missing_conditions = expected_true_conditions - got_true_conditions
        if missing_conditions:
&gt;           raise AssertionError(
                f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
            )
E           AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:30:54Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'status': 'True', 'type': 'WorkloadsReady'}]

llmisvc/test_llm_inference_service.py:1110: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf0]" time="902.771"><failure message="AssertionError: Missing true conditions: {'Ready', 'WorkloadsReady'}, expected {'Ready', 'RouterReady', 'WorkloadsReady'}, got [{'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:16Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]">test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt='KServe is a', ...opt-125m-with-lora-hf-a7886ead'}]},
 'status': None}, model_name='publishers/kserve-ci-e2e-test/models/lora-adapter-1')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
&gt;           wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )

llmisvc/test_llm_inference_service.py:723: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f7425babd10&gt;, {'api_version': 'serving.kserve.io/v1alpha1',
 'kin...vc-mod-495991f8'},
                       {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]},
 'status': None}, 900)
kwargs = {}, func_name = 'wait_for_llm_isvc_ready'
timestamp_start = '2026-06-15T06:44:39.921048', start_time = 1781505879.9213114
duration = 900.540575504303, timestamp_end = '2026-06-15T06:59:40.461894'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f7425babd10&gt;
given = {'api_version': 'serving.kserve.io/v1alpha1',
 'kind': 'LLMInferenceService',
 'metadata': {'annotations': {'security....-llmisvc-mod-495991f8'},
                       {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]},
 'status': None}
timeout_seconds = 900

    @log_execution
    def wait_for_llm_isvc_ready(
        kserve_client: KServeClient,
        given: V1alpha1LLMInferenceService,
        timeout_seconds: int = 900,
    ) -&gt; str:
        def assert_llm_isvc_ready():
            out = get_llmisvc(
                kserve_client,
                given.metadata.name,
                given.metadata.namespace,
                given.api_version.split("/")[1],
            )
    
            if "status" not in out:
                raise AssertionError("No status found in LLM inference service")
    
            status = out["status"]
            if "conditions" not in status:
                raise AssertionError("No conditions found in status")
    
            expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
            got_true_conditions = set()
    
            conditions = status["conditions"]
    
            for condition in conditions:
                if condition.get("status") == "True":
                    got_true_conditions.add(condition.get("type"))
    
            missing_conditions = expected_true_conditions - got_true_conditions
            if missing_conditions:
                raise AssertionError(
                    f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
                )
            return True
    
&gt;       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)

llmisvc/test_llm_inference_service.py:1115: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_llm_isvc_ready.&lt;locals&gt;.assert_llm_isvc_ready at 0x7f7425e99d00&gt;
timeout = 900, interval = 1.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def assert_llm_isvc_ready():
        out = get_llmisvc(
            kserve_client,
            given.metadata.name,
            given.metadata.namespace,
            given.api_version.split("/")[1],
        )
    
        if "status" not in out:
            raise AssertionError("No status found in LLM inference service")
    
        status = out["status"]
        if "conditions" not in status:
            raise AssertionError("No conditions found in status")
    
        expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
        got_true_conditions = set()
    
        conditions = status["conditions"]
    
        for condition in conditions:
            if condition.get("status") == "True":
                got_true_conditions.add(condition.get("type"))
    
        missing_conditions = expected_true_conditions - got_true_conditions
        if missing_conditions:
&gt;           raise AssertionError(
                f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
            )
E           AssertionError: Missing true conditions: {'Ready', 'WorkloadsReady'}, expected {'Ready', 'RouterReady', 'WorkloadsReady'}, got [{'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:16Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]

llmisvc/test_llm_inference_service.py:1110: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-pd-cpu-model-fb-opt-125m]" time="221.863" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-custom-route-timeout-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]" time="203.477" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-refs-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]" time="905.696"><failure message="AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-pd-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:54:37Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PrefillWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: &quot;False&quot; (reason &quot;InvalidKind&quot;, message &quot;referencing unsupported backendRef: group \\&quot;inference.networking.x-k8s.io\\&quot; kind \\&quot;InferencePool\\&quot;&quot;)]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:53:00Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'status': 'True', 'type': 'WorkloadsReady'}]">test_case = TestCase(base_refs=['router-with-refs-pd', 'scheduler-managed', 'workload-pd-cpu', 'model-fb-opt-125m'], prompt='You a...              {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
&gt;           wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )

llmisvc/test_llm_inference_service.py:723: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f192340b750&gt;, {'api_version': 'serving.kserve.io/v1alpha1',
 'kin...h-ref-d1f07093'},
                       {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},
 'status': None}, 900)
kwargs = {}, func_name = 'wait_for_llm_isvc_ready'
timestamp_start = '2026-06-15T06:52:14.786869', start_time = 1781506334.7871306
duration = 900.4981956481934, timestamp_end = '2026-06-15T07:07:15.285340'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f192340b750&gt;
given = {'api_version': 'serving.kserve.io/v1alpha1',
 'kind': 'LLMInferenceService',
 'metadata': {'annotations': {'security....er-with-ref-d1f07093'},
                       {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},
 'status': None}
timeout_seconds = 900

    @log_execution
    def wait_for_llm_isvc_ready(
        kserve_client: KServeClient,
        given: V1alpha1LLMInferenceService,
        timeout_seconds: int = 900,
    ) -&gt; str:
        def assert_llm_isvc_ready():
            out = get_llmisvc(
                kserve_client,
                given.metadata.name,
                given.metadata.namespace,
                given.api_version.split("/")[1],
            )
    
            if "status" not in out:
                raise AssertionError("No status found in LLM inference service")
    
            status = out["status"]
            if "conditions" not in status:
                raise AssertionError("No conditions found in status")
    
            expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
            got_true_conditions = set()
    
            conditions = status["conditions"]
    
            for condition in conditions:
                if condition.get("status") == "True":
                    got_true_conditions.add(condition.get("type"))
    
            missing_conditions = expected_true_conditions - got_true_conditions
            if missing_conditions:
                raise AssertionError(
                    f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
                )
            return True
    
&gt;       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)

llmisvc/test_llm_inference_service.py:1115: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_llm_isvc_ready.&lt;locals&gt;.assert_llm_isvc_ready at 0x7f1922f6b240&gt;
timeout = 900, interval = 1.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def assert_llm_isvc_ready():
        out = get_llmisvc(
            kserve_client,
            given.metadata.name,
            given.metadata.namespace,
            given.api_version.split("/")[1],
        )
    
        if "status" not in out:
            raise AssertionError("No status found in LLM inference service")
    
        status = out["status"]
        if "conditions" not in status:
            raise AssertionError("No conditions found in status")
    
        expected_true_conditions = {"Ready", "WorkloadsReady", "RouterReady"}
        got_true_conditions = set()
    
        conditions = status["conditions"]
    
        for condition in conditions:
            if condition.get("status") == "True":
                got_true_conditions.add(condition.get("type"))
    
        missing_conditions = expected_true_conditions - got_true_conditions
        if missing_conditions:
&gt;           raise AssertionError(
                f"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}"
            )
E           AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-pd-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:54:37Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PrefillWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: "False" (reason "InvalidKind", message "referencing unsupported backendRef: group \\"inference.networking.x-k8s.io\\" kind \\"InferencePool\\"")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:53:00Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'status': 'True', 'type': 'WorkloadsReady'}]

llmisvc/test_llm_inference_service.py:1110: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf1]" time="1030.931"><failure message="AssertionError: Service returned 401:">test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n...               {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},
 'status': None}, model_name='facebook/opt-125m')

    @pytest.mark.llminferenceservice
    @pytest.mark.asyncio(loop_scope="session")
    @pytest.mark.parametrize(
        "test_case",
        [
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-gateway-ref",
                        "router-with-managed-route",
                        "model-fb-opt-125m",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="custom-route-timeout-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs",
                        "scheduler-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="KServe is a",
                    service_name="router-with-refs-test",
                    expected_gateway=ROUTER_GATEWAYS[0],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[0]],
                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-custom-route-timeout-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="custom-route-timeout-pd-test",
                    response_assertion=assert_200_with_choices,
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-with-refs-pd",
                        "scheduler-managed",
                        "workload-pd-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. "
                    "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. "
                    "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.",
                    service_name="router-with-refs-pd-test",
                    response_assertion=assert_200_with_choices,
                    expected_gateway=ROUTER_GATEWAYS[1],
                    before_test=[
                        lambda: create_router_resources(
                            gateways=[ROUTER_GATEWAYS[1]],
                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],
                        )
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.custom_gateway,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-dp-ep-gpu",
                        "workload-dp-ep-prefill-gpu",
                        "model-deepseek-v2-lite",
                    ],
                    prompt="Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically "
                    "where the compute plane (P) and the data plane (D) are independently deployed and managed for a "
                    "geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the "
                    "fundamental challenges of network latency and data consistency, elaborate on the advanced "
                    "considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: "
                    "How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to "
                    "evolve to support optimal performance and minimize inter-plane communication overhead, especially for "
                    "synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically "
                    "optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: "
                    "Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) "
                    "and their applicability in balancing performance and data integrity across a globally distributed data plane. "
                    "Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, "
                    "intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. "
                    "3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently "
                    "manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, "
                    "cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). "
                    "Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on "
                    "workload patterns and data locality, potentially involving live migration strategies. "
                    "4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter "
                    "challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), "
                    "fine-grained access control to data at rest and in motion, and identity management across disaggregated "
                    "components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) "
                    "concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: "
                    "Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and "
                    "data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) "
                    "would be essential? How would incident response and troubleshooting differ in this disaggregated environment "
                    "compared to traditional integrated systems? Consider the challenges of pinpointing root causes across "
                    "independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries "
                    "or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) "
                    "where the benefits of P/D disaggregation would strongly outweigh its complexities. "
                    "Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions "
                    "directly interacting with object storage, in-memory disaggregation) that could further drive or "
                    "transform P/D disaggregation in cloud computing.",
                    max_tokens=2000,
                ),
                marks=[
                    pytest.mark.cluster_gpu,
                    pytest.mark.cluster_nvidia,
                    pytest.mark.cluster_nvidia_roce,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-no-scheduler",
                        "workload-single-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="What is KServe?",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.no_scheduler,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-simulated-dp-ep-cpu",
                        "model-fb-opt-125m",
                    ],
                    prompt="This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, "
                    "but without the resources requirements for DP+EP (GPUs and ROCe/IB).",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],
            ),
            # Scheduler config tests
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-inline-config",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-inline-config-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Chat completions endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                        "model-qwen2.5-0.5b",
                    ],
                    model_name="Qwen/Qwen2.5-0.5B-Instruct",
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=create_response_assertion(with_field="choices"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-configmap-ref",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-configmap-ref-test",
                    before_test=[create_scheduler_configmap],
                    after_test=[delete_scheduler_configmap],
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-replicas",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-ha-replicas-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-custom-template",
                        "workload-llmd-simulator",
                    ],
                    prompt="KServe is a",
                    service_name="scheduler-custom-template-test",
                ),
                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],
            ),
            # Precise prefix KV cache routing test
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "scheduler-with-precise-prefix-cache-inline-config",
                        "workload-llmd-simulator-kvcache",
                    ],
                    prompt="KServe is a",
                    service_name="precise-prefix-cache-test",
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Models endpoint coverage
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/models",
                    response_assertion=create_response_assertion(with_field="data"),
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/completions",
                            prompt="KServe is a",
                            payload_formatter=completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/chat/completions
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-llmd-simulator",
                    ],
                    endpoint="/v1/chat/completions",
                    prompt="What is KServe?",
                    payload_formatter=chat_completions_payload,
                    response_assertion=assert_model_field_matches("facebook/opt-125m"),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                    peers=[
                        TestCase(
                            base_refs=[
                                "router-managed",
                                "workload-llmd-simulator",
                                "model-qwen2.5-0.5b",
                            ],
                            endpoint="/v1/chat/completions",
                            prompt="What is KServe?",
                            payload_formatter=chat_completions_payload,
                            response_assertion=assert_model_field_matches(
                                "Qwen/Qwen2.5-0.5B-Instruct"
                            ),
                            url_getter=get_model_routing_url,
                            extra_headers={
                                MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct",
                            },
                        ),
                    ],
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.llmd_simulator,
                    pytest.mark.model_routing,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — LoRA adapter
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/completions",
                    prompt="KServe is a",
                    model_name=f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    payload_formatter=completions_payload,
                    response_assertion=assert_model_field_matches(
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1"
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
            # Model-based routing via X-Gateway-Model-Name header — /v1/models (base + LoRA)
            pytest.param(
                TestCase(
                    base_refs=[
                        "router-managed",
                        "workload-single-cpu",
                        "model-fb-opt-125m-with-lora-hf",
                    ],
                    endpoint="/v1/models",
                    response_assertion=assert_models_contains(
                        "facebook/opt-125m",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                        "lora-adapter-1",
                        f"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1",
                    ),
                    url_getter=get_model_routing_url,
                    extra_headers={
                        MODEL_ROUTING_HEADER: f"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m",
                    },
                ),
                marks=[
                    pytest.mark.cluster_cpu,
                    pytest.mark.cluster_single_node,
                    pytest.mark.model_routing,
                    pytest.mark.lora,
                ],
            ),
        ],
        indirect=["test_case"],
        ids=generate_test_id,
    )
    @log_execution
    def test_llm_inference_service(test_case: TestCase):  # noqa: F811
        inject_k8s_proxy()
    
        kserve_client = KServeClient(
            config_file=os.environ.get("KUBECONFIG", "~/.kube/config"),
            client_configuration=client.Configuration(),
        )
    
        service_name = test_case.llm_service.metadata.name
        if not test_case.llm_service.metadata.annotations:
            test_case.llm_service.metadata.annotations = {}
    
        test_case.llm_service.metadata.annotations[
            "security.opendatahub.io/enable-auth"
        ] = "false"
        prefix = test_case.log_prefix
    
        test_failed = False
        try:
            print(f"{prefix} Creating LLMInferenceService {service_name}")
            create_llmisvc(kserve_client, test_case.llm_service)
            print(f"{prefix} Waiting for LLMInferenceService {service_name} to be ready")
            wait_for_llm_isvc_ready(
                kserve_client, test_case.llm_service, test_case.wait_timeout
            )
            print(f"{prefix} Waiting for model response from {service_name}")
&gt;           wait_for_model_response(
                kserve_client,
                test_case,
                test_case.wait_timeout,
                extra_headers=test_case.extra_headers,
            )

llmisvc/test_llm_inference_service.py:727: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (&lt;kserve.api.kserve_client.KServeClient object at 0x7f7424fee210&gt;, TestCase(base_refs=['router-managed', 'workload-sin...         {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},
 'status': None}, model_name='facebook/opt-125m'), 900)
kwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}
func_name = 'wait_for_model_response'
timestamp_start = '2026-06-15T07:01:51.337397', start_time = 1781506911.3376548
duration = 900.2222678661346, timestamp_end = '2026-06-15T07:16:51.559924'

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
    
        timestamp_start = datetime.now().isoformat()
        logger.info(
            f"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}"
        )
        start_time = time.time()
    
        try:
&gt;           result = func(*args, **kwargs)

llmisvc/logging.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

kserve_client = &lt;kserve.api.kserve_client.KServeClient object at 0x7f7424fee210&gt;
test_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n...               {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},
 'status': None}, model_name='facebook/opt-125m')
timeout_seconds = 900
extra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}

    @log_execution
    def wait_for_model_response(
        kserve_client: KServeClient,
        test_case: TestCase,  # noqa: F811
        timeout_seconds: int = 900,
        extra_headers: Optional[Dict[str, str]] = None,
    ) -&gt; str:
        def get_successful_response():
            try:
                if test_case.url_getter:
                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)
                else:
                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)
            except Exception as e:
                raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
            model_url = service_url + test_case.endpoint
    
            headers = {"Content-Type": "application/json"}
            if extra_headers:
                headers.update(extra_headers)
    
            if test_case.payload_formatter is not None:
                test_payload = test_case.payload_formatter(test_case)
            elif test_case.prompt is not None:
                test_payload = {
                    "model": test_case.model_name
                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                    else extra_headers[MODEL_ROUTING_HEADER],
                    "prompt": test_case.prompt,
                    "max_tokens": test_case.max_tokens,
                }
            else:
                test_payload = None
    
            logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
            try:
                if test_payload is not None:
                    response = post_with_retry(
                        model_url,
                        headers=headers,
                        json_data=test_payload,
                        timeout=test_case.response_timeout,
                    )
                else:
                    response = get_with_retry(
                        model_url,
                        headers=headers,
                        timeout=test_case.response_timeout,
                    )
            except Exception as e:
                logger.error(f"❌ Failed to call model: {e}")
                raise AssertionError(f"❌ Failed to call model: {e}") from e
    
            logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
            if 200 &lt;= response.status_code &lt; 300:
                return response
            raise AssertionError(
                f"Service returned {response.status_code}: {response.text}"
            )
    
&gt;       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)

llmisvc/test_llm_inference_service.py:1030: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

assertion_fn = &lt;function wait_for_model_response.&lt;locals&gt;.get_successful_response at 0x7f7425e9bec0&gt;
timeout = 900, interval = 5.0

    def wait_for(
        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1
    ) -&gt; Any:
        """Wait for the assertion to succeed within timeout."""
        deadline = time.time() + timeout
        last_msg = None
        while True:
            try:
&gt;               return assertion_fn()

llmisvc/test_llm_inference_service.py:1126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def get_successful_response():
        try:
            if test_case.url_getter:
                service_url = test_case.url_getter(kserve_client, test_case.llm_service)
            else:
                service_url = get_llm_service_url(kserve_client, test_case.llm_service)
        except Exception as e:
            raise AssertionError(f"❌ Failed to get service URL: {e}") from e
    
        model_url = service_url + test_case.endpoint
    
        headers = {"Content-Type": "application/json"}
        if extra_headers:
            headers.update(extra_headers)
    
        if test_case.payload_formatter is not None:
            test_payload = test_case.payload_formatter(test_case)
        elif test_case.prompt is not None:
            test_payload = {
                "model": test_case.model_name
                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers
                else extra_headers[MODEL_ROUTING_HEADER],
                "prompt": test_case.prompt,
                "max_tokens": test_case.max_tokens,
            }
        else:
            test_payload = None
    
        logger.info(f"Calling LLM service at {model_url} with payload {test_payload}")
        try:
            if test_payload is not None:
                response = post_with_retry(
                    model_url,
                    headers=headers,
                    json_data=test_payload,
                    timeout=test_case.response_timeout,
                )
            else:
                response = get_with_retry(
                    model_url,
                    headers=headers,
                    timeout=test_case.response_timeout,
                )
        except Exception as e:
            logger.error(f"❌ Failed to call model: {e}")
            raise AssertionError(f"❌ Failed to call model: {e}") from e
    
        logger.info(f"Model response is {response.status_code}: {response.text[:500]}")
    
        if 200 &lt;= response.status_code &lt; 300:
            return response
&gt;       raise AssertionError(
            f"Service returned {response.status_code}: {response.text}"
        )
E       AssertionError: Service returned 401:

llmisvc/test_llm_inference_service.py:1026: AssertionError</failure></testcase><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-no-scheduler-workload-single-cpu-model-fb-opt-125m]" time="181.490" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_multi_node-router-managed-workload-simulated-dp-ep-cpu-model-fb-opt-125m]" time="471.643" /><testcase classname="llmisvc.test_llm_inference_service_conversion.TestLLMInferenceServiceConversion" name="test_v1alpha1_to_v1alpha2_conversion" time="0.254" /><testcase classname="llmisvc.test_llm_inference_service_conversion.TestLLMInferenceServiceConversion" name="test_v1alpha2_to_v1alpha1_conversion" time="0.098" /><testcase classname="llmisvc.test_llm_inference_service_conversion.TestLLMInferenceServiceConversion" name="test_criticality_preservation_via_annotations" time="0.184" /><testcase classname="llmisvc.test_llm_inference_service_conversion.TestLLMInferenceServiceConversion" name="test_lora_criticality_preservation" time="0.208" /><testcase classname="llmisvc.test_llm_inference_service_conversion.TestLLMInferenceServiceConversion" name="test_round_trip_conversion_preserves_fields" time="0.141" /><testcase classname="llmisvc.test_llm_inference_service_stop" name="test_llm_stop_feature[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]" time="375.423" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-inline-config-workload-llmd-simulator]" time="68.308" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator-model-qwen2.5-0.5b]" time="108.420" /><testcase classname="llmisvc.test_llm_inference_service" name="test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-configmap-ref-workload-llmd-simulator]" time="66.107" /><testcase classname="llmisvc.test_llm_lora_adapters" name="test_llm_with_lora_adapters[cluster_cpu-single-lora-adapter-hf]" time="144.527" /><testcase classname="llmisvc.test_llm_lora_adapters" name="test_llm_with_lora_adapters[cluster_cpu-multiple-lora-adapters]" time="154.199" /><testcase classname="llmisvc.test_prestop_hook" name="test_prestop_hook[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]" time="239.959" /><testcase classname="llmisvc.test_storage_version_migration.TestStorageVersionMigration" name="test_storage_version_migration_after_simulated_upgrade" time="88.454" /></testsuite></testsuites>