{"created": 1781508817.0107176, "duration": 5514.600154399872, "exitcode": 2, "root": "/workspace/source/test/e2e", "environment": {}, "summary": {"passed": 28, "failed": 7, "total": 35, "collected": 35}, "collectors": [{"nodeid": "explainer/test_art_explainer.py", "outcome": "skipped", "result": [], "longrepr": "('/workspace/source/test/e2e/explainer/test_art_explainer.py', 38, 'Skipped: ODH does not support art explainer at the moment')"}, {"nodeid": "predictor/test_grpc.py", "outcome": "skipped", "result": [], "longrepr": "('/workspace/source/test/e2e/predictor/test_grpc.py', 35, 'Skipped: Not testable in ODH at the moment')"}, {"nodeid": "predictor/test_torchserve.py", "outcome": "skipped", "result": [], "longrepr": "('/workspace/source/test/e2e/predictor/test_torchserve.py', 34, 'Skipped: ODH does not support torchserve at the moment')"}], "tests": [{"nodeid": "llmisvc/test_gateway_section_name.py::test_gateway_section_name_propagation[cluster_single_node-cluster_cpu-with-section-name]", "lineno": 131, "outcome": "passed", "keywords": ["test_gateway_section_name_propagation[with-section-name]", "parametrize", "llmd_simulator", "cluster_single_node", "cluster_cpu", "llminferenceservice", "pytestmark", "with-section-name", "test_gateway_section_name.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.001252792000741465, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 33.85204442699978, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0008737319985812064, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-replicas-workload-llmd-simulator]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-scheduler-with-replicas-workload-llmd-simulator]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-scheduler-with-replicas-workload-llmd-simulator", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.2465630910010077, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 100.33356320599705, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.003601180000259774, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_gateway_section_name.py::test_gateway_section_name_propagation[cluster_single_node-cluster_cpu-without-section-name]", "lineno": 131, "outcome": "passed", "keywords": ["test_gateway_section_name_propagation[without-section-name]", "parametrize", "llmd_simulator", "cluster_single_node", "cluster_cpu", "llminferenceservice", "pytestmark", "without-section-name", "test_gateway_section_name.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.000390069999411935, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 49.56539492899901, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0005860740020580124, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_auth.py::test_llm_auth_enabled_requires_token[cluster_cpu-cluster_single_node-auth-enabled-default]", "lineno": 221, "outcome": "failed", "keywords": ["test_llm_auth_enabled_requires_token[auth-enabled-default]", "parametrize", "auth", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "auth-enabled-default", "test_llm_auth.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.4054372450009396, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 902.130330410997, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError: Missing true conditions: {'WorkloadsReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]"}, "traceback": [{"path": "llmisvc/test_llm_auth.py", "lineno": 275, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1115, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError"}], "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KServe is a', service_name=...               {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.auth\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"auth-enabled-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                ],\n                id=\"auth-enabled-default\",\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_auth_enabled_requires_token(test_case: TestCase):  # noqa: F811\n        \"\"\"\n        Test that when auth is enabled (default):\n        - Requests WITH valid token succeed\n        - Requests WITHOUT token are rejected (401/403)\n        \"\"\"\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        sa_name = f\"{service_name}-test-sa\"\n        test_failed = False\n    \n        # Enable auth for this test\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"true\"\n    \n        try:\n            # Create LLMInferenceService\n            create_llmisvc(kserve_client, test_case.llm_service)\n>           wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n\nllmisvc/test_llm_auth.py:275: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90>, {'api_version': 'serving.kserve.io/v1alpha1',\n 'kin...enable-a18fd8e2'},\n                       {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},\n 'status': None}, 900)\nkwargs = {}, func_name = 'wait_for_llm_isvc_ready'\ntimestamp_start = '2026-06-15T06:03:08.552722', start_time = 1781503388.5530953\nduration = 900.3913719654083, timestamp_end = '2026-06-15T06:18:08.944471'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f1922e8cc90>\ngiven = {'api_version': 'serving.kserve.io/v1alpha1',\n 'kind': 'LLMInferenceService',\n 'metadata': {'annotations': {'security....-auth-enable-a18fd8e2'},\n                       {'name': 'model-fb-opt-125m-auth-enabled-89f54b63'}]},\n 'status': None}\ntimeout_seconds = 900\n\n    @log_execution\n    def wait_for_llm_isvc_ready(\n        kserve_client: KServeClient,\n        given: V1alpha1LLMInferenceService,\n        timeout_seconds: int = 900,\n    ) -> str:\n        def assert_llm_isvc_ready():\n            out = get_llmisvc(\n                kserve_client,\n                given.metadata.name,\n                given.metadata.namespace,\n                given.api_version.split(\"/\")[1],\n            )\n    \n            if \"status\" not in out:\n                raise AssertionError(\"No status found in LLM inference service\")\n    \n            status = out[\"status\"]\n            if \"conditions\" not in status:\n                raise AssertionError(\"No conditions found in status\")\n    \n            expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n            got_true_conditions = set()\n    \n            conditions = status[\"conditions\"]\n    \n            for condition in conditions:\n                if condition.get(\"status\") == \"True\":\n                    got_true_conditions.add(condition.get(\"type\"))\n    \n            missing_conditions = expected_true_conditions - got_true_conditions\n            if missing_conditions:\n                raise AssertionError(\n                    f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n                )\n            return True\n    \n>       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)\n\nllmisvc/test_llm_inference_service.py:1115: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922fbafc0>\ntimeout = 900, interval = 1.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def assert_llm_isvc_ready():\n        out = get_llmisvc(\n            kserve_client,\n            given.metadata.name,\n            given.metadata.namespace,\n            given.api_version.split(\"/\")[1],\n        )\n    \n        if \"status\" not in out:\n            raise AssertionError(\"No status found in LLM inference service\")\n    \n        status = out[\"status\"]\n        if \"conditions\" not in status:\n            raise AssertionError(\"No conditions found in status\")\n    \n        expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n        got_true_conditions = set()\n    \n        conditions = status[\"conditions\"]\n    \n        for condition in conditions:\n            if condition.get(\"status\") == \"True\":\n                got_true_conditions.add(condition.get(\"type\"))\n    \n        missing_conditions = expected_true_conditions - got_true_conditions\n        if missing_conditions:\n>           raise AssertionError(\n                f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n            )\nE           AssertionError: Missing true conditions: {'WorkloadsReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:03:44Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:03:17Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]\n\nllmisvc/test_llm_inference_service.py:1110: AssertionError"}, "teardown": {"duration": 0.003153519002808025, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-custom-template-workload-llmd-simulator]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-scheduler-with-custom-template-workload-llmd-simulator]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-scheduler-with-custom-template-workload-llmd-simulator", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.33688229299878003, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 61.20490736799911, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.004286047002096893, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-precise-prefix-cache-inline-config-workload-llmd-simulator-kvcache]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-scheduler-with-precise-prefix-cache-inline-config-workload-llmd-simulator-kvcache]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "__wrapped__", "pytestmark", "router-managed-scheduler-with-precise-prefix-cache-inline-config-workload-llmd-simulator-kvcache", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.12322452700027497, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 63.663346779001586, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.002617376001580851, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator0]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-workload-llmd-simulator0]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "__wrapped__", "pytestmark", "router-managed-workload-llmd-simulator0", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.09751723299996229, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 60.53752650400202, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.003010996002558386, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator1]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-managed-workload-llmd-simulator1]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "model_routing", "__wrapped__", "pytestmark", "router-managed-workload-llmd-simulator1", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.17846723899856443, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 1144.3786262499998, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 727, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1030, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError"}], "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m...              {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n            wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n            print(f\"{prefix} Waiting for model response from {service_name}\")\n>           wait_for_model_response(\n                kserve_client,\n                test_case,\n                test_case.wait_timeout,\n                extra_headers=test_case.extra_headers,\n            )\n\nllmisvc/test_llm_inference_service.py:727: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f7425d76390>, TestCase(base_refs=['router-managed', 'workload-llm...        {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},\n 'status': None}, model_name='facebook/opt-125m'), 900)\nkwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}\nfunc_name = 'wait_for_model_response'\ntimestamp_start = '2026-06-15T06:07:11.714988', start_time = 1781503631.7154357\nduration = 1102.550819158554, timestamp_end = '2026-06-15T06:25:34.266259'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7425d76390>\ntest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='KServe is a', service_name='llmisvc-router-m...              {'name': 'workload-llmd-simulator-llmisvc-8461fd55'}]},\n 'status': None}, model_name='facebook/opt-125m')\ntimeout_seconds = 900\nextra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}\n\n    @log_execution\n    def wait_for_model_response(\n        kserve_client: KServeClient,\n        test_case: TestCase,  # noqa: F811\n        timeout_seconds: int = 900,\n        extra_headers: Optional[Dict[str, str]] = None,\n    ) -> str:\n        def get_successful_response():\n            try:\n                if test_case.url_getter:\n                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n                else:\n                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n            except Exception as e:\n                raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n            model_url = service_url + test_case.endpoint\n    \n            headers = {\"Content-Type\": \"application/json\"}\n            if extra_headers:\n                headers.update(extra_headers)\n    \n            if test_case.payload_formatter is not None:\n                test_payload = test_case.payload_formatter(test_case)\n            elif test_case.prompt is not None:\n                test_payload = {\n                    \"model\": test_case.model_name\n                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                    else extra_headers[MODEL_ROUTING_HEADER],\n                    \"prompt\": test_case.prompt,\n                    \"max_tokens\": test_case.max_tokens,\n                }\n            else:\n                test_payload = None\n    \n            logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n            try:\n                if test_payload is not None:\n                    response = post_with_retry(\n                        model_url,\n                        headers=headers,\n                        json_data=test_payload,\n                        timeout=test_case.response_timeout,\n                    )\n                else:\n                    response = get_with_retry(\n                        model_url,\n                        headers=headers,\n                        timeout=test_case.response_timeout,\n                    )\n            except Exception as e:\n                logger.error(f\"\u274c Failed to call model: {e}\")\n                raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n            logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n            if 200 <= response.status_code < 300:\n                return response\n            raise AssertionError(\n                f\"Service returned {response.status_code}: {response.text}\"\n            )\n    \n>       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)\n\nllmisvc/test_llm_inference_service.py:1030: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e9a020>\ntimeout = 900, interval = 5.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def get_successful_response():\n        try:\n            if test_case.url_getter:\n                service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n            else:\n                service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n        except Exception as e:\n            raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n        model_url = service_url + test_case.endpoint\n    \n        headers = {\"Content-Type\": \"application/json\"}\n        if extra_headers:\n            headers.update(extra_headers)\n    \n        if test_case.payload_formatter is not None:\n            test_payload = test_case.payload_formatter(test_case)\n        elif test_case.prompt is not None:\n            test_payload = {\n                \"model\": test_case.model_name\n                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                else extra_headers[MODEL_ROUTING_HEADER],\n                \"prompt\": test_case.prompt,\n                \"max_tokens\": test_case.max_tokens,\n            }\n        else:\n            test_payload = None\n    \n        logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n        try:\n            if test_payload is not None:\n                response = post_with_retry(\n                    model_url,\n                    headers=headers,\n                    json_data=test_payload,\n                    timeout=test_case.response_timeout,\n                )\n            else:\n                response = get_with_retry(\n                    model_url,\n                    headers=headers,\n                    timeout=test_case.response_timeout,\n                )\n        except Exception as e:\n            logger.error(f\"\u274c Failed to call model: {e}\")\n            raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n        logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n        if 200 <= response.status_code < 300:\n            return response\n>       raise AssertionError(\n            f\"Service returned {response.status_code}: {response.text}\"\n        )\nE       AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request\n\nllmisvc/test_llm_inference_service.py:1026: AssertionError"}, "teardown": {"duration": 0.0018814970026141964, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_auth.py::test_llm_auth_invalid_token_rejected[cluster_cpu-cluster_single_node-auth-invalid-token]", "lineno": 380, "outcome": "passed", "keywords": ["test_llm_auth_invalid_token_rejected[auth-invalid-token]", "parametrize", "auth", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "auth-invalid-token", "test_llm_auth.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.14247033900028327, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 176.709795707, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0028477690029831138, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_auth.py::test_llm_auth_disabled_no_token_required[cluster_cpu-cluster_single_node-auth-disabled]", "lineno": 511, "outcome": "passed", "keywords": ["test_llm_auth_disabled_no_token_required[auth-disabled]", "parametrize", "auth", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "auth-disabled", "test_llm_auth.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.12864859199908096, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 156.9434969190006, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0028306700005487073, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-gateway-ref-router-with-managed-route-model-fb-opt-125m-workload-llmd-simulator]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-with-gateway-ref-router-with-managed-route-model-fb-opt-125m-workload-llmd-simulator]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "custom_gateway", "__wrapped__", "pytestmark", "router-with-gateway-ref-router-with-managed-route-model-fb-opt-125m-workload-llmd-simulator", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.7404222280019894, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 54.22375963599916, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0016966630028036889, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-workload-single-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.12040557799991802, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 171.99445789899983, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0031874889973551035, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator2]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-managed-workload-llmd-simulator2]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "model_routing", "__wrapped__", "pytestmark", "router-managed-workload-llmd-simulator2", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.1611487349982781, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 1143.396674976997, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 727, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1030, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError"}], "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout...              {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n            wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n            print(f\"{prefix} Waiting for model response from {service_name}\")\n>           wait_for_model_response(\n                kserve_client,\n                test_case,\n                test_case.wait_timeout,\n                extra_headers=test_case.extra_headers,\n            )\n\nllmisvc/test_llm_inference_service.py:727: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0>, TestCase(base_refs=['router-managed', 'workload-llm...        {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},\n 'status': None}, model_name='facebook/opt-125m'), 900)\nkwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}\nfunc_name = 'wait_for_model_response'\ntimestamp_start = '2026-06-15T06:26:15.484944', start_time = 1781504775.4854898\nduration = 1102.5697031021118, timestamp_end = '2026-06-15T06:44:38.055197'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7426f541d0>\ntest_case = TestCase(base_refs=['router-managed', 'workload-llmd-simulator'], prompt='What is KServe?', service_name='llmisvc-rout...              {'name': 'workload-llmd-simulator-llmisvc-53a6ad30'}]},\n 'status': None}, model_name='facebook/opt-125m')\ntimeout_seconds = 900\nextra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}\n\n    @log_execution\n    def wait_for_model_response(\n        kserve_client: KServeClient,\n        test_case: TestCase,  # noqa: F811\n        timeout_seconds: int = 900,\n        extra_headers: Optional[Dict[str, str]] = None,\n    ) -> str:\n        def get_successful_response():\n            try:\n                if test_case.url_getter:\n                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n                else:\n                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n            except Exception as e:\n                raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n            model_url = service_url + test_case.endpoint\n    \n            headers = {\"Content-Type\": \"application/json\"}\n            if extra_headers:\n                headers.update(extra_headers)\n    \n            if test_case.payload_formatter is not None:\n                test_payload = test_case.payload_formatter(test_case)\n            elif test_case.prompt is not None:\n                test_payload = {\n                    \"model\": test_case.model_name\n                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                    else extra_headers[MODEL_ROUTING_HEADER],\n                    \"prompt\": test_case.prompt,\n                    \"max_tokens\": test_case.max_tokens,\n                }\n            else:\n                test_payload = None\n    \n            logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n            try:\n                if test_payload is not None:\n                    response = post_with_retry(\n                        model_url,\n                        headers=headers,\n                        json_data=test_payload,\n                        timeout=test_case.response_timeout,\n                    )\n                else:\n                    response = get_with_retry(\n                        model_url,\n                        headers=headers,\n                        timeout=test_case.response_timeout,\n                    )\n            except Exception as e:\n                logger.error(f\"\u274c Failed to call model: {e}\")\n                raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n            logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n            if 200 <= response.status_code < 300:\n                return response\n            raise AssertionError(\n                f\"Service returned {response.status_code}: {response.text}\"\n            )\n    \n>       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)\n\nllmisvc/test_llm_inference_service.py:1030: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e99760>\ntimeout = 900, interval = 5.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def get_successful_response():\n        try:\n            if test_case.url_getter:\n                service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n            else:\n                service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n        except Exception as e:\n            raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n        model_url = service_url + test_case.endpoint\n    \n        headers = {\"Content-Type\": \"application/json\"}\n        if extra_headers:\n            headers.update(extra_headers)\n    \n        if test_case.payload_formatter is not None:\n            test_payload = test_case.payload_formatter(test_case)\n        elif test_case.prompt is not None:\n            test_payload = {\n                \"model\": test_case.model_name\n                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                else extra_headers[MODEL_ROUTING_HEADER],\n                \"prompt\": test_case.prompt,\n                \"max_tokens\": test_case.max_tokens,\n            }\n        else:\n            test_payload = None\n    \n        logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n        try:\n            if test_payload is not None:\n                response = post_with_retry(\n                    model_url,\n                    headers=headers,\n                    json_data=test_payload,\n                    timeout=test_case.response_timeout,\n                )\n            else:\n                response = get_with_retry(\n                    model_url,\n                    headers=headers,\n                    timeout=test_case.response_timeout,\n                )\n        except Exception as e:\n            logger.error(f\"\u274c Failed to call model: {e}\")\n            raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n        logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n        if 200 <= response.status_code < 300:\n            return response\n>       raise AssertionError(\n            f\"Service returned {response.status_code}: {response.text}\"\n        )\nE       AssertionError: Service returned 503: inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request\n\nllmisvc/test_llm_inference_service.py:1026: AssertionError"}, "teardown": {"duration": 0.0017117419993155636, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-custom-route-timeout-scheduler-managed-workload-single-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-custom-route-timeout-scheduler-managed-workload-single-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-custom-route-timeout-scheduler-managed-workload-single-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.15262392499789712, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 147.85951695199765, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0018754869997792412, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-refs-scheduler-managed-workload-single-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-with-refs-scheduler-managed-workload-single-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "custom_gateway", "__wrapped__", "pytestmark", "router-with-refs-scheduler-managed-workload-single-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 2.321730748000846, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 903.9187642770012, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:30:54Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'status': 'True', 'type': 'WorkloadsReady'}]"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 723, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1115, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError"}], "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-with-refs', 'scheduler-managed', 'workload-single-cpu', 'model-fb-opt-125m'], prompt='KSer...              {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n>           wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n\nllmisvc/test_llm_inference_service.py:723: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f1922bed290>, {'api_version': 'serving.kserve.io/v1alpha1',\n 'kin...-with-ec5d4bfa'},\n                       {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},\n 'status': None}, 900)\nkwargs = {}, func_name = 'wait_for_llm_isvc_ready'\ntimestamp_start = '2026-06-15T06:30:03.487652', start_time = 1781505003.4879394\nduration = 901.1005208492279, timestamp_end = '2026-06-15T06:45:04.588475'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f1922bed290>\ngiven = {'api_version': 'serving.kserve.io/v1alpha1',\n 'kind': 'LLMInferenceService',\n 'metadata': {'annotations': {'security....router-with-ec5d4bfa'},\n                       {'name': 'model-fb-opt-125m-router-with-r-6d64416a'}]},\n 'status': None}\ntimeout_seconds = 900\n\n    @log_execution\n    def wait_for_llm_isvc_ready(\n        kserve_client: KServeClient,\n        given: V1alpha1LLMInferenceService,\n        timeout_seconds: int = 900,\n    ) -> str:\n        def assert_llm_isvc_ready():\n            out = get_llmisvc(\n                kserve_client,\n                given.metadata.name,\n                given.metadata.namespace,\n                given.api_version.split(\"/\")[1],\n            )\n    \n            if \"status\" not in out:\n                raise AssertionError(\"No status found in LLM inference service\")\n    \n            status = out[\"status\"]\n            if \"conditions\" not in status:\n                raise AssertionError(\"No conditions found in status\")\n    \n            expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n            got_true_conditions = set()\n    \n            conditions = status[\"conditions\"]\n    \n            for condition in conditions:\n                if condition.get(\"status\") == \"True\":\n                    got_true_conditions.add(condition.get(\"type\"))\n    \n            missing_conditions = expected_true_conditions - got_true_conditions\n            if missing_conditions:\n                raise AssertionError(\n                    f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n                )\n            return True\n    \n>       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)\n\nllmisvc/test_llm_inference_service.py:1115: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922fb9ee0>\ntimeout = 900, interval = 1.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def assert_llm_isvc_ready():\n        out = get_llmisvc(\n            kserve_client,\n            given.metadata.name,\n            given.metadata.namespace,\n            given.api_version.split(\"/\")[1],\n        )\n    \n        if \"status\" not in out:\n            raise AssertionError(\"No status found in LLM inference service\")\n    \n        status = out[\"status\"]\n        if \"conditions\" not in status:\n            raise AssertionError(\"No conditions found in status\")\n    \n        expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n        got_true_conditions = set()\n    \n        conditions = status[\"conditions\"]\n    \n        for condition in conditions:\n            if condition.get(\"status\") == \"True\":\n                got_true_conditions.add(condition.get(\"type\"))\n    \n        missing_conditions = expected_true_conditions - got_true_conditions\n        if missing_conditions:\n>           raise AssertionError(\n                f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n            )\nE           AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:30:21Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-1: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:30:54Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:32:23Z', 'status': 'True', 'type': 'WorkloadsReady'}]\n\nllmisvc/test_llm_inference_service.py:1110: AssertionError"}, "teardown": {"duration": 0.0015322580002248287, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf0]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf0]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "model_routing", "lora", "__wrapped__", "pytestmark", "router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf0", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.2121409479987051, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 902.5550018219983, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError: Missing true conditions: {'Ready', 'WorkloadsReady'}, expected {'Ready', 'RouterReady', 'WorkloadsReady'}, got [{'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:16Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 723, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1115, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError"}], "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt='KServe is a', ...opt-125m-with-lora-hf-a7886ead'}]},\n 'status': None}, model_name='publishers/kserve-ci-e2e-test/models/lora-adapter-1')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n>           wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n\nllmisvc/test_llm_inference_service.py:723: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f7425babd10>, {'api_version': 'serving.kserve.io/v1alpha1',\n 'kin...vc-mod-495991f8'},\n                       {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]},\n 'status': None}, 900)\nkwargs = {}, func_name = 'wait_for_llm_isvc_ready'\ntimestamp_start = '2026-06-15T06:44:39.921048', start_time = 1781505879.9213114\nduration = 900.540575504303, timestamp_end = '2026-06-15T06:59:40.461894'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7425babd10>\ngiven = {'api_version': 'serving.kserve.io/v1alpha1',\n 'kind': 'LLMInferenceService',\n 'metadata': {'annotations': {'security....-llmisvc-mod-495991f8'},\n                       {'name': 'model-fb-opt-125m-with-lora-hf-a7886ead'}]},\n 'status': None}\ntimeout_seconds = 900\n\n    @log_execution\n    def wait_for_llm_isvc_ready(\n        kserve_client: KServeClient,\n        given: V1alpha1LLMInferenceService,\n        timeout_seconds: int = 900,\n    ) -> str:\n        def assert_llm_isvc_ready():\n            out = get_llmisvc(\n                kserve_client,\n                given.metadata.name,\n                given.metadata.namespace,\n                given.api_version.split(\"/\")[1],\n            )\n    \n            if \"status\" not in out:\n                raise AssertionError(\"No status found in LLM inference service\")\n    \n            status = out[\"status\"]\n            if \"conditions\" not in status:\n                raise AssertionError(\"No conditions found in status\")\n    \n            expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n            got_true_conditions = set()\n    \n            conditions = status[\"conditions\"]\n    \n            for condition in conditions:\n                if condition.get(\"status\") == \"True\":\n                    got_true_conditions.add(condition.get(\"type\"))\n    \n            missing_conditions = expected_true_conditions - got_true_conditions\n            if missing_conditions:\n                raise AssertionError(\n                    f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n                )\n            return True\n    \n>       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)\n\nllmisvc/test_llm_inference_service.py:1115: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f7425e99d00>\ntimeout = 900, interval = 1.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def assert_llm_isvc_ready():\n        out = get_llmisvc(\n            kserve_client,\n            given.metadata.name,\n            given.metadata.namespace,\n            given.api_version.split(\"/\")[1],\n        )\n    \n        if \"status\" not in out:\n            raise AssertionError(\"No status found in LLM inference service\")\n    \n        status = out[\"status\"]\n        if \"conditions\" not in status:\n            raise AssertionError(\"No conditions found in status\")\n    \n        expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n        got_true_conditions = set()\n    \n        conditions = status[\"conditions\"]\n    \n        for condition in conditions:\n            if condition.get(\"status\") == \"True\":\n                got_true_conditions.add(condition.get(\"type\"))\n    \n        missing_conditions = expected_true_conditions - got_true_conditions\n        if missing_conditions:\n>           raise AssertionError(\n                f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n            )\nE           AssertionError: Missing true conditions: {'Ready', 'WorkloadsReady'}, expected {'Ready', 'RouterReady', 'WorkloadsReady'}, got [{'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'severity': 'Info', 'status': 'True', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'severity': 'Info', 'status': 'False', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:16Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'status': 'True', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:45:52Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:45:40Z', 'message': 'Deployment does not have minimum availability.', 'reason': 'MinimumReplicasUnavailable', 'status': 'False', 'type': 'WorkloadsReady'}]\n\nllmisvc/test_llm_inference_service.py:1110: AssertionError"}, "teardown": {"duration": 0.0034667449981498066, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-pd-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-workload-pd-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-workload-pd-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.35954627499813796, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 221.4993557739981, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.003605027999583399, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-custom-route-timeout-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-custom-route-timeout-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-custom-route-timeout-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.20046824800010654, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 203.27413807899939, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0028695500004687347, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-with-refs-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-with-refs-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "custom_gateway", "__wrapped__", "pytestmark", "router-with-refs-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 2.6804630330007058, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 903.0143166149974, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-pd-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:54:37Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PrefillWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:53:00Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'status': 'True', 'type': 'WorkloadsReady'}]"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 723, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1115, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1110, "message": "AssertionError"}], "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-with-refs-pd', 'scheduler-managed', 'workload-pd-cpu', 'model-fb-opt-125m'], prompt='You a...              {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n>           wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n\nllmisvc/test_llm_inference_service.py:723: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f192340b750>, {'api_version': 'serving.kserve.io/v1alpha1',\n 'kin...h-ref-d1f07093'},\n                       {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},\n 'status': None}, 900)\nkwargs = {}, func_name = 'wait_for_llm_isvc_ready'\ntimestamp_start = '2026-06-15T06:52:14.786869', start_time = 1781506334.7871306\nduration = 900.4981956481934, timestamp_end = '2026-06-15T07:07:15.285340'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f192340b750>\ngiven = {'api_version': 'serving.kserve.io/v1alpha1',\n 'kind': 'LLMInferenceService',\n 'metadata': {'annotations': {'security....er-with-ref-d1f07093'},\n                       {'name': 'model-fb-opt-125m-router-with-r-c22ea8a0'}]},\n 'status': None}\ntimeout_seconds = 900\n\n    @log_execution\n    def wait_for_llm_isvc_ready(\n        kserve_client: KServeClient,\n        given: V1alpha1LLMInferenceService,\n        timeout_seconds: int = 900,\n    ) -> str:\n        def assert_llm_isvc_ready():\n            out = get_llmisvc(\n                kserve_client,\n                given.metadata.name,\n                given.metadata.namespace,\n                given.api_version.split(\"/\")[1],\n            )\n    \n            if \"status\" not in out:\n                raise AssertionError(\"No status found in LLM inference service\")\n    \n            status = out[\"status\"]\n            if \"conditions\" not in status:\n                raise AssertionError(\"No conditions found in status\")\n    \n            expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n            got_true_conditions = set()\n    \n            conditions = status[\"conditions\"]\n    \n            for condition in conditions:\n                if condition.get(\"status\") == \"True\":\n                    got_true_conditions.add(condition.get(\"type\"))\n    \n            missing_conditions = expected_true_conditions - got_true_conditions\n            if missing_conditions:\n                raise AssertionError(\n                    f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n                )\n            return True\n    \n>       return wait_for(assert_llm_isvc_ready, timeout=timeout_seconds, interval=1.0)\n\nllmisvc/test_llm_inference_service.py:1115: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_llm_isvc_ready.<locals>.assert_llm_isvc_ready at 0x7f1922f6b240>\ntimeout = 900, interval = 1.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def assert_llm_isvc_ready():\n        out = get_llmisvc(\n            kserve_client,\n            given.metadata.name,\n            given.metadata.namespace,\n            given.api_version.split(\"/\")[1],\n        )\n    \n        if \"status\" not in out:\n            raise AssertionError(\"No status found in LLM inference service\")\n    \n        status = out[\"status\"]\n        if \"conditions\" not in status:\n            raise AssertionError(\"No conditions found in status\")\n    \n        expected_true_conditions = {\"Ready\", \"WorkloadsReady\", \"RouterReady\"}\n        got_true_conditions = set()\n    \n        conditions = status[\"conditions\"]\n    \n        for condition in conditions:\n            if condition.get(\"status\") == \"True\":\n                got_true_conditions.add(condition.get(\"type\"))\n    \n        missing_conditions = expected_true_conditions - got_true_conditions\n        if missing_conditions:\n>           raise AssertionError(\n                f\"Missing true conditions: {missing_conditions}, expected {expected_true_conditions}, got {conditions}\"\n            )\nE           AssertionError: Missing true conditions: {'RouterReady', 'Ready'}, expected {'RouterReady', 'WorkloadsReady', 'Ready'}, got [{'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'GatewaysReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'severity': 'Info', 'status': 'False', 'type': 'HTTPRoutesReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'Inference Pool kserve-ci-e2e-test/router-with-refs-pd-test-inference-pool exists but no Gateway controller has accepted it yet', 'reason': 'WaitingForGateway', 'severity': 'Info', 'status': 'False', 'type': 'InferencePoolReady'}, {'lastTransitionTime': '2026-06-15T06:54:37Z', 'severity': 'Info', 'status': 'True', 'type': 'MainWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'severity': 'Info', 'status': 'True', 'type': 'PrefillWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'severity': 'Info', 'status': 'True', 'type': 'PresetsCombined'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'Ready'}, {'lastTransitionTime': '2026-06-15T06:52:32Z', 'message': 'The following HTTPRoutes are not ready: [kserve-ci-e2e-test/router-route-3: \"False\" (reason \"InvalidKind\", message \"referencing unsupported backendRef: group \\\\\"inference.networking.x-k8s.io\\\\\" kind \\\\\"InferencePool\\\\\"\")]', 'reason': 'HTTPRoutesNotReady', 'status': 'False', 'type': 'RouterReady'}, {'lastTransitionTime': '2026-06-15T06:53:00Z', 'severity': 'Info', 'status': 'True', 'type': 'SchedulerWorkloadReady'}, {'lastTransitionTime': '2026-06-15T06:55:17Z', 'status': 'True', 'type': 'WorkloadsReady'}]\n\nllmisvc/test_llm_inference_service.py:1110: AssertionError"}, "teardown": {"duration": 0.0012388100003590807, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf1]", "lineno": 243, "outcome": "failed", "keywords": ["test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf1]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "model_routing", "lora", "__wrapped__", "pytestmark", "router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf1", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.1376244399980351, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 1030.7907219180015, "outcome": "failed", "crash": {"path": "/workspace/source/test/e2e/llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError: Service returned 401:"}, "traceback": [{"path": "llmisvc/test_llm_inference_service.py", "lineno": 727, "message": ""}, {"path": "llmisvc/logging.py", "lineno": 40, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1030, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1126, "message": ""}, {"path": "llmisvc/test_llm_inference_service.py", "lineno": 1026, "message": "AssertionError"}], "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python\n\ntest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n...               {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},\n 'status': None}, model_name='facebook/opt-125m')\n\n    @pytest.mark.llminferenceservice\n    @pytest.mark.asyncio(loop_scope=\"session\")\n    @pytest.mark.parametrize(\n        \"test_case\",\n        [\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-gateway-ref\",\n                        \"router-with-managed-route\",\n                        \"model-fb-opt-125m\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"custom-route-timeout-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs\",\n                        \"scheduler-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"router-with-refs-test\",\n                    expected_gateway=ROUTER_GATEWAYS[0],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[0]],\n                            routes=[ROUTER_ROUTES[0], ROUTER_ROUTES[1]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\"router-managed\", \"workload-pd-cpu\", \"model-fb-opt-125m\"],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-custom-route-timeout-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"custom-route-timeout-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-with-refs-pd\",\n                        \"scheduler-managed\",\n                        \"workload-pd-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. \"\n                    \"Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. \"\n                    \"Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.\",\n                    service_name=\"router-with-refs-pd-test\",\n                    response_assertion=assert_200_with_choices,\n                    expected_gateway=ROUTER_GATEWAYS[1],\n                    before_test=[\n                        lambda: create_router_resources(\n                            gateways=[ROUTER_GATEWAYS[1]],\n                            routes=[ROUTER_ROUTES[2], ROUTER_ROUTES[3]],\n                        )\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.custom_gateway,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-dp-ep-gpu\",\n                        \"workload-dp-ep-prefill-gpu\",\n                        \"model-deepseek-v2-lite\",\n                    ],\n                    prompt=\"Delve into the multifaceted implications of a fully disaggregated cloud architecture, specifically \"\n                    \"where the compute plane (P) and the data plane (D) are independently deployed and managed for a \"\n                    \"geographically distributed, high-throughput, low-latency microservices ecosystem. Beyond the \"\n                    \"fundamental challenges of network latency and data consistency, elaborate on the advanced \"\n                    \"considerations and trade-offs inherent in such a setup: 1. Network Architecture and Protocols: \"\n                    \"How would the network fabric and underlying protocols (e.g., RDMA, custom transport layers) need to \"\n                    \"evolve to support optimal performance and minimize inter-plane communication overhead, especially for \"\n                    \"synchronous operations? Discuss the role of network programmability (e.g., SDN, P4) in dynamically \"\n                    \"optimizing routing and traffic flow between P and D. 2. Advanced Data Consistency and Durability: \"\n                    \"Explore sophisticated data consistency models (e.g., causal consistency, strong eventual consistency) \"\n                    \"and their applicability in balancing performance and data integrity across a globally distributed data plane. \"\n                    \"Detail strategies for ensuring data durability and fault tolerance, including multi-region replication, \"\n                    \"intelligent partitioning, and recovery mechanisms in the event of partial or full plane failures. \"\n                    \"3. Dynamic Resource Orchestration and Cost Optimization: Analyze how an orchestration layer would intelligently \"\n                    \"manage the independent scaling of compute (P) and data (D) resources, considering fluctuating workloads, \"\n                    \"cost efficiency, and performance targets (e.g., using predictive analytics for resource provisioning). \"\n                    \"Discuss mechanisms for dynamically reallocating compute nodes to different data partitions based on \"\n                    \"workload patterns and data locality, potentially involving live migration strategies. \"\n                    \"4. Security and Compliance in a Distributed Landscape: Address the enhanced security perimeter \"\n                    \"challenges, including securing communication channels between P and D (encryption in transit, mutual TLS), \"\n                    \"fine-grained access control to data at rest and in motion, and identity management across disaggregated \"\n                    \"components. Discuss how such an architecture impacts compliance with regulatory frameworks (e.g., GDPR, HIPAA) \"\n                    \"concerning data sovereignty, privacy, and auditability. 5. Operational Complexity and Observability: \"\n                    \"Examine the increased complexity in monitoring, logging, and tracing across highly decoupled compute and \"\n                    \"data planes. What specialized tooling and practices (e.g., distributed tracing with OpenTelemetry, advanced AIOps) \"\n                    \"would be essential? How would incident response and troubleshooting differ in this disaggregated environment \"\n                    \"compared to traditional integrated systems? Consider the challenges of pinpointing root causes across \"\n                    \"independent failures. 6. Real-world Applicability and Future Trends: Identify specific industries \"\n                    \"or use cases (e.g., high-frequency trading, IoT edge processing, large language model inference) \"\n                    \"where the benefits of P/D disaggregation would strongly outweigh its complexities. \"\n                    \"Conclude by speculating on emerging technologies or paradigms (e.g., serverless compute functions \"\n                    \"directly interacting with object storage, in-memory disaggregation) that could further drive or \"\n                    \"transform P/D disaggregation in cloud computing.\",\n                    max_tokens=2000,\n                ),\n                marks=[\n                    pytest.mark.cluster_gpu,\n                    pytest.mark.cluster_nvidia,\n                    pytest.mark.cluster_nvidia_roce,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-no-scheduler\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"What is KServe?\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.no_scheduler,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-simulated-dp-ep-cpu\",\n                        \"model-fb-opt-125m\",\n                    ],\n                    prompt=\"This test simulates DP+EP that can run on CPU, the idea is to test the LWS-based deployment, \"\n                    \"but without the resources requirements for DP+EP (GPUs and ROCe/IB).\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_multi_node],\n            ),\n            # Scheduler config tests\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-inline-config\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-inline-config-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Chat completions endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                        \"model-qwen2.5-0.5b\",\n                    ],\n                    model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=create_response_assertion(with_field=\"choices\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-configmap-ref\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-configmap-ref-test\",\n                    before_test=[create_scheduler_configmap],\n                    after_test=[delete_scheduler_configmap],\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-replicas\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-ha-replicas-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-custom-template\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"scheduler-custom-template-test\",\n                ),\n                marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node],\n            ),\n            # Precise prefix KV cache routing test\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"scheduler-with-precise-prefix-cache-inline-config\",\n                        \"workload-llmd-simulator-kvcache\",\n                    ],\n                    prompt=\"KServe is a\",\n                    service_name=\"precise-prefix-cache-test\",\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Models endpoint coverage\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=create_response_assertion(with_field=\"data\"),\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/completions\",\n                            prompt=\"KServe is a\",\n                            payload_formatter=completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/chat/completions\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-llmd-simulator\",\n                    ],\n                    endpoint=\"/v1/chat/completions\",\n                    prompt=\"What is KServe?\",\n                    payload_formatter=chat_completions_payload,\n                    response_assertion=assert_model_field_matches(\"facebook/opt-125m\"),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                    peers=[\n                        TestCase(\n                            base_refs=[\n                                \"router-managed\",\n                                \"workload-llmd-simulator\",\n                                \"model-qwen2.5-0.5b\",\n                            ],\n                            endpoint=\"/v1/chat/completions\",\n                            prompt=\"What is KServe?\",\n                            payload_formatter=chat_completions_payload,\n                            response_assertion=assert_model_field_matches(\n                                \"Qwen/Qwen2.5-0.5B-Instruct\"\n                            ),\n                            url_getter=get_model_routing_url,\n                            extra_headers={\n                                MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/Qwen/Qwen2.5-0.5B-Instruct\",\n                            },\n                        ),\n                    ],\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.llmd_simulator,\n                    pytest.mark.model_routing,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 LoRA adapter\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/completions\",\n                    prompt=\"KServe is a\",\n                    model_name=f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    payload_formatter=completions_payload,\n                    response_assertion=assert_model_field_matches(\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\"\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n            # Model-based routing via X-Gateway-Model-Name header \u2014 /v1/models (base + LoRA)\n            pytest.param(\n                TestCase(\n                    base_refs=[\n                        \"router-managed\",\n                        \"workload-single-cpu\",\n                        \"model-fb-opt-125m-with-lora-hf\",\n                    ],\n                    endpoint=\"/v1/models\",\n                    response_assertion=assert_models_contains(\n                        \"facebook/opt-125m\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                        \"lora-adapter-1\",\n                        f\"publishers/{KSERVE_TEST_NAMESPACE}/models/lora-adapter-1\",\n                    ),\n                    url_getter=get_model_routing_url,\n                    extra_headers={\n                        MODEL_ROUTING_HEADER: f\"publishers/{KSERVE_TEST_NAMESPACE}/models/facebook/opt-125m\",\n                    },\n                ),\n                marks=[\n                    pytest.mark.cluster_cpu,\n                    pytest.mark.cluster_single_node,\n                    pytest.mark.model_routing,\n                    pytest.mark.lora,\n                ],\n            ),\n        ],\n        indirect=[\"test_case\"],\n        ids=generate_test_id,\n    )\n    @log_execution\n    def test_llm_inference_service(test_case: TestCase):  # noqa: F811\n        inject_k8s_proxy()\n    \n        kserve_client = KServeClient(\n            config_file=os.environ.get(\"KUBECONFIG\", \"~/.kube/config\"),\n            client_configuration=client.Configuration(),\n        )\n    \n        service_name = test_case.llm_service.metadata.name\n        if not test_case.llm_service.metadata.annotations:\n            test_case.llm_service.metadata.annotations = {}\n    \n        test_case.llm_service.metadata.annotations[\n            \"security.opendatahub.io/enable-auth\"\n        ] = \"false\"\n        prefix = test_case.log_prefix\n    \n        test_failed = False\n        try:\n            print(f\"{prefix} Creating LLMInferenceService {service_name}\")\n            create_llmisvc(kserve_client, test_case.llm_service)\n            print(f\"{prefix} Waiting for LLMInferenceService {service_name} to be ready\")\n            wait_for_llm_isvc_ready(\n                kserve_client, test_case.llm_service, test_case.wait_timeout\n            )\n            print(f\"{prefix} Waiting for model response from {service_name}\")\n>           wait_for_model_response(\n                kserve_client,\n                test_case,\n                test_case.wait_timeout,\n                extra_headers=test_case.extra_headers,\n            )\n\nllmisvc/test_llm_inference_service.py:727: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nargs = (<kserve.api.kserve_client.KServeClient object at 0x7f7424fee210>, TestCase(base_refs=['router-managed', 'workload-sin...         {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},\n 'status': None}, model_name='facebook/opt-125m'), 900)\nkwargs = {'extra_headers': {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}}\nfunc_name = 'wait_for_model_response'\ntimestamp_start = '2026-06-15T07:01:51.337397', start_time = 1781506911.3376548\nduration = 900.2222678661346, timestamp_end = '2026-06-15T07:16:51.559924'\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        func_name = func.__name__\n    \n        timestamp_start = datetime.now().isoformat()\n        logger.info(\n            f\"[{func_name}] [{timestamp_start}] start - args={args}, kwargs={kwargs}\"\n        )\n        start_time = time.time()\n    \n        try:\n>           result = func(*args, **kwargs)\n\nllmisvc/logging.py:40: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nkserve_client = <kserve.api.kserve_client.KServeClient object at 0x7f7424fee210>\ntest_case = TestCase(base_refs=['router-managed', 'workload-single-cpu', 'model-fb-opt-125m-with-lora-hf'], prompt=None, service_n...               {'name': 'model-fb-opt-125m-with-lora-hf-c0d503b0'}]},\n 'status': None}, model_name='facebook/opt-125m')\ntimeout_seconds = 900\nextra_headers = {'X-Gateway-Model-Name': 'publishers/kserve-ci-e2e-test/models/facebook/opt-125m'}\n\n    @log_execution\n    def wait_for_model_response(\n        kserve_client: KServeClient,\n        test_case: TestCase,  # noqa: F811\n        timeout_seconds: int = 900,\n        extra_headers: Optional[Dict[str, str]] = None,\n    ) -> str:\n        def get_successful_response():\n            try:\n                if test_case.url_getter:\n                    service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n                else:\n                    service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n            except Exception as e:\n                raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n            model_url = service_url + test_case.endpoint\n    \n            headers = {\"Content-Type\": \"application/json\"}\n            if extra_headers:\n                headers.update(extra_headers)\n    \n            if test_case.payload_formatter is not None:\n                test_payload = test_case.payload_formatter(test_case)\n            elif test_case.prompt is not None:\n                test_payload = {\n                    \"model\": test_case.model_name\n                    if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                    else extra_headers[MODEL_ROUTING_HEADER],\n                    \"prompt\": test_case.prompt,\n                    \"max_tokens\": test_case.max_tokens,\n                }\n            else:\n                test_payload = None\n    \n            logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n            try:\n                if test_payload is not None:\n                    response = post_with_retry(\n                        model_url,\n                        headers=headers,\n                        json_data=test_payload,\n                        timeout=test_case.response_timeout,\n                    )\n                else:\n                    response = get_with_retry(\n                        model_url,\n                        headers=headers,\n                        timeout=test_case.response_timeout,\n                    )\n            except Exception as e:\n                logger.error(f\"\u274c Failed to call model: {e}\")\n                raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n            logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n            if 200 <= response.status_code < 300:\n                return response\n            raise AssertionError(\n                f\"Service returned {response.status_code}: {response.text}\"\n            )\n    \n>       response = wait_for(get_successful_response, timeout=timeout_seconds, interval=5.0)\n\nllmisvc/test_llm_inference_service.py:1030: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nassertion_fn = <function wait_for_model_response.<locals>.get_successful_response at 0x7f7425e9bec0>\ntimeout = 900, interval = 5.0\n\n    def wait_for(\n        assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1\n    ) -> Any:\n        \"\"\"Wait for the assertion to succeed within timeout.\"\"\"\n        deadline = time.time() + timeout\n        last_msg = None\n        while True:\n            try:\n>               return assertion_fn()\n\nllmisvc/test_llm_inference_service.py:1126: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n    def get_successful_response():\n        try:\n            if test_case.url_getter:\n                service_url = test_case.url_getter(kserve_client, test_case.llm_service)\n            else:\n                service_url = get_llm_service_url(kserve_client, test_case.llm_service)\n        except Exception as e:\n            raise AssertionError(f\"\u274c Failed to get service URL: {e}\") from e\n    \n        model_url = service_url + test_case.endpoint\n    \n        headers = {\"Content-Type\": \"application/json\"}\n        if extra_headers:\n            headers.update(extra_headers)\n    \n        if test_case.payload_formatter is not None:\n            test_payload = test_case.payload_formatter(test_case)\n        elif test_case.prompt is not None:\n            test_payload = {\n                \"model\": test_case.model_name\n                if not extra_headers or MODEL_ROUTING_HEADER not in extra_headers\n                else extra_headers[MODEL_ROUTING_HEADER],\n                \"prompt\": test_case.prompt,\n                \"max_tokens\": test_case.max_tokens,\n            }\n        else:\n            test_payload = None\n    \n        logger.info(f\"Calling LLM service at {model_url} with payload {test_payload}\")\n        try:\n            if test_payload is not None:\n                response = post_with_retry(\n                    model_url,\n                    headers=headers,\n                    json_data=test_payload,\n                    timeout=test_case.response_timeout,\n                )\n            else:\n                response = get_with_retry(\n                    model_url,\n                    headers=headers,\n                    timeout=test_case.response_timeout,\n                )\n        except Exception as e:\n            logger.error(f\"\u274c Failed to call model: {e}\")\n            raise AssertionError(f\"\u274c Failed to call model: {e}\") from e\n    \n        logger.info(f\"Model response is {response.status_code}: {response.text[:500]}\")\n    \n        if 200 <= response.status_code < 300:\n            return response\n>       raise AssertionError(\n            f\"Service returned {response.status_code}: {response.text}\"\n        )\nE       AssertionError: Service returned 401:\n\nllmisvc/test_llm_inference_service.py:1026: AssertionError"}, "teardown": {"duration": 0.0029410419992927928, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-no-scheduler-workload-single-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-no-scheduler-workload-single-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "no_scheduler", "__wrapped__", "pytestmark", "router-no-scheduler-workload-single-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.12427426000067499, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 181.36366722800085, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.001785504002327798, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_multi_node-router-managed-workload-simulated-dp-ep-cpu-model-fb-opt-125m]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-workload-simulated-dp-ep-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_multi_node", "__wrapped__", "pytestmark", "router-managed-workload-simulated-dp-ep-cpu-model-fb-opt-125m", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.295195495000371, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 471.34285894300046, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.00458838299891795, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_conversion.py::TestLLMInferenceServiceConversion::test_v1alpha1_to_v1alpha2_conversion", "lineno": 212, "outcome": "passed", "keywords": ["test_v1alpha1_to_v1alpha2_conversion", "cluster_single_node", "cluster_cpu", "pytestmark", "TestLLMInferenceServiceConversion", "conversion", "llminferenceservice", "test_llm_inference_service_conversion.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.01314402200296172, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 0.1529069870011881, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.08745175300282426, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_conversion.py::TestLLMInferenceServiceConversion::test_v1alpha2_to_v1alpha1_conversion", "lineno": 303, "outcome": "passed", "keywords": ["test_v1alpha2_to_v1alpha1_conversion", "cluster_single_node", "cluster_cpu", "pytestmark", "TestLLMInferenceServiceConversion", "conversion", "llminferenceservice", "test_llm_inference_service_conversion.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.012359443000605097, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 0.06860955099909916, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.01720435199968051, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_conversion.py::TestLLMInferenceServiceConversion::test_criticality_preservation_via_annotations", "lineno": 394, "outcome": "passed", "keywords": ["test_criticality_preservation_via_annotations", "cluster_single_node", "cluster_cpu", "pytestmark", "TestLLMInferenceServiceConversion", "conversion", "llminferenceservice", "test_llm_inference_service_conversion.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.01234769199800212, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 0.08722625700102071, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.08491751199835562, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_conversion.py::TestLLMInferenceServiceConversion::test_lora_criticality_preservation", "lineno": 531, "outcome": "passed", "keywords": ["test_lora_criticality_preservation", "cluster_single_node", "cluster_cpu", "pytestmark", "TestLLMInferenceServiceConversion", "conversion", "llminferenceservice", "test_llm_inference_service_conversion.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.012818023998988792, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 0.11612528499972541, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.07905235700309277, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_conversion.py::TestLLMInferenceServiceConversion::test_round_trip_conversion_preserves_fields", "lineno": 680, "outcome": "passed", "keywords": ["test_round_trip_conversion_preserves_fields", "cluster_single_node", "cluster_cpu", "pytestmark", "TestLLMInferenceServiceConversion", "conversion", "llminferenceservice", "test_llm_inference_service_conversion.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.012759673001710325, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 0.09482947400101693, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.03356874299788615, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service_stop.py::test_llm_stop_feature[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]", "lineno": 39, "outcome": "passed", "keywords": ["test_llm_stop_feature[router-managed-workload-single-cpu-model-fb-opt-125m]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-workload-single-cpu-model-fb-opt-125m", "test_llm_inference_service_stop.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.34058851599911577, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 375.080517391998, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.002200173999881372, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-inline-config-workload-llmd-simulator]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-scheduler-with-inline-config-workload-llmd-simulator]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-scheduler-with-inline-config-workload-llmd-simulator", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.11318085899983998, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 68.1934034320002, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0016046700002334546, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-workload-llmd-simulator-model-qwen2.5-0.5b]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-workload-llmd-simulator-model-qwen2.5-0.5b]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "llmd_simulator", "__wrapped__", "pytestmark", "router-managed-workload-llmd-simulator-model-qwen2.5-0.5b", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.1132654110006115, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 108.30343687800269, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0032449799982714467, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_inference_service.py::test_llm_inference_service[cluster_cpu-cluster_single_node-router-managed-scheduler-with-configmap-ref-workload-llmd-simulator]", "lineno": 243, "outcome": "passed", "keywords": ["test_llm_inference_service[router-managed-scheduler-with-configmap-ref-workload-llmd-simulator]", "parametrize", "asyncio", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-scheduler-with-configmap-ref-workload-llmd-simulator", "test_llm_inference_service.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.3457274039974436, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 65.71168280899656, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.04984318000060739, "outcome": "passed", "longrepr": "[gw0] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_lora_adapters.py::test_llm_with_lora_adapters[cluster_cpu-single-lora-adapter-hf]", "lineno": 203, "outcome": "passed", "keywords": ["test_llm_with_lora_adapters[single-lora-adapter-hf]", "parametrize", "llminferenceservice", "cluster_cpu", "lora", "__wrapped__", "pytestmark", "single-lora-adapter-hf", "test_llm_lora_adapters.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.0003277379983046558, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 144.52490560099977, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.002230674002930755, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_llm_lora_adapters.py::test_llm_with_lora_adapters[cluster_cpu-multiple-lora-adapters]", "lineno": 203, "outcome": "passed", "keywords": ["test_llm_with_lora_adapters[multiple-lora-adapters]", "parametrize", "llminferenceservice", "cluster_cpu", "lora", "__wrapped__", "pytestmark", "multiple-lora-adapters", "test_llm_lora_adapters.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.0003602190008678008, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 154.19665131299917, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.0023378070000035223, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_prestop_hook.py::test_prestop_hook[cluster_cpu-cluster_single_node-router-managed-workload-single-cpu-model-fb-opt-125m]", "lineno": 40, "outcome": "passed", "keywords": ["test_prestop_hook[router-managed-workload-single-cpu-model-fb-opt-125m]", "parametrize", "llminferenceservice", "cluster_cpu", "cluster_single_node", "__wrapped__", "pytestmark", "router-managed-workload-single-cpu-model-fb-opt-125m", "test_prestop_hook.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.11404272499930812, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 239.83955090900054, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 0.005761889999121195, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}, {"nodeid": "llmisvc/test_storage_version_migration.py::TestStorageVersionMigration::test_storage_version_migration_after_simulated_upgrade", "lineno": 113, "outcome": "passed", "keywords": ["test_storage_version_migration_after_simulated_upgrade", "cluster_single_node", "cluster_cpu", "pytestmark", "TestStorageVersionMigration", "conversion", "llminferenceservice", "test_storage_version_migration.py", "llmisvc/__init__.py", "e2e"], "setup": {"duration": 0.013436447003186913, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "call": {"duration": 85.64989905299808, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}, "teardown": {"duration": 2.7904278640016855, "outcome": "passed", "longrepr": "[gw1] linux -- Python 3.11.13 /workspace/source/python/kserve/.venv/bin/python"}}], "warnings": [{"message": "The event_loop fixture provided by pytest-asyncio has been redefined in\n/workspace/source/test/e2e/conftest.py:43\nReplacing the event_loop fixture with a custom implementation is deprecated\nand will lead to errors in the future.\nIf you want to request an asyncio event loop with a scope other than function\nscope, use the \"scope\" argument to the asyncio mark when marking the tests.\nIf you want to return different types of event loops, use the event_loop_policy\nfixture.\n", "category": "DeprecationWarning", "when": "runtest", "filename": "/workspace/source/python/kserve/.venv/lib64/python3.11/site-packages/pytest_asyncio/plugin.py", "lineno": 761}, {"message": "The test <Function test_llm_inference_service[router-managed-scheduler-with-replicas-workload-llmd-simulator]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-scheduler-with-custom-template-workload-llmd-simulator]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-scheduler-with-precise-prefix-cache-inline-config-workload-llmd-simulator-kvcache]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-llmd-simulator0]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The event_loop fixture provided by pytest-asyncio has been redefined in\n/workspace/source/test/e2e/conftest.py:43\nReplacing the event_loop fixture with a custom implementation is deprecated\nand will lead to errors in the future.\nIf you want to request an asyncio event loop with a scope other than function\nscope, use the \"scope\" argument to the asyncio mark when marking the tests.\nIf you want to return different types of event loops, use the event_loop_policy\nfixture.\n", "category": "DeprecationWarning", "when": "runtest", "filename": "/workspace/source/python/kserve/.venv/lib64/python3.11/site-packages/pytest_asyncio/plugin.py", "lineno": 761}, {"message": "The test <Function test_llm_inference_service[router-with-gateway-ref-router-with-managed-route-model-fb-opt-125m-workload-llmd-simulator]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-llmd-simulator1]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-custom-route-timeout-scheduler-managed-workload-single-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-llmd-simulator2]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-with-refs-scheduler-managed-workload-single-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-pd-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-custom-route-timeout-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf0]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-with-refs-pd-scheduler-managed-workload-pd-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-no-scheduler-workload-single-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-single-cpu-model-fb-opt-125m-with-lora-hf1]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-simulated-dp-ep-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-scheduler-with-inline-config-workload-llmd-simulator]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-workload-llmd-simulator-model-qwen2.5-0.5b]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_inference_service[router-managed-scheduler-with-configmap-ref-workload-llmd-simulator]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service.py", "lineno": 244}, {"message": "The test <Function test_llm_stop_feature[router-managed-workload-single-cpu-model-fb-opt-125m]> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.", "category": "PytestWarning", "when": "runtest", "filename": "llmisvc/test_llm_inference_service_stop.py", "lineno": 40}]}