---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  annotations:
    controller-gen.kubebuilder.io/version: v0.16.1
    internal.config.kubernetes.io/previousKinds: CustomResourceDefinition
    internal.config.kubernetes.io/previousNames: inferencemodels.inference.networking.x-k8s.io
    internal.config.kubernetes.io/previousNamespaces: _non_namespaceable_
  creationTimestamp: "2026-04-22T16:50:13Z"
  generation: 1
  labels:
    app.kubernetes.io/part-of: kserve
    app.opendatahub.io/kserve: "true"
    platform.opendatahub.io/part-of: platform
  managedFields:
  - apiVersion: apiextensions.k8s.io/v1
    fieldsType: FieldsV1
    fieldsV1:
      f:metadata:
        f:annotations:
          f:controller-gen.kubebuilder.io/version: {}
          f:internal.config.kubernetes.io/previousKinds: {}
          f:internal.config.kubernetes.io/previousNames: {}
          f:internal.config.kubernetes.io/previousNamespaces: {}
        f:labels:
          f:app.kubernetes.io/part-of: {}
          f:app.opendatahub.io/kserve: {}
          f:platform.opendatahub.io/part-of: {}
      f:spec:
        f:group: {}
        f:names:
          f:kind: {}
          f:listKind: {}
          f:plural: {}
          f:singular: {}
        f:scope: {}
        f:versions: {}
    manager: platform.opendatahub.io
    operation: Apply
    time: "2026-04-22T16:50:13Z"
  - apiVersion: apiextensions.k8s.io/v1
    fieldsType: FieldsV1
    fieldsV1:
      f:status:
        f:acceptedNames:
          f:kind: {}
          f:listKind: {}
          f:plural: {}
          f:singular: {}
        f:conditions:
          k:{"type":"Established"}:
            .: {}
            f:lastTransitionTime: {}
            f:message: {}
            f:reason: {}
            f:status: {}
            f:type: {}
          k:{"type":"NamesAccepted"}:
            .: {}
            f:lastTransitionTime: {}
            f:message: {}
            f:reason: {}
            f:status: {}
            f:type: {}
    manager: kube-apiserver
    operation: Update
    subresource: status
    time: "2026-04-22T16:50:14Z"
  name: inferencemodels.inference.networking.x-k8s.io
  resourceVersion: "20118"
  uid: f7c3e783-bebf-4705-b4f9-84b5d13257c2
spec:
  conversion:
    strategy: None
  group: inference.networking.x-k8s.io
  names:
    kind: InferenceModel
    listKind: InferenceModelList
    plural: inferencemodels
    singular: inferencemodel
  scope: Namespaced
  versions:
  - additionalPrinterColumns:
    - jsonPath: .spec.modelName
      name: Model Name
      type: string
    - jsonPath: .spec.poolRef.name
      name: Inference Pool
      type: string
    - jsonPath: .spec.criticality
      name: Criticality
      type: string
    - jsonPath: .metadata.creationTimestamp
      name: Age
      type: date
    name: v1alpha2
    schema:
      openAPIV3Schema:
        description: InferenceModel is the Schema for the InferenceModels API.
        properties:
          apiVersion:
            description: |-
              APIVersion defines the versioned schema of this representation of an object.
              Servers should convert recognized schemas to the latest internal value, and
              may reject unrecognized values.
              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
            type: string
          kind:
            description: |-
              Kind is a string value representing the REST resource this object represents.
              Servers may infer this from the endpoint the client submits requests to.
              Cannot be updated.
              In CamelCase.
              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
            type: string
          metadata:
            type: object
          spec:
            description: |-
              InferenceModelSpec represents the desired state of a specific model use case. This resource is
              managed by the "Inference Workload Owner" persona.

              The Inference Workload Owner persona is someone that trains, verifies, and
              leverages a large language model from a model frontend, drives the lifecycle
              and rollout of new versions of those models, and defines the specific
              performance and latency goals for the model. These workloads are
              expected to operate within an InferencePool sharing compute capacity with other
              InferenceModels, defined by the Inference Platform Admin.

              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
              if the name is reused, an error will be shown on the status of a
              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
              creation timestamp, will be selected to remain valid. In the event of a race
              condition, one will be selected at random.
            properties:
              criticality:
                description: |-
                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
                  and the proportionality of fairness will be configurable.

                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
                enum:
                - Critical
                - Standard
                - Sheddable
                type: string
              modelName:
                description: |-
                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
                  ModelNames must be unique for a referencing InferencePool
                  (names can be reused for a different pool in the same cluster).
                  The modelName with the oldest creation timestamp is retained, and the incoming
                  InferenceModel's Ready status is set to false with a corresponding reason.
                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
                  Names can be reserved without an underlying model configured in the pool.
                  This can be done by specifying a target model and setting the weight to zero,
                  an error will be returned specifying that no valid target model is found.
                maxLength: 256
                type: string
                x-kubernetes-validations:
                - message: modelName is immutable
                  rule: self == oldSelf
              poolRef:
                description: PoolRef is a reference to the inference pool, the pool
                  must exist in the same namespace.
                properties:
                  group:
                    default: inference.networking.x-k8s.io
                    description: Group is the group of the referent.
                    maxLength: 253
                    pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
                    type: string
                  kind:
                    default: InferencePool
                    description: Kind is kind of the referent. For example "InferencePool".
                    maxLength: 63
                    minLength: 1
                    pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$
                    type: string
                  name:
                    description: Name is the name of the referent.
                    maxLength: 253
                    minLength: 1
                    type: string
                required:
                - name
                type: object
              targetModels:
                description: |-
                  TargetModels allow multiple versions of a model for traffic splitting.
                  If not specified, the target model name is defaulted to the modelName parameter.
                  modelName is often in reference to a LoRA adapter.
                items:
                  description: |-
                    TargetModel represents a deployed model or a LoRA adapter. The
                    Name field is expected to match the name of the LoRA adapter
                    (or base model) as it is registered within the model server. Inference
                    Gateway assumes that the model exists on the model server and it's the
                    responsibility of the user to validate a correct match. Should a model fail
                    to exist at request time, the error is processed by the Inference Gateway
                    and emitted on the appropriate InferenceModel object.
                  properties:
                    name:
                      description: Name is the name of the adapter or base model,
                        as expected by the ModelServer.
                      maxLength: 253
                      type: string
                    weight:
                      description: |-
                        Weight is used to determine the proportion of traffic that should be
                        sent to this model when multiple target models are specified.

                        Weight defines the proportion of requests forwarded to the specified
                        model. This is computed as weight/(sum of all weights in this
                        TargetModels list). For non-zero values, there may be some epsilon from
                        the exact proportion defined here depending on the precision an
                        implementation supports. Weight is not a percentage and the sum of
                        weights does not need to equal 100.

                        If a weight is set for any targetModel, it must be set for all targetModels.
                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
                      format: int32
                      maximum: 1000000
                      minimum: 1
                      type: integer
                  required:
                  - name
                  type: object
                maxItems: 10
                type: array
                x-kubernetes-validations:
                - message: Weights should be set for all models, or none of the models.
                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
            required:
            - modelName
            - poolRef
            type: object
          status:
            description: InferenceModelStatus defines the observed state of InferenceModel
            properties:
              conditions:
                default:
                - lastTransitionTime: "1970-01-01T00:00:00Z"
                  message: Waiting for controller
                  reason: Pending
                  status: Unknown
                  type: Ready
                description: |-
                  Conditions track the state of the InferenceModel.

                  Known condition types are:

                  * "Accepted"
                items:
                  description: Condition contains details for one aspect of the current
                    state of this API Resource.
                  properties:
                    lastTransitionTime:
                      description: |-
                        lastTransitionTime is the last time the condition transitioned from one status to another.
                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
                      format: date-time
                      type: string
                    message:
                      description: |-
                        message is a human readable message indicating details about the transition.
                        This may be an empty string.
                      maxLength: 32768
                      type: string
                    observedGeneration:
                      description: |-
                        observedGeneration represents the .metadata.generation that the condition was set based upon.
                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
                        with respect to the current state of the instance.
                      format: int64
                      minimum: 0
                      type: integer
                    reason:
                      description: |-
                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
                        Producers of specific condition types may define expected values and meanings for this field,
                        and whether the values are considered a guaranteed API.
                        The value should be a CamelCase string.
                        This field may not be empty.
                      maxLength: 1024
                      minLength: 1
                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
                      type: string
                    status:
                      description: status of the condition, one of True, False, Unknown.
                      enum:
                      - "True"
                      - "False"
                      - Unknown
                      type: string
                    type:
                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
                      maxLength: 316
                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
                      type: string
                  required:
                  - lastTransitionTime
                  - message
                  - reason
                  - status
                  - type
                  type: object
                maxItems: 8
                type: array
                x-kubernetes-list-map-keys:
                - type
                x-kubernetes-list-type: map
            type: object
        type: object
    served: true
    storage: true
    subresources:
      status: {}
status:
  acceptedNames:
    kind: InferenceModel
    listKind: InferenceModelList
    plural: inferencemodels
    singular: inferencemodel
  conditions:
  - lastTransitionTime: "2026-04-22T16:50:13Z"
    message: no conflicts found
    reason: NoConflicts
    status: "True"
    type: NamesAccepted
  - lastTransitionTime: "2026-04-22T16:50:14Z"
    message: the initial names have been accepted
    reason: InitialNamesAccepted
    status: "True"
    type: Established
  storedVersions:
  - v1alpha2