InftyAI
diff --git a/‎Makefile‎
Lines changed: 18 additions & 2 deletions b/‎Makefile‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎chart/.helmignore‎
Lines changed: 23 additions & 0 deletions b/‎chart/.helmignore‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎chart/Chart.yaml‎
Lines changed: 21 additions & 0 deletions b/‎chart/Chart.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎chart/crds/backendruntime-crd.yaml‎
Lines changed: 314 additions & 0 deletions b/‎chart/crds/backendruntime-crd.yaml‎
Lines changed: 314 additions & 0 deletions
diff --git a/‎chart/crds/leaderworkerset.x-k8s.io_leaderworkersets.yaml‎
Lines changed: 16015 additions & 0 deletions b/‎chart/crds/leaderworkerset.x-k8s.io_leaderworkersets.yaml‎
Lines changed: 16015 additions & 0 deletions
diff --git a/‎chart/crds/openmodel-crd.yaml‎
Lines changed: 236 additions & 0 deletions b/‎chart/crds/openmodel-crd.yaml‎
Lines changed: 236 additions & 0 deletions
@@ -185,7 +185,7 @@ image-build:
 		--build-arg CGO_ENABLED=$(CGO_ENABLED) \
 		$(IMAGE_BUILD_EXTRA_OPTS) ./
 image-load: IMAGE_BUILD_EXTRA_OPTS=--load
-image-load: image-load
+image-load: image-build
 image-push: IMAGE_BUILD_EXTRA_OPTS=--push
 image-push: image-build
 
@@ -284,4 +284,20 @@ artifacts: kustomize
 	if [ -d artifacts ]; then rm -rf artifacts; fi
 	mkdir -p artifacts
 	$(KUSTOMIZE) build config/default -o artifacts/manifests.yaml
-	@$(call clean-manifests)
+	@$(call clean-manifests)
+
+HELMIFY ?= $(LOCALBIN)/helmify
+
+.PHONY: helmify
+helmify: $(HELMIFY) ## Download helmify locally if necessary.
+$(HELMIFY): $(LOCALBIN)
+	test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/helmify@latest
+
+.PHONY: helm
+helm: manifests kustomize helmify
+	$(KUBECTL) create namespace llmaz-system --dry-run=client -o yaml | $(KUBECTL) apply -f -
+	$(KUSTOMIZE) build config/default | $(HELMIFY) -crd-dir
+
+.PHONY: helm-install
+helm-install: helm
+	helm upgrade --install llmaz ./chart --namespace llmaz-system -f ./chart/values.global.yaml
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
@@ -0,0 +1,21 @@
+apiVersion: v2
+name: llmaz
+description: A Helm chart for llmaz
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.0.6"
@@ -0,0 +1,236 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.14.0
+  name: openmodels.llmaz.io
+spec:
+  conversion:
+    strategy: Webhook
+    webhook:
+      clientConfig:
+        service:
+          name: llmaz-webhook-service
+          namespace: llmaz-system
+          path: /convert
+      conversionReviewVersions:
+      - v1
+  group: llmaz.io
+  names:
+    kind: OpenModel
+    listKind: OpenModelList
+    plural: openmodels
+    shortNames:
+    - om
+    singular: openmodel
+  scope: Cluster
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: OpenModel is the Schema for the open models API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ModelSpec defines the desired state of Model
+            properties:
+              familyName:
+                description: |-
+                  FamilyName represents the model type, like llama2, which will be auto injected
+                  to the labels with the key of `llmaz.io/model-family-name`.
+                type: string
+              inferenceFlavors:
+                description: |-
+                  InferenceFlavors represents the accelerator requirements to serve the model.
+                  Flavors are fungible following the priority represented by the slice order.
+                items:
+                  description: |-
+                    Flavor defines the accelerator requirements for a model and the necessary parameters
+                    in autoscaling. Right now, it will be used in two places:
+                    - Pod scheduling with node selectors specified.
+                    - Cluster autoscaling with essential parameters provided.
+                  properties:
+                    name:
+                      description: Name represents the flavor name, which will be
+                        used in model claim.
+                      type: string
+                    nodeSelector:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        NodeSelector represents the node candidates for Pod placements, if a node doesn't
+                        meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
+                        If nodeSelector is empty, it means every node is a candidate.
+                      type: object
+                    params:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        Params stores other useful parameters and will be consumed by the autoscaling components
+                        like cluster-autoscaler, Karpenter.
+                        E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
+                        instance-type: p4d.24xlarge for AWS.
+                      type: object
+                    requests:
+                      additionalProperties:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                      description: |-
+                        Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
+                        When GPU number is greater than 8, like 32, then multi-host inference is enabled and
+                        32/8=4 hosts will be grouped as an unit, each host will have a resource request as
+                        nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
+                        Not recommended to set the cpu and memory usage here.
+                        If using playground, you can define the cpu/mem usage at backendConfig.
+                        If using service, you can define the cpu/mem at the container resources.
+                        Note: if you define the same accelerator requests at playground/service as well,
+                        the requests here will be covered.
+                      type: object
+                  required:
+                  - name
+                  type: object
+                maxItems: 8
+                type: array
+              source:
+                description: |-
+                  Source represents the source of the model, there're several ways to load
+                  the model such as loading from huggingface, OCI registry, s3, host path and so on.
+                properties:
+                  modelHub:
+                    description: ModelHub represents the model registry for model
+                      downloads.
+                    properties:
+                      filename:
+                        description: |-
+                          Filename refers to a specified model file rather than the whole repo.
+                          This is helpful to download a specified GGUF model rather than downloading
+                          the whole repo which includes all kinds of quantized models.
+                          TODO: this is only supported with Huggingface, add support for ModelScope
+                          in the near future.
+                        type: string
+                      modelID:
+                        description: |-
+                          ModelID refers to the model identifier on model hub,
+                          such as meta-llama/Meta-Llama-3-8B.
+                        type: string
+                      name:
+                        default: Huggingface
+                        description: Name refers to the model registry, such as huggingface.
+                        enum:
+                        - Huggingface
+                        - ModelScope
+                        type: string
+                      revision:
+                        description: |-
+                          Revision refers to a Git revision id which can be a branch name, a tag, or a commit hash.
+                          Most of the time, you don't need to specify it.
+                        type: string
+                    type: object
+                  uri:
+                    description: |-
+                      URI represents a various kinds of model sources following the uri protocol, e.g.:
+                      - OSS: oss://<bucket>.<endpoint>/<path-to-your-model>
+                    type: string
+                type: object
+            required:
+            - familyName
+            - source
+            type: object
+          status:
+            description: ModelStatus defines the observed state of Model
+            properties:
+              conditions:
+                description: Conditions represents the Inference condition.
+                items:
+                  description: "Condition contains details for one aspect of the current
+                    state of this API Resource.\n---\nThis struct is intended for
+                    direct use as an array at the field path .status.conditions.  For
+                    example,\n\n\n\ttype FooStatus struct{\n\t    // Represents the
+                    observations of a foo's current state.\n\t    // Known .status.conditions.type
+                    are: \"Available\", \"Progressing\", and \"Degraded\"\n\t    //
+                    +patchMergeKey=type\n\t    // +patchStrategy=merge\n\t    // +listType=map\n\t
+                    \   // +listMapKey=type\n\t    Conditions []metav1.Condition `json:\"conditions,omitempty\"
+                    patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t
+                    \   // other fields\n\t}"
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: |-
+                        type of condition in CamelCase or in foo.example.com/CamelCase.
+                        ---
+                        Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be
+                        useful (see .node.status.conditions), the ability to deconflict is important.
+                        The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}