feat: add gpuResource config

2023-04-12 10:09:30 +08:00 · 2023-04-12 10:09:30 +08:00 · db6c521068
parent 0abcf133de
commit db6c521068
6 changed files with 96 additions and 6 deletions
--- a/Chart.yaml
+++ b/Chart.yaml
@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.1.1

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
+# Helm Chart for AILab demo API
+
+> 注：所有Demo能力需要部署在`default`命名空间下，才能使用`keda-http-add-on`代理服务。
+
+## 快速使用
+
+1、指定Demo能力镜像和服务端口
+
+```shell
+helm upgrade -n default --install hf-gan --set image.repository="artifacts.iflytek.com/docker-private/atp/aicloud/hf_fully_body_anime_gan" \
+--set image.tag="1fc7e9b8c0" --set image.pullPolicy=Always --set service.port=7860 .
+```
+
+2、指定Demo能力GPU规格
+
+> GPU机器需要包含`gpushare`、`gputype`和`gpudrive`等标签。
+
+```shell
+helm upgrade -n default --install hf-gan --set image.repository="artifacts.iflytek.com/docker-private/atp/aicloud/hf_fully_body_anime_gan" \
+--set image.tag="1fc7e9b8c0" --set image.pullPolicy=Always --set service.port=7860 \
+--set gpuResource.enable=true,gpuResource.type="{t4,p4}",gpuResource.driverVersion="{515}" .
+```
+
+3、查看渲染后的部署清单
+
+```shell
+helm template demo01 --set gpuResource.enable=true --set gpuResource.type="{t4}",gpuResource.driverVersion="{515,418}" .
+```
+
+## 部署清单
+
+### 资源列表
+
+- ServiceAccount
+- Deployment
+- Ingress
+- Service
+- HTTPScaledObject
+
+## Values
+
+查看文件[values.yaml](values.yaml)。
--- a/templates/_helpers.tpl
+++ b/templates/_helpers.tpl
@ -67,3 +67,43 @@ Create the url path of service
 {{- define "demo-chart.path" -}}
 {{ .Values.pathPrefix }}/{{ .Release.Name }}
 {{- end }}
+
+{{/*
+Create the GPUNodeAffinity of service
+*/}}
+{{- define "demo-chart.GPUNodeAffinity" -}}
+{{- if .Values.gpuResource.enable -}}
+nodeAffinity:
+  requiredDuringSchedulingIgnoredDuringExecution:
+    nodeSelectorTerms:
+    - matchExpressions:
+      - key: gpushare
+        operator: In
+        values: ["true"]
+{{- if .Values.gpuResource.type }}
+      - key: gputype
+        operator: In
+        values:
+        {{- range .Values.gpuResource.type }}
+        - {{ . | upper }}
+        {{- end }}
+{{- end }}
+{{- if .Values.gpuResource.driverVersion }}
+      - key: gpudrive
+        operator: In
+        values:
+        {{- range .Values.gpuResource.driverVersion }}
+        - {{ . | quote }}
+        {{- end }}
+{{- end }}
+{{- else -}}
+nodeAffinity:
+  preferredDuringSchedulingIgnoredDuringExecution:
+  - preference:
+      matchExpressions:
+      - key: gpushare
+        operator: NotIn
+        values: ["true"]
+    weight: 100
+{{- end }}
+{{- end }}
--- a/templates/deployment.yaml
+++ b/templates/deployment.yaml
@ -49,10 +49,8 @@ spec:
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
-      {{- with .Values.affinity }}
      affinity:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
+        {{- include "demo-chart.GPUNodeAffinity" . | nindent 8 }}
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
--- a/templates/ingress.yaml
+++ b/templates/ingress.yaml
@ -10,7 +10,7 @@ spec:
    - host: {{ .Values.host }}
      http:
        paths:
-          - path: {{ include "demo-chart.path" . }}
+          - path: {{ include "demo-chart.path" . }}/
            pathType: Prefix
            backend:
              service:
--- a/values.yaml
+++ b/values.yaml
@ -67,6 +67,15 @@ resources: {}
  #   cpu: 100m
  #   memory: 128Mi

+# GPU resource configuration
+gpuResource:
+  # Specifies whether a GPU resource should be requested
+  enable: false
+  # Specifies the node GPU type list should be scheduled, maybe T4, P4, A100
+  type: []
+  # Specifies the GPU driver version list should be scheduled, maybe 418, 440, 515
+  driverVersion: []
+
 autoscaling:
  http:
    minReplicas: 0
@ -76,4 +85,5 @@ nodeSelector: {}

 tolerations: []

-affinity: {}
+# Temporarily unconfigurable
+# affinity: {}