NeMo Microservices Helm Chart#
For deployment guide, see Admin Setup in the NeMo Microservices documentation.
Values#
The following is the complete values.yaml file for the NeMo Microservices Helm Chart.
All configuration options are documented inline with comments.
# Default values for NeMo Microservices Platform Helm chart
## Helm global configuration settings
global:
# -- A value needed for bitnamilegacy images used in postgresql subcharts
security:
allowInsecureImages: true
# -- Overrides for name and fullname templates
nameOverride: ""
fullnameOverride: ""
# -- Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key.
ngcAPIKey: YOUR-NGC-API-KEY
# -- Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well.
env: {}
# -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingSecret: ngc-api
# -- You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingImagePullSecret: nvcrimagepullsecret
# -- List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment.
additionalImagePullSecrets: {}
# -- RBAC configuration settings for optional dependencies
rbac:
# -- Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs.
volcanoEnabled: true
# -- Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs.
k8sNimOperatorEnabled: true
# -- Multi-node networking configuration for distributed GPU training.
# These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations.
#
# Requirements:
# - Kyverno policy engine must be installed in your cluster (required for multi-node networking)
# - Kyverno is NOT included as a subchart dependency and must be installed separately
#
# To install Kyverno:
# helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0
#
# Documentation: https://kyverno.io/docs/installation/
# Helm chart: https://kyverno.github.io/kyverno/
#
# Note: Only enable ONE cloud provider per cluster deployment.
multinodeNetworking:
# -- AWS-specific configuration for EFA device injection
aws:
# -- Enable AWS-specific Kyverno policy for EFA device injection
enabled: false
# -- Number of EFA devices to request per GPU (typically 1 or 4)
efaDevicesPerGPU: 1
# -- Azure-specific configuration for InfiniBand/RDMA
azure:
# -- Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration
enabled: false
# -- Number of RDMA devices to request per GPU
rdmaDevicesPerGPU: 1
# -- RDMA device plugin resource name
rdmaDeviceName: "hca_shared_devices_a"
# -- GCP-specific configuration for TCP-X/TCP-XO
gcp:
# -- Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration
enabled: false
# -- OCI-specific configuration for InfiniBand/SR-IOV
oci:
# -- Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration
enabled: false
# -- Number of RDMA devices (mlnxnics) to request per GPU
rdmaDevicesPerGPU: 8
## Optional dependencies configuration. For production deployments, it is recommended to use existing installations of these dependencies.
k8s-nim-operator:
# -- Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html).
# If you are using an existing NIM Operator installation, set this to false.
enabled: true
nfd:
nodeFeatureRules:
# -- Specifies whether to enable device ID feature rules.
deviceID: false
# -- Local PostgreSQL configuration for the NeMo Platform.
# @default -- This object has the following default values for the PostgreSQL configuration.
postgresql:
# -- Whether to install the default PostgreSQL Helm chart. If enabled, the NeMo Platform Helm chart uses the [PostgreSQL Helm chart from Bitnami](https://github.com/bitnami/charts/blob/main/bitnami/postgresql/values.yaml) to create a PostgreSQL database.
# It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform.
# If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section.
enabled: true
image:
repository: bitnamilegacy/postgresql
volumePermissions:
image:
repository: bitnamilegacy/os-shell
metrics:
image:
repository: bitnamilegacy/postgres-exporter
auth:
enablePostgresUser: true
username: nemo
password: nemo
database: nemoplatform
existingSecret: ""
architecture: standalone
service:
ports:
postgresql: 5432
persistence:
enabled: true
size: 5Gi
# -- External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false.
# @default -- This object has the following default values for the external PostgreSQL configuration.
externalDatabase:
# -- External database host address.
host: localhost
# -- External database port number.
port: 5432
# -- Database username
user: nemo
# -- Database name.
database: nemoplatform
# -- Name of an existing secret resource containing the database credentials.
existingSecret: ""
# -- Name of an existing secret key containing the database credentials.
existingSecretPasswordKey: ""
# -- URI secret configuration for external database.
# @default -- This object has the following default values for the URI secret configuration.
uriSecret:
# -- Name of the URI secret.
name: ""
# -- Key in the URI secret containing the database URI.
key: ""
# -- Specifies whether to enable authentication on the platform.
enableAuth: false
# -- Platform-wide configuration settings
# Set configuration here to apply custom, structured configuration across all services.
# Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config.
# It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features.
platformConfig: {}
# -- Base platform configuration settings
# @default -- This object has the following default values for the base platform configuration.
basePlatformConfig: |
# -- platform is the service discovery configuration for services across the platform
platform:
# -- control_plane specifies the type of control plane the platform is running on.
# Always set to 'kubernetes' for NeMo Microservices Platform when deploying with Helm.
control_plane: kubernetes
# Base URLs for various platform services
base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
entitystore_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
jobs_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
models_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
files_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
secrets_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
inference_gateway_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
# Image configuration for launching containers via the platform
image_registry: nvcr.io/nvidia/nemo-microservices
image_tag: {{ .Chart.AppVersion | quote }}
image_pull_secrets:
{{ include "nemo-common.imagepullsecrets" . | nindent 8 }}
studio:
# -- platform_base_url is the base URL used to access the platform.
# This is the URL that NeMo Studio will use in the browser to communicate with the platform backend services.
# An empty string means the Studio UI will reference its own host for API calls.
platform_base_url: ""
auth:
enabled: {{ .Values.enableAuth }}
# TODO - allow using an external OPA server
policy_decision_point_provider: embedded
policy_decision_point_base_url: "http://localhost:8080"
policy_data_refresh_interval: 5
bundle_cache_seconds: 5
admin_email: "admin@example.com"
# -- service is the common configuration for service settings on the platform
service:
host: "0.0.0.0"
port: {{ toString .Values.api.service.port }}
log_format: json
# -- entities is the configuration specific to entity management on the platform
entities:
backend: sqlalchemy
# -- jobs is the configuration specific to executing jobs on the platform
jobs:
# -- executor_defaults is the default configuration applied to all executor profiles
executor_defaults:
kubernetes_job:
launcher_image: {{ include "nmp-core.image" . | quote }}
storage:
pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
volcano_job:
launcher_image: {{ include "nmp-core.image" . | quote }}
storage:
pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
{{- if include "nemo-platform.multinodeNetworkingEnabled" . }}
# Enable multi-node networking (triggers Kyverno policies for cloud-specific configuration)
enable_multi_node_networking: true
{{- end }}
# -- secrets is the configuration specific to storing secrets on the platform
secrets:
encryption:
current_provider: local_v1
providers:
secret_key:
local_v1:
value: "f4NPSp39YN5oWTwZ3iDX/L3PTvEH8qFvUs1noC/jWuo="
# -- models is the configuration specific to model management on the platform
models:
controller:
backends:
k8s-nim-operator:
enabled: true
# -- inference_gateway is the configuration specific to inference request routing
inference_gateway: {}
# -- files is the configuration specific to file management on the platform
files:
default_storage_config:
type: local
path: /vol/files
# -- auditor is the configuration specific to the Auditor service
auditor: {}
# -- data_designer is the configuration specific to the Data Designer service
data_designer:
model_provider_registry:
default: "mock"
providers:
- name: "mock"
endpoint: "http://localhost:8000"
# -- customizer is the configuration specific to the Customizer service
customizer: {}
# -- evaluator is the configuration specific to the Evaluator service
evaluator: {}
# -- guardrails is the configuration specific to the Guardrails service
guardrails: {}
ingress:
# -- Specifies whether to enable the ingress.
enabled: false
# -- Annotations for the ingress resource.
annotations: {}
# -- The ingress class to use if your cluster has more than one class.
className: ""
# -- TLS configurations.
tls: []
hosts:
# -- Hostname used by ingress. If blank, use path-only routing.
- name: ""
# Note these paths only work as is with Nginx
paths:
- path: /
pathType: Prefix
service: '{{ include "nmp-api.api-servicename" . }}'
port: "{{ .Values.api.service.port }}"
httpRoute:
# -- Specifies whether to enable a Gateway API HTTP Route for the service.
enabled: false
# -- Extra labels for the HTTP Route object.
labels: {}
# -- Extra annotations for the HTTP Route object.
annotations: {}
# -- A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true.
parentRefs: []
# -- If this has a specific hostname, add the name or names here in an array.
hostnames: []
# -- Path matches to route queries.
pathRules:
- matches:
- path: /
type: PathPrefix
backends:
- service: '{{ include "nmp-api.api-servicename" . }}'
port: "{{ .Values.api.service.port }}"
# -- This is a list of filters for the objects, such as CORS settings.
filters: []
# # -- OpenTelemetry configuration settings for all services.
# @default -- This object has the following default values for the OpenTelemetry configuration.
telemetry:
# -- Disable OpenTelemetry instrumentation and exporting for all services.
OTEL_SDK_DISABLED: false
# -- The OpenTelemetry grpc collector endpoint to export traces and metrics to.
OTEL_EXPORTER_OTLP_ENDPOINT: ""
# -- Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint.
OTEL_EXPORTER_OTLP_INSECURE: true
# -- The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export.
OTEL_TRACES_EXPORTER: "none"
# -- The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export.
OTEL_METRICS_EXPORTER: "none"
# -- The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: null
# -- Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
OTEL_EXPORTER_OTLP_TRACES_INSECURE: true
# -- The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: null
# -- Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
OTEL_EXPORTER_OTLP_METRICS_INSECURE: true
# -- Pod security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the pod security context.
podSecurityContext: {}
# -- Container security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the container security context.
securityContext: {}
# -- API configuration settings for the NeMo Platform API microservice
# @default -- This object has the following default values for the API configuration.
api:
# -- Specifies whether to enable the NeMo Platform API.
enabled: true
# -- Container image configuration for the NeMo Platform API microservice.
# @default -- This object has the following default values for the image configuration.
image:
# -- The registry where the NeMo Platform API image is located.
repository: nvcr.io/nvidia/nemo-microservices/nmp-api
# -- The image pull policy determining when to pull new images.
pullPolicy: IfNotPresent
# -- The image tag to use.
tag: ""
# -- OpenTelemetry configuration overrides for the NeMo Platform API microservice.
telemetry: {}
# -- Number of replicas for the API service.
replicaCount: 1
# -- Additional arguments to pass to the Platform API service
extraArgs: []
# -- Service account configuration for the API service.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
name: ""
# -- Annotations to add to the API service deployment.
annotations: {}
# -- Annotations to add to the API service pod.
podAnnotations: {}
# -- Labels for the API service pod.
podLabels: {}
# -- Pod-level security context settings for the API service.
# @default -- This object has the following default values for the pod security context.
podSecurityContext:
# -- The file system group ID to use for all containers.
fsGroup: 1000
# -- Container-level security context settings for the API service.
securityContext: {}
# -- Service configuration for the API service.
# @default -- This object has the following default values for the service configuration.
service:
# -- The Kubernetes service type to create.
type: ClusterIP
# -- The port number to expose for the service.
port: 8080
# -- Kubernetes deployment resources configuration for the API service.
resources: {}
# -- Startup probe configuration for the api service.
# @default -- This object has the following default values for the startup probe configuration.
startupProbe:
# -- The HTTP GET request to use for the startup probe.
httpGet:
path: /health
port: http
# -- The frequency in seconds to perform the startup probe.
periodSeconds: 10
# -- The timeout in seconds for the startup probe.
timeoutSeconds: 5
# -- The failure threshold for the startup probe.
failureThreshold: 24
# -- Liveness probe configuration for the api service.
# @default -- This object has the following default values for the liveness probe configuration.
livenessProbe:
# -- The HTTP GET request to use for the liveness probe.
httpGet:
path: /health
port: http
# -- The frequency in seconds to perform the liveness probe.
periodSeconds: 10
# -- The timeout in seconds for the liveness probe.
timeoutSeconds: 5
# -- The failure threshold for the liveness probe.
failureThreshold: 3
# -- Readiness probe configuration for the api service.
# @default -- This object has the following default values for the readiness probe configuration.
readinessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- Specifies autoscaling configurations for the deployment.
autoscaling:
# -- Whether to enable horizontal pod autoscaler.
enabled: false
# -- The minimum number of replicas for the deployment.
minReplicas: 1
# -- The maximum number of replicas for the deployment.
maxReplicas: 10
# -- The target CPU utilization percentage.
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
env: {}
# -- Node selector configuration for the API service.
nodeSelector: {}
# -- Affinity configuration for the API service.
affinity: {}
# -- Tolerations configuration for the API service.
tolerations: []
# ServiceMonitor configuration for Prometheus Operator
serviceMonitor:
# -- Enable ServiceMonitor resources for Prometheus Operator
enabled: false
# -- Scrape interval for the ServiceMonitor
interval: "30s"
# -- Scheme to use for scraping metrics (http or https)
scheme: "http"
# -- Additional labels to add to the ServiceMonitor
labels: {}
# -- Additional annotations to add to the ServiceMonitor
annotations: {}
# -- NeMo Core microservice configuration settings
# @default -- This object has the following default values for the NeMo Core microservice configuration.
core:
# -- Specifies whether to enable the NeMo Core microservice.
enabled: true
# -- Container image configuration for the NeMo Core microservice.
# @default -- This object has the following default values for the image configuration.
image:
# -- The registry where the NeMo Core image is located.
repository: nvcr.io/nvidia/nemo-microservices/nmp-core
# -- The image pull policy determining when to pull new images.
pullPolicy: IfNotPresent
# -- The image tag to use.
tag: ""
storage:
# -- If set, pods will mount this persistent volume for job-scoped storage
# and we will not create a new persistent volume claim.
existingPersistentVolumeName: ""
# -- Which storageClass to use when creating a new persistent volume claim.
# Leaving as empty string will use the cluster's default storageClass.
storageClass: ""
# -- accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure
# multiple job pods can write to the volume concurrently.
accessModes:
- ReadWriteMany
# -- size of the persistent volume claim used for persistent storage
size: 200Gi
# -- volumePermissionsImage is the image used to set permissions on the volume
volumePermissionsImage: "busybox"
# -- OpenTelemetry configuration overrides for the NeMo Core microservice.
telemetry: {}
# @default -- This object has the following default values for the controller configuration.
controller:
# -- Service account configuration for the controller service.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
name: ""
# -- Additional arguments to pass to the Core Controller service
extraArgs: []
# -- Service configuration for the controller service. This only configures a headless service for DNS resolution.
# @default -- This object has the following default values for the service configuration.
service:
# -- The port for the service.
port: 8080
# -- Annotations to add to the controller service deployment.
annotations: {}
# -- Annotations to add to the controller service pod.
podAnnotations: {}
# -- Labels for the controller service pod.
podLabels: {}
# -- Pod-level security context settings for the controller service.
# @default -- This object has the following default values for the pod security context.
podSecurityContext:
# -- The file system group ID to use for all containers.
fsGroup: 1000
# -- Container-level security context settings for the controller service.
securityContext: {}
# -- Kubernetes deployment resources configuration for the controller service.
resources: {}
# -- Liveness probe configuration for the controller service.
# @default -- This object has the following default values for the liveness probe configuration.
livenessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- Readiness probe configuration for the controller service.
# @default -- This object has the following default values for the readiness probe configuration.
readinessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}.
env: {}
# -- Node selector configuration for the controller service.
nodeSelector: {}
# -- Affinity configuration for the controller service.
affinity: {}
# -- Tolerations configuration for the controller service.
tolerations: []
# ServiceMonitor configuration for Prometheus Operator
serviceMonitor:
# -- Enable ServiceMonitor resources for Prometheus Operator
enabled: false
# -- Scrape interval for the ServiceMonitor
interval: "30s"
# -- Scheme to use for scraping metrics (http or https)
scheme: "http"
# -- Additional labels to add to the ServiceMonitor
labels: {}
# -- Additional annotations to add to the ServiceMonitor
annotations: {}