NeMo Microservices Helm Chart#

Type: application

For deployment guide, see Admin Setup in the NeMo Microservices documentation.

Values#

The following is the complete values.yaml file for the NeMo Microservices Helm Chart. All configuration options are documented inline with comments.

# Default values for NeMo Microservices Platform Helm chart

## Helm global configuration settings

global:
  # -- A value needed for bitnamilegacy images used in postgresql subcharts
  security:
    allowInsecureImages: true

# -- Overrides for name and fullname templates
nameOverride: ""
fullnameOverride: ""

# -- Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key.
ngcAPIKey: YOUR-NGC-API-KEY

# -- Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well.
env: {}

# -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingSecret: ngc-api

# -- You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingImagePullSecret: nvcrimagepullsecret

# -- List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment.
additionalImagePullSecrets: {}

# -- RBAC configuration settings for optional dependencies
rbac:
  # -- Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs.
  volcanoEnabled: true
  # -- Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs.
  k8sNimOperatorEnabled: true

# -- Multi-node networking configuration for distributed GPU training.
# These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations.
#
# Requirements:
# - Kyverno policy engine must be installed in your cluster (required for multi-node networking)
# - Kyverno is NOT included as a subchart dependency and must be installed separately
#
# To install Kyverno:
#   helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0
#
# Documentation: https://kyverno.io/docs/installation/
# Helm chart: https://kyverno.github.io/kyverno/
#
# Note: Only enable ONE cloud provider per cluster deployment.
multinodeNetworking:
  # -- AWS-specific configuration for EFA device injection
  aws:
    # -- Enable AWS-specific Kyverno policy for EFA device injection
    enabled: false
    # -- Number of EFA devices to request per GPU (typically 1 or 4)
    efaDevicesPerGPU: 1

  # -- Azure-specific configuration for InfiniBand/RDMA
  azure:
    # -- Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration
    enabled: false
    # -- Number of RDMA devices to request per GPU
    rdmaDevicesPerGPU: 1
    # -- RDMA device plugin resource name
    rdmaDeviceName: "hca_shared_devices_a"

  # -- GCP-specific configuration for TCP-X/TCP-XO
  gcp:
    # -- Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration
    enabled: false

  # -- OCI-specific configuration for InfiniBand/SR-IOV
  oci:
    # -- Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration
    enabled: false
    # -- Number of RDMA devices (mlnxnics) to request per GPU
    rdmaDevicesPerGPU: 8

## Optional dependencies configuration. For production deployments, it is recommended to use existing installations of these dependencies.
k8s-nim-operator:
  # -- Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html).
  # If you are using an existing NIM Operator installation, set this to false.
  enabled: true
  nfd:
    nodeFeatureRules:
      # -- Specifies whether to enable device ID feature rules.
      deviceID: false


# -- Local PostgreSQL configuration for the NeMo Platform.
# @default -- This object has the following default values for the PostgreSQL configuration.
postgresql:
  # -- Whether to install the default PostgreSQL Helm chart. If enabled, the NeMo Platform Helm chart uses the [PostgreSQL Helm chart from Bitnami](https://github.com/bitnami/charts/blob/main/bitnami/postgresql/values.yaml) to create a PostgreSQL database.
  # It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform.
  # If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section.
  enabled: true
  image:
    repository: bitnamilegacy/postgresql
  volumePermissions:
    image:
      repository: bitnamilegacy/os-shell
  metrics:
    image:
      repository: bitnamilegacy/postgres-exporter
  auth:
    enablePostgresUser: true
    username: nemo
    password: nemo
    database: nemoplatform
    existingSecret: ""
  architecture: standalone
  service:
    ports:
      postgresql: 5432
  persistence:
    enabled: true
    size: 5Gi

# -- External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false.
# @default -- This object has the following default values for the external PostgreSQL configuration.
externalDatabase:
  # -- External database host address.
  host: localhost
  # -- External database port number.
  port: 5432
  # -- Database username
  user: nemo
  # -- Database name.
  database: nemoplatform
  # -- Name of an existing secret resource containing the database credentials.
  existingSecret: ""
  # -- Name of an existing secret key containing the database credentials.
  existingSecretPasswordKey: ""
  # -- URI secret configuration for external database.
  # @default -- This object has the following default values for the URI secret configuration.
  uriSecret:
    # -- Name of the URI secret.
    name: ""
    # -- Key in the URI secret containing the database URI.
    key: ""

# -- Specifies whether to enable authentication on the platform.
enableAuth: false

# -- Platform-wide configuration settings
# Set configuration here to apply custom, structured configuration across all services.
# Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config.
# It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features.
platformConfig: {}

# -- Base platform configuration settings
# @default -- This object has the following default values for the base platform configuration.
basePlatformConfig: |
  # -- platform is the service discovery configuration for services across the platform
  platform:
    # -- control_plane specifies the type of control plane the platform is running on.
    # Always set to 'kubernetes' for NeMo Microservices Platform when deploying with Helm.
    control_plane: kubernetes

    # Base URLs for various platform services
    base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    entitystore_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    jobs_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    models_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    files_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    secrets_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
    inference_gateway_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"

    # Image configuration for launching containers via the platform
    image_registry: nvcr.io/nvidia/nemo-microservices
    image_tag: {{ .Chart.AppVersion | quote }}
    image_pull_secrets:
      {{ include "nemo-common.imagepullsecrets" . | nindent 8 }}

  studio:
    # -- platform_base_url is the base URL used to access the platform.
    # This is the URL that NeMo Studio will use in the browser to communicate with the platform backend services.
    # An empty string means the Studio UI will reference its own host for API calls.
    platform_base_url: ""

  auth:
    enabled: {{ .Values.enableAuth }}
    # TODO - allow using an external OPA server
    policy_decision_point_provider: embedded
    policy_decision_point_base_url: "http://localhost:8080"
    policy_data_refresh_interval: 5
    bundle_cache_seconds: 5
    admin_email: "admin@example.com"

  # -- service is the common configuration for service settings on the platform
  service:
    host: "0.0.0.0"
    port: {{ toString .Values.api.service.port }}
    log_format: json

  # -- entities is the configuration specific to entity management on the platform
  entities:
    backend: sqlalchemy

  # -- jobs is the configuration specific to executing jobs on the platform
  jobs:
    # -- executor_defaults is the default configuration applied to all executor profiles
    executor_defaults:
      kubernetes_job:
        launcher_image: {{ include "nmp-core.image" . | quote }}
        storage:
          pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
          volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
        pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
      volcano_job:
        launcher_image: {{ include "nmp-core.image" . | quote }}
        storage:
          pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
          volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
        pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
        {{- if include "nemo-platform.multinodeNetworkingEnabled" . }}
        # Enable multi-node networking (triggers Kyverno policies for cloud-specific configuration)
        enable_multi_node_networking: true
        {{- end }}

  # -- secrets is the configuration specific to storing secrets on the platform
  secrets:
    encryption:
      current_provider: local_v1
      providers:
        secret_key:
          local_v1:
            value: "f4NPSp39YN5oWTwZ3iDX/L3PTvEH8qFvUs1noC/jWuo="

  # -- models is the configuration specific to model management on the platform
  models:
    controller:
      backends:
        k8s-nim-operator:
          enabled: true

  # -- inference_gateway is the configuration specific to inference request routing
  inference_gateway: {}

  # -- files is the configuration specific to file management on the platform
  files:
    default_storage_config:
      type: local
      path: /vol/files

  # -- auditor is the configuration specific to the Auditor service
  auditor: {}

  # -- data_designer is the configuration specific to the Data Designer service
  data_designer:
    model_provider_registry:
      default: "mock"
      providers:
        - name: "mock"
          endpoint: "http://localhost:8000"

  # -- customizer is the configuration specific to the Customizer service
  customizer: {}

  # -- evaluator is the configuration specific to the Evaluator service
  evaluator: {}

  # -- guardrails is the configuration specific to the Guardrails service
  guardrails: {}


ingress:
  # -- Specifies whether to enable the ingress.
  enabled: false
  # -- Annotations for the ingress resource.
  annotations: {}
  # -- The ingress class to use if your cluster has more than one class.
  className: ""
  # -- TLS configurations.
  tls: []
  hosts:
      # -- Hostname used by ingress. If blank, use path-only routing.
    - name: ""
      # Note these paths only work as is with Nginx
      paths:
        - path: /
          pathType: Prefix
          service: '{{ include "nmp-api.api-servicename" . }}'
          port: "{{ .Values.api.service.port }}"

httpRoute:
  # -- Specifies whether to enable a Gateway API HTTP Route for the service.
  enabled: false
  # -- Extra labels for the HTTP Route object.
  labels: {}
  # -- Extra annotations for the HTTP Route object.
  annotations: {}
  # -- A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true.
  parentRefs: []
  # -- If this has a specific hostname, add the name or names here in an array.
  hostnames: []
  # -- Path matches to route queries.
  pathRules:
    - matches:
        - path: /
          type: PathPrefix
      backends:
        - service: '{{ include "nmp-api.api-servicename" . }}'
          port: "{{ .Values.api.service.port }}"
  # -- This is a list of filters for the objects, such as CORS settings.
  filters: []

# # -- OpenTelemetry configuration settings for all services.
# @default -- This object has the following default values for the OpenTelemetry configuration.
telemetry:
  # -- Disable OpenTelemetry instrumentation and exporting for all services.
  OTEL_SDK_DISABLED: false
  # -- The OpenTelemetry grpc collector endpoint to export traces and metrics to.
  OTEL_EXPORTER_OTLP_ENDPOINT: ""
  # -- Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint.
  OTEL_EXPORTER_OTLP_INSECURE: true
  # -- The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export.
  OTEL_TRACES_EXPORTER: "none"
  # -- The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export.
  OTEL_METRICS_EXPORTER: "none"
  # -- The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
  OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: null
  # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
  OTEL_EXPORTER_OTLP_TRACES_INSECURE: true
  # -- The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
  OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: null
  # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
  OTEL_EXPORTER_OTLP_METRICS_INSECURE: true

# -- Pod security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the pod security context.
podSecurityContext: {}

# -- Container security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the container security context.
securityContext: {}

# -- API configuration settings for the NeMo Platform API microservice
# @default -- This object has the following default values for the API configuration.
api:
  # -- Specifies whether to enable the NeMo Platform API.
  enabled: true

  # -- Container image configuration for the NeMo Platform API microservice.
  # @default -- This object has the following default values for the image configuration.
  image:
    # -- The registry where the NeMo Platform API image is located.
    repository: nvcr.io/nvidia/nemo-microservices/nmp-api
    # -- The image pull policy determining when to pull new images.
    pullPolicy: IfNotPresent
    # -- The image tag to use.
    tag: ""

  # -- OpenTelemetry configuration overrides for the NeMo Platform API microservice.
  telemetry: {}

  # -- Number of replicas for the API service.
  replicaCount: 1
  # -- Additional arguments to pass to the Platform API service
  extraArgs: []
  # -- Service account configuration for the API service.
  # @default -- This object has the following default values for the service account configuration.
  serviceAccount:
    # -- Specifies whether a service account should be created.
    create: true
    # -- Automatically mount a ServiceAccount's API credentials.
    automount: true
    # -- Annotations to add to the service account.
    annotations: {}
    # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
    name: ""
  # -- Annotations to add to the API service deployment.
  annotations: {}
  # -- Annotations to add to the API service pod.
  podAnnotations: {}
  # -- Labels for the API service pod.
  podLabels: {}
  # -- Pod-level security context settings for the API service.
  # @default -- This object has the following default values for the pod security context.
  podSecurityContext:
    # -- The file system group ID to use for all containers.
    fsGroup: 1000
  # -- Container-level security context settings for the API service.
  securityContext: {}
  # -- Service configuration for the API service.
  # @default -- This object has the following default values for the service configuration.
  service:
    # -- The Kubernetes service type to create.
    type: ClusterIP
    # -- The port number to expose for the service.
    port: 8080
  # -- Kubernetes deployment resources configuration for the API service.
  resources: {}

  # -- Startup probe configuration for the api service.
  # @default -- This object has the following default values for the startup probe configuration.
  startupProbe:
    # -- The HTTP GET request to use for the startup probe.
    httpGet:
      path: /health
      port: http
    # -- The frequency in seconds to perform the startup probe.
    periodSeconds: 10
    # -- The timeout in seconds for the startup probe.
    timeoutSeconds: 5
    # -- The failure threshold for the startup probe.
    failureThreshold: 24

  # -- Liveness probe configuration for the api service.
  # @default -- This object has the following default values for the liveness probe configuration.
  livenessProbe:
    # -- The HTTP GET request to use for the liveness probe.
    httpGet:
      path: /health
      port: http
    # -- The frequency in seconds to perform the liveness probe.
    periodSeconds: 10
    # -- The timeout in seconds for the liveness probe.
    timeoutSeconds: 5
    # -- The failure threshold for the liveness probe.
    failureThreshold: 3

  # -- Readiness probe configuration for the api service.
  # @default -- This object has the following default values for the readiness probe configuration.
  readinessProbe:
    # -- The HTTP GET request to use for the readiness probe.
    httpGet:
      path: /health
      port: http
    # -- The frequency in seconds to perform the readiness probe.
    periodSeconds: 10
    # -- The timeout in seconds for the readiness probe.
    timeoutSeconds: 5
    # -- The failure threshold for the readiness probe.
    failureThreshold: 3

  # -- Specifies autoscaling configurations for the deployment.
  autoscaling:
    # -- Whether to enable horizontal pod autoscaler.
    enabled: false
    # -- The minimum number of replicas for the deployment.
    minReplicas: 1
    # -- The maximum number of replicas for the deployment.
    maxReplicas: 10
    # -- The target CPU utilization percentage.
    targetCPUUtilizationPercentage: 80
    # targetMemoryUtilizationPercentage: 80

  # Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
  env: {}
  # -- Node selector configuration for the API service.
  nodeSelector: {}
  # -- Affinity configuration for the API service.
  affinity: {}
  # -- Tolerations configuration for the API service.
  tolerations: []

  # ServiceMonitor configuration for Prometheus Operator
  serviceMonitor:
    # -- Enable ServiceMonitor resources for Prometheus Operator
    enabled: false
    # -- Scrape interval for the ServiceMonitor
    interval: "30s"
    # -- Scheme to use for scraping metrics (http or https)
    scheme: "http"
    # -- Additional labels to add to the ServiceMonitor
    labels: {}
    # -- Additional annotations to add to the ServiceMonitor
    annotations: {}

# -- NeMo Core microservice configuration settings
# @default -- This object has the following default values for the NeMo Core microservice configuration.
core:
  # -- Specifies whether to enable the NeMo Core microservice.
  enabled: true

  # -- Container image configuration for the NeMo Core microservice.
  # @default -- This object has the following default values for the image configuration.
  image:
    # -- The registry where the NeMo Core image is located.
    repository: nvcr.io/nvidia/nemo-microservices/nmp-core
    # -- The image pull policy determining when to pull new images.
    pullPolicy: IfNotPresent
    # -- The image tag to use.
    tag: ""

  storage:
    # -- If set, pods will mount this persistent volume for job-scoped storage
    # and we will not create a new persistent volume claim.
    existingPersistentVolumeName: ""
    # -- Which storageClass to use when creating a new persistent volume claim.
    # Leaving as empty string will use the cluster's default storageClass.
    storageClass: ""
    # -- accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure
    # multiple job pods can write to the volume concurrently.
    accessModes:
      - ReadWriteMany
    # -- size of the persistent volume claim used for persistent storage
    size: 200Gi
    # -- volumePermissionsImage is the image used to set permissions on the volume
    volumePermissionsImage: "busybox"

  # -- OpenTelemetry configuration overrides for the NeMo Core microservice.
  telemetry: {}

  # @default -- This object has the following default values for the controller configuration.
  controller:
    # -- Service account configuration for the controller service.
    # @default -- This object has the following default values for the service account configuration.
    serviceAccount:
      # -- Specifies whether a service account should be created.
      create: true
      # -- Automatically mount a ServiceAccount's API credentials.
      automount: true
      # -- Annotations to add to the service account.
      annotations: {}
      # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
      name: ""
    # -- Additional arguments to pass to the Core Controller service
    extraArgs: []

    # -- Service configuration for the controller service. This only configures a headless service for DNS resolution.
    # @default -- This object has the following default values for the service configuration.
    service:
      # -- The port for the service.
      port: 8080
    # -- Annotations to add to the controller service deployment.
    annotations: {}
    # -- Annotations to add to the controller service pod.
    podAnnotations: {}
    # -- Labels for the controller service pod.
    podLabels: {}
    # -- Pod-level security context settings for the controller service.
    # @default -- This object has the following default values for the pod security context.
    podSecurityContext:
      # -- The file system group ID to use for all containers.
      fsGroup: 1000
    # -- Container-level security context settings for the controller service.
    securityContext: {}
    # -- Kubernetes deployment resources configuration for the controller service.
    resources: {}
    # -- Liveness probe configuration for the controller service.
    # @default -- This object has the following default values for the liveness probe configuration.
    livenessProbe:
      # -- The HTTP GET request to use for the readiness probe.
      httpGet:
        path: /health
        port: http
      # -- The frequency in seconds to perform the readiness probe.
      periodSeconds: 10
      # -- The timeout in seconds for the readiness probe.
      timeoutSeconds: 5
      # -- The failure threshold for the readiness probe.
      failureThreshold: 3

    # -- Readiness probe configuration for the controller service.
    # @default -- This object has the following default values for the readiness probe configuration.
    readinessProbe:
      # -- The HTTP GET request to use for the readiness probe.
      httpGet:
        path: /health
        port: http
      # -- The frequency in seconds to perform the readiness probe.
      periodSeconds: 10
      # -- The timeout in seconds for the readiness probe.
      timeoutSeconds: 5
      # -- The failure threshold for the readiness probe.
      failureThreshold: 3
    # -- Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}.
    env: {}
    # -- Node selector configuration for the controller service.
    nodeSelector: {}
    # -- Affinity configuration for the controller service.
    affinity: {}
    # -- Tolerations configuration for the controller service.
    tolerations: []

  # ServiceMonitor configuration for Prometheus Operator
  serviceMonitor:
    # -- Enable ServiceMonitor resources for Prometheus Operator
    enabled: false
    # -- Scrape interval for the ServiceMonitor
    interval: "30s"
    # -- Scheme to use for scraping metrics (http or https)
    scheme: "http"
    # -- Additional labels to add to the ServiceMonitor
    labels: {}
    # -- Additional annotations to add to the ServiceMonitor
    annotations: {}