Download this tutorial as a Jupyter notebook
Deploy Models#
Deploy models from NGC, HuggingFace, or Customizer checkpoints. Register external providers like OpenAI or NVIDIA Build.
Tip
If you are deploying models locally using the quickstart environment (Docker), see GPU Configuration for information on configuring GPU resources. This ensures model deployments and jobs coordinate GPU allocation to prevent resource conflicts.
# Configure CLI (if not already done)
nmp config set --base-url "$NMP_BASE_URL" --workspace default
import os
from nemo_microservices import NeMoMicroservices
sdk = NeMoMicroservices(
base_url=os.environ["NMP_BASE_URL"],
workspace="default"
)
Add External Providers#
Register external inference APIs like NVIDIA Build or OpenAI.
NVIDIA Build#
# Store API key
echo "$NVIDIA_API_KEY" | nmp secrets create --name "nvidia-api-key" --from-file -
# Create provider
nmp inference providers create \
--name "build" \
--host-url "https://integrate.api.nvidia.com" \
--api-key-secret-name "nvidia-api-key"
nmp wait inference provider build
# Test via interactive chat
nmp chat nvidia/llama-3.3-nemotron-super-49b-v1 "Hello!" \
--provider build
# Store API key
sdk.secrets.create(
name="nvidia-api-key",
data=os.environ["NVIDIA_API_KEY"]
)
# Create provider
provider = sdk.inference.providers.create(
name="build",
host_url="https://integrate.api.nvidia.com",
api_key_secret_name="nvidia-api-key"
)
sdk.models.wait_for_provider("build")
# Use via provider routing
response = sdk.inference.gateway.provider.post(
"v1/chat/completions",
name="build",
body={
"model": "meta/llama-3.1-8b-instruct",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
OpenAI#
# Store API key
echo "$OPENAI_API_KEY" | nmp secrets create --name "openai-api-key" --from-file -
# Create provider with enabled models
nmp inference providers create \
--name "openai" \
--host-url "https://api.openai.com/v1" \
--api-key-secret-name "openai-api-key" \
--enabled-models "gpt-4" \
--enabled-models "gpt-3.5-turbo"
nmp wait inference provider openai
# Test via interactive chat
nmp chat gpt-4 "Hello!" \
--provider openai
sdk.secrets.create(
name="openai-api-key",
data=os.environ["OPENAI_API_KEY"]
)
provider = sdk.inference.providers.create(
name="openai",
host_url="https://api.openai.com/v1",
api_key_secret_name="openai-api-key",
enabled_models=["gpt-4", "gpt-3.5-turbo"]
)
sdk.models.wait_for_provider("openai")
# Use via provider routing
response = sdk.inference.gateway.provider.post(
"v1/chat/completions",
name="openai",
body={
"model": "gpt-4",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
Deploy from NGC#
Deploy pre-built NIM containers from NGC.
Deploy Llama 3.2 1B#
nmp inference deployment-configs create \
--name "llama-3-2-1b-config" \
--nim-deployment '{
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_name": "meta/llama-3.2-1b-instruct"
}'
nmp inference deployments create \
--name "llama-3-2-1b-deployment" \
--config "llama-3-2-1b-config"
nmp wait inference deployment llama-3-2-1b-deployment
nmp chat meta/llama-3.2-1b-instruct "Hello!" \
--provider llama-3-2-1b-deployment \
--max-tokens 100
config = sdk.inference.deployment_configs.create(
name="llama-3-2-1b-config",
nim_deployment={
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_name": "meta/llama-3.2-1b-instruct"
}
)
deployment = sdk.inference.deployments.create(
name="llama-3-2-1b-deployment",
config="llama-3-2-1b-config"
)
sdk.models.wait_for_status(
deployment_name="llama-3-2-1b-deployment",
desired_status="READY"
)
response = sdk.inference.gateway.provider.post(
"v1/chat/completions",
name="llama-3-2-1b-deployment",
body={
"model": "meta/llama-3.2-1b-instruct",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
Deploy NeMoGuard Jailbreak Detection#
Deploy classification NIMs like NeMoGuard for content safety. Uses /v1/classify endpoint instead of chat completions.
nmp inference deployment-configs create \
--name "nemoguard-jailbreak-config" \
--nim-deployment '{
"gpu": 1,
"image_name": "nvcr.io/nim/nvidia/nemoguard-jailbreak-detect",
"image_tag": "1.10.1"
}'
nmp inference deployments create \
--name "nemoguard-jailbreak-deployment" \
--config "nemoguard-jailbreak-config"
nmp wait inference deployment nemoguard-jailbreak-deployment
nmp inference gateway provider post v1/classify \
--name "nemoguard-jailbreak-deployment" \
--body '{"input": "Tell me about vacation spots in Hawaii."}'
config = sdk.inference.deployment_configs.create(
name="nemoguard-jailbreak-config",
nim_deployment={
"gpu": 1,
"image_name": "nvcr.io/nim/nvidia/nemoguard-jailbreak-detect",
"image_tag": "1.10.1"
}
)
deployment = sdk.inference.deployments.create(
name="nemoguard-jailbreak-deployment",
config="nemoguard-jailbreak-config"
)
sdk.models.wait_for_status(
deployment_name="nemoguard-jailbreak-deployment",
desired_status="READY"
)
response = sdk.inference.gateway.provider.post(
"v1/classify",
name="nemoguard-jailbreak-deployment",
body={"input": "Tell me about vacation spots in Hawaii."}
)
Deploy from HuggingFace#
Deploy models from HuggingFace using the multi-LLM NIM. Omit image_name to auto-select the multi-LLM image.
echo "$HF_TOKEN" | nmp secrets create --name "hf-token-secret" --from-file -
nmp inference deployment-configs create \
--name "qwen-config" \
--nim-deployment '{
"model_name": "Qwen/Qwen2.5-1.5B-Instruct",
"gpu": 1
}'
nmp inference deployments create \
--name "qwen-deployment" \
--config "qwen-config" \
--hf-token-secret-name "hf-token-secret"
nmp wait inference deployment qwen-deployment
nmp chat Qwen/Qwen2.5-1.5B-Instruct "Hello!" \
--provider qwen-deployment \
--max-tokens 100
sdk.secrets.create(
name="hf-token-secret",
data=os.environ["HF_TOKEN"]
)
config = sdk.inference.deployment_configs.create(
name="qwen-config",
nim_deployment={
"model_name": "Qwen/Qwen2.5-1.5B-Instruct",
"gpu": 1
}
)
deployment = sdk.inference.deployments.create(
name="qwen-deployment",
config="qwen-config",
hf_token_secret_name="hf-token-secret"
)
sdk.models.wait_for_status(
deployment_name="qwen-deployment",
desired_status="READY"
)
response = sdk.inference.gateway.provider.post(
"v1/chat/completions",
name="qwen-deployment",
body={
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
Deploy from Customizer Weights#
Option 1: LoRA Adapters#
Deploy a base model with LoRA support. Adapters are loaded automatically for model entities whose peft.base_model matches.
nmp inference deployment-configs create \
--name "llama-lora-config" \
--nim-deployment '{
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_name": "meta/llama-3.2-1b-instruct",
"lora_enabled": true
}'
nmp inference deployments create \
--name "llama-lora-deployment" \
--config "llama-lora-config"
nmp wait inference deployment llama-lora-deployment
nmp chat "customized/my-llama-lora@cust-abc123" "Hello!" \
--provider llama-lora-deployment \
--max-tokens 100
config = sdk.inference.deployment_configs.create(
name="llama-lora-config",
nim_deployment={
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_name": "meta/llama-3.2-1b-instruct",
"lora_enabled": True
}
)
deployment = sdk.inference.deployments.create(
name="llama-lora-deployment",
config="llama-lora-config"
)
sdk.models.wait_for_status(
deployment_name="llama-lora-deployment",
desired_status="READY"
)
response = sdk.inference.gateway.model.post(
"v1/chat/completions",
name="customized/my-llama-lora@cust-abc123", # output_model from Customizer job
body={
"model": "customized/my-llama-lora@cust-abc123",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
Option 2: Full Fine-Tuned Models (SFT)#
Reference the model entity created by Customizer using model_namespace and model_name.
nmp inference deployment-configs create \
--name "sft-config" \
--nim-deployment '{
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_namespace": "customized",
"model_name": "my-sft-llama"
}'
nmp inference deployments create \
--name "sft-deployment" \
--config "sft-config"
nmp wait inference deployment sft-deployment
config = sdk.inference.deployment_configs.create(
name="sft-config",
nim_deployment={
"gpu": 1,
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.6",
"model_namespace": "customized", # From Customizer output_model
"model_name": "my-sft-llama"
}
)
deployment = sdk.inference.deployments.create(
name="sft-deployment",
config="sft-config"
)
sdk.models.wait_for_status(
deployment_name="sft-deployment",
desired_status="READY"
)
Deployment Cleanup#
# Note: Deleting the deployment will free up its GPU(s) when complete
nmp inference deployments delete <deployment-name>
nmp wait inference deployment <deployment-name> --status DELETED
nmp inference deployment-configs delete <config-name>
# For external providers
nmp inference providers delete <provider-name>
nmp secrets delete <secret-name>
# Note: Deleting the deployment will free up its GPU(s) when complete
sdk.inference.deployments.delete(name="<deployment-name>")
sdk.models.wait_for_status(
deployment_name="<deployment-name>",
desired_status="DELETED"
)
sdk.inference.deployment_configs.delete(name="<config-name>")
# For external providers
sdk.inference.providers.delete(name="<provider-name>")
sdk.secrets.delete(name="<secret-name>")
Multi-GPU Deployments#
For larger models requiring multiple GPUs, parallelism configuration depends on the NIM type.
Parallelism Strategies#
Tensor Parallel (TP): Splits model layers across GPUs → best for latency
Pipeline Parallel (PP): Splits model depth across GPUs → best for throughput
Formula:
gpu=tp_size×pp_size
Model-Specific NIMs#
Model-specific NIMs (e.g., nvcr.io/nim/meta/llama-3.1-70b-instruct) have TP/PP settings derived from manifest profiles in the container. Configure enough GPUs and the NIM selects the appropriate profile automatically.
Multi-LLM NIM#
The multi-LLM NIM (nvcr.io/nim/nvidia/llm-nim) requires explicit parallelism configuration via environment variables (NIM_TENSOR_PARALLEL_SIZE, NIM_PIPELINE_PARALLEL_SIZE). By default, it uses all GPUs for tensor parallelism (TP=gpu, PP=1).
This example deploys Qwen2.5-14B-Instruct across 2 GPUs using tensor parallelism.
# Create a fileset pointing to the HuggingFace model
# Qwen models are public — token_secret is optional
nmp filesets create \
--name "qwen-2-5-14b" \
--storage '{
"type": "huggingface",
"repo_id": "Qwen/Qwen2.5-14B-Instruct",
"repo_type": "model"
}'
# Register a model entity referencing the fileset
nmp models create \
--name "qwen-2-5-14b" \
--artifact '{"files_url": "hf://default/qwen-2-5-14b"}'
# Create deployment config with 2 GPUs (TP=2 by default)
nmp inference deployment-configs create \
--name "qwen-14b-config" \
--nim-deployment '{
"model_name": "default/qwen-2-5-14b",
"gpu": 2
}'
# Deploy
nmp inference deployments create \
--name "qwen-14b-deployment" \
--config "qwen-14b-config"
nmp wait inference deployment qwen-14b-deployment
nmp chat qwen-2-5-14b "Hello!" \
--max-tokens 100
# Create a fileset pointing to the HuggingFace model
# Qwen models are public — token_secret is optional
sdk.filesets.create(
name="qwen-2-5-14b",
storage={
"type": "huggingface",
"repo_id": "Qwen/Qwen2.5-14B-Instruct",
"repo_type": "model"
}
)
# Register a model entity referencing the fileset
sdk.models.create(
name="qwen-2-5-14b",
artifact={"files_url": "hf://default/qwen-2-5-14b"}
)
# Create deployment config with 2 GPUs (TP=2 by default)
config = sdk.inference.deployment_configs.create(
name="qwen-14b-config",
nim_deployment={
"model_name": "default/qwen-2-5-14b",
"gpu": 2
}
)
# Deploy
deployment = sdk.inference.deployments.create(
name="qwen-14b-deployment",
config="qwen-14b-config"
)
sdk.models.wait_for_status(
deployment_name="qwen-14b-deployment",
desired_status="READY"
)
response = sdk.inference.gateway.model.post(
"v1/chat/completions",
name="qwen-2-5-14b",
body={
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}
)
# NIM sets NIM_TENSOR_PARALLEL_SIZE=2 automatically
Custom Parallelism Configuration#
For larger models requiring more GPUs, you can configure specific TP/PP splits using additional_envs. The formula is: gpu = NIM_TENSOR_PARALLEL_SIZE x NIM_PIPELINE_PARALLEL_SIZE.
nmp inference deployment-configs create \
--name "multi-gpu-custom-config" \
--nim-deployment '{
"model_name": "default/qwen-2-5-14b",
"gpu": 4,
"additional_envs": {
"NIM_TENSOR_PARALLEL_SIZE": "2",
"NIM_PIPELINE_PARALLEL_SIZE": "2"
}
}'
config = sdk.inference.deployment_configs.create(
name="multi-gpu-custom-config",
nim_deployment={
"model_name": "default/qwen-2-5-14b",
"gpu": 4,
"additional_envs": {
"NIM_TENSOR_PARALLEL_SIZE": "2",
"NIM_PIPELINE_PARALLEL_SIZE": "2"
}
}
)
Tip
Choosing Parallelism Strategy:
TP=8, PP=1 (default): Lowest latency, best for real-time applications
TP=4, PP=2: Balanced latency and throughput
TP=2, PP=4: Highest throughput, best for batch processing
For custom models, match deployment parallelism to training parallelism for optimal performance.