Download this tutorial as a Jupyter notebook
Emergency Triage LLM Evaluation#
Problem: Accurately labeling Emergency Severity Index (ESI) levels from nurse triage notes is critical for clinical research and model development, but real-world data is often sensitive and access is restricted. Additionally, obtaining high-quality human annotations is costly and slow, making it difficult to build large, diverse datasets for robust evaluation.
Opportunity: Synthetic data generation offers a scalable, privacy-preserving alternative. By simulating realistic triage notes and ESI labels, we can create rich datasets without exposing patient information or relying on scarce human annotators. This enables rapid iteration, benchmarking, and model improvement in a domain where data scarcity is a major bottleneck.
Use case: Predict ESI levels from synthetic nurse triage notes using LLMs
Goal: Evaluate model accuracy and the quality/complexity of generated notes across a range of clinical scenarios
Pipeline: Synthetic data ➜ LLM-as-a-Judge scoring ➜ Filtering ➜ Evaluation
┌───────────────────────────────┐ ┌─────────────────────────────┐
│ Nemo Data Designer │ │ Nemo Evaluator │
│ +------------------------+ │ │ +-----------------------+ |
│ | Nurse Triage Note 📝 |───┼───────▶| | LLM predicts ESI 🔍🤖 | |
│ +------------------------+ │ │ +-----------------------+ │
│ + │ │ | │
│ │ │ v │
│ +------------------------+ │ │ +-----------------------+ │
│ | Ground Truth (ESI) ✅ |───┼───────▶| | Predicted ESI 🏷️ | │
│ +------------------------+ │ │ +-----------------------+ │
└───────────────────────────────┘ │ | │
│ v │
│ +-----------------------+ │
│ | Metrics 📊 | │
│ | (Accuracy) | │
│ +-----------------------+ │
└─────────────────────────────┘
What happens below:
🏗️ Generate realistic, privacy-safe triage notes, score their quality with LLM-as-a-Judge, and filter for high-signal examples using Data Designer.
⬆️ Upload dataset to a datastore (HF-compatible)
📈 Run Evaluator to compute ESI classification accuracy and other metrics
Tip: Run the cells in order. You can re-run preview/generation to explore different scenarios and difficulty levels.
Step 1: 🎨 NeMo Data Designer#
from nemo_microservices.data_designer.essentials import (
DataDesignerConfigBuilder,
NeMoDataDesignerClient,
ModelConfig,
InferenceParameters,
SamplerColumnConfig,
SamplerType,
CategorySamplerParams,
SubcategorySamplerParams,
PersonSamplerParams,
LLMTextColumnConfig,
LLMJudgeColumnConfig,
Score,
)
data_designer_client = NeMoDataDesignerClient(
base_url="http://localhost:8080",
)
MODEL_PROVIDER = "nvidiabuild"
MODEL_ALIAS_GENERATOR = "content_generator"
MODEL_ID_GENERATOR = "deepseek-ai/deepseek-r1-distill-qwen-14b"
MODEL_ALIAS_JUDGE = "judge"
MODEL_ID_JUDGE = "openai/gpt-oss-120b"
model_configs = [
ModelConfig(
provider=MODEL_PROVIDER,
alias=MODEL_ALIAS_GENERATOR,
model=MODEL_ID_GENERATOR,
inference_parameters=InferenceParameters(
max_tokens=8000,
temperature=0.7,
top_p=0.95,
)
),
ModelConfig(
provider=MODEL_PROVIDER,
alias=MODEL_ALIAS_JUDGE,
model=MODEL_ID_JUDGE,
inference_parameters=InferenceParameters(
max_tokens=4096,
temperature=0.1,
top_p=0.95,
)
)
]
config_builder = DataDesignerConfigBuilder(model_configs=model_configs)
🎲 Sampler columns#
# ESI levels
ESI_LEVELS = [
"ESI 1: Resuscitation",
"ESI 2: Emergency",
"ESI 3: Urgent",
"ESI 4: Less Urgent",
"ESI 5: Non-urgent",
]
# Unique record ID
config_builder.add_column(
SamplerColumnConfig(
name="record_id",
sampler_type=SamplerType.UUID,
params={"short_form": True, "uppercase": True}
)
)
# ESI level (balanced sampling)
config_builder.add_column(
SamplerColumnConfig(
name="esi_level_description",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=ESI_LEVELS,
),
)
)
# Clinical scenario (conditioned on ESI level)
config_builder.add_column(
SamplerColumnConfig(
name="clinical_scenario",
sampler_type=SamplerType.SUBCATEGORY,
params=SubcategorySamplerParams(
category="esi_level_description",
values={
ESI_LEVELS[0]: [
"Cardiac arrest",
"Unresponsive with no pulse",
"Severe respiratory distress",
"Major trauma with signs of shock",
"Suspected narcotic overdose with shallow respirations",
],
ESI_LEVELS[1]: [
"Crushing substernal chest pain radiating to the left arm",
"Sudden onset of facial droop and arm weakness",
"New onset confusion in an elderly patient",
"Active suicidal ideation with a plan",
"High-speed motor vehicle accident",
"Severe abdominal pain in a patient with a history of aortic aneurysm",
],
ESI_LEVELS[2]: [
"Abdominal pain with fever and nausea",
"High fever with a productive cough and history of COPD",
"Displaced fracture with visible deformity",
"Asthma attack, responsive to initial treatment",
"Vaginal bleeding in a pregnant patient",
"Head injury with brief loss of consciousness",
],
ESI_LEVELS[3]: [
"Simple laceration requiring sutures",
"Twisted ankle, unable to bear weight",
"Sore throat with fever",
"Symptoms of a urinary tract infection",
"Painful ear with fever in a child",
],
ESI_LEVELS[4]: [
"Request for a prescription refill",
"Suture removal",
"Minor rash present for several days",
"Common cold symptoms",
"Follow-up for a minor wound check",
],
},
),
)
)
# Synthetic patient info
config_builder.add_column(
SamplerColumnConfig(
name="patient",
sampler_type=SamplerType.PERSON,
params=PersonSamplerParams(age_range=[18, 70]),
)
)
# Triage note writing style (captures range from poor to best quality notes)
config_builder.add_column(
SamplerColumnConfig(
name="writing_style",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=["Draft", "Adequate", "Polished"]
),
)
)
🦜 LLM-generated columns#
# LLM-generated triage note
config_builder.add_column(
LLMTextColumnConfig(
name="content",
prompt=(
"You are an experienced triage nurse in a busy Emergency Department writing a draft note. "
"Write a realistic, concise triage note in a telegraphic style using common medical abbreviations. "
"The note is for a {{ patient.age }} y/o {{ 'M' if patient.sex == 'Male' else 'F' }}. "
"Triage classification: '{{ esi_level_description }}'. "
"Reason for visit: '{{ clinical_scenario }}'. "
"Desired writing style: '{{ writing_style }}'. "
"Structure the note with 'CC:' and 'HPI:'. "
"Adjust the style and level of clinical detail based on the 'writing_style': "
"- Draft: Use minimal structure, brief statements, and omit some details; clinical indicators may be less clear. "
"- Adequate: Use complete sentences, include all relevant clinical indicators, but avoid excessive detail. "
"- Polished: Be thorough, precise, and clear; include nuanced or subtle signs and show strong clinical reasoning. "
"Also, adjust level of detail based on urgency (ESI 1 is always brief). "
"Respond with ONLY the note text, starting with 'CC:'."
),
model_alias=MODEL_ALIAS_GENERATOR,
)
)
⚖️ LLM-as-a-Judge Evaluation Step#
# Rubric: clinical coherence
clinical_coherence_rubric = Score(
name="Clinical Coherence",
description="Evaluates how well the clinical details in the triage note align with the assigned ESI level and scenario.",
options={
"5": "Note is perfectly aligned with the ESI level and scenario; details are clinically plausible and specific.",
"4": "Note is well-aligned, with only minor details that might be slightly inconsistent.",
"3": "Note is generally consistent, but some key clinical indicators are missing or don't fully match the ESI level.",
"2": "Note shows significant inconsistency between the clinical details and the assigned ESI level.",
"1": "Note is clinically incoherent and does not reflect the assigned ESI level or scenario at all."
}
)
# Rubric: ESI level complexity (reduced to 3 levels: Simple, Moderate, Complex)
esi_level_complexity_rubric = Score(
name="ESI Level Complexity",
description="Evaluates how difficult it is to infer the correct ESI level from the note. Higher scores indicate greater complexity, which is desirable for creating a challenging dataset.",
options={
"Complex": "Note contains subtle or conflicting information, requiring clinical reasoning to distinguish between ESI levels.",
"Moderate": "Note requires some clinical inference; indicators are present but not always immediately obvious.",
"Simple": "Note uses clear, direct, or textbook indicators that make the ESI level obvious."
}
)
# LLM judge: triage note quality
EVAL_TRIAGE_NOTE_PROMPT = """\
You are an expert ER physician responsible for quality control. Your task is to evaluate a synthetic triage note for its realism and complexity.
**Triage Situation:**
- ESI Level: '{{ esi_level_description }}'
- Clinical Scenario: '{{ clinical_scenario }}'
- Desired Writing Style: '{{ writing_style }}'
- Patient: {{ patient.age }}-year-old {{ patient.sex }}
**Generated Triage Note:**
"{{ content }}"
Take a deep breath and carefully evaluate the "Generated Triage Note". Assess its clinical coherence with the situation and how well it matches the desired complexity. The goal is to create a challenging dataset, so higher complexity scores are desirable.
"""
config_builder.add_column(
LLMJudgeColumnConfig(
name="triage_note_quality",
model_alias=MODEL_ALIAS_JUDGE,
prompt=EVAL_TRIAGE_NOTE_PROMPT,
scores=[clinical_coherence_rubric, esi_level_complexity_rubric],
)
)
🧪 Generate & Preview#
Tip: Re-run preview to cycle examples; adjust prompts, temperatures, or scenarios to tune realism and difficulty.
preview = data_designer_client.preview(config_builder, num_records=10)
# Run this cell multiple times to cycle through the 10 preview records.
preview.display_sample_record()
# The preview dataset is available as a pandas DataFrame.
preview.dataset
🚀 Scale Up Generations#
Once satisfied with the preview results, scale up to generate the full dataset.
# Submit batch job
job_results = data_designer_client.create(config_builder, num_records=100)
job_results.wait_until_done()
dataset = job_results.load_dataset()
print("\nGenerated dataset shape:", dataset.shape)
dataset.head()
🧹 Refinement [Optional]#
Filter the generated dataset to retain only higher-quality triage notes:
Keeps only notes with Clinical Coherence ≥ 2 (as judged by LLM).
Retrieves ESI level complexity directly from the LLM judge column (
triage_note_quality).
import ast
from rich import print
def filter_by_scores(df, min_coherence=3, samples_per_complexity=100):
indices = []
for idx, k in enumerate(df['triage_note_quality']):
# If k is a string, parse it to dict
if isinstance(k, str):
try:
k_dict = ast.literal_eval(k)
except Exception:
continue
else:
k_dict = k
try:
coherence_score = int(k_dict['Clinical Coherence']['score'])
if coherence_score >= min_coherence:
indices.append(idx)
except Exception:
continue
filtered_df = df.iloc[indices]
filtered_df = filtered_df[["esi_level_description", "content", "triage_note_quality"]]
filtered_df['esi_level_complexity'] = filtered_df['triage_note_quality'].apply(
lambda k: (ast.literal_eval(k) if isinstance(k, str) else k).get('ESI Level Complexity', {}).get('score')
)
filtered_df.drop(columns=['triage_note_quality'], inplace=True)
percent_filtered = 100 * len(filtered_df) / len(df) if len(df) > 0 else 0
print(f"Filtered {len(filtered_df)} out of {len(df)} records ({percent_filtered:.1f}%)")
# Sample up to N per complexity
sampled_df = (
filtered_df
.groupby('esi_level_complexity', group_keys=False)
.apply(lambda x: x.sample(min(len(x), samples_per_complexity), random_state=42))
.reset_index(drop=True)
)
print(f"Sampled {len(sampled_df)} records total, {samples_per_complexity} (or less) per complexity level.")
return sampled_df
filtered_df = filter_by_scores(dataset, samples_per_complexity=100)
👀 Inspect results#
def show_example_triage_notes(filtered_df, num_examples=5):
from rich.console import Console
from rich.panel import Panel
from rich.text import Text
console = Console()
examples = filtered_df.sample(num_examples)
console.print(f"[italic]Showing last {num_examples} filtered triage notes:[/italic]\n")
for idx, row in examples.iterrows():
esi_level = str(row.get("esi_level_description", ""))
esi_level_complexity = str(row.get("esi_level_complexity", ""))
content = str(row.get("content", ""))
# Use blue for the complexity level
panel_title = f"ESI Level: {esi_level} [bold][blue]({esi_level_complexity})[/blue][/bold]"
panel = Panel(
Text(content, style="green"),
title=panel_title,
border_style="cyan",
expand=False,
padding=(1, 2),
)
console.print(panel)
console.print() # Extra newline for separation
# Show some example records from the bottom using rich
show_example_triage_notes(filtered_df, num_examples=3)
Step 2: 📊 Nemo Evaluator#
We evaluate the model on filtered triage notes to see if it predicts the correct ESI level.
Dataset: HF-compatible JSONL served by the datastore
Task: Completion with structured output
{ "esi_level_description": "..." }Metric: String containment check against ground-truth ESI
import os
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError
from nemo_microservices import NeMoMicroservices
# Service endpoint for the NeMo Evaluator (change if running elsewhere)
BASE_URL = "http://localhost:8080"
# Initialize NeMoMicroservices client (does not trigger any action yet)
client = NeMoMicroservices(base_url=BASE_URL)
# Namespace for organizing datasets within Hugging Face Hub
NAMESPACE = "triage-eval"
# Split the filtered dataframe into different complexity levels
df_complexities = {
"simple": filtered_df[filtered_df["esi_level_complexity"] == "Simple"],
"moderate": filtered_df[filtered_df["esi_level_complexity"] == "Moderate"],
"complex": filtered_df[filtered_df["esi_level_complexity"] == "Complex"]
}
# Hugging Face Hub endpoint for local server (set up in your datastore container)
HF_ENDPOINT = "http://localhost:3000/v1/hf" # Exposed from: 0.0.0.0:3000->3000/tcp
# Initialize the Hugging Face HF API client
hf_api = HfApi(endpoint=HF_ENDPOINT, token=os.environ["HF_TOKEN"])
# Create a dict to store files_url for each complexity level
files_url_dict = {}
# Loop over each complexity level, preparing, saving, and uploading evaluation datasets
for level, df in df_complexities.items():
DATASET_NAME = f"nurse-triage-notes-{level}"
repo_id = f"{NAMESPACE}/{DATASET_NAME}"
# Create (or get) the dataset repo for this complexity level
hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
file_name = f"dataset_{level}.jsonl"
df.to_json(file_name, orient="records", lines=True)
print(f"Dataset prepared with {len(df)} samples for complexity '{level.capitalize()}'")
# Upload the dataset to the Hugging Face Hub
result = hf_api.upload_file(
path_or_fileobj=file_name,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset",
revision="main",
commit_message=f"Eval dataset in {repo_id} for {level.capitalize()}"
)
print(f"Dataset uploaded: {result}") # Print result with the uploaded file URL/info
# Construct files_url and store it for this complexity level
files_url = f"hf://datasets/{NAMESPACE}/{DATASET_NAME}"
files_url_dict[level] = files_url
🧪 Evaluator Flow#
This section defines the evaluation configuration used to assess model performance on triage note classification using a custom evaluator.
EVALUATOR_CONFIG = {
"eval_config": {
"type": "custom",
"params": {
"parallelism": 8
},
"tasks": {
"triage_classification": {
"type": "completion",
"params": {
"template": {
"messages": [
{
"role": "system",
"content": (
"You are an expert ER triage nurse. Your task is to classify the following triage note into one of the five Emergency Severity Index (ESI) levels."
f" The possible levels are: {', '.join([repr(level) for level in ESI_LEVELS])}."
" Carefully analyze the clinical details in the triage note, focusing on patient acuity, resource needs, and risk of rapid deterioration."
" Respond with only the selected ESI level description, exactly matching one of the listed possibilities. Do not provide extra text or explanation."
)
},
{
"role": "user",
"content": (
"Triage Note: {{item.content}}\n"
"Classify the ESI level for this note based on the provided definitions."
" Respond in JSON format only: { \"esi_level_description\": \"...\" }"
)
}
],
}
},
"metrics": {
"accuracy": {
"type": "string-check",
"params": {
"check": [
"{{sample.output_text}}",
"contains",
"{{item.esi_level_description}}"
]
}
}
},
"dataset": {
"files_url": None
}
}
}
},
"target_config": {
"type": "model",
"model": {
"api_endpoint": {
"url": None,
"model_id": None
}
}
}
}
🔍 Model evaluation loop and configuration#
This section compares multiple models (A/B testing) on the triage note classification task across each complexity level (Simple, Moderate, Complex).
The models evaluated are:
Qwen3-8B (
Qwen/Qwen3-8B)Nemotron Nano 9B v2 (
nvidia/nvidia-nemotron-nano-9b-v2)
For each complexity level, the accuracy score for each model is printed, allowing for side-by-side evaluation of how each model performs at every complexity.
import time
import copy
import pandas as pd
# This code assumes EVALUATOR_CONFIG is available in the notebook scope
MODEL_SPECS = [
{
"name": "Qwen3-8B",
"model_id": "Qwen/Qwen3-8B",
"url": "http://204.52.25.18:8000/v1/completions"
},
{
"name": "Nemotron Nano 9B v2",
"model_id": "nvidia/nvidia-nemotron-nano-9b-v2",
"url": "http://204.52.25.18:8001/v1/completions"
}
]
COMPLEXITIES = ["simple", "moderate", "complex"]
def run_evaluation(client, namespace, evaluator_config, model_spec, complexity, files_url_dict):
"""
Populates the evaluator_config, filling in the files_url and endpoint, then runs evaluation.
Returns accuracy for the given model+complexity.
"""
# Work with a deepcopy of the config for isolation
config = copy.deepcopy(evaluator_config)
# Set the dataset URL for the current complexity
config['eval_config']['tasks']['triage_classification']['dataset']['files_url'] = files_url_dict[complexity]
# Set the API endpoint and model_id for this model
config['target_config']['model']['api_endpoint']['url'] = model_spec['url']
config['target_config']['model']['api_endpoint']['model_id'] = model_spec['model_id']
# Submit evaluation job
job = client.evaluation.jobs.create(
namespace=namespace,
# here, pass through the two parts
target=config['target_config'],
config=config['eval_config']
)
print(f"Submitted evaluation job for model '{model_spec['name']}' on complexity '{complexity.capitalize()}' (job id: {job.id})")
# Wait until complete
while True:
time.sleep(3)
progress = client.evaluation.jobs.status(job.id).progress
if progress >= 100: break
if progress % 20 == 0: print(f" ⏳ Job {job.id} is {progress}% done")
print(f" ✔️ Job done for model '{model_spec['name']}' on complexity '{complexity.capitalize()}'")
# Fetch results and extract accuracy
results = client.evaluation.jobs.results(job.id)
accuracy_value = results.tasks['triage_classification'].metrics['accuracy'].scores['string-check'].value
return accuracy_value
results_dict = {model_spec['name']: {} for model_spec in MODEL_SPECS}
print("Starting evaluation jobs (per model, per complexity)...")
for complexity in COMPLEXITIES:
for spec in MODEL_SPECS:
accuracy = run_evaluation(client, NAMESPACE, EVALUATOR_CONFIG, spec, complexity, files_url_dict)
results_dict[spec['name']][complexity.capitalize()] = 100 * accuracy # Store as percentage
print(f" --> DONE: {spec['name']}, {complexity.capitalize()} (Accuracy: {100*accuracy:.2f}%)\n")
df_results = pd.DataFrame(results_dict).T
df_results = df_results[[c.capitalize() for c in COMPLEXITIES]]
print("\nModel Accuracy Table (%):")
display(df_results.style.format("{:.2f}"))