Skip to main content
Failure Analysis (FA) classifies your AI outputs into failure buckets and subcategories, scores severity, and explains root causes — so you can fix the actual problem, not just observe metrics.
New to Valiqor? Start with the 5-minute quickstart to see FA in action before diving into this full guide.

Two analysis modes

Pass your existing AI inputs/outputs directly — no tracing required.
from valiqor import ValiqorClient

client = ValiqorClient(api_key="vq_...", project_name="my-app")

result = client.failure_analysis.run(
    dataset=[
        {
            "input": "What is the capital of France?",
            "output": "The capital of France is Berlin.",
            "context": ["The capital of France is Paris."],
        },
        {
            "input": "Summarize the climate report.",
            "output": "The report discusses economic trends in Asia.",
            "context": ["Global temperatures rose 1.1°C since pre-industrial times."],
        },
    ]
)
Each item in the dataset requires:
  • input (str) — the user prompt/query
  • output (str) — the model’s response
  • context (list[str], optional) — retrieved documents / reference passages
  • tool_calls (list[dict], optional) — tool invocations

Full run() signature

result = client.failure_analysis.run(
    # Source — provide one of:
    trace_id=None,               # Trace ID for full trace analysis
    dataset=None,                # List of {input, output, context} dicts

    # Configuration
    project_name=None,           # Overrides client-level project_name
    feature_kind=None,           # "rag", "agent", "agentic_rag", "generic_llm"
    run_eval=True,               # Run evaluation metrics alongside FA
    run_security=True,           # Run security audit alongside FA
    run_scan=True,               # Attach AST scan data if available

    # Filtering
    mandatory_eval_metrics=None, # Eval metrics that must run
    mandatory_security_categories=None,  # Security categories that must run
    subcategories=None,          # Filter to specific failure subcategories
    buckets=None,                # Filter to specific failure buckets

    # BYOK
    openai_api_key=None,         # Your OpenAI key for LLM judges
)
run() is transparent-async — if the backend returns HTTP 202, the SDK auto-polls until complete. For explicit async control, use run_async() instead.

Async analysis

For large datasets or when you want non-blocking execution:
# Start async — always returns a handle
handle = client.failure_analysis.run_async(
    dataset=large_dataset,
    project_name="my-app",
)

# Option 1: Wait with progress callback
result = handle.wait(
    on_progress=lambda s: print(f"{s.progress_percent:.0f}% ({s.current_item}/{s.total_items})")
)

# Option 2: Poll manually
while handle.is_running():
    status = handle.status()
    print(f"Status: {status.status}{status.progress_percent:.0f}%")
    import time; time.sleep(2)

result = handle.result()

# Option 3: Cancel
handle.cancel()
See the Async & Batch guide for more details.

Reading results

Summary

summary = result.summary

print(f"Items analyzed:      {summary.total_items}")
print(f"Items with failures: {summary.items_with_failures}")
print(f"Total failures:      {summary.total_failures_detected}")
print(f"Overall severity:    {summary.overall_severity}/5")
print(f"Primary failure:     {summary.primary_failure_name}")
print(f"Should alert:        {summary.should_alert}")
print(f"Should gate CI:      {summary.should_gate_ci}")

Failure tags

for tag in result.failure_tags:
    if tag.decision == "fail":
        print(f"❌ [{tag.bucket_name}] {tag.subcategory_name}")
        print(f"   Severity: {tag.severity}/5  Confidence: {tag.confidence}")
        print(f"   Detector: {tag.detector_type_used}")
        if tag.judge_rationale:
            print(f"   Rationale: {tag.judge_rationale}")
        print(f"   Item index: {tag.item_index}")

Linked results

FA can run eval and security alongside analysis:
# Eval metrics from the same run
print(f"Eval run ID: {result.eval_run_id}")
print(f"Eval metrics: {result.eval_metrics}")

# Security flags — values are "fail" or "pass" per S-category
print(f"Security batch: {result.security_batch_id}")
print(f"Security flags: {result.security_flags}")
# Example output: {'S1': 'pass', 'S9': 'fail', 'S10': 'pass', ...}

Filtering by bucket or subcategory

Focus analysis on specific failure types:
# Only check hallucination-related failures
result = client.failure_analysis.run(
    dataset=my_data,
    buckets=["hallucination"],
)

# Only check specific subcategories
result = client.failure_analysis.run(
    dataset=my_data,
    subcategories=["entity_fabrication", "contradicts_source"],
)

Feature kind hints

Tell FA what kind of app you’re analyzing for better classification:
# RAG application
result = client.failure_analysis.run(
    dataset=my_data,
    feature_kind="rag",
)

# Agent with tool calls
result = client.failure_analysis.run(
    dataset=agent_data,
    feature_kind="agent",
)
Feature KindBest for
"rag"Retrieval-augmented generation apps
"agent"Tool-using agents
"agentic_rag"Agents with retrieval steps
"generic_llm"Simple chat / completion apps

Browsing the failure taxonomy

# Get full taxonomy (buckets + subcategories)
taxonomy = client.failure_analysis.get_taxonomy()

for bucket in taxonomy:
    print(f"\n{bucket['name']} ({bucket['id']})")
    for sub in bucket.get("subcategories", []):
        print(f"  - {sub['name']} ({sub['scope']}, {sub['detector_type']})")
# Get subcategories for a specific bucket
subs = client.failure_analysis.get_subcategories(
    bucket_id="hallucination",
    scope="rag",           # Filter by scope
    detector_type="llm_judge",  # Filter by detector
)
# Deep-dive into a bucket
details = client.failure_analysis.get_bucket_details(bucket_id="hallucination")

Run history and analytics

# List past runs
runs = client.failure_analysis.list_runs(
    project_name="my-app",
    limit=10,
    offset=0,
)

# Count runs
count = client.failure_analysis.count_runs(project_name="my-app")

# Get original inputs for a run
inputs = client.failure_analysis.get_run_inputs(run_id="run_abc123")

# Get tags for a run
tags = client.failure_analysis.get_tags(run_id="run_abc123")
# Aggregated insights across runs
insights = client.failure_analysis.get_insights(project_name="my-app")

# Time-series trends
trends = client.failure_analysis.get_trends(
    project_name="my-app",
    period="7d",  # or "30d", "90d"
)

# Security insights from FA runs
sec_insights = client.failure_analysis.get_security_insights(
    project_name="my-app"
)

Playground (single-item)

Quick single-item analysis for testing — rate-limited to 10/day, 2/min:
result = client.failure_analysis.playground(
    input_text="What medications interact with warfarin?",
    output_text="Warfarin has no known drug interactions.",
    context=["Warfarin interacts with aspirin, ibuprofen, and many antibiotics."],
)

CI/CD gating

result = client.failure_analysis.run(dataset=test_cases)

if result.summary.should_gate_ci:
    print("❌ Critical failures — blocking deployment")
    exit(1)

if result.summary.needs_human_review:
    print("⚠️ Uncertain results — flagged for human review")

CLI

# Run FA from a dataset file
valiqor fa run --dataset test_data.json --project-name my-app

# Run FA from a trace
valiqor fa run --trace-id tr_abc123

# Check status
valiqor fa status --run-id run_xyz

# Get results
valiqor fa result --run-id run_xyz --output results.json

# List past runs
valiqor fa list --project-name my-app --limit 10