Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.valiqor.com/llms.txt

Use this file to discover all available pages before exploring further.

Overview

ValiqorEvalClient runs quality evaluations on your AI application’s inputs and outputs. It supports both synchronous and asynchronous evaluation, heuristic and LLM-based metrics, trace-based evaluation, and result analytics. Access it via the unified client:
from valiqor import ValiqorClient

client = ValiqorClient(api_key="your-api-key")
eval_client = client.eval
Or use it standalone:
from valiqor.eval import ValiqorEvalClient

eval_client = ValiqorEvalClient(api_key="your-api-key")
Supports context manager protocol: with ValiqorEvalClient(...) as ec:

Constructor

ValiqorEvalClient(
    api_key: Optional[str] = None,
    project_name: Optional[str] = None,
    base_url: Optional[str] = None,
    timeout: int = 300,
    openai_api_key: Optional[str] = None,
    _config: Optional[Dict[str, Any]] = None,
)
ParameterTypeDefaultDescription
api_keyOptional[str]NoneValiqor API key. Falls back to env/config.
project_nameOptional[str]NoneDefault project name.
base_urlOptional[str]NoneBackend URL override.
timeoutint300Request timeout in seconds.
openai_api_keyOptional[str]NoneOpenAI key for LLM-based metrics. Falls back to VALIQOR_OPENAI_API_KEY.

Core Methods

evaluate()

Run a synchronous evaluation on a dataset. Auto-polls if the backend returns an async response.
def evaluate(
    self,
    dataset: List[Dict[str, Any]],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> EvaluationResult
ParameterTypeDefaultDescription
datasetList[Dict]List of items with input, output, and optionally context, expected.
metricsList[str]Metric keys to evaluate. See available metrics.
project_nameOptional[str]NoneOverride project name for this run.
run_nameOptional[str]NoneCustom name for this evaluation run.
metadataOptional[Dict]NoneArbitrary metadata to attach to the run.
openai_api_keyOptional[str]NoneOverride OpenAI key for this run.
Returns: EvaluationResult
result = client.eval.evaluate(
    dataset=[
        {"input": "What is RAG?", "output": "RAG stands for Retrieval-Augmented Generation.", "context": "RAG is a technique that augments LLMs with retrieved documents."},
        {"input": "Explain caching", "output": "Caching stores data for faster access.", "expected": "Caching is storing data temporarily for quick retrieval."}
    ],
    metrics=["answer_relevance", "factual_accuracy", "coherence"]
)
print(f"Overall: {result.overall_score}")
print(f"Scores: {result.aggregate_scores}")

evaluate_trace()

Evaluate a trace dict (not a trace_id). The client parses messages and spans from the trace to build the evaluation dataset.
def evaluate_trace(
    self,
    trace: Dict[str, Any],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> EvaluationResult
ParameterTypeDescription
traceDict[str, Any]A trace dict containing messages and/or spans.
metricsList[str]Metric keys to evaluate.
trace_data = client.traces.get_full_trace("trace-id-123")
result = client.eval.evaluate_trace(
    trace=trace_data.to_dict(),
    metrics=["task_adherence", "response_completeness"]
)

evaluate_async()

Start an asynchronous evaluation. Always returns a JobHandle regardless of dataset size.
def evaluate_async(
    self,
    dataset: List[Dict[str, Any]],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> JobHandle
Returns: JobHandle
handle = client.eval.evaluate_async(
    dataset=large_dataset,
    metrics=["hallucination", "coherence"]
)

# Wait with progress callback
handle.wait(on_progress=lambda s: print(f"{s.progress_percent}%"))

# Get result
result = handle.result()

Result Retrieval

get_run()

Retrieve an evaluation result by run ID.
def get_run(self, run_id: str) -> EvaluationResult

get_run_result()

Alias for get_run(). Fetch a completed evaluation result.
def get_run_result(self, run_id: str) -> EvaluationResult

get_run_metrics()

Get per-metric scores for a run.
def get_run_metrics(self, run_id: str) -> List[RunMetric]
metrics = client.eval.get_run_metrics("run-id-123")
for m in metrics:
    print(f"{m.display_name}: {m.score}")

get_run_items()

Get paginated per-item evaluation details.
def get_run_items(
    self,
    run_id: str,
    page: int = 1,
    page_size: int = 20,
) -> EvalItemsPage

get_item_detail()

Get full detail for a single evaluated item including per-metric scores and explanations.
def get_item_detail(self, run_id: str, item_id: str) -> EvalItemDetail

Analytics

Get evaluation score trends over time for a project.
def get_trends(
    self,
    project_id: str,
    period: str = "last_7_days",
    metrics: Optional[List[str]] = None,
) -> List[EvalTrendPoint]

compare_runs()

Compare scores across multiple evaluation runs.
def compare_runs(self, run_ids: List[str]) -> EvalRunComparison
comparison = client.eval.compare_runs(["run-1", "run-2", "run-3"])
for score in comparison.overall_scores:
    print(score)

Project & Metric Management

list_projects()

def list_projects(self) -> List[ProjectInfo]

create_project()

def create_project(
    self,
    name: str,
    key: Optional[str] = None,
    model_name: Optional[str] = None,
) -> ProjectInfo

list_metric_templates()

List all available metric templates.
def list_metric_templates(self) -> List[MetricInfo]

list_project_metrics()

List metrics configured for a project.
def list_project_metrics(
    self,
    project_name: Optional[str] = None,
) -> List[MetricInfo]

add_project_metric()

Add a metric to a project’s configuration.
def add_project_metric(
    self,
    metric_key: str,
    display_name: str,
    project_name: Optional[str] = None,
    template_id: Optional[str] = None,
    value_type: str = "numeric",
    default_weight: float = 1.0,
) -> MetricInfo

get_project_stats()

def get_project_stats(
    self,
    project_name: Optional[str] = None,
) -> Dict[str, Any]

Job Management

get_job_status()

Check the status of an async evaluation job.
def get_job_status(self, job_id: str) -> JobStatus

cancel_job()

Cancel a running async job.
def cancel_job(self, job_id: str) -> CancelResponse

Available Metrics

Heuristic Metrics

KeyDescription
containsCheck if output contains expected content.
equalsExact match comparison.
levenshteinString similarity via Levenshtein distance.
regex_matchRegex pattern matching on output.

LLM-Based Metrics

KeyDescription
hallucinationDetects fabricated information not supported by context.
answer_relevanceMeasures how relevant the answer is to the question.
context_precisionEvaluates if retrieved context is precise and focused.
context_recallMeasures if all relevant context was retrieved.
coherenceEvaluates logical flow and consistency.
fluencyMeasures language quality and readability.
factual_accuracyVerifies factual correctness of the output.
task_adherenceChecks if the output follows the task instructions.
response_completenessEvaluates if the response fully addresses the query.