Skip to main content

Overview

ValiqorEvalClient runs quality evaluations on your AI application’s inputs and outputs. It supports both synchronous and asynchronous evaluation, heuristic and LLM-based metrics, trace-based evaluation, and result analytics. Access it via the unified client:
from valiqor import ValiqorClient

client = ValiqorClient(api_key="your-api-key")
eval_client = client.eval
Or use it standalone:
from valiqor.eval import ValiqorEvalClient

eval_client = ValiqorEvalClient(api_key="your-api-key")
Supports context manager protocol: with ValiqorEvalClient(...) as ec:

Constructor

ValiqorEvalClient(
    api_key: Optional[str] = None,
    project_name: Optional[str] = None,
    base_url: Optional[str] = None,
    timeout: int = 300,
    openai_api_key: Optional[str] = None,
    _config: Optional[Dict[str, Any]] = None,
)
ParameterTypeDefaultDescription
api_keyOptional[str]NoneValiqor API key. Falls back to env/config.
project_nameOptional[str]NoneDefault project name.
base_urlOptional[str]NoneBackend URL override.
timeoutint300Request timeout in seconds.
openai_api_keyOptional[str]NoneOpenAI key for LLM-based metrics. Falls back to VALIQOR_OPENAI_API_KEY.

Core Methods

evaluate()

Run a synchronous evaluation on a dataset. Auto-polls if the backend returns an async response.
def evaluate(
    self,
    dataset: List[Dict[str, Any]],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> EvaluationResult
ParameterTypeDefaultDescription
datasetList[Dict]List of items with input, output, and optionally context, expected.
metricsList[str]Metric keys to evaluate. See available metrics.
project_nameOptional[str]NoneOverride project name for this run.
run_nameOptional[str]NoneCustom name for this evaluation run.
metadataOptional[Dict]NoneArbitrary metadata to attach to the run.
openai_api_keyOptional[str]NoneOverride OpenAI key for this run.
Returns: EvaluationResult
result = client.eval.evaluate(
    dataset=[
        {"input": "What is RAG?", "output": "RAG stands for Retrieval-Augmented Generation.", "context": "RAG is a technique that augments LLMs with retrieved documents."},
        {"input": "Explain caching", "output": "Caching stores data for faster access.", "expected": "Caching is storing data temporarily for quick retrieval."}
    ],
    metrics=["answer_relevance", "factual_accuracy", "coherence"]
)
print(f"Overall: {result.overall_score}")
print(f"Scores: {result.aggregate_scores}")

evaluate_trace()

Evaluate a trace dict (not a trace_id). The client parses messages and spans from the trace to build the evaluation dataset.
def evaluate_trace(
    self,
    trace: Dict[str, Any],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> EvaluationResult
ParameterTypeDescription
traceDict[str, Any]A trace dict containing messages and/or spans.
metricsList[str]Metric keys to evaluate.
trace_data = client.traces.get_full_trace("trace-id-123")
result = client.eval.evaluate_trace(
    trace=trace_data.to_dict(),
    metrics=["task_adherence", "response_completeness"]
)

evaluate_async()

Start an asynchronous evaluation. Always returns a JobHandle regardless of dataset size.
def evaluate_async(
    self,
    dataset: List[Dict[str, Any]],
    metrics: List[str],
    project_name: Optional[str] = None,
    run_name: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    openai_api_key: Optional[str] = None,
) -> JobHandle
Returns: JobHandle
handle = client.eval.evaluate_async(
    dataset=large_dataset,
    metrics=["hallucination", "coherence"]
)

# Wait with progress callback
handle.wait(on_progress=lambda s: print(f"{s.progress_percent}%"))

# Get result
result = handle.result()

Result Retrieval

get_run()

Retrieve an evaluation result by run ID.
def get_run(self, run_id: str) -> EvaluationResult

get_run_result()

Alias for get_run(). Fetch a completed evaluation result.
def get_run_result(self, run_id: str) -> EvaluationResult

get_run_metrics()

Get per-metric scores for a run.
def get_run_metrics(self, run_id: str) -> List[RunMetric]
metrics = client.eval.get_run_metrics("run-id-123")
for m in metrics:
    print(f"{m.display_name}: {m.score}")

get_run_items()

Get paginated per-item evaluation details.
def get_run_items(
    self,
    run_id: str,
    page: int = 1,
    page_size: int = 20,
) -> EvalItemsPage

get_item_detail()

Get full detail for a single evaluated item including per-metric scores and explanations.
def get_item_detail(self, run_id: str, item_id: str) -> EvalItemDetail

Analytics

Get evaluation score trends over time for a project.
def get_trends(
    self,
    project_id: str,
    period: str = "last_7_days",
    metrics: Optional[List[str]] = None,
) -> List[EvalTrendPoint]

compare_runs()

Compare scores across multiple evaluation runs.
def compare_runs(self, run_ids: List[str]) -> EvalRunComparison
comparison = client.eval.compare_runs(["run-1", "run-2", "run-3"])
for score in comparison.overall_scores:
    print(score)

Project & Metric Management

list_projects()

def list_projects(self) -> List[ProjectInfo]

create_project()

def create_project(
    self,
    name: str,
    key: Optional[str] = None,
    model_name: Optional[str] = None,
) -> ProjectInfo

list_metric_templates()

List all available metric templates.
def list_metric_templates(self) -> List[MetricInfo]

list_project_metrics()

List metrics configured for a project.
def list_project_metrics(
    self,
    project_name: Optional[str] = None,
) -> List[MetricInfo]

add_project_metric()

Add a metric to a project’s configuration.
def add_project_metric(
    self,
    metric_key: str,
    display_name: str,
    project_name: Optional[str] = None,
    template_id: Optional[str] = None,
    value_type: str = "numeric",
    default_weight: float = 1.0,
) -> MetricInfo

get_project_stats()

def get_project_stats(
    self,
    project_name: Optional[str] = None,
) -> Dict[str, Any]

Job Management

get_job_status()

Check the status of an async evaluation job.
def get_job_status(self, job_id: str) -> JobStatus

cancel_job()

Cancel a running async job.
def cancel_job(self, job_id: str) -> CancelResponse

Available Metrics

Heuristic Metrics

KeyDescription
containsCheck if output contains expected content.
equalsExact match comparison.
levenshteinString similarity via Levenshtein distance.
regex_matchRegex pattern matching on output.

LLM-Based Metrics

KeyDescription
hallucinationDetects fabricated information not supported by context.
answer_relevanceMeasures how relevant the answer is to the question.
context_precisionEvaluates if retrieved context is precise and focused.
context_recallMeasures if all relevant context was retrieved.
coherenceEvaluates logical flow and consistency.
fluencyMeasures language quality and readability.
factual_accuracyVerifies factual correctness of the output.
task_adherenceChecks if the output follows the task instructions.
response_completenessEvaluates if the response fully addresses the query.