Overview
ValiqorEvalClient runs quality evaluations on your AI application’s inputs and outputs. It supports both synchronous and asynchronous evaluation, heuristic and LLM-based metrics, trace-based evaluation, and result analytics.
Access it via the unified client:
from valiqor import ValiqorClient
client = ValiqorClient(api_key="your-api-key")
eval_client = client.eval
Or use it standalone:
from valiqor.eval import ValiqorEvalClient
eval_client = ValiqorEvalClient(api_key="your-api-key")
Supports context manager protocol: with ValiqorEvalClient(...) as ec:
Constructor
ValiqorEvalClient(
api_key: Optional[str] = None,
project_name: Optional[str] = None,
base_url: Optional[str] = None,
timeout: int = 300,
openai_api_key: Optional[str] = None,
_config: Optional[Dict[str, Any]] = None,
)
| Parameter | Type | Default | Description |
|---|
api_key | Optional[str] | None | Valiqor API key. Falls back to env/config. |
project_name | Optional[str] | None | Default project name. |
base_url | Optional[str] | None | Backend URL override. |
timeout | int | 300 | Request timeout in seconds. |
openai_api_key | Optional[str] | None | OpenAI key for LLM-based metrics. Falls back to VALIQOR_OPENAI_API_KEY. |
Core Methods
evaluate()
Run a synchronous evaluation on a dataset. Auto-polls if the backend returns an async response.
def evaluate(
self,
dataset: List[Dict[str, Any]],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> EvaluationResult
| Parameter | Type | Default | Description |
|---|
dataset | List[Dict] | — | List of items with input, output, and optionally context, expected. |
metrics | List[str] | — | Metric keys to evaluate. See available metrics. |
project_name | Optional[str] | None | Override project name for this run. |
run_name | Optional[str] | None | Custom name for this evaluation run. |
metadata | Optional[Dict] | None | Arbitrary metadata to attach to the run. |
openai_api_key | Optional[str] | None | Override OpenAI key for this run. |
Returns: EvaluationResult
result = client.eval.evaluate(
dataset=[
{"input": "What is RAG?", "output": "RAG stands for Retrieval-Augmented Generation.", "context": "RAG is a technique that augments LLMs with retrieved documents."},
{"input": "Explain caching", "output": "Caching stores data for faster access.", "expected": "Caching is storing data temporarily for quick retrieval."}
],
metrics=["answer_relevance", "factual_accuracy", "coherence"]
)
print(f"Overall: {result.overall_score}")
print(f"Scores: {result.aggregate_scores}")
evaluate_trace()
Evaluate a trace dict (not a trace_id). The client parses messages and spans from the trace to build the evaluation dataset.
def evaluate_trace(
self,
trace: Dict[str, Any],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> EvaluationResult
| Parameter | Type | Description |
|---|
trace | Dict[str, Any] | A trace dict containing messages and/or spans. |
metrics | List[str] | Metric keys to evaluate. |
trace_data = client.traces.get_full_trace("trace-id-123")
result = client.eval.evaluate_trace(
trace=trace_data.to_dict(),
metrics=["task_adherence", "response_completeness"]
)
evaluate_async()
Start an asynchronous evaluation. Always returns a JobHandle regardless of dataset size.
def evaluate_async(
self,
dataset: List[Dict[str, Any]],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> JobHandle
Returns: JobHandle
handle = client.eval.evaluate_async(
dataset=large_dataset,
metrics=["hallucination", "coherence"]
)
# Wait with progress callback
handle.wait(on_progress=lambda s: print(f"{s.progress_percent}%"))
# Get result
result = handle.result()
Result Retrieval
get_run()
Retrieve an evaluation result by run ID.
def get_run(self, run_id: str) -> EvaluationResult
get_run_result()
Alias for get_run(). Fetch a completed evaluation result.
def get_run_result(self, run_id: str) -> EvaluationResult
get_run_metrics()
Get per-metric scores for a run.
def get_run_metrics(self, run_id: str) -> List[RunMetric]
metrics = client.eval.get_run_metrics("run-id-123")
for m in metrics:
print(f"{m.display_name}: {m.score}")
get_run_items()
Get paginated per-item evaluation details.
def get_run_items(
self,
run_id: str,
page: int = 1,
page_size: int = 20,
) -> EvalItemsPage
get_item_detail()
Get full detail for a single evaluated item including per-metric scores and explanations.
def get_item_detail(self, run_id: str, item_id: str) -> EvalItemDetail
Analytics
get_trends()
Get evaluation score trends over time for a project.
def get_trends(
self,
project_id: str,
period: str = "last_7_days",
metrics: Optional[List[str]] = None,
) -> List[EvalTrendPoint]
compare_runs()
Compare scores across multiple evaluation runs.
def compare_runs(self, run_ids: List[str]) -> EvalRunComparison
comparison = client.eval.compare_runs(["run-1", "run-2", "run-3"])
for score in comparison.overall_scores:
print(score)
Project & Metric Management
list_projects()
def list_projects(self) -> List[ProjectInfo]
create_project()
def create_project(
self,
name: str,
key: Optional[str] = None,
model_name: Optional[str] = None,
) -> ProjectInfo
list_metric_templates()
List all available metric templates.
def list_metric_templates(self) -> List[MetricInfo]
list_project_metrics()
List metrics configured for a project.
def list_project_metrics(
self,
project_name: Optional[str] = None,
) -> List[MetricInfo]
add_project_metric()
Add a metric to a project’s configuration.
def add_project_metric(
self,
metric_key: str,
display_name: str,
project_name: Optional[str] = None,
template_id: Optional[str] = None,
value_type: str = "numeric",
default_weight: float = 1.0,
) -> MetricInfo
get_project_stats()
def get_project_stats(
self,
project_name: Optional[str] = None,
) -> Dict[str, Any]
Job Management
get_job_status()
Check the status of an async evaluation job.
def get_job_status(self, job_id: str) -> JobStatus
cancel_job()
Cancel a running async job.
def cancel_job(self, job_id: str) -> CancelResponse
Available Metrics
Heuristic Metrics
| Key | Description |
|---|
contains | Check if output contains expected content. |
equals | Exact match comparison. |
levenshtein | String similarity via Levenshtein distance. |
regex_match | Regex pattern matching on output. |
LLM-Based Metrics
| Key | Description |
|---|
hallucination | Detects fabricated information not supported by context. |
answer_relevance | Measures how relevant the answer is to the question. |
context_precision | Evaluates if retrieved context is precise and focused. |
context_recall | Measures if all relevant context was retrieved. |
coherence | Evaluates logical flow and consistency. |
fluency | Measures language quality and readability. |
factual_accuracy | Verifies factual correctness of the output. |
task_adherence | Checks if the output follows the task instructions. |
response_completeness | Evaluates if the response fully addresses the query. |