Documentation Index
Fetch the complete documentation index at: https://docs.valiqor.com/llms.txt
Use this file to discover all available pages before exploring further.
Overview
ValiqorEvalClient runs quality evaluations on your AI application’s inputs and outputs. It supports both synchronous and asynchronous evaluation, heuristic and LLM-based metrics, trace-based evaluation, and result analytics.
Access it via the unified client:
from valiqor import ValiqorClient
client = ValiqorClient(api_key="your-api-key")
eval_client = client.eval
Or use it standalone:
from valiqor.eval import ValiqorEvalClient
eval_client = ValiqorEvalClient(api_key="your-api-key")
Supports context manager protocol: with ValiqorEvalClient(...) as ec:
Constructor
ValiqorEvalClient(
api_key: Optional[str] = None,
project_name: Optional[str] = None,
base_url: Optional[str] = None,
timeout: int = 300,
openai_api_key: Optional[str] = None,
_config: Optional[Dict[str, Any]] = None,
)
| Parameter | Type | Default | Description |
|---|
api_key | Optional[str] | None | Valiqor API key. Falls back to env/config. |
project_name | Optional[str] | None | Default project name. |
base_url | Optional[str] | None | Backend URL override. |
timeout | int | 300 | Request timeout in seconds. |
openai_api_key | Optional[str] | None | OpenAI key for LLM-based metrics. Falls back to VALIQOR_OPENAI_API_KEY. |
Core Methods
evaluate()
Run a synchronous evaluation on a dataset. Auto-polls if the backend returns an async response.
def evaluate(
self,
dataset: List[Dict[str, Any]],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> EvaluationResult
| Parameter | Type | Default | Description |
|---|
dataset | List[Dict] | — | List of items with input, output, and optionally context, expected. |
metrics | List[str] | — | Metric keys to evaluate. See available metrics. |
project_name | Optional[str] | None | Override project name for this run. |
run_name | Optional[str] | None | Custom name for this evaluation run. |
metadata | Optional[Dict] | None | Arbitrary metadata to attach to the run. |
openai_api_key | Optional[str] | None | Override OpenAI key for this run. |
Returns: EvaluationResult
result = client.eval.evaluate(
dataset=[
{"input": "What is RAG?", "output": "RAG stands for Retrieval-Augmented Generation.", "context": "RAG is a technique that augments LLMs with retrieved documents."},
{"input": "Explain caching", "output": "Caching stores data for faster access.", "expected": "Caching is storing data temporarily for quick retrieval."}
],
metrics=["answer_relevance", "factual_accuracy", "coherence"]
)
print(f"Overall: {result.overall_score}")
print(f"Scores: {result.aggregate_scores}")
evaluate_trace()
Evaluate a trace dict (not a trace_id). The client parses messages and spans from the trace to build the evaluation dataset.
def evaluate_trace(
self,
trace: Dict[str, Any],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> EvaluationResult
| Parameter | Type | Description |
|---|
trace | Dict[str, Any] | A trace dict containing messages and/or spans. |
metrics | List[str] | Metric keys to evaluate. |
trace_data = client.traces.get_full_trace("trace-id-123")
result = client.eval.evaluate_trace(
trace=trace_data.to_dict(),
metrics=["task_adherence", "response_completeness"]
)
evaluate_async()
Start an asynchronous evaluation. Always returns a JobHandle regardless of dataset size.
def evaluate_async(
self,
dataset: List[Dict[str, Any]],
metrics: List[str],
project_name: Optional[str] = None,
run_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
openai_api_key: Optional[str] = None,
) -> JobHandle
Returns: JobHandle
handle = client.eval.evaluate_async(
dataset=large_dataset,
metrics=["hallucination", "coherence"]
)
# Wait with progress callback
handle.wait(on_progress=lambda s: print(f"{s.progress_percent}%"))
# Get result
result = handle.result()
Result Retrieval
get_run()
Retrieve an evaluation result by run ID.
def get_run(self, run_id: str) -> EvaluationResult
get_run_result()
Alias for get_run(). Fetch a completed evaluation result.
def get_run_result(self, run_id: str) -> EvaluationResult
get_run_metrics()
Get per-metric scores for a run.
def get_run_metrics(self, run_id: str) -> List[RunMetric]
metrics = client.eval.get_run_metrics("run-id-123")
for m in metrics:
print(f"{m.display_name}: {m.score}")
get_run_items()
Get paginated per-item evaluation details.
def get_run_items(
self,
run_id: str,
page: int = 1,
page_size: int = 20,
) -> EvalItemsPage
get_item_detail()
Get full detail for a single evaluated item including per-metric scores and explanations.
def get_item_detail(self, run_id: str, item_id: str) -> EvalItemDetail
Analytics
get_trends()
Get evaluation score trends over time for a project.
def get_trends(
self,
project_id: str,
period: str = "last_7_days",
metrics: Optional[List[str]] = None,
) -> List[EvalTrendPoint]
compare_runs()
Compare scores across multiple evaluation runs.
def compare_runs(self, run_ids: List[str]) -> EvalRunComparison
comparison = client.eval.compare_runs(["run-1", "run-2", "run-3"])
for score in comparison.overall_scores:
print(score)
Project & Metric Management
list_projects()
def list_projects(self) -> List[ProjectInfo]
create_project()
def create_project(
self,
name: str,
key: Optional[str] = None,
model_name: Optional[str] = None,
) -> ProjectInfo
list_metric_templates()
List all available metric templates.
def list_metric_templates(self) -> List[MetricInfo]
list_project_metrics()
List metrics configured for a project.
def list_project_metrics(
self,
project_name: Optional[str] = None,
) -> List[MetricInfo]
add_project_metric()
Add a metric to a project’s configuration.
def add_project_metric(
self,
metric_key: str,
display_name: str,
project_name: Optional[str] = None,
template_id: Optional[str] = None,
value_type: str = "numeric",
default_weight: float = 1.0,
) -> MetricInfo
get_project_stats()
def get_project_stats(
self,
project_name: Optional[str] = None,
) -> Dict[str, Any]
Job Management
get_job_status()
Check the status of an async evaluation job.
def get_job_status(self, job_id: str) -> JobStatus
cancel_job()
Cancel a running async job.
def cancel_job(self, job_id: str) -> CancelResponse
Available Metrics
Heuristic Metrics
| Key | Description |
|---|
contains | Check if output contains expected content. |
equals | Exact match comparison. |
levenshtein | String similarity via Levenshtein distance. |
regex_match | Regex pattern matching on output. |
LLM-Based Metrics
| Key | Description |
|---|
hallucination | Detects fabricated information not supported by context. |
answer_relevance | Measures how relevant the answer is to the question. |
context_precision | Evaluates if retrieved context is precise and focused. |
context_recall | Measures if all relevant context was retrieved. |
coherence | Evaluates logical flow and consistency. |
fluency | Measures language quality and readability. |
factual_accuracy | Verifies factual correctness of the output. |
task_adherence | Checks if the output follows the task instructions. |
response_completeness | Evaluates if the response fully addresses the query. |