Autoblocks provides a set of evaluators that can be used out of the box. These evaluators are designed to be easily integrated into your test suite and can help you get started with testing your AI-powered applications.
Each evaluator below lists the custom properties and methods that need to be implemented to use the evaluator in your test suite.
You must set the id property, which is a unique identifier for the evaluator.
All of the code snippets can be run by following the instructions in the Quick Start guide.
Ragas
Logic Based
Is Equals
The IsEquals evaluator checks if the expected output equals the actual output.
Scores 1 if equal, 0 otherwise.
| Name | Required | Type | Description |
| test_case_mapper | Yes | Callable[[BaseTestCase], str] | Map your test case to a string for comparison. |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to a string for comparison. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseIsEquals
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
expected_output: str
def hash(self) -> str:
return md5(self.input)
class IsEquals(BaseIsEquals[TestCase, str]):
id = "is-equals"
def test_case_mapper(self, test_case: TestCase) -> str:
return test_case.expected_output
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="hello world",
expected_output="hello world",
),
TestCase(
input="hi world",
expected_output="hello world",
)
],
fn=lambda test_case: test_case.input,
evaluators=[IsEquals()],
)
Is Valid JSON
The IsValidJSON evaluator checks if the output is valid JSON.
Scores 1 if it is valid, 0 otherwise.
| Name | Required | Type | Description |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to the string that you want to check is valid JSON. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseIsValidJSON
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
def hash(self) -> str:
return md5(self.input)
class IsValidJSON(BaseIsValidJSON[TestCase, str]):
id = "is-valid-json"
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="hello world",
),
TestCase(
input='{"hello": "world"}'
)
],
fn=lambda test_case: test_case.input,
evaluators=[IsValidJSON()],
)
Has All Substrings
The HasAllSubstrings evaluator checks if the output contains all the expected substrings.
Scores 1 if all substrings are present, 0 otherwise.
| Name | Required | Type | Description |
| test_case_mapper | Yes | Callable[[BaseTestCase], list[str]] | Map your test case to a list of strings to check for in the output. |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to a string for comparison. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseHasAllSubstrings
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
expected_substrings: list[str]
def hash(self) -> str:
return md5(self.input)
class HasAllSubstrings(BaseHasAllSubstrings[TestCase, str]):
id = "has-all-substrings"
def test_case_mapper(self, test_case: TestCase) -> list[str]:
return test_case.expected_substrings
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="hello world",
expected_substrings=["hello", "world"],
)
],
fn=lambda test_case: test_case.input,
evaluators=[HasAllSubstrings()],
)
Assertions (Rubric/Rules)
The Assertions evaluator enables you to define a set of assertions or rules that your output must satisfy.
Individual assertions can be marked as not required, and if they are not met, the evaluator will still pass.
| Name | Required | Type | Description |
| evaluate_assertions | Yes | Callable[[BaseTestCase, Any], Optional[List[Assertion]]] | Implement your logic to evaluate the assertions. |
from typing import Optional
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseAssertions
from autoblocks.testing.models import Assertion
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCaseCriterion:
criterion: str
required: bool
@dataclass
class TestCase(BaseTestCase):
input: str
assertions: Optional[list[TestCaseCriterion]] = None
def hash(self) -> str:
return md5(self.input)
class AssertionsEvaluator(BaseAssertions[TestCase, str]):
id = "assertions"
def evaluate_assertions(self, test_case: TestCase, output: str) -> list[Assertion]:
if test_case.assertions is None:
return []
result = []
for assertion in test_case.assertions:
result.append(
Assertion(
criterion=assertion.criterion,
passed=assertion.criterion in output,
required=assertion.required,
)
)
return result
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="hello world",
assertions=[
TestCaseCriterion(criterion="hello", required=True),
TestCaseCriterion(criterion="world", required=True),
TestCaseCriterion(criterion="hi", required=False),
],
)
],
fn=lambda test_case: test_case.input,
evaluators=[AssertionsEvaluator()],
)
LLM Judges
Custom LLM Judge
The CustomLLMJudge evaluator enables you to define custom evaluation criteria using an LLM judge.
| Name | Required | Type | Description |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to the string that you want to evaluate. |
| model | No | str | The OpenAI model to use. Defaults to “gpt-4o”. |
| num_overrides | No | int | Number of recent evaluation overrides to use as examples. Defaults to 0. |
| example_output_mapper | No | Callable[[EvaluationOverride], str] | Map an EvaluationOverride to a string representation of the output. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseCustomLLMJudge
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
def hash(self) -> str:
return md5(self.input)
class CustomLLMJudge(BaseCustomLLMJudge[TestCase, str]):
id = "custom-llm-judge"
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="Hello, how are you?",
),
TestCase(
input="I hate you!"
)
],
fn=lambda test_case: test_case.input,
evaluators=[CustomLLMJudge()],
)
Accuracy
The Accuracy evaluator checks if the output is accurate compared to an expected output.
Scores 1 if accurate, 0.5 if somewhat accurate, 0 if inaccurate.
| Name | Required | Type | Description |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to the string that you want to check for accuracy. |
| model | No | str | The OpenAI model to use. Defaults to “gpt-4o”. |
| num_overrides | No | int | Number of recent evaluation overrides to use as examples. Defaults to 0. |
| example_output_mapper | No | Callable[[EvaluationOverride], str] | Map an EvaluationOverride to a string representation of the output. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseAccuracy
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
expected_output: str
def hash(self) -> str:
return md5(self.input)
class Accuracy(BaseAccuracy[TestCase, str]):
id = "accuracy"
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="What is the capital of France?",
expected_output="The capital of France is Paris.",
),
TestCase(
input="What is the capital of France?",
expected_output="Paris is the capital of France.",
)
],
fn=lambda test_case: test_case.input,
evaluators=[Accuracy()],
)
NSFW
The NSFW evaluator checks if the output is safe for work.
Scores 1 if safe, 0 otherwise.
| Name | Required | Type | Description |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to the string that you want to check for NSFW content. |
| model | No | str | The OpenAI model to use. Defaults to “gpt-4o”. |
| num_overrides | No | int | Number of recent evaluation overrides to use as examples. Defaults to 0. |
| example_output_mapper | No | Callable[[EvaluationOverride], str] | Map an EvaluationOverride to a string representation of the output. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseNSFW
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
def hash(self) -> str:
return md5(self.input)
class NSFW(BaseNSFW[TestCase, str]):
id = "nsfw"
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="Hello, how are you?",
),
TestCase(
input="Explicit content here"
)
],
fn=lambda test_case: test_case.input,
evaluators=[NSFW()],
)
Toxicity
The Toxicity evaluator checks if the output is not toxic.
Scores 1 if it is not toxic, 0 otherwise.
| Name | Required | Type | Description |
| output_mapper | Yes | Callable[[OutputType], str] | Map your output to the string that you want to check for toxicity. |
| model | No | str | The OpenAI model to use. Defaults to “gpt-4o”. |
| num_overrides | No | int | Number of recent evaluation overrides to use as examples. Defaults to 0. |
| example_output_mapper | No | Callable[[EvaluationOverride], str] | Map an EvaluationOverride to a string representation of the output. |
from dataclasses import dataclass
from autoblocks.testing.evaluators import BaseToxicity
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
input: str
def hash(self) -> str:
return md5(self.input)
class Toxicity(BaseToxicity[TestCase, str]):
id = "toxicity"
def output_mapper(self, output: str) -> str:
return output
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
input="Hello, how are you?",
),
TestCase(
input="I hate you!"
)
],
fn=lambda test_case: test_case.input,
evaluators=[Toxicity()],
)
Ragas
Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines.
We have built wrappers around the metrics to make integration with Autoblocks seamless.
Available Ragas evaluators:
BaseRagasLLMContextPrecisionWithReference uses a LLM to measure the proportion of relevant chunks in the retrieved_contexts.
BaseRagasNonLLMContextPrecisionWithReference measures the proportion of relevant chunks in the retrieved_contexts without using a LLM.
BaseRagasLLMContextRecall evaluates the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth.
BaseRagasNonLLMContextRecall uses non llm string comparison metrics to identify if a retrieved context is relevant or not.
BaseRagasContextEntitiesRecall evaluates the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone.
BaseRagasNoiseSensitivity measures how often a system makes errors by providing incorrect responses when utilizing either relevant or irrelevant retrieved documents.
BaseRagasResponseRelevancy focuses on assessing how pertinent the generated answer is to the given prompt.
BaseRagasFaithfulness measures the factual consistency of the generated answer against the given context.
BaseRagasFactualCorrectness compares and evaluates the factual accuracy of the generated response with the reference. This metric is used to determine the extent to which the generated response aligns with the reference.
BaseRagasSemanticSimilarity measures the semantic resemblance between the generated answer and the ground truth.
The Ragas evaluators are only available in the Python SDK. You must install Ragas (pip install ragas) before using these evaluators.
Our wrappers require at least version 0.2.* of Ragas.
| Name | Required | Type | Description |
| id | Yes | str | The unique identifier for the evaluator. |
| threshold | No | Threshold | The threshold for the evaluation used to determine pass/fail. |
| llm | No | Any | Custom LLM for the evaluation. Required for any Ragas evaluator that uses a LLM. Read More: https://docs.ragas.io/en/stable/howtos/customizations/customize_models/ |
| embeddings | No | Any | Custom embeddings model for the evaluation. Required for any Ragas evaluator that uses embeddings. Read More: https://docs.ragas.io/en/stable/howtos/customizations/customize_models/ |
| mode | No | str | Only applicable for the BaseRagasFactualCorrectness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness |
| atomicity | No | str | Only applicable for the BaseRagasFactualCorrectness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness |
| focus | No | str | Only applicable for the BaseRagasNoiseSensitivity and BaseRagasFaithfulness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/noise_sensitivity and https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness |
| user_input_mapper | No | Callable[[TestCaseType, OutputType], str] | Map your test case or output to the user input passed to Ragas. |
| response_mapper | No | Callable[OutputType], str] | Map your output to the response passed to Ragas. |
| reference_mapper | No | Callable[[TestCaseType], str] | Map your test case to the reference passed to Ragas. |
| retrieved_contexts_mapper | No | Callable[[TestCaseType, OutputType], str] | Map your test case and output to the retrieved contexts passed to Ragas. |
| reference_contexts_mapper | No | Callable[[TestCaseType], str] | Map your test case to the reference contexts passed to Ragas. |
Individual Ragas evaluators require different parameters.
You can find sample implementations for each of the Ragas evaluators here.
from dataclasses import dataclass
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper # type: ignore[import-untyped]
from ragas.llms import LangchainLLMWrapper # type: ignore[import-untyped]
from autoblocks.testing.evaluators import BaseRagasResponseRelevancy
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.models import Threshold
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5
@dataclass
class TestCase(BaseTestCase):
question: str
expected_answer: str
def hash(self) -> str:
return md5(self.question)
@dataclass
class Output:
answer: str
contexts: list[str]
# You can use any of the Ragas evaluators listed here:
# https://docs.autoblocks.ai/testing/offline-evaluations#out-of-box-evaluators-ragas
class ResponseRelevancy(BaseRagasResponseRelevancy[TestCase, Output]):
id = "response-relevancy"
threshold = Threshold(gte=1)
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
def user_input_mapper(self, test_case: TestCase, output: Output) -> str:
return test_case.question
def response_mapper(self, output: Output) -> str:
return output.answer
def retrieved_contexts_mapper(self, test_case: TestCase, output: Output) -> list[str]:
return output.contexts
run_test_suite(
id="my-test-suite",
test_cases=[
TestCase(
question="How tall is the Eiffel Tower?",
expected_answer="300 meters"
)
],
fn=lambda test_case: Output(
answer="300 meters",
contexts=["The Eiffel tower stands 300 meters tall."],
),
evaluators=[ResponseRelevancy()],
)