Autoblocks provides a set of evaluators that can be used out of the box. These evaluators are designed to be easily integrated into your test suite and can help you get started with testing your AI-powered applications.

Each evaluator below lists the custom properties and methods that need to be implemented to use the evaluator in your test suite. You must set the id property, which is a unique identifier for the evaluator.

All of the code snippets can be run by following the instructions in the Quick Start guide.

Ragas

Logic Based

Is Equals

The IsEquals evaluator checks if the expected output equals the actual output.

Scores 1 if equal, 0 otherwise.

NameRequiredTypeDescription
test_case_mapperYesCallable[[BaseTestCase], str]Map your test case to a string for comparison.
output_mapperYesCallable[[OutputType], str]Map your output to a string for comparison.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseIsEquals
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str
    expected_output: str

    def hash(self) -> str:
        return md5(self.input)

class IsEquals(BaseIsEquals[TestCase, str]):
    id = "is-equals"

    def test_case_mapper(self, test_case: TestCase) -> str:
        return test_case.expected_output

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="hello world",
            expected_output="hello world",
        ),
        TestCase(
            input="hi world",
            expected_output="hello world",
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[IsEquals()],
)

Is Valid JSON

The IsValidJSON evaluator checks if the output is valid JSON.

Scores 1 if it is valid, 0 otherwise.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to check is valid JSON.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseIsValidJSON
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class IsValidJSON(BaseIsValidJSON[TestCase, str]):
    id = "is-valid-json"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="hello world",
        ),
        TestCase(
            input='{"hello": "world"}'
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[IsValidJSON()],
)

Has All Substrings

The HasAllSubstrings evaluator checks if the output contains all the expected substrings.

Scores 1 if all substrings are present, 0 otherwise.

NameRequiredTypeDescription
test_case_mapperYesCallable[[BaseTestCase], list[str]]Map your test case to a list of strings to check for in the output.
output_mapperYesCallable[[OutputType], str]Map your output to a string for comparison.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseHasAllSubstrings
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str
    expected_substrings: list[str]

    def hash(self) -> str:
        return md5(self.input)

class HasAllSubstrings(BaseHasAllSubstrings[TestCase, str]):
    id = "has-all-substrings"

    def test_case_mapper(self, test_case: TestCase) -> list[str]:
        return test_case.expected_substrings

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="hello world",
            expected_substrings=["hello", "world"],
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[HasAllSubstrings()],
)

Assertions (Rubric/Rules)

The Assertions evaluator enables you to define a set of assertions or rules that your output must satisfy.

Individual assertions can be marked as not required, and if they are not met, the evaluator will still pass.

NameRequiredTypeDescription
evaluate_assertionsYesCallable[[BaseTestCase, Any], Optional[List[Assertion]]]Implement your logic to evaluate the assertions.
from typing import Optional
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseAssertions
from autoblocks.testing.models import Assertion
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCaseCriterion:
    criterion: str
    required: bool


@dataclass
class TestCase(BaseTestCase):
    input: str
    assertions: Optional[list[TestCaseCriterion]] = None

    def hash(self) -> str:
        return md5(self.input)


class AssertionsEvaluator(BaseAssertions[TestCase, str]):
    id = "assertions"

    def evaluate_assertions(self, test_case: TestCase, output: str) -> list[Assertion]:
        if test_case.assertions is None:
            return []
        result = []
        for assertion in test_case.assertions:
            result.append(
                Assertion(
                    criterion=assertion.criterion,
                    passed=assertion.criterion in output,
                    required=assertion.required,
                )
            )
        return result


run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="hello world",
            assertions=[
                TestCaseCriterion(criterion="hello", required=True),
                TestCaseCriterion(criterion="world", required=True),
                TestCaseCriterion(criterion="hi", required=False),
            ],
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[AssertionsEvaluator()],
)

LLM Judges

Custom LLM Judge

The CustomLLMJudge evaluator enables you to define custom evaluation criteria using an LLM judge.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to evaluate.
modelNostrThe OpenAI model to use. Defaults to “gpt-4o”.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseCustomLLMJudge
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class CustomLLMJudge(BaseCustomLLMJudge[TestCase, str]):
    id = "custom-llm-judge"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="Hello, how are you?",
        ),
        TestCase(
            input="I hate you!"
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[CustomLLMJudge()],
)

Automatic Battle

The AutomaticBattle evaluator enables you to compare two outputs using an LLM judge.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to compare.
modelNostrThe OpenAI model to use. Defaults to “gpt-4o”.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseAutomaticBattle
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class AutomaticBattle(BaseAutomaticBattle[TestCase, str]):
    id = "automatic-battle"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="Hello, how are you?",
        ),
        TestCase(
            input="I hate you!"
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[AutomaticBattle()],
)

Manual Battle

The ManualBattle evaluator enables you to compare two outputs using human evaluation.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to compare.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseManualBattle
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class ManualBattle(BaseManualBattle[TestCase, str]):
    id = "manual-battle"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="Hello, how are you?",
        ),
        TestCase(
            input="I hate you!"
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[ManualBattle()],
)

Accuracy

The Accuracy evaluator checks if the output is accurate compared to an expected output.

Scores 1 if accurate, 0.5 if somewhat accurate, 0 if inaccurate.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to check for accuracy.
modelNostrThe OpenAI model to use. Defaults to “gpt-4o”.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseAccuracy
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str
    expected_output: str

    def hash(self) -> str:
        return md5(self.input)

class Accuracy(BaseAccuracy[TestCase, str]):
    id = "accuracy"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="What is the capital of France?",
            expected_output="The capital of France is Paris.",
        ),
        TestCase(
            input="What is the capital of France?",
            expected_output="Paris is the capital of France.",
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[Accuracy()],
)

NSFW

The NSFW evaluator checks if the output is safe for work.

Scores 1 if safe, 0 otherwise.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to check for NSFW content.
modelNostrThe OpenAI model to use. Defaults to “gpt-4o”.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseNSFW
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class NSFW(BaseNSFW[TestCase, str]):
    id = "nsfw"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="Hello, how are you?",
        ),
        TestCase(
            input="Explicit content here"
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[NSFW()],
)

Toxicity

The Toxicity evaluator checks if the output is not toxic.

Scores 1 if it is not toxic, 0 otherwise.

NameRequiredTypeDescription
output_mapperYesCallable[[OutputType], str]Map your output to the string that you want to check for toxicity.
modelNostrThe OpenAI model to use. Defaults to “gpt-4o”.
num_overridesNointNumber of recent evaluation overrides to use as examples. Defaults to 0.
example_output_mapperNoCallable[[EvaluationOverride], str]Map an EvaluationOverride to a string representation of the output.
from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseToxicity
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str

    def hash(self) -> str:
        return md5(self.input)

class Toxicity(BaseToxicity[TestCase, str]):
    id = "toxicity"

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="Hello, how are you?",
        ),
        TestCase(
            input="I hate you!"
        )
    ],
    fn=lambda test_case: test_case.input,
    evaluators=[Toxicity()],
)

Ragas

Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. We have built wrappers around the metrics to make integration with Autoblocks seamless.

Available Ragas evaluators:

  • BaseRagasLLMContextPrecisionWithReference uses a LLM to measure the proportion of relevant chunks in the retrieved_contexts.
  • BaseRagasNonLLMContextPrecisionWithReference measures the proportion of relevant chunks in the retrieved_contexts without using a LLM.
  • BaseRagasLLMContextRecall evaluates the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth.
  • BaseRagasNonLLMContextRecall uses non llm string comparison metrics to identify if a retrieved context is relevant or not.
  • BaseRagasContextEntitiesRecall evaluates the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone.
  • BaseRagasNoiseSensitivity measures how often a system makes errors by providing incorrect responses when utilizing either relevant or irrelevant retrieved documents.
  • BaseRagasResponseRelevancy focuses on assessing how pertinent the generated answer is to the given prompt.
  • BaseRagasFaithfulness measures the factual consistency of the generated answer against the given context.
  • BaseRagasFactualCorrectness compares and evaluates the factual accuracy of the generated response with the reference. This metric is used to determine the extent to which the generated response aligns with the reference.
  • BaseRagasSemanticSimilarity measures the semantic resemblance between the generated answer and the ground truth.

The Ragas evaluators are only available in the Python SDK. You must install Ragas (pip install ragas) before using these evaluators. Our wrappers require at least version 0.2.* of Ragas.

NameRequiredTypeDescription
idYesstrThe unique identifier for the evaluator.
thresholdNoThresholdThe threshold for the evaluation used to determine pass/fail.
llmNoAnyCustom LLM for the evaluation. Required for any Ragas evaluator that uses a LLM. Read More: https://docs.ragas.io/en/stable/howtos/customizations/customize_models/
embeddingsNoAnyCustom embeddings model for the evaluation. Required for any Ragas evaluator that uses embeddings. Read More: https://docs.ragas.io/en/stable/howtos/customizations/customize_models/
modeNostrOnly applicable for the BaseRagasFactualCorrectness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness
atomicityNostrOnly applicable for the BaseRagasFactualCorrectness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness
focusNostrOnly applicable for the BaseRagasNoiseSensitivity and BaseRagasFaithfulness evaluator. Read More: https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/noise_sensitivity and https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness
user_input_mapperNoCallable[[TestCaseType, OutputType], str]Map your test case or output to the user input passed to Ragas.
response_mapperNoCallable[OutputType], str]Map your output to the response passed to Ragas.
reference_mapperNoCallable[[TestCaseType], str]Map your test case to the reference passed to Ragas.
retrieved_contexts_mapperNoCallable[[TestCaseType, OutputType], str]Map your test case and output to the retrieved contexts passed to Ragas.
reference_contexts_mapperNoCallable[[TestCaseType], str]Map your test case to the reference contexts passed to Ragas.

Individual Ragas evaluators require different parameters. You can find sample implementations for each of the Ragas evaluators here.

from dataclasses import dataclass

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper  # type: ignore[import-untyped]
from ragas.llms import LangchainLLMWrapper  # type: ignore[import-untyped]

from autoblocks.testing.evaluators import BaseRagasResponseRelevancy
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.models import Threshold
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    question: str
    expected_answer: str

    def hash(self) -> str:
        return md5(self.question)
    
@dataclass
class Output:
    answer: str
    contexts: list[str]

# You can use any of the Ragas evaluators listed here:
# https://docs.autoblocks.ai/testing/offline-evaluations#out-of-box-evaluators-ragas
class ResponseRelevancy(BaseRagasResponseRelevancy[TestCase, Output]):
    id = "response-relevancy"
    threshold = Threshold(gte=1)
    llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
    embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

    def user_input_mapper(self, test_case: TestCase, output: Output) -> str:
        return test_case.question

    def response_mapper(self, output: Output) -> str:
        return output.answer

    def retrieved_contexts_mapper(self, test_case: TestCase, output: Output) -> list[str]:
        return output.contexts

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            question="How tall is the Eiffel Tower?",
            expected_answer="300 meters"
        )
    ],
    fn=lambda test_case: Output(
        answer="300 meters",
        contexts=["The Eiffel tower stands 300 meters tall."],
    ),
    evaluators=[ResponseRelevancy()],
)