Formatting your test cases and outputs

The schemas of your test case and output as they exist in your codebase often contain implementation details that are not relevant to a human reviewer. Each SDK provides methods that allow you to transform your test cases and outputs into human-readable formats.

from dataclasses import dataclass
from uuid import UUID
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.models import HumanReviewField
from autoblocks.testing.models import HumanReviewFieldContentType
from autoblocks.testing.util import md5

@dataclass
class Document:
    uuid: UUID  # Not relevant for human review, so we don't include it below
    title: str
    content: str

@dataclass
class MyCustomTestCase(BaseTestCase):
    user_question: str
    documents: list[Document]

    def hash(self) -> str:
        return md5(self.user_question)

    def serialize_for_human_review(self) -> list[HumanReviewField]:
        return [
            HumanReviewField(
                name="Question",
                value=self.user_question,
                content_type=HumanReviewFieldContentType.TEXT,
            ),
        ] + [
            HumanReviewField(
                name=f"Document {i + 1}: {doc.title}",
                value=doc.content,
                content_type=HumanReviewFieldContentType.TEXT,
            )
            for i, doc in enumerate(self.documents)
        ]

@dataclass
class MyCustomOutput:
    answer: str
    reason: str

    # These fields are implementation details not needed
    # for human review, so they will be omitted below
    x: int
    y: int
    z: int

    def serialize_for_human_review(self) -> list[HumanReviewField]:
        return [
            HumanReviewField(
                name="Answer",
                value=self.answer,
                content_type=HumanReviewFieldContentType.TEXT
            ),
            HumanReviewField(
                name="Reason",
                value=self.reason,
                content_type=HumanReviewFieldContentType.TEXT,
            ),
        ]

There are four different content types you can use to control the rendering in the Autoblocks UI:

  • TEXT
  • HTML
  • MARKDOWN
  • LINK

This is often a good starting point when setting up a test suite for the first time. Developers can run the test without any code-based evaluators and review the results manually to understand the responses being generated by the LLM.

Creating a human review job programmatically

Whether you are on the free plan or a paid plan, you can create human review jobs directly in code with either the RunManager or in runTestSuite.

run_test_suite / runTestSuite

from dataclasses import dataclass

from autoblocks.testing.evaluators import BaseHasAllSubstrings
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.models import CreateHumanReviewJob
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.util import md5

@dataclass
class TestCase(BaseTestCase):
    input: str
    expected_substrings: list[str]

    def hash(self) -> str:
        return md5(self.input) # Unique identifier for a test case

class HasAllSubstrings(BaseHasAllSubstrings[TestCase, str]):
    id = "has-all-substrings"

    def test_case_mapper(self, test_case: TestCase) -> list[str]:
        return test_case.expected_substrings

    def output_mapper(self, output: str) -> str:
        return output

run_test_suite(
    id="my-test-suite",
    test_cases=[
        TestCase(
            input="hello world",
            expected_substrings=["hello", "world"],
        )
    ], # Replace with your test cases
    fn=lambda test_case: test_case.input, # Replace with your LLM call
    evaluators=[HasAllSubstrings()], # Replace with your evaluators
    human_review_job=CreateHumanReviewJob(
        assignee_email_address="example@example.com",
        name="Review for accuracy",
    )
)

Run Manager

from dataclasses import dataclass

from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.models import HumanReviewField
from autoblocks.testing.models import HumanReviewFieldContentType
from autoblocks.testing.run import RunManager
from autoblocks.testing.util import md5


# Update with your test case type
@dataclass
class TestCase(BaseTestCase):
    input: str

    def serialize_for_human_review(self) -> list[HumanReviewField]:
        return [
            HumanReviewField(
                name="Input",
                value=self.input,
                content_type=HumanReviewFieldContentType.TEXT,
            ),
        ]

    def hash(self) -> str:
        return md5(self.input)


# Update with your output type
@dataclass
class Output:
    output: str

    def serialize_for_human_review(self) -> list[HumanReviewField]:
        return [
            HumanReviewField(
                name="Output",
                value=self.output,
                content_type=HumanReviewFieldContentType.TEXT,
            ),
        ]


run = RunManager[TestCase, Output](
    test_id="test-id",
)

run.start()
# Add results from your test suite here
run.add_result(
    test_case=TestCase(input="Hello, world!"),
    output=Output(output="Hi, world!"),
)
run.end()

run.create_human_review_job(
    assignee_email_address="${emailAddress}",
    name="Review for accuracy",
)

Using the results

You can use the results of a human review job for a variety of purposes, such as:

  • Fine tuning an evaluation model
  • Few shot examples in your LLM judges
  • Improving your core product based on expert feedback
  • and more!