Source code for intelligence_layer.evaluation.dataset.in_memory_dataset_repository

from collections.abc import Iterable, Sequence
from typing import Any, Optional, TypeVar, cast

from pydantic import TypeAdapter

from intelligence_layer.connectors.base.json_serializable import (
    SerializableDict,
)
from intelligence_layer.core import Input, PydanticSerializable
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import (
    Dataset,
    Example,
    ExpectedOutput,
)



[docs]
class InMemoryDatasetRepository(DatasetRepository):
    def __init__(self) -> None:
        self._datasets_and_examples: dict[
            str,
            tuple[
                Dataset, Sequence[Example[PydanticSerializable, PydanticSerializable]]
            ],
        ] = {}


[docs]
    def create_dataset(
        self,
        examples: Iterable[Example[Input, ExpectedOutput]],
        dataset_name: str,
        id: str | None = None,
        labels: set[str] | None = None,
        metadata: SerializableDict | None = None,
    ) -> Dataset:
        if metadata is None:
            metadata = dict()
        if labels is None:
            labels = set()
        dataset = Dataset(name=dataset_name, labels=labels, metadata=metadata)
        if id is not None:
            dataset.id = id
        if dataset.id in self._datasets_and_examples:
            if id:
                raise ValueError(
                    f"Cannot create dataset - dataset with given ID '{dataset.id}' already exists."
                )
            else:
                raise ValueError(
                    f"Newly assigned random dataset ID {dataset.id} already exists. This should not happen."
                )

        examples_casted = cast(
            Sequence[Example[PydanticSerializable, PydanticSerializable]],
            list(examples),
        )
        self._datasets_and_examples[dataset.id] = (dataset, examples_casted)

        return dataset



[docs]
    def delete_dataset(self, dataset_id: str) -> None:
        self._datasets_and_examples.pop(dataset_id, None)



[docs]
    def dataset(self, dataset_id: str) -> Optional[Dataset]:
        if dataset_id in self._datasets_and_examples:
            return self._datasets_and_examples[dataset_id][0]
        return None



[docs]
    def dataset_ids(self) -> Iterable[str]:
        return sorted(list(self._datasets_and_examples.keys()))


    T = TypeVar("T")

    @staticmethod
    def _convert_to_type(data: Any, desired_type: T) -> T:
        if type(data) is desired_type:
            return data
        else:
            return TypeAdapter(desired_type).validate_python(data)


[docs]
    def example(
        self,
        dataset_id: str,
        example_id: str,
        input_type: type[Input],
        expected_output_type: type[ExpectedOutput],
    ) -> Optional[Example[Input, ExpectedOutput]]:
        examples = self.examples(dataset_id, input_type, expected_output_type)
        filtered = (e for e in examples if e.id == example_id)
        return next(filtered, None)



[docs]
    def examples(
        self,
        dataset_id: str,
        input_type: type[Input],
        expected_output_type: type[ExpectedOutput],
        examples_to_skip: Optional[frozenset[str]] = None,
    ) -> Iterable[Example[Input, ExpectedOutput]]:
        examples_to_skip = examples_to_skip or frozenset()
        if dataset_id not in self._datasets_and_examples:
            raise ValueError(
                f"Repository does not contain a dataset with id: {dataset_id}"
            )
        examples: list[Example[Input, ExpectedOutput]] = []
        for example in self._datasets_and_examples[dataset_id][1]:
            if example.id in examples_to_skip:
                continue
            converted_input = self._convert_to_type(example.input, input_type)
            converted_expected_output = self._convert_to_type(
                example.expected_output, expected_output_type
            )
            examples.append(
                Example[input_type, expected_output_type](  # type: ignore
                    id=example.id,
                    input=converted_input,
                    expected_output=converted_expected_output,
                    metadata=example.metadata,
                )
            )
        return sorted(
            examples,
            key=lambda example: example.id,
        )