import warnings
from collections.abc import Iterable
from typing import Optional
from intelligence_layer.connectors import (
SerializableDict,
StudioClient,
)
from intelligence_layer.connectors.studio.studio import (
StudioDataset,
StudioExample,
)
from intelligence_layer.core import Input
from intelligence_layer.evaluation import (
Dataset,
DatasetRepository,
Example,
ExpectedOutput,
)
[docs]
class StudioDatasetRepository(DatasetRepository):
"""Dataset repository interface with Data Platform."""
def __init__(self, studio_client: StudioClient) -> None:
"""Initializes the StudioDatasetRepository.
Args:
studio_client: Client to interact with the Studio API.
"""
self.studio_client = studio_client
warnings.warn(
"The StudioDatasetRepository is currently in beta and only supports create_dataset."
)
[docs]
def create_dataset(
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
labels: set[str] | None = None,
metadata: SerializableDict | None = None,
) -> Dataset:
"""Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
Args:
examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
dataset_name: A name for the dataset.
id: ID is not used in the StudioDatasetRepository as it is generated by the Studio.
labels: A list of labels for filtering. Defaults to an empty list. Defaults to None.
metadata: A dict for additional information about the dataset. Defaults to an empty dict. Defaults to None.
Returns:
:class:`Dataset`
"""
if id is not None:
raise NotImplementedError(
"Custom dataset IDs are not supported by the StudioDataRepository"
)
created_dataset = Dataset(
name=dataset_name,
labels=labels or set(),
metadata=metadata or dict(),
)
studio_dataset = self.map_to_studio_dataset(created_dataset)
studio_examples = self.map_to_many_studio_example(examples)
studio_dataset_id = self.studio_client.submit_dataset(
dataset=studio_dataset, examples=studio_examples
)
created_dataset.id = studio_dataset_id
return created_dataset
[docs]
def delete_dataset(self, dataset_id: str) -> None:
"""Deletes a dataset identified by the given dataset ID.
Args:
dataset_id: Dataset ID of the dataset to delete.
"""
raise NotImplementedError()
[docs]
def dataset(self, dataset_id: str) -> Optional[Dataset]:
"""Returns a dataset identified by the given dataset ID.
Args:
dataset_id: Dataset ID of the dataset to delete.
Returns:
:class:`Dataset` if it was not, `None` otherwise.
"""
raise NotImplementedError()
[docs]
def datasets(self) -> Iterable[Dataset]:
"""Returns all :class:`Dataset`s. Sorting is not guaranteed.
Returns:
:class:`Sequence` of :class:`Dataset`s.
"""
raise NotImplementedError()
[docs]
def dataset_ids(self) -> Iterable[str]:
"""Returns all sorted dataset IDs.
Returns:
:class:`Iterable` of dataset IDs.
"""
raise NotImplementedError()
[docs]
def example(
self,
dataset_id: str,
example_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Optional[Example[Input, ExpectedOutput]]:
"""Returns an :class:`Example` for the given dataset ID and example ID.
Args:
dataset_id: Dataset ID of the linked dataset.
example_id: ID of the example to retrieve.
input_type: Input type of the example.
expected_output_type: Expected output type of the example.
Returns:
:class:`Example` if it was found, `None` otherwise.
"""
raise NotImplementedError()
[docs]
def examples(
self,
dataset_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
examples_to_skip: Optional[frozenset[str]] = None,
) -> Iterable[Example[Input, ExpectedOutput]]:
"""Returns all :class:`Example`s for the given dataset ID sorted by their ID.
Args:
dataset_id: Dataset ID whose examples should be retrieved.
input_type: Input type of the example.
expected_output_type: Expected output type of the example.
examples_to_skip: Optional list of example IDs. Those examples will be excluded from the output. Defaults to None.
Returns:
:class:`Iterable` of :class`Example`s.
"""
raise NotImplementedError()
def map_to_studio_example(
self, example_to_map: Example[Input, ExpectedOutput]
) -> StudioExample[Input, ExpectedOutput]:
return StudioExample(**example_to_map.model_dump())
def map_to_many_studio_example(
self, examples_to_map: Iterable[Example[Input, ExpectedOutput]]
) -> Iterable[StudioExample[Input, ExpectedOutput]]:
return (self.map_to_studio_example(example) for example in examples_to_map)
def map_to_studio_dataset(self, dataset_to_map: Dataset) -> StudioDataset:
return StudioDataset(**dataset_to_map.model_dump())