Source code for intelligence_layer.connectors.retrievers.document_index_retriever

from collections.abc import Sequence

from intelligence_layer.connectors.document_index.document_index import (
    CollectionPath,
    DocumentIndexClient,
    DocumentPath,
    DocumentTextPosition,
    SearchQuery,
)
from intelligence_layer.connectors.retrievers.base_retriever import (
    BaseRetriever,
    Document,
    DocumentChunk,
    SearchResult,
)


[docs] class DocumentIndexRetriever(BaseRetriever[DocumentPath]): """Search through documents within collections in the `DocumentIndexClient`. We initialize this Retriever with a collection & namespace names, and we can find the documents in the collection most semanticly similar to our query. Args: document_index: Client offering functionality for search. index_name: The name of the index to be used. namespace: The namespace within the `DocumentIndexClient` where all collections are stored. collection: The collection within the namespace that holds the desired documents. k: The (top) number of documents to be returned by search. threshold: The mimumum value of cosine similarity between the query vector and the document vector. Example: >>> import os >>> from intelligence_layer.connectors import DocumentIndexClient, DocumentIndexRetriever >>> document_index = DocumentIndexClient(os.getenv("AA_TOKEN")) >>> retriever = DocumentIndexRetriever(document_index, "asymmetric", "aleph-alpha", "wikipedia-de", 3) >>> documents = retriever.get_relevant_documents_with_scores("Who invented the airplane?") """ def __init__( self, document_index: DocumentIndexClient, index_name: str, namespace: str, collection: str, k: int, threshold: float = 0.5, ) -> None: self._document_index = document_index self._index_name = index_name self._collection_path = CollectionPath( namespace=namespace, collection=collection ) self._k = k self._threshold = threshold def _get_absolute_position( self, id: DocumentPath, document_text_position: DocumentTextPosition ) -> dict[str, int]: doc = self._document_index.document(id) previous_item_length = sum( len(text) for text in doc.contents[0 : document_text_position.item] ) start = previous_item_length + document_text_position.start_position end = previous_item_length + document_text_position.end_position return {"start": start, "end": end} def get_relevant_documents_with_scores( self, query: str ) -> Sequence[SearchResult[DocumentPath]]: search_query = SearchQuery( query=query, max_results=self._k, min_score=self._threshold ) response = self._document_index.search( self._collection_path, self._index_name, search_query ) relevant_chunks = [ SearchResult( id=result.document_path, score=result.score, document_chunk=DocumentChunk( text=result.section, **self._get_absolute_position( id=result.document_path, document_text_position=result.chunk_position, ), ), ) for result in response ] return relevant_chunks def get_full_document(self, id: DocumentPath) -> Document: contents = self._document_index.document(id) return Document(text="\n".join(contents.contents), metadata=contents.metadata)