Source code for intelligence_layer.core.detect_language

from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from typing import ClassVar, Optional, TypeVar

from lingua import ConfidenceValue, IsoCode639_1, LanguageDetectorBuilder
from lingua import Language as LinguaLanguage
from pycountry import languages
from pydantic import BaseModel

from intelligence_layer.core.task import Task
from intelligence_layer.core.tracer.tracer import TaskSpan


class LanguageNotSupportedError(ValueError):
    """Raised in case language in the input is not compatible with the languages supported in the task."""


Config = TypeVar("Config")


[docs] @dataclass(frozen=True) class Language: """A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_.""" iso_639_1: str def get_name(self) -> Optional[str]: language = languages.get(alpha_2=self.iso_639_1) return language.name if language else None def language_config(self, configs: Mapping["Language", Config]) -> Config: config = configs.get(self) if config is None: raise LanguageNotSupportedError( f"{self.iso_639_1} not in ({', '.join(lang.iso_639_1 for lang in configs)})" ) return config def to_lingua_language(self) -> LinguaLanguage: iso_code = getattr(IsoCode639_1, self.iso_639_1.upper()) language = LinguaLanguage.from_iso_code_639_1(iso_code) return language
[docs] class DetectLanguageInput(BaseModel): """The input for a `DetectLanguage` task. Attributes: text: The text to identify the language for. possible_languages: All languages that should be considered during detection. Languages should be provided with their ISO 639-1 codes. """ text: str possible_languages: Sequence[Language]
[docs] class DetectLanguageOutput(BaseModel): """The output of a `DetectLanguage` task. Attributes: best_fit: The prediction for the best matching language. Will be `None` if no language has a probability above the threshold. """ best_fit: Optional[Language]
class AnnotatedLanguage(BaseModel): lang: Language prob: float
[docs] class DetectLanguage(Task[DetectLanguageInput, DetectLanguageOutput]): """Task that detects the language of a text. Analyzes the likelihood that a given text is written in one of the `possible_languages`. Returns the best match or `None`. Args: threshold: Minimum probability value for a language to be considered the `best_fit`. Example: >>> from intelligence_layer.core import ( ... DetectLanguage, ... DetectLanguageInput, ... InMemoryTracer, ... Language, ... ) >>> task = DetectLanguage() >>> input = DetectLanguageInput( ... text="This is an English text.", ... possible_languages=[Language(l) for l in ("en", "fr")], ... ) >>> output = task.run(input, InMemoryTracer()) """ AVAILABLE_LANGUAGES: ClassVar[list[LinguaLanguage]] = [ LinguaLanguage.CATALAN, LinguaLanguage.ENGLISH, LinguaLanguage.FRENCH, LinguaLanguage.GERMAN, LinguaLanguage.ITALIAN, LinguaLanguage.POLISH, LinguaLanguage.SPANISH, ] def __init__(self, threshold: float = 0.5): super().__init__() self._threshold = threshold self._detector = LanguageDetectorBuilder.from_languages( *self.AVAILABLE_LANGUAGES ).build()
[docs] def do_run( self, input: DetectLanguageInput, task_span: TaskSpan ) -> DetectLanguageOutput: annotated_languages = self._detect_languages(input, task_span) best_fit = self._get_best_fit(annotated_languages, input.possible_languages) return DetectLanguageOutput(best_fit=best_fit if best_fit is not None else None)
def _detect_languages( self, input: DetectLanguageInput, task_span: TaskSpan ) -> Sequence[AnnotatedLanguage]: determined_languages = self._detector.compute_language_confidence_values( input.text ) annotated_languages = [ AnnotatedLanguage( lang=Language(iso_639_1=self._to_iso_639_1_code(lang)), prob=lang.value ) for lang in determined_languages ] task_span.log("Raw language probabilities", annotated_languages) return annotated_languages def _to_iso_639_1_code(self, lingua_with_confidence: ConfidenceValue) -> str: return str(lingua_with_confidence.language.iso_code_639_1.name).lower() def _get_best_fit( self, languages_result: Sequence[AnnotatedLanguage], possible_languages: Sequence[Language], ) -> Optional[Language]: return ( languages_result[0].lang if ( languages_result[0].prob >= self._threshold and languages_result[0].lang in possible_languages ) else None )