from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from typing import ClassVar, Optional, TypeVar
from lingua import ConfidenceValue, IsoCode639_1, LanguageDetectorBuilder
from lingua import Language as LinguaLanguage
from pycountry import languages
from pydantic import BaseModel
from intelligence_layer.core.task import Task
from intelligence_layer.core.tracer.tracer import TaskSpan
class LanguageNotSupportedError(ValueError):
"""Raised in case language in the input is not compatible with the languages supported in the task."""
Config = TypeVar("Config")
[docs]
@dataclass(frozen=True)
class Language:
"""A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_."""
iso_639_1: str
def get_name(self) -> Optional[str]:
language = languages.get(alpha_2=self.iso_639_1)
return language.name if language else None
def language_config(self, configs: Mapping["Language", Config]) -> Config:
config = configs.get(self)
if config is None:
raise LanguageNotSupportedError(
f"{self.iso_639_1} not in ({', '.join(lang.iso_639_1 for lang in configs)})"
)
return config
def to_lingua_language(self) -> LinguaLanguage:
iso_code = getattr(IsoCode639_1, self.iso_639_1.upper())
language = LinguaLanguage.from_iso_code_639_1(iso_code)
return language
[docs]
class DetectLanguageOutput(BaseModel):
"""The output of a `DetectLanguage` task.
Attributes:
best_fit: The prediction for the best matching language.
Will be `None` if no language has a probability above the threshold.
"""
best_fit: Optional[Language]
class AnnotatedLanguage(BaseModel):
lang: Language
prob: float
[docs]
class DetectLanguage(Task[DetectLanguageInput, DetectLanguageOutput]):
"""Task that detects the language of a text.
Analyzes the likelihood that a given text is written in one of the
`possible_languages`. Returns the best match or `None`.
Args:
threshold: Minimum probability value for a language to be considered
the `best_fit`.
Example:
>>> from intelligence_layer.core import (
... DetectLanguage,
... DetectLanguageInput,
... InMemoryTracer,
... Language,
... )
>>> task = DetectLanguage()
>>> input = DetectLanguageInput(
... text="This is an English text.",
... possible_languages=[Language(l) for l in ("en", "fr")],
... )
>>> output = task.run(input, InMemoryTracer())
"""
AVAILABLE_LANGUAGES: ClassVar[list[LinguaLanguage]] = [
LinguaLanguage.CATALAN,
LinguaLanguage.ENGLISH,
LinguaLanguage.FRENCH,
LinguaLanguage.GERMAN,
LinguaLanguage.ITALIAN,
LinguaLanguage.POLISH,
LinguaLanguage.SPANISH,
]
def __init__(self, threshold: float = 0.5):
super().__init__()
self._threshold = threshold
self._detector = LanguageDetectorBuilder.from_languages(
*self.AVAILABLE_LANGUAGES
).build()
[docs]
def do_run(
self, input: DetectLanguageInput, task_span: TaskSpan
) -> DetectLanguageOutput:
annotated_languages = self._detect_languages(input, task_span)
best_fit = self._get_best_fit(annotated_languages, input.possible_languages)
return DetectLanguageOutput(best_fit=best_fit if best_fit is not None else None)
def _detect_languages(
self, input: DetectLanguageInput, task_span: TaskSpan
) -> Sequence[AnnotatedLanguage]:
determined_languages = self._detector.compute_language_confidence_values(
input.text
)
annotated_languages = [
AnnotatedLanguage(
lang=Language(iso_639_1=self._to_iso_639_1_code(lang)), prob=lang.value
)
for lang in determined_languages
]
task_span.log("Raw language probabilities", annotated_languages)
return annotated_languages
def _to_iso_639_1_code(self, lingua_with_confidence: ConfidenceValue) -> str:
return str(lingua_with_confidence.language.iso_code_639_1.name).lower()
def _get_best_fit(
self,
languages_result: Sequence[AnnotatedLanguage],
possible_languages: Sequence[Language],
) -> Optional[Language]:
return (
languages_result[0].lang
if (
languages_result[0].prob >= self._threshold
and languages_result[0].lang in possible_languages
)
else None
)