Source code for bocoel.corpora.embedders.huggingface

from collections.abc import Callable, Sequence
from typing import Any

from torch import Tensor

from bocoel.corpora.embedders.interfaces import Embedder


[docs] class HuggingfaceEmbedder(Embedder): """ Huggingface embedder. Uses the transformers library. Not a traditional encoder but uses a classifier and logits as embeddings. """
[docs] def __init__( self, path: str, device: str = "cpu", batch_size: int = 64, transform: Callable[[Any], Tensor] = lambda output: output.logits, ) -> None: """ Initializes the Huggingface embedder. Parameters: path: The path to the model. device: The device to use. batch_size: The batch size for encoding. transform: The transformation function to use. Raises: ImportError: If transformers is not installed. ValueError: If the model does not have a `config.id2label` attribute. """ # Optional dependency. from transformers import AutoModelForSequenceClassification, AutoTokenizer self._path = path self._model = AutoModelForSequenceClassification.from_pretrained(path) self._tokenizer = AutoTokenizer.from_pretrained(path) self._batch_size = batch_size self._device = device self._model = self._model.to(device) self._transform = transform try: self._dims = len(self._model.config.id2label) except AttributeError as e: raise ValueError( "The model must have a `config.id2label` attribute to determine the number of classes." ) from e
def __repr__(self) -> str: return f"Huggingface({self._path}, {self.dims})" @property def batch(self) -> int: return self._batch_size @property def dims(self) -> int: return self._dims def _encode(self, texts: Sequence[str]) -> Tensor: encoded = self._tokenizer( texts, return_tensors="pt", padding=True, max_length=self._tokenizer.model_max_length, ).to(self._device) output = self._model(**encoded) transformed = self._transform(output) return transformed