Source code for bocoel.models.lms.huggingface.logits

from collections.abc import Sequence

import torch
from numpy.typing import NDArray

from bocoel.models.lms.interfaces import ClassifierModel

from .causal import HuggingfaceCausalLM



[docs]
class HuggingfaceLogitsLM(HuggingfaceCausalLM, ClassifierModel):
    """
    Logits classification model backed by huggingface's transformers library.
    This means that the model would use the logits of ['1', '2', '3', '4', '5'] as the output,
    if `choices = 5`, for the current batch of inputs.
    """


[docs]
    def __init__(
        self,
        model_path: str,
        batch_size: int,
        device: str,
        choices: Sequence[str],
        add_sep_token: bool = False,
    ) -> None:
        """
        Parameters:
            model_path: The path to the model.
            batch_size: The batch size to use.
            device: The device to use.
            choices: The choices to classify.
            add_sep_token: Whether to add the sep token.
        """

        super().__init__(
            model_path=model_path,
            batch_size=batch_size,
            device=device,
            add_sep_token=add_sep_token,
        )

        self._choices = choices
        self._encoded_choices = self._encode_tokens(self._choices)


    @property
    def choices(self) -> Sequence[str]:
        return self._choices

    @torch.no_grad()
    def _classify(self, prompts: Sequence[str], /) -> NDArray:
        tokenized = self._tokenizer(prompts)

        output = self._model(**tokenized)

        # Logits has the shape [batch_size, seq_len, vocab_size].
        logits = output.logits

        # Using encoded to select the logits at the last position.
        result = logits[:, -1, self._encoded_choices]

        return result.cpu().numpy()

    def _encode_tokens(self, tokens: Sequence[str]) -> Sequence[int]:
        result: list[int] = []
        for tok in tokens:
            # Only adds the first token because we are only interested in the first token.
            result.append(self._tokenizer.encode(tok, add_special_tokens=False)[0])

        assert len(result) == len(tokens)

        if len(result) != len(set(result)):
            decoded = self._tokenizer.decode(self._tokenizer.encode(tokens))
            raise ValueError(
                "Each token must be converted to 1 unique id."
                f"Got {tokens}, encoded into {decoded}."
            )

        return result