Source code for bocoel.models.lms.huggingface.tokenizers
# Copyright (c) RenChu Wang - All Rights Reserved
from collections.abc import Sequence
from typing import Any
from transformers import BatchEncoding
[docs]
class HuggingfaceTokenizer:
"""
A tokenizer for Huggingface models.
"""
[docs]
def __init__(self, model_path: str, device: str, add_sep_token: bool) -> None:
"""
Parameters:
model_path: The path to the model.
device: The device to use.
add_sep_token: Whether to add the sep token.
Raises:
ImportError: If the transformers library is not installed.
"""
# Optional dependency.
from transformers import AutoTokenizer
# Initializes the tokenizer and pad to the left for sequence generation.
self._tokenizer = AutoTokenizer.from_pretrained(
model_path, padding_side="left", truncation_side="left"
)
# Always add the pad token.
if (eos := self._tokenizer.eos_token) is not None:
self._tokenizer.pad_token = eos
else:
self._tokenizer.add_special_tokens({"pad_token": "[PAD]"})
if add_sep_token:
if self._tokenizer.sep_token is None:
self._tokenizer.add_special_tokens({"sep_token": "[SEP]"})
self._device = device
[docs]
def to(self, device: str, /) -> "HuggingfaceTokenizer":
"""
Move the tokenizer to the given device.
Parameters:
device: The device to move to.
"""
self._device = device
return self
[docs]
def tokenize(
self, prompts: Sequence[str], /, max_length: int | None = None
) -> BatchEncoding:
"""
Tokenize, pad, truncate, cast to device, and yield the encoded results.
Returning `BatchEncoding` but not marked in the type hint
due to optional dependency.
Parameters:
prompts: The prompts to tokenize.
Returns:
(BatchEncoding): The tokenized prompts.
"""
if not isinstance(prompts, list):
prompts = list(prompts)
inputs = self._tokenizer(
prompts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length,
)
return inputs.to(self.device)
def __call__(
self, prompts: Sequence[str], /, max_length: int | None = None
) -> BatchEncoding:
return self.tokenize(prompts, max_length=max_length)
[docs]
def encode(
self,
prompts: Sequence[str],
/,
return_tensors: str | None = None,
add_special_tokens: bool = True,
) -> list[int]:
"""
Encode the given prompts.
Parameters:
prompts: The prompts to encode.
return_tensors: Whether to return tensors.
add_special_tokens: Whether to add special tokens.
Returns:
(Any): The encoded prompts.
"""
return self._tokenizer.encode(
prompts,
return_tensors=return_tensors,
add_special_tokens=add_special_tokens,
)
[docs]
def decode(self, outputs: Any, /, skip_special_tokens: bool = True) -> str:
"""
Decode the given outputs.
Parameters:
outputs: The outputs to decode.
skip_special_tokens: Whether to skip special tokens.
Returns:
The decoded outputs.
"""
return self._tokenizer.decode(outputs, skip_special_tokens=skip_special_tokens)
[docs]
def batch_decode(
self, outputs: Any, /, skip_special_tokens: bool = True
) -> list[str]:
"""
Batch decode the given outputs.
Parameters:
outputs: The outputs to decode.
skip_special_tokens: Whether to skip special tokens.
Returns:
The batch decoded outputs.
"""
return self._tokenizer.batch_decode(
outputs, skip_special_tokens=skip_special_tokens
)
@property
def device(self) -> str:
return self._device
@property
def pad_token_id(self) -> int:
return self._tokenizer.pad_token_id
@property
def pad_token(self) -> str:
return self._tokenizer.pad_token