Source code for bocoel.core.optim.sklearn.kmeans
# Copyright (c) RenChu Wang - All Rights Reserved
from typing import Literal, TypedDict
from numpy.typing import NDArray
from sklearn.cluster import KMeans
from typing_extensions import NotRequired
from bocoel.core.optim.interfaces import IndexEvaluator
from bocoel.corpora import Index
from .optim import ScikitLearnOptimizer
[docs]
class KMeansOptions(TypedDict):
n_clusters: int
init: NotRequired[Literal["k-means++", "random"]]
n_init: NotRequired[int | Literal["auto"]]
tol: NotRequired[float]
verbose: NotRequired[int]
random_state: NotRequired[int]
algorithm: NotRequired[Literal["llyod", "elkan"]]
[docs]
class KMeansOptimizer(ScikitLearnOptimizer):
"""
The KMeans optimizer that uses clustering algorithms.
"""
[docs]
def __init__(
self,
index_eval: IndexEvaluator,
index: Index,
*,
batch_size: int,
embeddings: NDArray,
model_kwargs: KMeansOptions,
) -> None:
"""
Parameters:
index_eval: The evaluator to use on the storage.
index: The index to use for the query.
batch_size: The number of embeddings to evaluate at once.
embeddings: The embeddings to cluster.
model_kwargs: The keyword arguments to pass to the KMeans model.
"""
model = KMeans(**model_kwargs)
super().__init__(
index_eval=index_eval,
index=index,
embeddings=embeddings,
model=model,
batch_size=batch_size,
)
self._model_kwargs = model_kwargs
def __repr__(self) -> str:
n_clusters = self._model_kwargs["n_clusters"]
return f"KMeans({n_clusters})"