Source code for bocoel.corpora.corpora.interfaces
from typing import Protocol
from bocoel import common
from bocoel.corpora.indices import Index
from bocoel.corpora.storages import Storage
[docs]
class Corpus(Protocol):
"""
Corpus is the entry point to handling the data in this library.
A corpus has 3 main components:
- Index: Searches one particular column in the storage.Provides fast retrival.
- Storage: Used to store the questions / answers / texts.
- Embedder: Embeds the text into vectors for faster access.
An index only corresponds to one key. If search over multiple keys is desired,
a new column or a new corpus (with shared storage) should be created.
"""
storage: Storage
"""
Storage is used to store the questions / answers / etc.
Can be viewed as a dataframe of texts.
"""
index: Index
"""
Index searches one particular column in the storage into vectors.
"""
def __repr__(self) -> str:
name = common.remove_base_suffix(self, Corpus)
return f"{name}({str(self.storage)}, {str(self.index)})"