Source code for bocoel.corpora.storages.pandas
# Copyright (c) RenChu Wang - All Rights Reserved
import json
from collections.abc import Collection, Mapping, Sequence
from pathlib import Path
from typing import Any
from pandas import DataFrame
from bocoel.corpora.storages.interfaces import Storage
[docs]
class PandasStorage(Storage):
"""
Storage for pandas DataFrame.
Since pandas DataFrames are in-memory, this storage is fast,
but might be memory inefficient and require a lot of RAM.
"""
[docs]
def __init__(self, df: DataFrame, /) -> None:
self._df = df
def keys(self) -> Collection[str]:
return self._df.columns
def __len__(self) -> int:
return len(self._df)
def _getitem(self, idx: int) -> Mapping[str, Any]:
return self._df.iloc[idx].to_dict()
[docs]
@classmethod
def from_jsonl_file(cls, path: str | Path, /) -> "PandasStorage":
"""
Load data from a JSONL file.
Parameters:
path: The path to the file.
Returns:
A `PandasStorage` instance.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(path)
if not path.is_file():
raise ValueError(f"Cannot open file: {path}")
with open(path) as f:
lines = map(lambda s: s.strip("\n"), f.readlines())
data = [json.loads(line) for line in lines]
return cls.from_jsonl(data)
[docs]
@classmethod
def from_jsonl(cls, data: Sequence[Mapping[str, str]], /) -> "PandasStorage":
"""
Load data from a JSONL object or a list of JSON.
Parameters:
data: The JSONL object or list of JSON.
Returns:
A `PandasStorage` instance.
"""
df = DataFrame.from_records(data)
return cls(df)