mirror of
https://github.com/SikongJueluo/Mini-Nav.git
synced 2026-03-12 20:35:31 +08:00
refactor(benchmarks): modularize benchmark system with config-driven execution
This commit is contained in:
6
mini-nav/benchmarks/datasets/__init__.py
Normal file
6
mini-nav/benchmarks/datasets/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Dataset loaders for benchmark evaluation."""
|
||||
|
||||
from .huggingface import HuggingFaceDataset
|
||||
from .local import LocalDataset
|
||||
|
||||
__all__ = ["HuggingFaceDataset", "LocalDataset"]
|
||||
66
mini-nav/benchmarks/datasets/huggingface.py
Normal file
66
mini-nav/benchmarks/datasets/huggingface.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""HuggingFace dataset loader for benchmark evaluation."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from ..base import BaseDataset
|
||||
|
||||
|
||||
class HuggingFaceDataset(BaseDataset):
|
||||
"""Dataset loader for HuggingFace datasets."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hf_id: str,
|
||||
img_column: str = "img",
|
||||
label_column: str = "label",
|
||||
):
|
||||
"""Initialize HuggingFace dataset loader.
|
||||
|
||||
Args:
|
||||
hf_id: HuggingFace dataset ID.
|
||||
img_column: Name of the image column.
|
||||
label_column: Name of the label column.
|
||||
"""
|
||||
self.hf_id = hf_id
|
||||
self.img_column = img_column
|
||||
self.label_column = label_column
|
||||
self._train_dataset: Any = None
|
||||
self._test_dataset: Any = None
|
||||
|
||||
def _load(self) -> tuple[Any, Any]:
|
||||
"""Load dataset from HuggingFace.
|
||||
|
||||
Returns:
|
||||
Tuple of (train_dataset, test_dataset).
|
||||
"""
|
||||
if self._train_dataset is None:
|
||||
dataset = load_dataset(self.hf_id)
|
||||
# Handle datasets that use 'train' and 'test' splits
|
||||
if "train" in dataset:
|
||||
self._train_dataset = dataset["train"]
|
||||
if "test" in dataset:
|
||||
self._test_dataset = dataset["test"]
|
||||
# Handle datasets that use 'train' and 'validation' splits
|
||||
elif "validation" in dataset:
|
||||
self._test_dataset = dataset["validation"]
|
||||
return self._train_dataset, self._test_dataset
|
||||
|
||||
def get_train_split(self) -> Any:
|
||||
"""Get training split of the dataset.
|
||||
|
||||
Returns:
|
||||
Training dataset.
|
||||
"""
|
||||
train, _ = self._load()
|
||||
return train
|
||||
|
||||
def get_test_split(self) -> Any:
|
||||
"""Get test/evaluation split of the dataset.
|
||||
|
||||
Returns:
|
||||
Test dataset.
|
||||
"""
|
||||
_, test = self._load()
|
||||
return test
|
||||
157
mini-nav/benchmarks/datasets/local.py
Normal file
157
mini-nav/benchmarks/datasets/local.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""Local dataset loader for benchmark evaluation."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..base import BaseDataset
|
||||
|
||||
|
||||
class LocalDataset(BaseDataset):
|
||||
"""Dataset loader for local datasets."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_path: str,
|
||||
img_column: str = "image_path",
|
||||
label_column: str = "label",
|
||||
):
|
||||
"""Initialize local dataset loader.
|
||||
|
||||
Args:
|
||||
local_path: Path to local dataset directory or CSV file.
|
||||
img_column: Name of the image path column.
|
||||
label_column: Name of the label column.
|
||||
"""
|
||||
self.local_path = Path(local_path)
|
||||
self.img_column = img_column
|
||||
self.label_column = label_column
|
||||
self._train_dataset: Optional[Any] = None
|
||||
self._test_dataset: Optional[Any] = None
|
||||
|
||||
def _load_csv_dataset(self) -> tuple[Any, Any]:
|
||||
"""Load dataset from CSV file.
|
||||
|
||||
Expected CSV format:
|
||||
label,image_path,x1,y1,x2,y2
|
||||
"class_name","path/to/image.jpg",100,200,300,400
|
||||
|
||||
Returns:
|
||||
Tuple of (train_dataset, test_dataset).
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
from torch.utils.data import Dataset as TorchDataset
|
||||
|
||||
# Load CSV file
|
||||
df = pd.read_csv(self.local_path)
|
||||
|
||||
# Create a simple dataset class
|
||||
class CSVDataset(TorchDataset):
|
||||
def __init__(self, dataframe: pd.DataFrame, img_col: str, label_col: str):
|
||||
self.df = dataframe.reset_index(drop=True)
|
||||
self.img_col = img_col
|
||||
self.label_col = label_col
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.df)
|
||||
|
||||
def __getitem__(self, idx: int) -> dict[str, Any]:
|
||||
row = self.df.iloc[idx]
|
||||
return {
|
||||
"img": row[self.img_col],
|
||||
"label": row[self.label_col],
|
||||
}
|
||||
|
||||
# Split into train/test (80/20)
|
||||
split_idx = int(len(df) * 0.8)
|
||||
train_df = df.iloc[:split_idx]
|
||||
test_df = df.iloc[split_idx:]
|
||||
|
||||
self._train_dataset = CSVDataset(train_df, self.img_column, self.label_column)
|
||||
self._test_dataset = CSVDataset(test_df, self.img_column, self.label_column)
|
||||
|
||||
return self._train_dataset, self._test_dataset
|
||||
|
||||
def _load_directory_dataset(self) -> tuple[Any, Any]:
|
||||
"""Load dataset from directory structure.
|
||||
|
||||
Expected structure:
|
||||
local_path/
|
||||
train/
|
||||
class_name_1/
|
||||
image1.jpg
|
||||
image2.jpg
|
||||
class_name_2/
|
||||
image1.jpg
|
||||
test/
|
||||
class_name_1/
|
||||
image1.jpg
|
||||
|
||||
Returns:
|
||||
Tuple of (train_dataset, test_dataset).
|
||||
"""
|
||||
from torch.utils.data import Dataset as TorchDataset
|
||||
from PIL import Image
|
||||
|
||||
class DirectoryDataset(TorchDataset):
|
||||
def __init__(self, root_dir: Path, transform=None):
|
||||
self.root_dir = root_dir
|
||||
self.transform = transform
|
||||
self.samples = []
|
||||
self.label_map = {}
|
||||
|
||||
# Build label map
|
||||
classes = sorted([d.name for d in root_dir.iterdir() if d.is_dir()])
|
||||
self.label_map = {cls: idx for idx, cls in enumerate(classes)}
|
||||
|
||||
# Build sample list
|
||||
for cls_dir in root_dir.iterdir():
|
||||
if cls_dir.is_dir():
|
||||
label = self.label_map[cls_dir.name]
|
||||
for img_path in cls_dir.iterdir():
|
||||
if img_path.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
|
||||
self.samples.append((img_path, label))
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.samples)
|
||||
|
||||
def __getitem__(self, idx: int) -> dict[str, Any]:
|
||||
img_path, label = self.samples[idx]
|
||||
image = Image.open(img_path).convert("RGB")
|
||||
return {"img": image, "label": label}
|
||||
|
||||
train_dir = self.local_path / "train"
|
||||
test_dir = self.local_path / "test"
|
||||
|
||||
if train_dir.exists():
|
||||
self._train_dataset = DirectoryDataset(train_dir)
|
||||
if test_dir.exists():
|
||||
self._test_dataset = DirectoryDataset(test_dir)
|
||||
|
||||
return self._train_dataset, self._test_dataset
|
||||
|
||||
def get_train_split(self) -> Any:
|
||||
"""Get training split of the dataset.
|
||||
|
||||
Returns:
|
||||
Training dataset.
|
||||
"""
|
||||
if self._train_dataset is None:
|
||||
if self.local_path.suffix.lower() == ".csv":
|
||||
self._load_csv_dataset()
|
||||
else:
|
||||
self._load_directory_dataset()
|
||||
return self._train_dataset
|
||||
|
||||
def get_test_split(self) -> Any:
|
||||
"""Get test/evaluation split of the dataset.
|
||||
|
||||
Returns:
|
||||
Test dataset.
|
||||
"""
|
||||
if self._test_dataset is None:
|
||||
if self.local_path.suffix.lower() == ".csv":
|
||||
self._load_csv_dataset()
|
||||
else:
|
||||
self._load_directory_dataset()
|
||||
return self._test_dataset
|
||||
Reference in New Issue
Block a user