refactor(data-loading): migrate to Hugging Face datasets and reorganize structure

2026-07-13 04:25:32 +08:00 · 2026-02-28 21:43:13 +08:00
parent 77d715a2cf
commit 88d1d0790d
7 changed files with 110 additions and 114 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,5 @@
 {
  "permissions": {
    "allow": ["Bash(uv run:*)", "mcp__fetch__fetch"]
  }
 }
--- a/.gitignore
+++ b/.gitignore
@@ -207,11 +207,11 @@ __marimo__/
 # Projects
 datasets/
 !mini-nav/**/datasets/
 data/
 deps/
 outputs/
 .sisyphus
 .claude/
 # Devenv
 .devenv*
--- a/mini-nav/data_loading/init.py
+++ b/mini-nav/data_loading/init.py
@@ -0,0 +1,8 @@
 from .loader import load_synth_dataset, load_val_dataset
 from .synthesizer import ImageSynthesizer
 __all__ = [
    "ImageSynthesizer",
    "load_synth_dataset",
    "load_val_dataset",
 ]
--- a/mini-nav/data_loading/loader.py
+++ b/mini-nav/data_loading/loader.py
@@ -0,0 +1,96 @@
 """Data loaders for synthetic and validation datasets using Hugging Face datasets."""
 from pathlib import Path
 from typing import Any
 from datasets import Dataset, Image
 def load_synth_dataset(
    synth_dir: Path,
    annotations_suffix: str = ".txt",
 ) -> Dataset:
    """Load synthesized dataset for object detection training.
    Args:
        synth_dir: Directory containing synthesized images and annotations
        annotations_suffix: Suffix for annotation files
    Returns:
        Hugging Face Dataset with image and objects columns
    """
    synth_dir = Path(synth_dir)
    image_files = sorted(synth_dir.glob("synth_*.jpg"))
    if not image_files:
        return Dataset.from_dict({"image": [], "objects": []}).cast_column("image", Image())
    image_paths: list[str] = []
    all_objects: list[dict[str, Any]] = []
    for img_path in image_files:
        image_paths.append(str(img_path))
        anno_path = img_path.with_suffix(annotations_suffix)
        if not anno_path.exists():
            all_objects.append({"bbox": [], "category": [], "area": [], "id": []})
            continue
        bboxes: list[list[float]] = []
        categories: list[str] = []
        areas: list[float] = []
        ids: list[int] = []
        with open(anno_path, "r") as f:
            for idx, line in enumerate(f):
                if not (line := line.strip()):
                    continue
                parts = line.split()
                if len(parts) != 5:
                    continue
                xmin, ymin, xmax, ymax = map(int, parts[1:])
                width, height = xmax - xmin, ymax - ymin
                bboxes.append([float(xmin), float(ymin), float(width), float(height)])
                categories.append(parts[0])
                areas.append(float(width * height))
                ids.append(idx)
        all_objects.append({"bbox": bboxes, "category": categories, "area": areas, "id": ids})
    dataset = Dataset.from_dict({"image": image_paths, "objects": all_objects})
    return dataset.cast_column("image", Image())
 def load_val_dataset(
    scenes_dir: Path,
    split: str = "easy",
 ) -> Dataset:
    """Load validation dataset from scene images.
    Args:
        scenes_dir: Directory containing scene subdirectories
        split: Scene split to load ('easy' or 'hard')
    Returns:
        Hugging Face Dataset with image and image_id columns
    """
    scenes_dir = Path(scenes_dir)
    split_dir = scenes_dir / split
    if not split_dir.exists():
        raise ValueError(f"Scene split directory not found: {split_dir}")
    rgb_files = sorted(split_dir.glob("*/rgb_*.jpg"))
    if not rgb_files:
        return Dataset.from_dict({"image": [], "image_id": []}).cast_column("image", Image())
    dataset = Dataset.from_dict({
        "image": [str(p) for p in rgb_files],
        "image_id": [p.stem for p in rgb_files],
    })
    return dataset.cast_column("image", Image())
--- a/mini-nav/data_loading/synthesizer.py
+++ b/mini-nav/data_loading/synthesizer.py
--- a/mini-nav/datasets/init.py
+++ b/mini-nav/datasets/init.py
@@ -1,8 +0,0 @@
 from .loader import SynthDataset, ValDataset
 from .synthesizer import ImageSynthesizer
 __all__ = [
    "ImageSynthesizer",
    "SynthDataset",
    "ValDataset",
 ]
--- a/mini-nav/datasets/loader.py
+++ b/mini-nav/datasets/loader.py
@@ -1,105 +0,0 @@
 """Data loaders for synthetic and validation datasets."""
 from collections.abc import Iterator
 from pathlib import Path
 from PIL import Image
 class SynthDataset:
    """Dataset loader for synthesized training images."""
    def __init__(self, synth_dir: Path, annotations_suffix: str = ".txt"):
        """
        Initialize the synthetic dataset loader.
        Args:
            synth_dir: Directory containing synthesized images and annotations
            annotations_suffix: Suffix for annotation files
        """
        self.synth_dir = Path(synth_dir)
        self.annotations_suffix = annotations_suffix
        # Find all images
        self.image_files = sorted(self.synth_dir.glob("synth_*.jpg"))
    def __len__(self) -> int:
        return len(self.image_files)
    def __getitem__(self, idx: int) -> tuple[Image.Image, list[tuple[str, int, int, int, int]]]:
        """Get a single item.
        Args:
            idx: Index of the item
        Returns:
            Tuple of (image, annotations) where annotations is a list of
            (category, xmin, ymin, xmax, ymax)
        """
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert("RGB")
        # Load annotations
        anno_path = img_path.with_suffix(self.annotations_suffix)
        annotations: list[tuple[str, int, int, int, int]] = []
        if anno_path.exists():
            with open(anno_path, "r") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parts = line.split()
                        if len(parts) == 5:
                            category = parts[0]
                            xmin, ymin, xmax, ymax = map(int, parts[1:])
                            annotations.append((category, xmin, ymin, xmax, ymax))
        return image, annotations
    def __iter__(self) -> Iterator[tuple[Image.Image, list[tuple[str, int, int, int, int]]]]:
        """Iterate over the dataset."""
        for i in range(len(self)):
            yield self[i]
 class ValDataset:
    """Dataset loader for validation scene images."""
    def __init__(self, scenes_dir: Path, split: str = "easy"):
        """
        Initialize the validation dataset loader.
        Args:
            scenes_dir: Directory containing scene subdirectories
            split: Scene split to load ('easy' or 'hard')
        """
        self.scenes_dir = Path(scenes_dir)
        self.split = split
        self.split_dir = self.scenes_dir / split
        if not self.split_dir.exists():
            raise ValueError(f"Scene split directory not found: {self.split_dir}")
        # Find all RGB images
        self.image_files = sorted(self.split_dir.glob("*/rgb_*.jpg"))
    def __len__(self) -> int:
        return len(self.image_files)
    def __getitem__(self, idx: int) -> tuple[Image.Image, Path]:
        """Get a single item.
        Args:
            idx: Index of the item
        Returns:
            Tuple of (image, scene_path)
        """
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert("RGB")
        return image, img_path.parent
    def __iter__(self) -> Iterator[tuple[Image.Image, Path]]:
        """Iterate over the dataset."""
        for i in range(len(self)):
            yield self[i]