refactor(data-loading): migrate to Hugging Face datasets and reorganize structure

2026-07-12 20:15:31 +08:00 · 2026-02-28 21:43:13 +08:00
parent 77d715a2cf
commit 88d1d0790d
7 changed files with 110 additions and 114 deletions
--- a/mini-nav/data_loading/init.py
+++ b/mini-nav/data_loading/init.py
@@ -0,0 +1,8 @@
+from .loader import load_synth_dataset, load_val_dataset
+from .synthesizer import ImageSynthesizer
+
+__all__ = [
+    "ImageSynthesizer",
+    "load_synth_dataset",
+    "load_val_dataset",
+]
--- a/mini-nav/data_loading/loader.py
+++ b/mini-nav/data_loading/loader.py
@@ -0,0 +1,96 @@
+"""Data loaders for synthetic and validation datasets using Hugging Face datasets."""
+
+from pathlib import Path
+from typing import Any
+
+from datasets import Dataset, Image
+
+
+def load_synth_dataset(
+    synth_dir: Path,
+    annotations_suffix: str = ".txt",
+) -> Dataset:
+    """Load synthesized dataset for object detection training.
+
+    Args:
+        synth_dir: Directory containing synthesized images and annotations
+        annotations_suffix: Suffix for annotation files
+
+    Returns:
+        Hugging Face Dataset with image and objects columns
+    """
+    synth_dir = Path(synth_dir)
+    image_files = sorted(synth_dir.glob("synth_*.jpg"))
+
+    if not image_files:
+        return Dataset.from_dict({"image": [], "objects": []}).cast_column("image", Image())
+
+    image_paths: list[str] = []
+    all_objects: list[dict[str, Any]] = []
+
+    for img_path in image_files:
+        image_paths.append(str(img_path))
+
+        anno_path = img_path.with_suffix(annotations_suffix)
+        if not anno_path.exists():
+            all_objects.append({"bbox": [], "category": [], "area": [], "id": []})
+            continue
+
+        bboxes: list[list[float]] = []
+        categories: list[str] = []
+        areas: list[float] = []
+        ids: list[int] = []
+
+        with open(anno_path, "r") as f:
+            for idx, line in enumerate(f):
+                if not (line := line.strip()):
+                    continue
+
+                parts = line.split()
+                if len(parts) != 5:
+                    continue
+
+                xmin, ymin, xmax, ymax = map(int, parts[1:])
+                width, height = xmax - xmin, ymax - ymin
+
+                bboxes.append([float(xmin), float(ymin), float(width), float(height)])
+                categories.append(parts[0])
+                areas.append(float(width * height))
+                ids.append(idx)
+
+        all_objects.append({"bbox": bboxes, "category": categories, "area": areas, "id": ids})
+
+    dataset = Dataset.from_dict({"image": image_paths, "objects": all_objects})
+    return dataset.cast_column("image", Image())
+
+
+def load_val_dataset(
+    scenes_dir: Path,
+    split: str = "easy",
+) -> Dataset:
+    """Load validation dataset from scene images.
+
+    Args:
+        scenes_dir: Directory containing scene subdirectories
+        split: Scene split to load ('easy' or 'hard')
+
+    Returns:
+        Hugging Face Dataset with image and image_id columns
+    """
+    scenes_dir = Path(scenes_dir)
+    split_dir = scenes_dir / split
+
+    if not split_dir.exists():
+        raise ValueError(f"Scene split directory not found: {split_dir}")
+
+    rgb_files = sorted(split_dir.glob("*/rgb_*.jpg"))
+
+    if not rgb_files:
+        return Dataset.from_dict({"image": [], "image_id": []}).cast_column("image", Image())
+
+    dataset = Dataset.from_dict({
+        "image": [str(p) for p in rgb_files],
+        "image_id": [p.stem for p in rgb_files],
+    })
+
+    return dataset.cast_column("image", Image())
--- a/mini-nav/data_loading/synthesizer.py
+++ b/mini-nav/data_loading/synthesizer.py
--- a/mini-nav/datasets/init.py
+++ b/mini-nav/datasets/init.py
@@ -1,8 +0,0 @@
-from .loader import SynthDataset, ValDataset
-from .synthesizer import ImageSynthesizer
-
-__all__ = [
-    "ImageSynthesizer",
-    "SynthDataset",
-    "ValDataset",
-]
--- a/mini-nav/datasets/loader.py
+++ b/mini-nav/datasets/loader.py
@@ -1,105 +0,0 @@
-"""Data loaders for synthetic and validation datasets."""
-
-from collections.abc import Iterator
-from pathlib import Path
-
-from PIL import Image
-
-
-class SynthDataset:
-    """Dataset loader for synthesized training images."""
-
-    def __init__(self, synth_dir: Path, annotations_suffix: str = ".txt"):
-        """
-        Initialize the synthetic dataset loader.
-
-        Args:
-            synth_dir: Directory containing synthesized images and annotations
-            annotations_suffix: Suffix for annotation files
-        """
-        self.synth_dir = Path(synth_dir)
-        self.annotations_suffix = annotations_suffix
-
-        # Find all images
-        self.image_files = sorted(self.synth_dir.glob("synth_*.jpg"))
-
-    def __len__(self) -> int:
-        return len(self.image_files)
-
-    def __getitem__(self, idx: int) -> tuple[Image.Image, list[tuple[str, int, int, int, int]]]:
-        """Get a single item.
-
-        Args:
-            idx: Index of the item
-
-        Returns:
-            Tuple of (image, annotations) where annotations is a list of
-            (category, xmin, ymin, xmax, ymax)
-        """
-        img_path = self.image_files[idx]
-        image = Image.open(img_path).convert("RGB")
-
-        # Load annotations
-        anno_path = img_path.with_suffix(self.annotations_suffix)
-        annotations: list[tuple[str, int, int, int, int]] = []
-
-        if anno_path.exists():
-            with open(anno_path, "r") as f:
-                for line in f:
-                    line = line.strip()
-                    if line:
-                        parts = line.split()
-                        if len(parts) == 5:
-                            category = parts[0]
-                            xmin, ymin, xmax, ymax = map(int, parts[1:])
-                            annotations.append((category, xmin, ymin, xmax, ymax))
-
-        return image, annotations
-
-    def __iter__(self) -> Iterator[tuple[Image.Image, list[tuple[str, int, int, int, int]]]]:
-        """Iterate over the dataset."""
-        for i in range(len(self)):
-            yield self[i]
-
-
-class ValDataset:
-    """Dataset loader for validation scene images."""
-
-    def __init__(self, scenes_dir: Path, split: str = "easy"):
-        """
-        Initialize the validation dataset loader.
-
-        Args:
-            scenes_dir: Directory containing scene subdirectories
-            split: Scene split to load ('easy' or 'hard')
-        """
-        self.scenes_dir = Path(scenes_dir)
-        self.split = split
-
-        self.split_dir = self.scenes_dir / split
-        if not self.split_dir.exists():
-            raise ValueError(f"Scene split directory not found: {self.split_dir}")
-
-        # Find all RGB images
-        self.image_files = sorted(self.split_dir.glob("*/rgb_*.jpg"))
-
-    def __len__(self) -> int:
-        return len(self.image_files)
-
-    def __getitem__(self, idx: int) -> tuple[Image.Image, Path]:
-        """Get a single item.
-
-        Args:
-            idx: Index of the item
-
-        Returns:
-            Tuple of (image, scene_path)
-        """
-        img_path = self.image_files[idx]
-        image = Image.open(img_path).convert("RGB")
-        return image, img_path.parent
-
-    def __iter__(self) -> Iterator[tuple[Image.Image, Path]]:
-        """Iterate over the dataset."""
-        for i in range(len(self)):
-            yield self[i]