refactor(data-loading): migrate to Hugging Face datasets and reorganize structure

This commit is contained in:
2026-02-28 21:43:13 +08:00
parent 77d715a2cf
commit 88d1d0790d
7 changed files with 110 additions and 114 deletions

View File

@@ -0,0 +1,8 @@
from .loader import load_synth_dataset, load_val_dataset
from .synthesizer import ImageSynthesizer
__all__ = [
"ImageSynthesizer",
"load_synth_dataset",
"load_val_dataset",
]

View File

@@ -0,0 +1,96 @@
"""Data loaders for synthetic and validation datasets using Hugging Face datasets."""
from pathlib import Path
from typing import Any
from datasets import Dataset, Image
def load_synth_dataset(
synth_dir: Path,
annotations_suffix: str = ".txt",
) -> Dataset:
"""Load synthesized dataset for object detection training.
Args:
synth_dir: Directory containing synthesized images and annotations
annotations_suffix: Suffix for annotation files
Returns:
Hugging Face Dataset with image and objects columns
"""
synth_dir = Path(synth_dir)
image_files = sorted(synth_dir.glob("synth_*.jpg"))
if not image_files:
return Dataset.from_dict({"image": [], "objects": []}).cast_column("image", Image())
image_paths: list[str] = []
all_objects: list[dict[str, Any]] = []
for img_path in image_files:
image_paths.append(str(img_path))
anno_path = img_path.with_suffix(annotations_suffix)
if not anno_path.exists():
all_objects.append({"bbox": [], "category": [], "area": [], "id": []})
continue
bboxes: list[list[float]] = []
categories: list[str] = []
areas: list[float] = []
ids: list[int] = []
with open(anno_path, "r") as f:
for idx, line in enumerate(f):
if not (line := line.strip()):
continue
parts = line.split()
if len(parts) != 5:
continue
xmin, ymin, xmax, ymax = map(int, parts[1:])
width, height = xmax - xmin, ymax - ymin
bboxes.append([float(xmin), float(ymin), float(width), float(height)])
categories.append(parts[0])
areas.append(float(width * height))
ids.append(idx)
all_objects.append({"bbox": bboxes, "category": categories, "area": areas, "id": ids})
dataset = Dataset.from_dict({"image": image_paths, "objects": all_objects})
return dataset.cast_column("image", Image())
def load_val_dataset(
scenes_dir: Path,
split: str = "easy",
) -> Dataset:
"""Load validation dataset from scene images.
Args:
scenes_dir: Directory containing scene subdirectories
split: Scene split to load ('easy' or 'hard')
Returns:
Hugging Face Dataset with image and image_id columns
"""
scenes_dir = Path(scenes_dir)
split_dir = scenes_dir / split
if not split_dir.exists():
raise ValueError(f"Scene split directory not found: {split_dir}")
rgb_files = sorted(split_dir.glob("*/rgb_*.jpg"))
if not rgb_files:
return Dataset.from_dict({"image": [], "image_id": []}).cast_column("image", Image())
dataset = Dataset.from_dict({
"image": [str(p) for p in rgb_files],
"image_id": [p.stem for p in rgb_files],
})
return dataset.cast_column("image", Image())

View File

@@ -1,8 +0,0 @@
from .loader import SynthDataset, ValDataset
from .synthesizer import ImageSynthesizer
__all__ = [
"ImageSynthesizer",
"SynthDataset",
"ValDataset",
]

View File

@@ -1,105 +0,0 @@
"""Data loaders for synthetic and validation datasets."""
from collections.abc import Iterator
from pathlib import Path
from PIL import Image
class SynthDataset:
"""Dataset loader for synthesized training images."""
def __init__(self, synth_dir: Path, annotations_suffix: str = ".txt"):
"""
Initialize the synthetic dataset loader.
Args:
synth_dir: Directory containing synthesized images and annotations
annotations_suffix: Suffix for annotation files
"""
self.synth_dir = Path(synth_dir)
self.annotations_suffix = annotations_suffix
# Find all images
self.image_files = sorted(self.synth_dir.glob("synth_*.jpg"))
def __len__(self) -> int:
return len(self.image_files)
def __getitem__(self, idx: int) -> tuple[Image.Image, list[tuple[str, int, int, int, int]]]:
"""Get a single item.
Args:
idx: Index of the item
Returns:
Tuple of (image, annotations) where annotations is a list of
(category, xmin, ymin, xmax, ymax)
"""
img_path = self.image_files[idx]
image = Image.open(img_path).convert("RGB")
# Load annotations
anno_path = img_path.with_suffix(self.annotations_suffix)
annotations: list[tuple[str, int, int, int, int]] = []
if anno_path.exists():
with open(anno_path, "r") as f:
for line in f:
line = line.strip()
if line:
parts = line.split()
if len(parts) == 5:
category = parts[0]
xmin, ymin, xmax, ymax = map(int, parts[1:])
annotations.append((category, xmin, ymin, xmax, ymax))
return image, annotations
def __iter__(self) -> Iterator[tuple[Image.Image, list[tuple[str, int, int, int, int]]]]:
"""Iterate over the dataset."""
for i in range(len(self)):
yield self[i]
class ValDataset:
"""Dataset loader for validation scene images."""
def __init__(self, scenes_dir: Path, split: str = "easy"):
"""
Initialize the validation dataset loader.
Args:
scenes_dir: Directory containing scene subdirectories
split: Scene split to load ('easy' or 'hard')
"""
self.scenes_dir = Path(scenes_dir)
self.split = split
self.split_dir = self.scenes_dir / split
if not self.split_dir.exists():
raise ValueError(f"Scene split directory not found: {self.split_dir}")
# Find all RGB images
self.image_files = sorted(self.split_dir.glob("*/rgb_*.jpg"))
def __len__(self) -> int:
return len(self.image_files)
def __getitem__(self, idx: int) -> tuple[Image.Image, Path]:
"""Get a single item.
Args:
idx: Index of the item
Returns:
Tuple of (image, scene_path)
"""
img_path = self.image_files[idx]
image = Image.open(img_path).convert("RGB")
return image, img_path.parent
def __iter__(self) -> Iterator[tuple[Image.Image, Path]]:
"""Iterate over the dataset."""
for i in range(len(self)):
yield self[i]