mirror of
https://github.com/SikongJueluo/Mini-Nav.git
synced 2026-03-12 20:35:31 +08:00
refactor(data-loading): migrate to Hugging Face datasets and reorganize structure
This commit is contained in:
96
mini-nav/data_loading/loader.py
Normal file
96
mini-nav/data_loading/loader.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Data loaders for synthetic and validation datasets using Hugging Face datasets."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from datasets import Dataset, Image
|
||||
|
||||
|
||||
def load_synth_dataset(
|
||||
synth_dir: Path,
|
||||
annotations_suffix: str = ".txt",
|
||||
) -> Dataset:
|
||||
"""Load synthesized dataset for object detection training.
|
||||
|
||||
Args:
|
||||
synth_dir: Directory containing synthesized images and annotations
|
||||
annotations_suffix: Suffix for annotation files
|
||||
|
||||
Returns:
|
||||
Hugging Face Dataset with image and objects columns
|
||||
"""
|
||||
synth_dir = Path(synth_dir)
|
||||
image_files = sorted(synth_dir.glob("synth_*.jpg"))
|
||||
|
||||
if not image_files:
|
||||
return Dataset.from_dict({"image": [], "objects": []}).cast_column("image", Image())
|
||||
|
||||
image_paths: list[str] = []
|
||||
all_objects: list[dict[str, Any]] = []
|
||||
|
||||
for img_path in image_files:
|
||||
image_paths.append(str(img_path))
|
||||
|
||||
anno_path = img_path.with_suffix(annotations_suffix)
|
||||
if not anno_path.exists():
|
||||
all_objects.append({"bbox": [], "category": [], "area": [], "id": []})
|
||||
continue
|
||||
|
||||
bboxes: list[list[float]] = []
|
||||
categories: list[str] = []
|
||||
areas: list[float] = []
|
||||
ids: list[int] = []
|
||||
|
||||
with open(anno_path, "r") as f:
|
||||
for idx, line in enumerate(f):
|
||||
if not (line := line.strip()):
|
||||
continue
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) != 5:
|
||||
continue
|
||||
|
||||
xmin, ymin, xmax, ymax = map(int, parts[1:])
|
||||
width, height = xmax - xmin, ymax - ymin
|
||||
|
||||
bboxes.append([float(xmin), float(ymin), float(width), float(height)])
|
||||
categories.append(parts[0])
|
||||
areas.append(float(width * height))
|
||||
ids.append(idx)
|
||||
|
||||
all_objects.append({"bbox": bboxes, "category": categories, "area": areas, "id": ids})
|
||||
|
||||
dataset = Dataset.from_dict({"image": image_paths, "objects": all_objects})
|
||||
return dataset.cast_column("image", Image())
|
||||
|
||||
|
||||
def load_val_dataset(
|
||||
scenes_dir: Path,
|
||||
split: str = "easy",
|
||||
) -> Dataset:
|
||||
"""Load validation dataset from scene images.
|
||||
|
||||
Args:
|
||||
scenes_dir: Directory containing scene subdirectories
|
||||
split: Scene split to load ('easy' or 'hard')
|
||||
|
||||
Returns:
|
||||
Hugging Face Dataset with image and image_id columns
|
||||
"""
|
||||
scenes_dir = Path(scenes_dir)
|
||||
split_dir = scenes_dir / split
|
||||
|
||||
if not split_dir.exists():
|
||||
raise ValueError(f"Scene split directory not found: {split_dir}")
|
||||
|
||||
rgb_files = sorted(split_dir.glob("*/rgb_*.jpg"))
|
||||
|
||||
if not rgb_files:
|
||||
return Dataset.from_dict({"image": [], "image_id": []}).cast_column("image", Image())
|
||||
|
||||
dataset = Dataset.from_dict({
|
||||
"image": [str(p) for p in rgb_files],
|
||||
"image_id": [p.stem for p in rgb_files],
|
||||
})
|
||||
|
||||
return dataset.cast_column("image", Image())
|
||||
Reference in New Issue
Block a user