feat(feature-compressor): add DINOv2 feature extraction and compression pipeline

This commit is contained in:
2026-01-31 10:33:37 +08:00
parent f9a359fc28
commit 1454647aa6
22 changed files with 1486 additions and 16 deletions

View File

@@ -0,0 +1,83 @@
"""Feature processing utilities."""
from pathlib import Path
from typing import Dict
import numpy as np
import torch
import yaml
def normalize_features(features: torch.Tensor) -> torch.Tensor:
"""L2-normalize features.
Args:
features: Tensor of shape [batch, dim] or [batch, seq, dim]
Returns:
L2-normalized features
"""
norm = torch.norm(features, p=2, dim=-1, keepdim=True)
return features / (norm + 1e-8)
def compute_feature_stats(features: torch.Tensor) -> Dict[str, float]:
"""Compute basic statistics for features.
Args:
features: Tensor of shape [batch, dim] or [batch, seq, dim]
Returns:
Dictionary with mean, std, min, max
"""
with torch.no_grad():
return {
"mean": float(features.mean().item()),
"std": float(features.std().item()),
"min": float(features.min().item()),
"max": float(features.max().item()),
}
def save_features_to_json(
features: torch.Tensor, path: Path, metadata: Dict = None
) -> None:
"""Save features to JSON file.
Args:
features: Tensor to save
path: Output file path
metadata: Optional metadata dictionary
"""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
features_np = features.cpu().numpy()
data = {
"features": features_np.tolist(),
"shape": list(features.shape),
}
if metadata:
data["metadata"] = metadata
with open(path, "w") as f:
import json
json.dump(data, f, indent=2)
def save_features_to_csv(features: torch.Tensor, path: Path) -> None:
"""Save features to CSV file.
Args:
features: Tensor to save
path: Output file path
"""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
features_np = features.cpu().numpy()
np.savetxt(path, features_np, delimiter=",", fmt="%.6f")