feat(compressors): add SAM+DINO+Hash pipeline for object feature extraction

This commit is contained in:
2026-03-02 14:22:44 +08:00
parent 370c4a6588
commit a7b01cb49e
7 changed files with 753 additions and 8 deletions

View File

@@ -1,6 +1,8 @@
from .common import BinarySign, bits_to_hash, hamming_distance, hamming_similarity, hash_to_bits
from .dino_compressor import DinoCompressor
from .hash_compressor import HashCompressor, HashLoss, VideoPositiveMask
from .pipeline import SAMHashPipeline, create_pipeline_from_config
from .segament_compressor import SegmentCompressor
from .train import train
__all__ = [
@@ -9,6 +11,9 @@ __all__ = [
"HashCompressor",
"HashLoss",
"VideoPositiveMask",
"SegmentCompressor",
"SAMHashPipeline",
"create_pipeline_from_config",
"BinarySign",
"hamming_distance",
"hamming_similarity",

View File

@@ -1,8 +1,10 @@
from typing import Optional, cast
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
from transformers import AutoModel, Dinov2Model
from PIL import Image
from transformers import AutoImageProcessor, AutoModel
class DinoCompressor(nn.Module):
@@ -10,15 +12,34 @@ class DinoCompressor(nn.Module):
When compressor is None: returns normalized DINO embeddings.
When compressor is provided: returns binary hash bits for CAM storage.
Supports both PIL Image input and pre-extracted tokens.
"""
def __init__(self, compressor: Optional[nn.Module] = None):
def __init__(
self,
model_name: str = "facebook/dinov2-large",
compressor: Optional[nn.Module] = None,
device: Optional[str] = None,
):
"""Initialize DINOv2 extractor.
Args:
model_name: HuggingFace model name
compressor: Optional hash compressor for producing binary codes
device: Device to load model on
"""
super().__init__()
self.dino = cast(
Dinov2Model,
AutoModel.from_pretrained("facebook/dinov2-large"),
)
# Auto detect device
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
self.device = torch.device(device)
self.model_name = model_name
self.processor = AutoImageProcessor.from_pretrained(model_name)
self.dino = AutoModel.from_pretrained(model_name).to(self.device)
self.dino.eval()
self.compressor = compressor
@@ -34,3 +55,51 @@ class DinoCompressor(nn.Module):
# HashCompressor returns (logits, hash_codes, bits)
_, _, bits = self.compressor(teacher_tokens)
return bits # [B, 512] binary bits for CAM
def extract_features(self, images: list[Image.Image]) -> torch.Tensor:
"""Extract DINO features from a list of cropped object images.
Args:
images: List of PIL Images (cropped objects)
Returns:
DINO features [N, feature_dim], normalized
"""
if len(images) == 0:
return torch.empty(0, self.dino.config.hidden_size, device=self.device)
# Process batch of images
inputs = self.processor(images, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.dino(**inputs)
# Pool tokens to get global representation
features = outputs.last_hidden_state.mean(dim=1) # [N, 1024]
features = F.normalize(features, dim=-1)
return features
def encode(self, images: list[Image.Image]) -> torch.Tensor:
"""Extract features from images and optionally compress to hash codes.
Args:
images: List of PIL Images
Returns:
If compressor is None: DINO features [N, 1024]
If compressor is set: Binary hash bits [N, 512]
"""
if self.compressor is None:
return self.extract_features(images)
# Extract features first
features = self.extract_features(images) # [N, 1024]
# Add sequence dimension for compressor (expects [B, N, dim])
features = features.unsqueeze(1) # [N, 1, 1024]
# Compress to hash codes
_, _, bits = self.compressor(features)
return bits

View File

@@ -0,0 +1,170 @@
"""Complete pipeline for SAM + DINO + HashCompressor.
This pipeline extracts object masks from images using SAM2.1,
crops the objects, extracts features using DINOv2,
and compresses them to binary hash codes using HashCompressor.
"""
from pathlib import Path
from typing import Optional
import torch
import torch.nn as nn
from PIL import Image
from .dino_compressor import DinoCompressor
from .hash_compressor import HashCompressor
from .segament_compressor import SegmentCompressor
def create_pipeline_from_config(config) -> "SAMHashPipeline":
"""Create SAMHashPipeline from a config object.
Args:
config: Configuration object with model settings
Returns:
Initialized SAMHashPipeline
"""
return SAMHashPipeline(
sam_model=config.model.sam_model,
dino_model=config.model.name,
hash_bits=config.model.compression_dim,
sam_min_mask_area=config.model.sam_min_mask_area,
sam_max_masks=config.model.sam_max_masks,
compressor_path=config.model.compressor_path,
device=config.model.device if config.model.device != "auto" else None,
)
class SAMHashPipeline(nn.Module):
"""Complete pipeline: SAM segmentation + DINO features + Hash compression.
Pipeline flow:
Image -> SAM (extract masks) -> Crop objects -> DINO (features) -> Hash (binary codes)
Usage:
# Initialize with config
pipeline = SAMHashPipeline(
sam_model="facebook/sam2.1-hiera-large",
dino_model="facebook/dinov2-large",
hash_bits=512,
)
# Process image
image = Image.open("path/to/image.jpg")
hash_codes = pipeline(image) # [N, 512] binary bits
"""
def __init__(
self,
sam_model: str = "facebook/sam2.1-hiera-large",
dino_model: str = "facebook/dinov2-large",
hash_bits: int = 512,
sam_min_mask_area: int = 100,
sam_max_masks: int = 10,
compressor_path: Optional[str] = None,
device: Optional[str] = None,
):
"""Initialize the complete pipeline.
Args:
sam_model: SAM model name from HuggingFace
dino_model: DINOv2 model name from HuggingFace
hash_bits: Number of bits in hash code
sam_min_mask_area: Minimum mask area threshold
sam_max_masks: Maximum number of masks to keep
compressor_path: Optional path to trained HashCompressor weights
device: Device to run models on
"""
super().__init__()
# Auto detect device
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
self.device = torch.device(device)
# Initialize components
self.segmentor = SegmentCompressor(
model_name=sam_model,
min_mask_area=sam_min_mask_area,
max_masks=sam_max_masks,
device=device,
)
# HashCompressor expects DINO features (1024 dim for dinov2-large)
dino_dim = 1024 if "large" in dino_model else 768
self.hash_compressor = HashCompressor(
input_dim=dino_dim, hash_bits=hash_bits
).to(device)
# Load pretrained compressor if provided
if compressor_path is not None:
self.hash_compressor.load_state_dict(
torch.load(compressor_path, map_location=device)
)
print(f"[OK] Loaded HashCompressor from {compressor_path}")
self.dino = DinoCompressor(
model_name=dino_model,
compressor=self.hash_compressor,
device=device,
)
def forward(self, image: Image.Image) -> torch.Tensor:
"""Process a single image through the complete pipeline.
Args:
image: Input PIL Image
Returns:
Binary hash codes [N, hash_bits] where N is number of detected objects
"""
# Step 1: SAM - extract and crop objects
cropped_objects = self.segmentor(image)
if len(cropped_objects) == 0:
# No objects detected, return empty tensor
return torch.empty(
0, self.hash_compressor.hash_bits, dtype=torch.int32, device=self.device
)
# Step 2: DINO - extract features from cropped objects
# Step 3: HashCompressor - compress features to binary codes
hash_codes = self.dino.encode(cropped_objects)
return hash_codes
def extract_features(
self, image: Image.Image, use_hash: bool = False
) -> torch.Tensor:
"""Extract features from image with optional hash compression.
Args:
image: Input PIL Image
use_hash: If True, return binary hash codes; else return DINO features
Returns:
Features [N, dim] where dim is 1024 (DINO) or 512 (hash)
"""
cropped_objects = self.segmentor(image)
if len(cropped_objects) == 0:
dim = self.hash_compressor.hash_bits if use_hash else 1024
return torch.empty(0, dim, device=self.device)
if use_hash:
return self.dino.encode(cropped_objects)
else:
return self.dino.extract_features(cropped_objects)
def extract_masks(self, image: Image.Image) -> list[torch.Tensor]:
"""Extract only masks without full processing (for debugging).
Args:
image: Input PIL Image
Returns:
List of binary masks [H, W]
"""
return self.segmentor.extract_masks(image)

View File

@@ -0,0 +1,180 @@
"""Segment Anything 2 feature extractor with mask filtering and image cropping.
Extracts object masks from images using SAM2.1, filters by area and confidence,
then crops the original image to obtain individual object regions.
"""
from typing import Optional
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from transformers import AutoModelForMaskGeneration, AutoProcessor
class SegmentCompressor(nn.Module):
"""SAM2.1 based segmenter with mask filtering.
Extracts object masks from images, filters by area and confidence,
and crops the original image to produce individual object patches.
"""
def __init__(
self,
model_name: str = "facebook/sam2.1-hiera-large",
min_mask_area: int = 100,
max_masks: int = 10,
device: Optional[str] = None,
):
"""Initialize SAM2.1 segmenter.
Args:
model_name: HuggingFace model name for SAM2.1
min_mask_area: Minimum mask pixel area threshold
max_masks: Maximum number of masks to keep
device: Device to load model on (auto-detect if None)
"""
super().__init__()
self.model_name = model_name
self.min_mask_area = min_mask_area
self.max_masks = max_masks
# Auto detect device
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
self.device = torch.device(device)
# Load SAM model and processor
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = AutoModelForMaskGeneration.from_pretrained(model_name).to(
self.device
)
self.model.eval()
def forward(self, image: Image.Image) -> list[Image.Image]:
"""Extract object masks and crop object regions.
Args:
image: Input PIL Image
Returns:
List of cropped object images (one per valid mask)
"""
# Run SAM inference
inputs = self.processor(image, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
# Post-process masks
masks = self.processor.post_process_masks(
outputs.pred_masks,
inputs["original_sizes"],
inputs["reshaped_input_sizes"],
)[0]
# Filter masks by area and confidence
valid_masks = self._filter_masks(masks)
if len(valid_masks) == 0:
return []
# Crop object regions from original image
cropped_objects = self._crop_objects(image, valid_masks)
return cropped_objects
def _filter_masks(self, masks: torch.Tensor) -> list[dict]:
"""Filter masks by area and keep top-N.
Args:
masks: Predicted masks [N, H, W]
Returns:
List of mask dictionaries with 'mask' and 'area'
"""
valid_masks = []
for mask in masks:
# Calculate mask area
area = mask.sum().item()
# Filter by minimum area
if area < self.min_mask_area:
continue
valid_masks.append({"mask": mask, "area": area})
# Sort by area (descending) and keep top-N
valid_masks = sorted(valid_masks, key=lambda x: x["area"], reverse=True)
valid_masks = valid_masks[: self.max_masks]
return valid_masks
def _crop_objects(
self, image: Image.Image, masks: list[dict]
) -> list[Image.Image]:
"""Crop object regions from image using masks.
Args:
image: Original PIL Image
masks: List of mask dictionaries
Returns:
List of cropped object images
"""
# Convert PIL to numpy for processing
image_np = np.array(image)
h, w = image_np.shape[:2]
cropped_objects = []
for mask_info in masks:
mask = mask_info["mask"].cpu().numpy()
# Find bounding box from mask
rows = mask.any(axis=1)
cols = mask.any(axis=0)
if not rows.any() or not cols.any():
continue
y_min, y_max = rows.argmax(), h - rows[::-1].argmax() - 1
x_min, x_max = cols.argmax(), w - cols[::-1].argmax() - 1
# Add small padding
pad = 5
x_min = max(0, x_min - pad)
y_min = max(0, y_min - pad)
x_max = min(w, x_max + pad)
y_max = min(h, y_max + pad)
# Crop
cropped = image.crop((x_min, y_min, x_max, y_max))
cropped_objects.append(cropped)
return cropped_objects
@torch.no_grad()
def extract_masks(self, image: Image.Image) -> list[torch.Tensor]:
"""Extract only masks without cropping (for debugging).
Args:
image: Input PIL Image
Returns:
List of binary masks [H, W]
"""
inputs = self.processor(image, return_tensors="pt").to(self.device)
outputs = self.model(**inputs)
masks = self.processor.post_process_masks(
outputs.pred_masks,
inputs["original_sizes"],
inputs["reshaped_input_sizes"],
)[0]
valid_masks = self._filter_masks(masks)
return [m["mask"] for m in valid_masks]