fix: change deps version to fit OpenVLA

refector: use mujoco to replace genesis
build: make environment adapt to Nixos
2026-01-29 14:37:49 +08:00 · 2025-12-08 23:12:26 +08:00 · 2025-12-08 16:44:16 +08:00 · 2025-12-08 16:04:06 +08:00 · 2025-12-08 15:38:00 +08:00
6 changed files with 766 additions and 1277 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -216,3 +216,14 @@ devenv.local.yaml
 # pre-commit
 .pre-commit-config.yaml
 # Devenv
 .devenv*
 devenv.local.nix
 devenv.local.yaml
 # direnv
 .direnv
 # pre-commit
 .pre-commit-config.yaml
--- a/devenv.lock
+++ b/devenv.lock
@@ -3,10 +3,10 @@
    "devenv": {
      "locked": {
        "dir": "src/modules",
-        "lastModified": 1764449550,
+        "lastModified": 1764927628,
        "owner": "cachix",
        "repo": "devenv",
-        "rev": "dfb58ac03bed07b93f629df55034bc50394d3971",
+        "rev": "247d7027f91368054fb0eefbd755a73d42b66fee",
        "type": "github"
      },
      "original": {
@@ -19,10 +19,10 @@
    "flake-compat": {
      "flake": false,
      "locked": {
-        "lastModified": 1761588595,
+        "lastModified": 1765121682,
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "f387cd2afec9419c8ee37694406ca490c3f34ee5",
+        "rev": "65f23138d8d09a92e30f1e5c87611b23ef451bf3",
        "type": "github"
      },
      "original": {
@@ -34,10 +34,10 @@
    "flake-compat_2": {
      "flake": false,
      "locked": {
-        "lastModified": 1761588595,
+        "lastModified": 1765121682,
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "f387cd2afec9419c8ee37694406ca490c3f34ee5",
+        "rev": "65f23138d8d09a92e30f1e5c87611b23ef451bf3",
        "type": "github"
      },
      "original": {
@@ -55,10 +55,10 @@
        ]
      },
      "locked": {
-        "lastModified": 1763988335,
+        "lastModified": 1765016596,
        "owner": "cachix",
        "repo": "git-hooks.nix",
-        "rev": "50b9238891e388c9fdc6a5c49e49c42533a1b5ce",
+        "rev": "548fc44fca28a5e81c5d6b846e555e6b9c2a5a3c",
        "type": "github"
      },
      "original": {
@@ -89,10 +89,10 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1761313199,
+        "lastModified": 1764580874,
        "owner": "cachix",
        "repo": "devenv-nixpkgs",
-        "rev": "d1c30452ebecfc55185ae6d1c983c09da0c274ff",
+        "rev": "dcf61356c3ab25f1362b4a4428a6d871e84f1d1d",
        "type": "github"
      },
      "original": {
@@ -108,10 +108,10 @@
        "nixpkgs": "nixpkgs_2"
      },
      "locked": {
-        "lastModified": 1763677049,
+        "lastModified": 1765052656,
        "owner": "cachix",
        "repo": "nixpkgs-python",
-        "rev": "159d63dc49a4b12bf85fe0e83011a8b69ba1bcb0",
+        "rev": "04b27dbad2e004cb237db202f21154eea3c4f89f",
        "type": "github"
      },
      "original": {
@@ -122,10 +122,10 @@
    },
    "nixpkgs_2": {
      "locked": {
-        "lastModified": 1764316264,
+        "lastModified": 1764939437,
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9a7b80b6f82a71ea04270d7ba11b48855681c4b0",
+        "rev": "00d2457e2f608b4be6fe8b470b0a36816324b0ae",
        "type": "github"
      },
      "original": {
--- a/devenv.nix
+++ b/devenv.nix
@@ -6,8 +6,8 @@
  ...
 }: {
  packages = with pkgs; [
-    glxinfo
+    mujoco
-    vulkan-tools
+    linuxHeaders
  ];
  languages.c.enable = true;
@@ -55,30 +55,33 @@
      );
  };
-  env = rec {
+  env = {
-    NIX_LD_LIBRARY_PATH = "$NIX_LD_LIBRARY_PATH:/usr/lib/wsl/lib/";
+    CPATH = "${pkgs.linuxHeaders}/include";
-    LD_LIBRARY_PATH = NIX_LD_LIBRARY_PATH;
+    KERNEL_DIR = "${pkgs.linuxHeaders}/include";
    GLVND = "${pkgs.libglvnd}";
    MESA = "${pkgs.mesa}";
    MESAD = MESA;
    GALLIUM_DRIVER = "d3d12";
    MESA_D3D12_DEFAULT_ADAPTER_NAME = "NVIDIA";
    # LIBGL_ALWAYS_INDIRECT = 0;
    # __GLX_VENDOR_LIBRARY_NAME = "nvidia";
    # __NV_PRIME_RENDER_OFFLOAD = 1;
    # __VK_LAYER_NV_optimus = "NVIDIA_only";
    # PYOPENGL_PLATFORM = "egl";
    # EGL_PLATFORM = "wayland";
    # __EGL_VENDOR_LIBRARY_DIRS = "${MESA}/share/glvnd/egl_vendor.d:${GLVND}/share/glvnd/egl_vendor.d";
    # LIBGL_DRIVERS_PATH = "${MESA}/lib/dri";
  };
  # 创建 .venv 软链接到 .devenv/state/venv
  scripts.linkVenv.exec = ''
    if [ -L ".venv" ] || [ ! -e ".venv" ]; then
        rm -f ".venv" 2>/dev/null || true
        ln -s ".devenv/state/venv" ".venv"
        echo "Created symlink: .venv -> .devenv/state/venv"
    elif [ -e ".venv" ]; then
        echo "Warning: .venv exists and is not a symlink. Not replacing it."
        echo "If you want to replace it with a symlink, please remove it manually first."
    fi
  '';
  # https://devenv.sh/basics/
  enterShell = ''
-    git --version # Use packages
+    export NIX_LD_LIBRARY_PATH="$NIX_LD_LIBRARY_PATH:${lib.makeLibraryPath [ pkgs.linuxHeaders ]}";
    export LD_LIBRARY_PATH="$NIX_LD_LIBRARY_PATH";
    echo "NIX_LD_LIBRARY_PATH set to $NIX_LD_LIBRARY_PATH"
    echo "LD_LIBRARY_PATH set to $LD_LIBRARY_PATH"
    echo "CPATH set to $CPATH"
    echo "KERNEL_DIR set to $KERNEL_DIR"
    linkVenv
  '';
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,15 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "accelerate>=1.12.0",
    "mujoco>=3.4.0",
    "pillow>=12.0.0",
    "robosuite>=1.5.1",
    "robosuite-models>=1.0.0",
    "timm>=0.9.10,<1.0.0",
    "torch>=2.9.1",
    "torchvision>=0.24.1",
    "transformers==4.40.1",
 ]
 [dependency-groups]
--- a/src/main.py
+++ b/src/main.py
@@ -1,126 +1,196 @@
-from typing import cast
+"""
 Simple OpenVLA Demo with Robosuite + MuJoCo
 Uses the official OpenVLA model from HuggingFace
 """
 import genesis as gs
 import numpy as np
 import robosuite as suite
 import torch
 from PIL import Image
 from transformers import AutoModelForVision2Seq, AutoProcessor
-########################## init ##########################
+
-gs.init(backend=gs.gs_backend.gpu)
+class OpenVLADemo:
    """OpenVLA Demo with Robosuite using HuggingFace model"""
    # OpenVLA action names (7-DoF)
    ACTION_NAMES = [
        "delta_x",
        "delta_y",
        "delta_z",
        "delta_roll",
        "delta_pitch",
        "delta_yaw",
        "gripper",
    ]
    def __init__(
        self,
        env_name: str = "Lift",
        robot: str = "Panda",
        instruction: str = "pick up the red cube",
        model_name: str = "openvla/openvla-7b",
    ):
        self.instruction = instruction
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"[INFO] Using device: {self.device}")
        # Load OpenVLA model from HuggingFace
        print(f"[INFO] Loading OpenVLA model: {model_name}")
        print("[INFO] This may take a while on first run...")
        self.processor = AutoProcessor.from_pretrained(
            model_name, trust_remote_code=True
        )
        # Load model with appropriate settings
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        ).to(self.device)
        self.model.eval()
        print("[INFO] OpenVLA model loaded successfully!")
        # Create robosuite environment
        print(f"[INFO] Creating {env_name} environment with {robot} robot...")
        self.env = suite.make(
            env_name=env_name,
            robots=robot,
            has_renderer=True,
            has_offscreen_renderer=True,
            use_camera_obs=True,
            camera_names="agentview",
            camera_heights=224,
            camera_widths=224,
            control_freq=20,
            render_camera="frontview",
        )
        print(f"[INFO] Environment created! Action dim: {self.env.action_dim}")
    def get_prompt(self) -> str:
        """Format the prompt for OpenVLA"""
        return f"In: What action should the robot take to {self.instruction}?\nOut:"
    @torch.no_grad()
    def get_action(self, obs: dict) -> np.ndarray:
        """Get action from OpenVLA model"""
        # Get image from observation and convert to PIL
        image_array = obs["agentview_image"]
        # Robosuite returns (H, W, C) uint8 image
        image = Image.fromarray(image_array)
        # Prepare prompt
        prompt = self.get_prompt()
        # Process inputs
        inputs = self.processor(prompt, image).to(self.device, dtype=torch.bfloat16)
        # Predict action using OpenVLA
        # Note: unnorm_key should match your robot setup
        # For simulation, we'll use raw normalized actions
        action = self.model.predict_action(
            **inputs, do_sample=False, unnorm_key="roboturk"
        )
        # Action is numpy array of shape (7,)
        return action
    def print_vla_output(self, step: int, action: np.ndarray, reward: float):
        """Print detailed VLA output to terminal"""
        print(f"\n[Step {step:6d}] OpenVLA Output:")
        print(f'  Instruction: "{self.instruction}"')
        print("  Actions:")
        for name, value in zip(self.ACTION_NAMES, action):
            print(f"    {name:12s}: {value:+.4f}")
        print(f"  Step Reward: {reward:.4f}")
    def run(self):
        """Run the demo forever"""
        print("\n" + "=" * 60)
        print("OpenVLA Demo Started (Running Forever)")
        print(f'Instruction: "{self.instruction}"')
        print("Close the MuJoCo viewer window to quit")
        print("=" * 60 + "\n")
        episode = 0
        total_steps = 0
        # Run forever
        while True:
            episode += 1
            print(f"\n{'=' * 60}")
            print(f"Episode {episode} Started")
            print("=" * 60)
            obs = self.env.reset()
            episode_reward = 0.0
            episode_steps = 0
            while True:
                # Render environment with MuJoCo viewer
                self.env.render()
                # Get action from OpenVLA model
                action = self.get_action(obs)
                # Robosuite expects action of env.action_dim size
                # OpenVLA outputs 7-DoF, pad or truncate if needed
                if len(action) < self.env.action_dim:
                    action = np.concatenate(
                        [action, np.zeros(self.env.action_dim - len(action))]
                    )
                elif len(action) > self.env.action_dim:
                    action = action[: self.env.action_dim]
                # Step environment
                obs, reward, done, info = self.env.step(action)
                episode_reward += reward
                episode_steps += 1
                total_steps += 1
                # Print VLA output every 20 steps
                if episode_steps % 20 == 0:
                    self.print_vla_output(total_steps, action, reward)
                if done:
                    print(f"\n[INFO] Episode {episode} done at step {episode_steps}")
                    print(f"[INFO] Episode Reward: {episode_reward:.3f}")
                    break
            print(f"\nEpisode {episode} Summary:")
            print(f"  Steps: {episode_steps}")
            print(f"  Total Reward: {episode_reward:.3f}")
            print(f"  Total Steps (all episodes): {total_steps}")
    def close(self):
        """Clean up"""
        self.env.close()
 def main():
    print("=" * 60)
    print("OpenVLA Demo with Robosuite + MuJoCo")
    print("Using OpenVLA-7B from HuggingFace")
    print("=" * 60)
    demo = OpenVLADemo(
        env_name="Lift",
        robot="Panda",
        instruction="pick up the red cube",
    )
    try:
-    from genesis.engine.entities import RigidEntity
+        demo.run()
-except ImportError:
+    except KeyboardInterrupt:
-    raise ImportError("genesis.engine.entities.RigidEntity is not available")
+        print("\n[INFO] Interrupted by user")
-
+    finally:
-########################## create a scene ##########################
+        demo.close()
-scene = gs.Scene(
+        print("[INFO] Demo closed")
    viewer_options=gs.options.ViewerOptions(
        camera_pos=(0, -3.5, 2.5),
        camera_lookat=(0.0, 0.0, 0.5),
        camera_fov=30,
        res=(960, 640),
        max_FPS=60,
    ),
    sim_options=gs.options.SimOptions(
        dt=0.01,
    ),
    show_viewer=True,
 )
 ########################## entities ##########################
 plane = scene.add_entity(
    gs.morphs.Plane(),
 )
 franka = scene.add_entity(
    gs.morphs.MJCF(
        file="xml/franka_emika_panda/panda.xml",
    ),
 )
 franka = cast(RigidEntity, franka)
-########################## build ##########################
+if __name__ == "__main__":
-scene.build()
+    main()
 jnt_names = [
    "joint1",
    "joint2",
    "joint3",
    "joint4",
    "joint5",
    "joint6",
    "joint7",
    "finger_joint1",
    "finger_joint2",
 ]
 dofs_idx = [franka.get_joint(name).dof_idx_local for name in jnt_names]
 ############ Optional: set control gains ############
 # set positional gains
 franka.set_dofs_kp(
    kp=np.array([4500, 4500, 3500, 3500, 2000, 2000, 2000, 100, 100]),
    dofs_idx_local=dofs_idx,
 )
 # set velocity gains
 franka.set_dofs_kv(
    kv=np.array([450, 450, 350, 350, 200, 200, 200, 10, 10]),
    dofs_idx_local=dofs_idx,
 )
 # set force range for safety
 franka.set_dofs_force_range(
    lower=np.array([-87, -87, -87, -87, -12, -12, -12, -100, -100]),
    upper=np.array([87, 87, 87, 87, 12, 12, 12, 100, 100]),
    dofs_idx_local=dofs_idx,
 )
 # Hard reset
 for i in range(150):
    if i < 50:
        franka.set_dofs_position(np.array([1, 1, 0, 0, 0, 0, 0, 0.04, 0.04]), dofs_idx)
    elif i < 100:
        franka.set_dofs_position(
            np.array([-1, 0.8, 1, -2, 1, 0.5, -0.5, 0.04, 0.04]), dofs_idx
        )
    else:
        franka.set_dofs_position(np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]), dofs_idx)
    scene.step()
 # PD control
 for i in range(1250):
    if i == 0:
        franka.control_dofs_position(
            np.array([1, 1, 0, 0, 0, 0, 0, 0.04, 0.04]),
            dofs_idx,
        )
    elif i == 250:
        franka.control_dofs_position(
            np.array([-1, 0.8, 1, -2, 1, 0.5, -0.5, 0.04, 0.04]),
            dofs_idx,
        )
    elif i == 500:
        franka.control_dofs_position(
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]),
            dofs_idx,
        )
    elif i == 750:
        # control first dof with velocity, and the rest with position
        franka.control_dofs_position(
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])[1:],
            dofs_idx[1:],
        )
        franka.control_dofs_velocity(
            np.array([1.0, 0, 0, 0, 0, 0, 0, 0, 0])[:1],
            dofs_idx[:1],
        )
    elif i == 1000:
        franka.control_dofs_force(
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]),
            dofs_idx,
        )
    # This is the control force computed based on the given control command
    # If using force control, it's the same as the given control command
    print("control force:", franka.get_dofs_control_force(dofs_idx))
    # This is the actual force experienced by the dof
    print("internal force:", franka.get_dofs_force(dofs_idx))
    scene.step()
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
SikongJueluo	f916ef0734	fix: change deps version to fit OpenVLA	2025-12-08 23:12:26 +08:00
SikongJueluo	48bb2d3258	refector: use mujoco to replace genesis	2025-12-08 16:44:16 +08:00
SikongJueluo	7c09706628	build: make environment adapt to Nixos	2025-12-08 16:04:06 +08:00
SikongJueluo	3bc68ced4c	build: deps and environment change	2025-12-08 15:38:00 +08:00