Skip to content

Commit debd1f5

Browse files
committed
Initial commit
1 parent e2ee0f6 commit debd1f5

File tree

10 files changed

+213
-26
lines changed

10 files changed

+213
-26
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55

6+
tests/
7+
data/data/
8+
69
# C extensions
710
*.so
811

@@ -181,4 +184,4 @@ wandb/
181184

182185
# Distributed leaning
183186
hostfile.txt
184-
.deepspeed_env
187+
.deepspeed_env

data/hdf5_vla_dataset.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class HDF5VLADataset:
1818
def __init__(self) -> None:
1919
# [Modify] The path to the HDF5 dataset directory
2020
# Each HDF5 file contains one episode
21-
HDF5_DIR = "data/datasets/agilex/rdt_data/"
21+
HDF5_DIR = "/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/tests/cube_pick_and_place"
2222
self.DATASET_NAME = "agilex"
2323

2424
self.file_paths = []
@@ -28,7 +28,7 @@ def __init__(self) -> None:
2828
self.file_paths.append(file_path)
2929

3030
# Load the config
31-
with open('configs/base.yaml', 'r') as file:
31+
with open('/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml', 'r') as file:
3232
config = yaml.safe_load(file)
3333
self.CHUNK_SIZE = config['common']['action_chunk_size']
3434
self.IMG_HISORY_SIZE = config['common']['img_history_size']
@@ -133,20 +133,22 @@ def parse_hdf5_file(self, file_path):
133133
step_id = np.random.randint(first_idx-1, num_steps)
134134

135135
# Load the instruction
136-
dir_path = os.path.dirname(file_path)
137-
with open(os.path.join(dir_path, 'expanded_instruction_gpt-4-turbo.json'), 'r') as f_instr:
138-
instruction_dict = json.load(f_instr)
139-
# We have 1/3 prob to use original instruction,
140-
# 1/3 to use simplified instruction,
141-
# and 1/3 to use expanded instruction.
142-
instruction_type = np.random.choice([
143-
'instruction', 'simplified_instruction', 'expanded_instruction'])
144-
instruction = instruction_dict[instruction_type]
145-
if isinstance(instruction, list):
146-
instruction = np.random.choice(instruction)
136+
# dir_path = os.path.dirname(file_path)
137+
# with open(os.path.join(dir_path, 'expanded_instruction_gpt-4-turbo.json'), 'r') as f_instr:
138+
# instruction_dict = json.load(f_instr)
139+
# # We have 1/3 prob to use original instruction,
140+
# # 1/3 to use simplified instruction,
141+
# # and 1/3 to use expanded instruction.
142+
# instruction_type = np.random.choice([
143+
# 'instruction', 'simplified_instruction', 'expanded_instruction'])
144+
# instruction = instruction_dict[instruction_type]
145+
# if isinstance(instruction, list):
146+
# instruction = np.random.choice(instruction)
147147
# You can also use precomputed language embeddings (recommended)
148-
# instruction = "path/to/lang_embed.pt"
149-
148+
import torch
149+
instruction = "/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/outs/cube_pick_up_and place._embedding.pt"
150+
#instruction = torch.load(instruction_path)['embeddings']
151+
#instruction = os.path.join(os.path.dirname(file_path), "lang_embed.pt")
150152
# Assemble the meta
151153
meta = {
152154
"dataset_name": self.DATASET_NAME,
@@ -204,6 +206,7 @@ def fill_in_state(values):
204206
# Parse the images
205207
def parse_img(key):
206208
imgs = []
209+
return np.array(f['observations']['images'][key]).copy()
207210
for i in range(max(step_id-self.IMG_HISORY_SIZE+1, 0), step_id+1):
208211
img = f['observations']['images'][key][i]
209212
imgs.append(cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR))

encode_lang.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
3+
import torch
4+
import yaml
5+
6+
from models.multimodal_encoder.t5_encoder import T5Embedder
7+
8+
9+
GPU = 0
10+
MODEL_PATH = "google/t5-v1_1-xxl"
11+
CONFIG_PATH = "configs/base.yaml"
12+
SAVE_DIR = "outs/"
13+
14+
# Modify this to your task name and instruction
15+
TASK_NAME = "cube_pick_up_and place"
16+
INSTRUCTION = "Pick up the green cube on the left and put it into the yellow plate on the right."
17+
18+
# Note: if your GPU VRAM is less than 24GB,
19+
# it is recommanded to enable offloading by specifying an offload directory.
20+
OFFLOAD_DIR = None # Specify your offload directory here, ensuring the directory exists.
21+
22+
def main():
23+
with open(CONFIG_PATH, "r") as fp:
24+
config = yaml.safe_load(fp)
25+
26+
device = torch.device(f"cuda:{GPU}")
27+
28+
text_embedder = T5Embedder(
29+
from_pretrained=MODEL_PATH,
30+
model_max_length=config["dataset"]["tokenizer_max_length"],
31+
device=device,
32+
use_offload_folder=OFFLOAD_DIR
33+
)
34+
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
35+
36+
tokens = tokenizer(
37+
INSTRUCTION, return_tensors="pt",
38+
padding="longest",
39+
truncation=True
40+
)["input_ids"].to(device)
41+
42+
tokens = tokens.view(1, -1)
43+
with torch.no_grad():
44+
pred = text_encoder(tokens).last_hidden_state.detach().cpu()
45+
46+
save_path = os.path.join(SAVE_DIR, f"{TASK_NAME}.pt")
47+
# We save the embeddings in a dictionary format
48+
torch.save({
49+
"name": TASK_NAME,
50+
"instruction": INSTRUCTION,
51+
"embeddings": pred
52+
}, save_path
53+
)
54+
55+
print(f'\"{INSTRUCTION}\" from \"{TASK_NAME}\" is encoded by \"{MODEL_PATH}\" into shape {pred.shape} and saved to \"{save_path}\"')
56+
57+
58+
if __name__ == "__main__":
59+
main()

eval.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
2+
export NCCL_IB_DISABLE=0
3+
export NCCL_SOCKET_IFNAME=bond0
4+
export NCCL_DEBUG=INFO
5+
export NCCL_NVLS_ENABLE=0
6+
7+
export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
8+
export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384"
9+
export OUTPUT_DIR="./checkpoints/rdt-finetune-1b"
10+
export CFLAGS="-I/usr/include"
11+
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
12+
export CUTLASS_PATH="/path/to/cutlass"
13+
14+
export WANDB_PROJECT="robotics_diffusion_transformer"
15+
export CUDA_VISIBLE_DEVICES=5
16+
17+
if [ ! -d "$OUTPUT_DIR" ]; then
18+
mkdir "$OUTPUT_DIR"
19+
echo "Folder '$OUTPUT_DIR' created"
20+
else
21+
echo "Folder '$OUTPUT_DIR' already exists"
22+
fi
23+
24+
# For run in a single node/machine
25+
# accelerate launch main.py \
26+
# --deepspeed="./configs/zero2.json" \
27+
# ...
28+
29+
30+
# --master_port=2000fix RuntimeError:message: address already in use:
31+
deepspeed --master_port=2000 \
32+
--hostfile=hostfile.txt main.py \
33+
--deepspeed="./configs/zero2.json" \
34+
--pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \
35+
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
36+
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
37+
--output_dir=$OUTPUT_DIR \
38+
--train_batch_size=32 \
39+
--sample_batch_size=8 \
40+
--max_train_steps=10 \
41+
--checkpointing_period=1000 \
42+
--sample_period=1 \
43+
--checkpoints_total_limit=40 \
44+
--lr_scheduler="constant" \
45+
--learning_rate=1e-4 \
46+
--dataloader_num_workers=8 \
47+
--image_aug \
48+
--dataset_type="finetune" \
49+
--state_noise_snr=40 \
50+
--load_from_hdf5 \
51+
--report_to=wandb \
52+
--precomp_lang_embed
53+
54+
55+
# Use this to resume= training from some previous checkpoint
56+
# --resume_from_checkpoint=="checkpoint-36000" \
57+
# Use this to load from saved lanuage instruction embeddings,
58+
# instead of calculating it during training
59+
# --precomp_lang_embed \

finetune.sh

100644100755
Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@ export NCCL_DEBUG=INFO
55
export NCCL_NVLS_ENABLE=0
66

77
export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
8-
export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384"
9-
export OUTPUT_DIR="./checkpoints/rdt-finetune-1b"
8+
export VISION_ENCODER_NAME="/home/1ms.ai/hf/hf_cache/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/"
9+
export OUTPUT_DIR="./checkpoints/rdt-finetune-1b_10-episode_xavier"
1010
export CFLAGS="-I/usr/include"
1111
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
1212
export CUTLASS_PATH="/path/to/cutlass"
1313

1414
export WANDB_PROJECT="robotics_diffusion_transformer"
15+
export CUDA_VISIBLE_DEVICES=0,1,2,3
1516

1617
if [ ! -d "$OUTPUT_DIR" ]; then
1718
mkdir "$OUTPUT_DIR"
@@ -27,14 +28,14 @@ fi
2728

2829
deepspeed --hostfile=hostfile.txt main.py \
2930
--deepspeed="./configs/zero2.json" \
30-
--pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \
31+
--pretrained_model_name_or_path="/home/1ms.ai/hf/hf_cache/models--robotics-diffusion-transformer--rdt-1b/snapshots/eb09036cc64ca4945051acbd1bd581d30a1d7711/" \
3132
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
3233
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
3334
--output_dir=$OUTPUT_DIR \
34-
--train_batch_size=32 \
35-
--sample_batch_size=64 \
36-
--max_train_steps=200000 \
37-
--checkpointing_period=1000 \
35+
--train_batch_size=8 \
36+
--sample_batch_size=8 \
37+
--max_train_steps=50 \
38+
--checkpointing_period=25 \
3839
--sample_period=500 \
3940
--checkpoints_total_limit=40 \
4041
--lr_scheduler="constant" \
@@ -45,7 +46,9 @@ deepspeed --hostfile=hostfile.txt main.py \
4546
--dataset_type="finetune" \
4647
--state_noise_snr=40 \
4748
--load_from_hdf5 \
48-
--report_to=wandb
49+
--report_to=wandb \
50+
--precomp_lang_embed
51+
4952

5053
# Use this to resume training from some previous checkpoint
5154
# --resume_from_checkpoint="checkpoint-36000" \

models/multimodal_encoder/t5_encoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def __init__(
7070
self.use_text_preprocessing = use_text_preprocessing
7171
self.hf_token = hf_token
7272

73-
assert from_pretrained in self.available_models
73+
#assert from_pretrained in self.available_models
7474
self.tokenizer = AutoTokenizer.from_pretrained(
7575
from_pretrained,
7676
model_max_length=model_max_length,

models/rdt/blocks.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ def __init__(
102102
def forward(self, x: torch.Tensor, c: torch.Tensor,
103103
mask: torch.Tensor | None = None) -> torch.Tensor:
104104
B, N, C = x.shape
105+
# print(c.shape) # todo ,test
106+
# _, L, _ = c.shape # ori
105107
_, L, _ = c.shape
106108
q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
107109
kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)

models/rdt/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
148148
# Add multimodal position embeddings
149149
x = x + self.x_pos_embed
150150
# Note the lang is of variable length
151+
# assert len(lang_c.shape)== 3 # wd,test,todo
151152
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
152153
img_c = img_c + self.img_cond_pos_embed
153154

preprocessing_lang_emb.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import os
2+
3+
import torch
4+
import yaml
5+
6+
from models.multimodal_encoder.t5_encoder import T5Embedder
7+
8+
9+
GPU = 4
10+
MODEL_PATH = "google/t5-v1_1-xxl"
11+
CONFIG_PATH = "/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml"
12+
13+
dataset_dir = "/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/data/data/dataset/agilex/rdt_data"
14+
15+
# Note: if your GPU VRAM is less than 24GB,
16+
# it is recommanded to enable offloading by specifying an offload directory.
17+
OFFLOAD_DIR = None # Specify your offload directory here, ensuring the directory exists.
18+
19+
def main():
20+
with open(CONFIG_PATH, "r") as fp:
21+
config = yaml.safe_load(fp)
22+
23+
device = torch.device(f"cuda:{GPU}")
24+
text_embedder = T5Embedder(
25+
from_pretrained=MODEL_PATH,
26+
model_max_length=config["dataset"]["tokenizer_max_length"],
27+
device=device,
28+
use_offload_folder=OFFLOAD_DIR
29+
)
30+
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
31+
32+
for task_name in os.listdir(dataset_dir):
33+
task_dir = os.path.join(dataset_dir, task_name)
34+
embedding_name = os.path.join(task_dir, 'lang_embed.pt')
35+
if os.path.exists(embedding_name):
36+
continue
37+
task_name = " ".join(task_name.split("_"))
38+
tokens = tokenizer(
39+
task_name, return_tensors="pt",
40+
padding="longest",
41+
truncation=True
42+
)["input_ids"].to(device)
43+
44+
tokens = tokens.view(1, -1)
45+
with torch.no_grad():
46+
pred = text_encoder(tokens).last_hidden_state.detach().cpu()[0]
47+
# We save the embeddings in a dictionary format
48+
torch.save(
49+
pred,
50+
embedding_name
51+
)
52+
53+
print(f'\"{task_name}\" into shape {pred.shape} and saved to \"{embedding_name}\"')
54+
55+
56+
if __name__ == "__main__":
57+
main()

scripts/encode_lang_batch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
MODEL_PATH = "google/t5-v1_1-xxl"
1313
CONFIG_PATH = "configs/base.yaml"
1414
# Modify the TARGET_DIR to your dataset path
15-
TARGET_DIR = "data/datasets/agilex/tfrecords/"
15+
TARGET_DIR = "/home/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/data/data/dataset/agilex/rdt_data/"
1616

1717

1818
def main():

0 commit comments

Comments
 (0)