From 16dd9bb70ea56b1402c723766ec729e7b694441f Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Thu, 20 Feb 2025 02:29:28 +0530 Subject: [PATCH 01/18] adding support for arkitscenes --- DATA.md | 34 ++ README.md | 1 + TRAIN.md | 2 +- configs/evaluation/eval_instance.yaml | 12 +- configs/evaluation/eval_scene.yaml | 12 +- configs/preprocess/process_1d.yaml | 8 + configs/preprocess/process_2d.yaml | 8 + configs/preprocess/process_3d.yaml | 8 + configs/preprocess/process_multimodal.yaml | 9 + configs/train/train_instance_baseline.yaml | 11 + configs/train/train_instance_crossover.yaml | 11 + configs/train/train_scene_crossover.yaml | 11 + data/datasets/__init__.py | 3 +- data/datasets/arkit.py | 41 +++ prepare_data/README.md | 50 ++- preprocess/feat1D/__init__.py | 3 +- preprocess/feat1D/arkit.py | 107 ++++++ preprocess/feat2D/__init__.py | 3 +- preprocess/feat2D/arkit.py | 287 ++++++++++++++++ preprocess/feat3D/__init__.py | 3 +- preprocess/feat3D/arkit.py | 98 ++++++ preprocess/multimodal_preprocess.py | 4 +- scripts/preprocess/process_arkit.sh | 9 + single_inference/datasets/__init__.py | 3 +- single_inference/datasets/arkit.py | 126 +++++++ single_inference/scene_inference.py | 2 + util/arkit.py | 347 ++++++++++++++++++++ 27 files changed, 1193 insertions(+), 20 deletions(-) create mode 100644 data/datasets/arkit.py create mode 100644 preprocess/feat1D/arkit.py create mode 100644 preprocess/feat2D/arkit.py create mode 100644 preprocess/feat3D/arkit.py create mode 100644 scripts/preprocess/process_arkit.sh create mode 100644 single_inference/datasets/arkit.py create mode 100644 util/arkit.py diff --git a/DATA.md b/DATA.md index 643b538..9377fd0 100644 --- a/DATA.md +++ b/DATA.md @@ -10,6 +10,7 @@ We list the available data used in the current version of CrossOver in the table | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | ScanNet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. @@ -110,4 +111,37 @@ Scan3R/ | │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... +``` + +### ARKitScenes + +#### Running preprocessing scripts +Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_arkit.sh +``` + +Our script for ARKitScenes dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +ARKitScenes/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── 40753679/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan ) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... ``` \ No newline at end of file diff --git a/README.md b/README.md index 1cb1030..b39d33a 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | Scannet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | > To run our demo, you only need to download generated embedding data; no need for any data preprocessing. diff --git a/TRAIN.md b/TRAIN.md index fd56dcd..622d5c6 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh ``` #### Train Scene Retrieval Pipeline -Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following: +Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan & ARKitScenes or any combination of the same. Run the following: ```bash $ bash scripts/train/train_scene_crossover.sh diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index a14c626..5515123 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -43,13 +43,23 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceObjectRetrieval InferenceObjectRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index 0f1b6f2..eab4202 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -43,13 +43,23 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceSceneRetrieval InferenceSceneRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index c74b6bc..11a9df7 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -25,6 +25,14 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + Shapenet: base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 74898cd..d02d017 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -27,6 +27,14 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + modality_info: 1D : feature_extractor: diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 3d15f23..e9bc9c6 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -24,6 +24,14 @@ data: processor1D : Scan3R1DProcessor label_filename : labels.instances.align.annotated.v2.ply + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + modality_info: 1D : feature_extractor: diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 3eb5ace..33b3def 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -28,6 +28,15 @@ data: skip_frames : 1 avail_modalities : ['point', 'rgb', 'referral'] + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + modality_info: 1D : feature_extractor: diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index 8b6bc89..02e4324 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -44,6 +44,17 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : ObjectLevelGrounding ObjectLevelGrounding : diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index c54257d..6bfdce4 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -44,6 +44,17 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : SceneLevelGrounding SceneLevelGrounding : diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index f9459da..31ae435 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -44,6 +44,17 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/scans + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : UnifiedTrain UnifiedTrain : diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 9a1b744..8c18552 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -1,2 +1,3 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * \ No newline at end of file diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py new file mode 100644 index 0000000..4944dae --- /dev/null +++ b/data/datasets/arkit.py @@ -0,0 +1,41 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig +import pandas as pd +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class ARKitScenesObject(ScanObjectBase): + """ARKitScenes dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class ARKitScenes(ScanBase): + """ARKitScenes dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self): + """Groups scans into temporal pairs based on shared visit_id.""" + csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv') + df = pd.read_csv(csv_path) + + df = df[df["visit_id"].notna()] + + grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict() + + scene_pairs = [] + for video_ids in grouped_scans.values(): + if len(video_ids) > 1: + ref_scan_id = video_ids[0] # First video_id as reference + rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] + + scene_pairs.append([ref_scan_id, rescan_list]) + + return scene_pairs \ No newline at end of file diff --git a/prepare_data/README.md b/prepare_data/README.md index dba34f5..919d73d 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -5,6 +5,7 @@ This document provides instructions for pre-processing different datasets, including - ScanNet - 3RScan +- ARKitScenes ## Prerequisites @@ -16,20 +17,14 @@ Before you begin, simply activate the `crossover` conda environment. #### Original Data - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark). -- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt). - - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json). - - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0). +- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan). -- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. - -#### Referral and CAD annotations -We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). +- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes). -- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/). -- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset). +- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. -### Prepare The Data -Exact instructions for data setup + preparation below: +### Download Referral and CAD annotations +We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. #### ScanNet 1. Run the following to extract ScanNet data @@ -107,3 +102,36 @@ Scan3R/ └── sceneverse └── ssg_ref_rel2_template.json ``` + +#### ARKitScenes +1. Download `files/` under `processed_data/meta_data/ARKitScenes/` from GDrive and place under `PATH_TO_ARKITSCENES/`. +2. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract MultiScan data + + ```bash +cd ARKitScenes +mv 3dod/Training/* scans +mv 3dod/Validation/* scans +``` + +Once completed, the data structure would look like the following: +``` +MultiScan/ +├── scans/ +│ ├── 40753679/ +│ │ ├── 40753679_frames/ +│ │ │ ├── lowres_depth/ (folder containing depth images) +│ │ │ ├── lowres_wide/ (folder containing rgb images) +│ │ │ ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics) +│ │ │ ├── lowres_wide.traj (camera trajectory) +│ │ ├── 40753679_3dod_annotation.json +│ │ ├── 40753679_3dod_mesh.ply +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── val_scans.txt + ├── metadata.csv + ├── 3dod_train_val_splits.csv + └── sceneverse + └── ssg_ref_rel2_template.json +``` \ No newline at end of file diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py index 9a1b744..8c18552 100644 --- a/preprocess/feat1D/__init__.py +++ b/preprocess/feat1D/__init__.py @@ -1,2 +1,3 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * \ No newline at end of file diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py new file mode 100644 index 0000000..0e2873d --- /dev/null +++ b/preprocess/feat1D/arkit.py @@ -0,0 +1,107 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import labelmap, arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + + return objects + + + + def compute1DFeaturesEachScan(self, scan_id): + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + + return object_referral_embeddings diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py index 9a1b744..8c18552 100644 --- a/preprocess/feat2D/__init__.py +++ b/preprocess/feat2D/__init__.py @@ -1,2 +1,3 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * \ No newline at end of file diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py new file mode 100644 index 0000000..f0d8456 --- /dev/null +++ b/preprocess/feat2D/arkit.py @@ -0,0 +1,287 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +from omegaconf import DictConfig +from typing import List, Dict, Tuple +import pandas as pd +from common import load_utils +from util import render, arkit, visualisation +from util import image as image_util + + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes2DProcessor(Base2DProcessor): + """ARKitScenes 2D (RGB) feature processor class.""" + def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None: + super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.split = split + self.scan_ids = arkit.get_scan_ids(files_dir, self.split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv')) + + self.frame_pose_data = {} + for scan_id in self.scan_ids: + pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self) -> None: + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + if self.split == 'val': + self.computeAllImageFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id: str) -> None: + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) + instance_ids = ply_data['objectId'] + + mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + obj_id_imgs = {} + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + if osp.exists(osp.join(scene_folder, 'gt-projection')): + shutil.rmtree(osp.join(scene_folder, 'gt-projection')) + + # save scene-level file for efficient loading + torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + + def compute2DFeaturesEachScan(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsic_mat = camera_info['intrinsic_mat'] + break + + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + + def computeAllImageFeaturesEachScan(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + + # Extract Scene Image Features + scene_images_pt = [] + scene_image_embeddings = [] + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + + for frame_index in frame_idxs: + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + + scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False)) + scene_images_pt.append(image_pt) + + scene_image_embeddings = np.concatenate(scene_image_embeddings) + data2D = {} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs} + torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + + def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + # return pose_data, None, None, sampled_frame_idxs + + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: + object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + object_image_votes = {} + scan_id=scene_folder.split('/')[-1] + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray: + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + # load image + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py index 9a1b744..8c18552 100644 --- a/preprocess/feat3D/__init__.py +++ b/preprocess/feat3D/__init__.py @@ -1,2 +1,3 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * \ No newline at end of file diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py new file mode 100644 index 0000000..9da2d4e --- /dev/null +++ b/preprocess/feat3D/arkit.py @@ -0,0 +1,98 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm + +from common import load_utils +from util import point_cloud, arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 822135d..34f2898 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -8,7 +8,7 @@ import h5py from common import load_utils from common.constants import ModalityType -from util import scan3r, scannet +from util import scan3r, scannet, arkit from typing import Dict, Optional from preprocess.build import PROCESSOR_REGISTRY @@ -33,6 +33,8 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split: self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split) elif self.dataset_name == 'Scan3R': self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'ARKitScenes': + self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split) else: raise NotImplementedError diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh new file mode 100644 index 0000000..ecb457e --- /dev/null +++ b/scripts/preprocess/process_arkit.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null + +# # Multi-modal dumping +python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py index 9a1b744..8c18552 100644 --- a/single_inference/datasets/__init__.py +++ b/single_inference/datasets/__init__.py @@ -1,2 +1,3 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * \ No newline at end of file diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py new file mode 100644 index 0000000..6434bde --- /dev/null +++ b/single_inference/datasets/arkit.py @@ -0,0 +1,126 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d +import pandas as pd +from common import load_utils +from util import arkit +from util import image as image_util + +class ARKitScenesInferDataset(Dataset): + def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scans') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv')) + + + def extract_images(self, scan_id, color_path): + pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id))) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py index 9846dd5..65465c2 100644 --- a/single_inference/scene_inference.py +++ b/single_inference/scene_inference.py @@ -26,6 +26,8 @@ def run_inference(args, scan_id=None): dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir) elif args.dataset == 'Scan3R': dataset = datasets.Scan3RInferDataset(args.data_dir) + elif args.dataset == 'ARKitScenes': + dataset = datasets.ARKitScenesInferDataset(args.data_dir) else: raise NotImplementedError('Dataset not implemented') diff --git a/util/arkit.py b/util/arkit.py new file mode 100644 index 0000000..c4e7593 --- /dev/null +++ b/util/arkit.py @@ -0,0 +1,347 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import trimesh +import pandas as pd +import cv2 + +ARKITSCENE_SCANNET= { +'bed': 'bed', +'cabinet': 'cabinet', +'refrigerator': 'refrigerator', +'table': 'table', +'chair': 'chair', +'sink': 'sink', +'stove': 'stove', +'oven': 'oven', +'washer': 'washing machine', +'shelf': 'shelf', +'tv_monitor': 'tv', +'bathtub': 'bathtub', +'toilet': 'toilet', +'sofa': 'sofa', +'stool': 'stool', +'fireplace': 'fireplace', +'build_in_cabinet': 'cabinet', +'dishwasher': 'dishwasher', +'stairs': 'stairs' +} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is not None: + frame_idxs = frame_idxs[::skip] + + return frame_idxs + +def TrajStringToMatrix(traj_str): + """ convert traj_str into translation and rotation matrices + Args: + traj_str: A space-delimited file where each line represents a camera position at a particular timestamp. + The file has seven columns: + * Column 1: timestamp + * Columns 2-4: rotation (axis-angle representation in radians) + * Columns 5-7: translation (usually in meters) + + Returns: + ts: translation matrix + Rt: rotation matrix + """ + # line=[float(x) for x in traj_str.split()] + # ts = line[0]; + # R = cv2.Rodrigues(np.array(line[1:4]))[0]; + # t = np.array(line[4:7]); + # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0) + tokens = traj_str.split() + assert len(tokens) == 7 + ts = tokens[0] + # Rotation in angle axis + angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])] + r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis)) + # Translation + t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])]) + extrinsics = np.eye(4, 4) + extrinsics[:3, :3] = r_w_to_p + extrinsics[:3, -1] = t_w_to_p + Rt = np.linalg.inv(extrinsics) + return Rt + +def convert_angle_axis_to_matrix3(angle_axis): + """Return a Matrix3 for the angle axis. + Arguments: + angle_axis {Point3} -- a rotation in angle axis form. + """ + matrix, jacobian = cv2.Rodrigues(angle_axis) + return matrix + +def load_poses(scan_dir, scan_id, skip=None): + frame_poses = {} + frame_idxs = load_frame_idxs(scan_dir, skip=skip) + traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj') + with open(traj_file) as f: + traj = f.readlines() + for i,line in enumerate(traj): + ts=line.split(" ")[0] + rounded_ts = round(float(ts), 3) + formatted_ts = f"{rounded_ts:.3f}" + if formatted_ts not in frame_idxs: + if f"{rounded_ts - 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line) + elif f"{rounded_ts + 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line) + else: + print("no matching pose for frame", formatted_ts) + continue + # if f"{round(float(ts), 3):.3f}" not in frame_idxs: + # if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line) + # elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line) + # else: + # continue + else: + frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line) + # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None) + # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()): + # if skip is not None and index % skip != 0: + # continue + # rotation_axis = row[1:4].values + # rotation_angle = np.linalg.norm(rotation_axis) + # if rotation_angle != 0: + # rotation_axis = rotation_axis / rotation_angle + # translation = row[4:7].values + # # Convert axis-angle to rotation matrix + # # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle) + # rotation_matrix= + # # Construct the 4x4 homogeneous transformation matrix + # homogenous_matrix = np.eye(4) + # homogenous_matrix[:3, :3] = rotation_matrix + # homogenous_matrix[:3, 3] = translation + # frame_poses[frame_idx] = homogenous_matrix + + return frame_poses + +def axis_angle_to_rotation_matrix(axis, angle): + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + x, y, z = axis + c = np.cos(angle) + s = np.sin(angle) + t = 1 - c + + # Compute the rotation matrix using the axis-angle formula + rotation_matrix = np.array([ + [t*x*x + c, t*x*y - s*z, t*x*z + s*y], + [t*x*y + s*z, t*y*y + c, t*y*z - s*x], + [t*x*z - s*y, t*y*z + s*x, t*z*z + c] + ]) + + return rotation_matrix + +def load_intrinsics(data_dir, scan_id, frame_id): + ''' + Load ARKit intrinsic information + ''' + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam') + + + intrinsics = {} + + # Read the .pincam file + with open(pincam_path, "r") as f: + line = f.readline().strip() + + # Parse the intrinsic parameters + width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split()) + + # Store the width and height + intrinsics['width'] = width + intrinsics['height'] = height + + # Construct the intrinsic matrix + intrinsic_mat = np.array([ + [focal_length_x, 0, principal_point_x], + [0, focal_length_y, principal_point_y], + [0, 0, 1] + ]) + intrinsics['intrinsic_mat'] = intrinsic_mat + + return intrinsics + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False + +def load_ply_data(data_dir, scan_id, annotations): + filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply') + file = open(filename_in, 'rb') + plydata = PlyData.read(file) + file.close() + vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z'] + vertices = np.vstack(vertices).T + + vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue'] + vertex_colors = np.vstack(vertex_colors).T + + vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + ('objectId', 'h')] + vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype) + + # Assign x, y, z, and color values to the structured array + vertices_structured['red'] = vertex_colors[:, 0] + vertices_structured['green'] = vertex_colors[:, 1] + vertices_structured['blue'] = vertex_colors[:, 2] + + vertex_instance = np.zeros(vertices.shape[0], dtype='h') # Use 'h' for signed 16-bit integer + bbox_list=[] + for _i, label_info in enumerate(annotations["data"]): + object_id = _i + 1 + rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3) + + transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3) + scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3) + + trns = np.eye(4) + trns[0:3, 3] = transform + trns[0:3, 0:3] = rotation.T + + box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns) + obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices)) + + vertex_instance[obj_containment] = object_id + box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) + bbox_list.append(box3d) + + # if len(bbox_list) == 0: + # return + + vertices_structured['objectId'] = vertex_instance + + # align_angle = calc_align_matrix(bbox_list) + + # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle) + + if np.max(vertex_colors) <= 1: + vertex_colors = vertex_colors * 255.0 + + # center_points = np.mean(vertices_aligned, axis=0) + # center_points[2] = np.min(vertices_aligned[:, 2]) + # vertices_aligned = vertices_aligned - center_points + + # vertices_structured['x'] = vertices_aligned[:, 0] + # vertices_structured['y'] = vertices_aligned[:, 1] + # vertices_structured['z'] = vertices_aligned[:, 2] + + vertices_structured['x'] = plydata['vertex']['x'] + vertices_structured['y'] = plydata['vertex']['y'] + vertices_structured['z'] = plydata['vertex']['z'] + + return vertices_structured + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres From c9049965fa38062315d5f82dfcfc76c627d2fd19 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Fri, 28 Mar 2025 23:39:08 +0530 Subject: [PATCH 02/18] removing image rotations --- preprocess/feat2D/arkit.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py index f0d8456..baec4ad 100644 --- a/preprocess/feat2D/arkit.py +++ b/preprocess/feat2D/arkit.py @@ -152,15 +152,10 @@ def computeAllImageFeaturesEachScan(self, scan_id: str) -> None: # Extract Scene Image Features scene_images_pt = [] scene_image_embeddings = [] - sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] - + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] for frame_index in frame_idxs: image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) - if sky_direction=='Left': - image = image.transpose(Image.ROTATE_270) - elif sky_direction=='Right': - image = image.transpose(Image.ROTATE_90) image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) image_pt = self.model.base_tf(image) @@ -186,7 +181,7 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr pose_data = np.array(pose_data) sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) - sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] # Extract Scene Image Features scene_images_pt = [] @@ -194,10 +189,6 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr frame_index = frame_idxs[idx] image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) - if sky_direction=='Left': - image = image.transpose(Image.ROTATE_270) - elif sky_direction=='Right': - image = image.transpose(Image.ROTATE_90) image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) image_pt = self.model.base_tf(image) scene_images_pt.append(image_pt) @@ -263,14 +254,6 @@ def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: object_anno_2d = object_anno_2d.transpose(1, 0) object_anno_2d = np.flip(object_anno_2d, 1) - sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] - - # load image - if sky_direction=='Left': - image = image.transpose(Image.ROTATE_270) - elif sky_direction=='Right': - image = image.transpose(Image.ROTATE_90) - object_mask = object_anno_2d == object_id images_crops = [] From 95efd602b1a6e08e85a3e6387cbb69841f8b1666 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Thu, 3 Apr 2025 23:18:22 +0530 Subject: [PATCH 03/18] readme fix --- prepare_data/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prepare_data/README.md b/prepare_data/README.md index 919d73d..279719f 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -104,18 +104,18 @@ Scan3R/ ``` #### ARKitScenes -1. Download `files/` under `processed_data/meta_data/ARKitScenes/` from GDrive and place under `PATH_TO_ARKITSCENES/`. -2. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract MultiScan data +1. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract the data ```bash cd ARKitScenes mv 3dod/Training/* scans mv 3dod/Validation/* scans ``` +2. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. Once completed, the data structure would look like the following: ``` -MultiScan/ +ARKitScenes/ ├── scans/ │ ├── 40753679/ │ │ ├── 40753679_frames/ From beb066028b6a8a5c032dee5d342bd569e7c9acc6 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Sun, 6 Apr 2025 16:25:06 +0530 Subject: [PATCH 04/18] updated installation instructions --- prepare_data/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/prepare_data/README.md b/prepare_data/README.md index 279719f..0a46691 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -104,14 +104,22 @@ Scan3R/ ``` #### ARKitScenes -1. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract the data +1. Download ARKitScenes 3dod data using the following command: + +```bash +python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES +``` +The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data. + +2. Once the data is downloaded, run the following to organize it as per our requirements. ```bash cd ARKitScenes mv 3dod/Training/* scans mv 3dod/Validation/* scans ``` -2. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. + +3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. Once completed, the data structure would look like the following: ``` From 417fcc46d24b06e5d7b46c32652118c1eaf21522 Mon Sep 17 00:00:00 2001 From: Sayan Deb Sarkar Date: Sun, 6 Apr 2025 19:47:52 -0700 Subject: [PATCH 05/18] Small config changes --- configs/preprocess/process_1d.yaml | 5 +---- configs/preprocess/process_2d.yaml | 4 ++-- configs/preprocess/process_3d.yaml | 2 +- configs/preprocess/process_multimodal.yaml | 2 +- configs/train/train_instance_crossover.yaml | 4 ++-- configs/train/train_scene_crossover.yaml | 4 ++-- preprocess/feat3D/arkit.py | 3 +-- scripts/preprocess/process_arkit.sh | 10 +++++----- 8 files changed, 15 insertions(+), 19 deletions(-) diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index 11a9df7..42ce6ef 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -26,15 +26,12 @@ data: skip_frames : 1 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes process_dir : ${data.process_dir}/ARKitScenes/scans processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor skip_frames : 1 - - Shapenet: - base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ modality_info: 1D : diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index d02d017..85e9d82 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -28,7 +28,7 @@ data: skip_frames : 1 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes process_dir : ${data.process_dir}/ARKitScenes/scans processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor @@ -68,4 +68,4 @@ task: name : Preprocess Preprocess : modality : '2D' - splits : ['val'] \ No newline at end of file + splits : ['train', 'val'] \ No newline at end of file diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index e9bc9c6..9971666 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -25,7 +25,7 @@ data: label_filename : labels.instances.align.annotated.v2.ply ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes process_dir : ${data.process_dir}/ARKitScenes/scans processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 33b3def..f8910c3 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -29,7 +29,7 @@ data: avail_modalities : ['point', 'rgb', 'referral'] ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes + base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes process_dir : ${data.process_dir}/ARKitScenes/scans chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked processor3D : ARKitScenes3DProcessor diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index 6bfdce4..e4eed4b 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -59,8 +59,8 @@ task: name : SceneLevelGrounding SceneLevelGrounding : modalities : ['rgb', 'point', 'cad', 'referral'] - train : [Scannet, Scan3R] - val : [Scannet, Scan3R] + train : [Scannet, Scan3R, ARKitScenes] + val : [Scannet, Scan3R, ARKitScenes] trainer: GroundingTrainer diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index 31ae435..43ef415 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -60,8 +60,8 @@ task: UnifiedTrain : modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'floorplan', 'referral'] - train : [Scannet, Scan3R, MultiScan] - val : [Scannet, Scan3R, MultiScan] + train : [Scannet, Scan3R, ARKitScenes] + val : [Scannet, Scan3R, ARKitScenes] object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth trainer: UnifiedTrainer diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py index 9da2d4e..e265d78 100644 --- a/preprocess/feat3D/arkit.py +++ b/preprocess/feat3D/arkit.py @@ -94,5 +94,4 @@ def compute3DFeaturesEachScan(self, scan_id): load_utils.ensure_dir(scene_out_dir) torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) - + torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh index ecb457e..5ff7fd5 100644 --- a/scripts/preprocess/process_arkit.sh +++ b/scripts/preprocess/process_arkit.sh @@ -1,9 +1,9 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null -python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null -python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null -# # Multi-modal dumping -python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# Multi-modal dumping +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null From 1ef819c044509320a09eb19cddfb1ba5a95c83a5 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Wed, 19 Feb 2025 14:12:25 +0530 Subject: [PATCH 06/18] adding support for multiscan --- DATA.md | 35 +- README.md | 4 +- TRAIN.md | 2 +- configs/evaluation/eval_instance.yaml | 14 + configs/evaluation/eval_scene.yaml | 10 + configs/preprocess/process_1d.yaml | 10 + configs/preprocess/process_2d.yaml | 6 + configs/preprocess/process_3d.yaml | 6 + configs/preprocess/process_multimodal.yaml | 9 + configs/train/train_instance_baseline.yaml | 10 + configs/train/train_instance_crossover.yaml | 10 + configs/train/train_scene_crossover.yaml | 10 + data/datasets/__init__.py | 3 +- data/datasets/multiscan.py | 42 ++ prepare_data/README.md | 44 +- .../multiscan/preprocess_2d_multiscan.py | 94 +++ preprocess/feat1D/__init__.py | 3 +- preprocess/feat1D/multiscan.py | 123 ++++ preprocess/feat2D/__init__.py | 3 +- preprocess/feat2D/multiscan.py | 240 +++++++ preprocess/feat3D/__init__.py | 3 +- preprocess/feat3D/multiscan.py | 94 +++ preprocess/multimodal_preprocess.py | 6 + scripts/preprocess/process_multiscan.sh | 9 + single_inference/datasets/__init__.py | 6 +- single_inference/datasets/multiscan.py | 120 ++++ single_inference/scene_inference.py | 2 + util/multiscan.py | 670 ++++++++++++++++++ 28 files changed, 1579 insertions(+), 9 deletions(-) create mode 100644 data/datasets/multiscan.py create mode 100644 prepare_data/multiscan/preprocess_2d_multiscan.py create mode 100644 preprocess/feat1D/multiscan.py create mode 100644 preprocess/feat2D/multiscan.py create mode 100644 preprocess/feat3D/multiscan.py create mode 100644 scripts/preprocess/process_multiscan.sh create mode 100644 single_inference/datasets/multiscan.py create mode 100644 util/multiscan.py diff --git a/DATA.md b/DATA.md index 9377fd0..92a22dd 100644 --- a/DATA.md +++ b/DATA.md @@ -11,6 +11,7 @@ We list the available data used in the current version of CrossOver in the table | ScanNet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | | ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. @@ -112,6 +113,38 @@ Scan3R/ | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... ``` +### MultiScan + +#### Running preprocessing scripts +Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_multiscan.sh +``` + +Our script for MultiScan dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +MultiScan/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── scene_00000_00/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` ### ARKitScenes @@ -144,4 +177,4 @@ ARKitScenes/ | │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... -``` \ No newline at end of file +``` diff --git a/README.md b/README.md index b39d33a..c133ec5 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,8 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a | Scannet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | | ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | + > To run our demo, you only need to download generated embedding data; no need for any data preprocessing. @@ -135,7 +137,7 @@ Various configurable parameters: - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`). - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral` - `--database_modality`: Modality used for retrieval. Same options as above. -- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`). +- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`. For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections. diff --git a/TRAIN.md b/TRAIN.md index 622d5c6..5520b7d 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh ``` #### Train Scene Retrieval Pipeline -Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan & ARKitScenes or any combination of the same. Run the following: +Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following: ```bash $ bash scripts/train/train_scene_crossover.sh diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index 5515123..1c8518c 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -53,13 +53,27 @@ data : max_object_len : 150 voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceObjectRetrieval InferenceObjectRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] +<<<<<<< HEAD ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth +======= + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth +>>>>>>> f86c782 (adding support for multiscan) inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index eab4202..381153e 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -49,6 +49,12 @@ data : processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor avail_modalities : ['point', 'cad', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 @@ -59,7 +65,11 @@ task: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] +<<<<<<< HEAD ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth +======= + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth +>>>>>>> f86c782 (adding support for multiscan) inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index 42ce6ef..baedd3a 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -32,6 +32,16 @@ data: processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor skip_frames : 1 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Shapenet: + base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ modality_info: 1D : diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 85e9d82..1cd64dc 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -33,6 +33,12 @@ data: processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor skip_frames : 1 modality_info: diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 9971666..5602ed8 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -30,6 +30,12 @@ data: processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor skip_frames : 1 modality_info: diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index f8910c3..54e3cd1 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -36,6 +36,15 @@ data: processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor avail_modalities : ['point', 'rgb', 'referral'] + + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : Scan3R3DProcessor + processor2D : Scan3R2DProcessor + processor1D : Scan3R1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] modality_info: 1D : diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index 02e4324..a97cb22 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -54,6 +54,16 @@ data : avail_modalities : ['point', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 task: name : ObjectLevelGrounding diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index e4eed4b..365f247 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -54,6 +54,16 @@ data : avail_modalities : ['point', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 task: name : SceneLevelGrounding diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index 43ef415..aea7152 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -54,6 +54,16 @@ data : avail_modalities : ['point', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 task: name : UnifiedTrain diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 8c18552..9c7b829 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -1,3 +1,4 @@ from .scannet import * from .scan3r import * -from .arkit import * \ No newline at end of file +from .arkit import * +from .multiscan import * diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py new file mode 100644 index 0000000..a43d8a1 --- /dev/null +++ b/data/datasets/multiscan.py @@ -0,0 +1,42 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class MultiScanObject(ScanObjectBase): + """MultiScan dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class MultiScan(ScanBase): + """MultiScan dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self) -> List[List[Any]]: + """Gets pairs of temporal scans from the dataset.""" + scene_pairs = [] + + ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')] + + for ref_scan_id in ref_scan_ids: + rescan_list = [] + + for rescan_id in self.scan_ids: + rescan = {} + if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id: + rescan['scan_id'] = rescan_id + rescan_list.append(rescan) + if len(rescan_list) == 0: + continue + + scene_pairs.append([ref_scan_id, rescan_list]) + return scene_pairs \ No newline at end of file diff --git a/prepare_data/README.md b/prepare_data/README.md index 0a46691..0246b5c 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -6,6 +6,7 @@ This document provides instructions for pre-processing different datasets, inclu - ScanNet - 3RScan - ARKitScenes +- MultiScan ## Prerequisites @@ -19,12 +20,15 @@ Before you begin, simply activate the `crossover` conda environment. - **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan). +- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan). + - **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes). - **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. ### Download Referral and CAD annotations -We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. +We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. + #### ScanNet 1. Run the following to extract ScanNet data @@ -142,4 +146,42 @@ ARKitScenes/ ├── 3dod_train_val_splits.csv └── sceneverse └── ssg_ref_rel2_template.json +``` + +#### MultiScan +1. Download `files/` under `processed_data/meta_data/MultiScan/` from GDrive and place under `PATH_TO_MULTISCAN/`. +2. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data + + ```bash +cd MultiScan/scenes +unzip '*.zip' +rm -rf '*.zip' +``` +3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing +```bash +cd prepare_data/multiscan +python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval} +``` +Once completed, the data structure would look like the following: +``` +MultiScan/ +├── scenes/ +│ ├── scene_00000_00/ +│ │ ├── sequence/ (folder containing rgb images at specified frame interval) +| | ├── frame_ids.txt +│ │ ├── scene_00000_00.annotations.json +│ │ ├── scene_00000_00.jsonl +│ │ ├── scene_00000_00.confidence.zlib +│ │ ├── scene_00000_00.mp4 +│ │ ├── poses.jsonl +│ │ ├── scene_00000_00.ply +│ │ ├── scene_00000_00.align.json +│ │ ├── scene_00000_00.json +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── test_scans.txt + └── sceneverse + └── ssg_ref_rel2_template.json ``` \ No newline at end of file diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py new file mode 100644 index 0000000..da89da1 --- /dev/null +++ b/prepare_data/multiscan/preprocess_2d_multiscan.py @@ -0,0 +1,94 @@ +import os +import cv2 +import json +import jsonlines +import argparse +import os.path as osp +import shutil + +def process_scene_folders(base_dir, frame_interval=10): + base_dir=osp.join(base_dir, 'scenes') + scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] + + for scene_folder in scene_folders: + scene_path = os.path.join(base_dir, scene_folder) + video_path = os.path.join(scene_path, f"{scene_folder}.mp4") + jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl") + frame_output_dir = os.path.join(scene_path, "sequence") + frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt") + metadata_output_path = os.path.join(scene_path, "poses.jsonl") + + if os.path.exists(frame_output_dir): + shutil.rmtree(frame_output_dir) + os.makedirs(frame_output_dir) + + if not os.path.exists(video_path): + print(f"Video file not found: {video_path}") + continue + if not os.path.exists(jsonl_path): + print(f"Metadata file not found: {jsonl_path}") + continue + + print(f"Processing scene: {scene_folder}") + + frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval) + + with open(frame_ids_txt_path, "w") as f: + for frame_id in frame_ids: + f.write(f"{frame_id}\n") + + selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids) + + with jsonlines.open(metadata_output_path, mode="w") as writer: + for entry in selected_metadata: + writer.write(entry) + + print(f"Finished processing scene: {scene_folder}") + + +def extract_frames_from_video(video_path, output_dir, frame_interval): + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + frame_ids = [] + frame_count = 0 + + while True: + ret, frame = cap.read() + if not ret: + break # End of video + + if frame_count % frame_interval == 0: + frame_id = frame_count + frame_ids.append(frame_id) + output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg") + cv2.imwrite(output_path, frame) # Save frame as an image + + frame_count += 1 + + cap.release() + return frame_ids + + +def extract_metadata_by_line_number(jsonl_path, line_numbers): + + selected_metadata = [] + + with jsonlines.open(jsonl_path) as reader: + for line_idx, entry in enumerate(reader): + if line_idx in line_numbers: + entry["frame_id"] = line_idx + selected_metadata.append(entry) + + return selected_metadata + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process scene folders.") + parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.") + parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.") + args = parser.parse_args() + + process_scene_folders(args.base_dir, args.frame_interval) \ No newline at end of file diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py index 8c18552..9c7b829 100644 --- a/preprocess/feat1D/__init__.py +++ b/preprocess/feat1D/__init__.py @@ -1,3 +1,4 @@ from .scannet import * from .scan3r import * -from .arkit import * \ No newline at end of file +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py new file mode 100644 index 0000000..58b9ff9 --- /dev/null +++ b/preprocess/feat1D/multiscan.py @@ -0,0 +1,123 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import labelmap, multiscan + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.undefined = 0 + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + objects = [] + + for obj in annotations["objects"]: + objects.append({ + "objectId": obj["objectId"], + "global_id": obj.get("label") + }) + + return objects + + def extractTextFeats(self, texts, return_text = False): + text_feats = [] + + for text in texts: + encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device) + if encoded_text['input_ids'].shape[1] > 512: + continue + + with torch.no_grad(): + encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask, + return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1) + + text_feats.append({'text' : text, 'feat' : encoded_text}) + + if len(text_feats) == 0: + return None + + if return_text: + return text_feats + + text_feats = [text_feat['feat'] for text_feat in text_feats] + text_feats = np.concatenate(text_feats) + return text_feats + + + def compute1DFeaturesEachScan(self, scan_id): + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py index 8c18552..9c7b829 100644 --- a/preprocess/feat2D/__init__.py +++ b/preprocess/feat2D/__init__.py @@ -1,3 +1,4 @@ from .scannet import * from .scan3r import * -from .arkit import * \ No newline at end of file +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py new file mode 100644 index 0000000..d95239e --- /dev/null +++ b/preprocess/feat2D/multiscan.py @@ -0,0 +1,240 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +from PIL import Image +from scipy.spatial.transform import Rotation as R + +from common import load_utils +from util import render, multiscan, visualisation +from util import image as image_util + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class MultiScan2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + + + # get frame_indexes + self.frame_pose_data = {} + for scan_id in self.scan_ids: + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + while(len(frame_idxs) > 500): + self.frame_skip += 2 + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=2) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=5) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=10) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=15) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=20) + + pose_data = multiscan.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) + + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + instance_ids = ply_data['objectId'] + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[scan_id]: + camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + # save scene-level file for efficient loading + torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + + def compute2DFeaturesEachScan(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + color_path = osp.join(scene_folder, 'sequence') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) + intrinsic_mat = camera_info['intrinsic_mat'] + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map): + object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py index 8c18552..9c7b829 100644 --- a/preprocess/feat3D/__init__.py +++ b/preprocess/feat3D/__init__.py @@ -1,3 +1,4 @@ from .scannet import * from .scan3r import * -from .arkit import * \ No newline at end of file +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py new file mode 100644 index 0000000..68ba025 --- /dev/null +++ b/preprocess/feat3D/multiscan.py @@ -0,0 +1,94 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm + +from common import load_utils +from util import point_cloud, multiscan +from util.multiscan import MULTISCAN_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + + for obj in annotations["objects"]: + object_id=obj["objectId"] + objectName=obj["label"].split('.')[0] + scannet_class=MULTISCAN_SCANNET[objectName] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + + + def compute3DFeaturesEachScan(self, scan_id): + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + \ No newline at end of file diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 34f2898..70adff4 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -8,7 +8,11 @@ import h5py from common import load_utils from common.constants import ModalityType +<<<<<<< HEAD from util import scan3r, scannet, arkit +======= +from util import scan3r, scannet, multiscan +>>>>>>> f86c782 (adding support for multiscan) from typing import Dict, Optional from preprocess.build import PROCESSOR_REGISTRY @@ -35,6 +39,8 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split: self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split) elif self.dataset_name == 'ARKitScenes': self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'MultiScan': + self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split) else: raise NotImplementedError diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh new file mode 100644 index 0000000..c08bf84 --- /dev/null +++ b/scripts/preprocess/process_multiscan.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py index 8c18552..d7126ea 100644 --- a/single_inference/datasets/__init__.py +++ b/single_inference/datasets/__init__.py @@ -1,3 +1,7 @@ from .scannet import * from .scan3r import * -from .arkit import * \ No newline at end of file +<<<<<<< HEAD +from .arkit import * +======= +from .multiscan import * +>>>>>>> f86c782 (adding support for multiscan) diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py new file mode 100644 index 0000000..06538e6 --- /dev/null +++ b/single_inference/datasets/multiscan.py @@ -0,0 +1,120 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d + +from common import load_utils +from util import multiscan +from util import image as image_util + +class MultiScanInferDataset(Dataset): + def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scenes') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def extract_images(self, scan_id, color_path): + frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id)) + pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply')) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, 'sequence') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py index 65465c2..1d13b5e 100644 --- a/single_inference/scene_inference.py +++ b/single_inference/scene_inference.py @@ -28,6 +28,8 @@ def run_inference(args, scan_id=None): dataset = datasets.Scan3RInferDataset(args.data_dir) elif args.dataset == 'ARKitScenes': dataset = datasets.ARKitScenesInferDataset(args.data_dir) + elif args.dataset == 'MultiScan': + dataset = datasets.MultiScanInferDataset(args.data_dir) else: raise NotImplementedError('Dataset not implemented') diff --git a/util/multiscan.py b/util/multiscan.py new file mode 100644 index 0000000..d570973 --- /dev/null +++ b/util/multiscan.py @@ -0,0 +1,670 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os + +MULTISCAN_SCANNET = { + "wall": "wall", + "door": "door", + "slippers": "shoe", + "mop": "broom", + "rug": "rug", + "floor": "floor", + "basin": "sink", + "basin_stand": "sink", + "bucket": "bucket", + "shower": "shower", + "water_tank": "container", + "beam": "wood beam", + "pillar": "pillar", + "ceiling": "ceiling", + "sink": "sink", + "toilet": "toilet", + "cabinet": "cabinet", + "remove": "object", + "towel": "towel", + "pillow": "pillow", + "sofa": "sofa", + "footstool": "footstool", + "picture": "picture", + "window": "window", + "heater": "heater", + "mirror": "mirror", + "pipe": "pipe", + "scarf": "cloth", + "ceiling_light": "ceiling light", + "chair": "chair", + "table": "table", + "vent": "vent", + "bag": "bag", + "wall_cabinet": "cabinet", + "range": "stove", + "ricemaker": "rice cooker", + "pan": "cooking pan", + "coffee_machine": "coffee maker", + "rice_bag": "bag", + "light": "light", + "trashbin": "trash bin", + "kettle": "kettle", + "refrigerator": "refrigerator", + "microwave": "microwave", + "light_switch": "light switch", + "rice_cooker": "rice cooker", + "box": "box", + "shoe": "shoe", + "range_hood": "range hood", + "wok": "cooking pan", + "router": "object", + "paper_towel": "paper towel roll", + "stock_pot": "pot", + "cutting_board": "cutting board", + "wall_calendar": "calendar", + "baseboard": "object", + "coke_box": "box", + "printer": "printer", + "bowl": "bowl", + "backpack": "backpack", + "baseboard_heater": "heater", + "broom": "broom", + "dust_pan": "dustpan", + "trash_bin": "trash bin", + "rigid_duct": "vent", + "electric_range": "stove", + "spatula": "object", + "faucet": "faucet", + "bottle": "bottle", + "countertop": "counter", + "railing": "railing", + "suitcase": "suitcase", + "trash": "trash can", + "pot": "pot", + "kitchen_tool": "object", + "vegetable": "object", + "board": "board", + "washing_machine": "washing machine", + "jar": "jar", + "object": "object", + "notebook": "book", + "induction_cooker": "stove", + "instant_pot_lid": "cooking pot", + "oven": "oven", + "air_fryer": "object", + "lid": "pot", + "sponge": "sponge", + "blender": "object", + "spoon": "object", + "dishwasher": "dishwasher", + "detergent": "laundry detergent", + "watermelon": "bananas", + "yard_waste_bag": "garbage bag", + "container": "container", + "newspapers": "paper", + "rag": "cloth", + "ladder": "ladder", + "gate": "door", + "napkin_box": "tissue box", + "jacket": "jacket", + "windowsill": "windowsill", + "water_faucet": "faucet", + "steel_ball": "ball", + "rice_maker": "rice cooker", + "watter_bottle": "water bottle", + "plastic_bag": "bag", + "paper_bag": "paper bag", + "cuttting_board": "cutting board", + "trash_bin_lid": "trash bin", + "hair_dryer": "hair dryer", + "electric_socket": "power outlet", + "electric_panel": "electric panel", + "wash_stand": "sink", + "soap": "soap", + "curtain": "curtain", + "bathtub": "bathtub", + "smoke_detector": "smoke detector", + "roll_paper": "paper towel roll", + "chandelier": "chandelier", + "hand_sanitizer": "hand sanitzer dispenser", + "plate": "plate", + "sticker": "sticker", + "power_socket": "power outlet", + "stacked_cups": "stack of cups", + "stacked_chairs": "stack of chairs", + "air_vent": "vent", + "cornice": "cabinet", + "wine_cabinet": "kitchen cabinet", + "crock": "bowl", + "liquor_box": "cabinet", + "shampoo": "shampoo", + "shower_curtain": "shower curtain", + "wall_light": "wall lamp", + "sink_cabinet": "sink", + "toilet_roll": "toilet paper", + "shelf": "shelf", + "paper_bin": "recycling bin", + "toilet_brush": "toilet brush", + "shower_head": "shower head", + "tv": "tv", + "remote_control": "remote", + "tv_box": "tv stand", + "nightstand": "nightstand", + "bed": "bed", + "quilt": "blanket", + "telephone": "telephone", + "monitor": "monitor", + "desk": "desk", + "radiator_shell": "radiator", + "calendar": "calendar", + "clock": "clock", + "keyboard": "keyboard", + "speaker": "speaker", + "clothes": "clothes", + "door_frame": "doorframe", + "sliding_door": "sliding door", + "ceiling_lamp": "ceiling lamp", + "scale": "scale", + "power_strip": "power strip", + "switch": "light switch", + "basket": "basket", + "stool": "stool", + "shoes": "shoe", + "slipper": "slippers", + "bifold_door": "door", + "rangehood": "range hood", + "books": "books", + "toilet_paper": "toilet paper", + "mouse_pad": "mouse", + "ipad": "ipad", + "scissor": "knife block", + "radiator": "radiator", + "pc": "computer tower", + "bicycle": "bicycle", + "wardrobe": "wardrobe", + "mouse": "mouse", + "advertising_board": "poster", + "banner": "banner", + "ceiling_decoration": "ceiling light", + "whiteboard": "whiteboard", + "wall_storage_set": "shelf", + "traffic_cone": "traffic cone", + "wall_decoration": "decoration", + "papers": "papers", + "hat": "hat", + "velvet_hangers": "clothes hanger", + "circular_plate": "plate", + "cellphone": "telephone", + "pen": "keyboard piano", + "paper": "paper", + "lamp": "lamp", + "curtain_box": "curtains", + "woodcarving": "wood", + "scissors": "knife block", + "hand_dryer": "hand dryer", + "machine": "machine", + "vase": "vase", + "plant": "plant", + "power_socket_case": "power outlet", + "gloves": "clothes", + "dishcloth": "cloth", + "painting": "painting", + "shower_wall": "shower wall", + "showerhead": "shower head", + "tooth_mug": "cup", + "map": "map", + "knot_artwork": "decoration", + "fan": "fan", + "sphygmomanometer": "scale", + "electric_kettle": "kettle", + "bread_maker": "oven", + "knife_set": "knife block", + "soup_pot": "cooking pot", + "flatware_set": "cutting board", + "candle": "candle", + "lid_rack": "dish rack", + "flower": "flowerpot", + "can": "can", + "scoop": "bowl", + "laptop": "laptop", + "glass": "glass doors", + "wet_floor_sign": "wet floor sign", + "shower_enclosure": "shower doors", + "jewelry_box": "jewelry box", + "bath_brush": "hair brush", + "sofa_cushion": "couch cushions", + "tv_cabinet": "tv stand", + "wood_fence": "wood beam", + "floor_lamp": "lamp", + "computer_case": "computer tower", + "waste_container": "trash bin", + "roadblock": "barricade", + "trash_can_lids": "trash can", + "hand_sanitizer_stand": "soap dispenser", + "air_conditioner": "conditioner bottle", + "pattern": "rug", + "remote_controller": "remote", + "phone": "telephone", + "speakers": "speaker", + "table_divider": "divider", + "table_card": "card", + "paper_trimmer": "paper cutter", + "stapler": "stapler", + "cup": "cup", + "bathroom_heater": "heater", + "wall_shelf": "shelf", + "towel_rack": "towel", + "sink_drain": "sink", + "floor_drain": "floor", + "broom_head": "broom", + "door_curtain": "curtain", + "refill_pouch": "plastic container", + "bin": "bin", + "stall_wall": "bathroom stall door", + "wall_speaker": "speaker", + "laundry_basket": "laundry basket", + "tissue_box": "tissue box", + "document_holder": "file cabinet", + "yoga_mat": "yoga mat", + "gas_range": "stove", + "chopping_board": "cutting board", + "book_scanner": "scanner", + "payment_terminal": "vending machine", + "napkin_roll": "paper towel roll", + "faucet_switch": "faucet", + "glass_door": "glass doors", + "carpet": "carpet", + "shower_floor": "shower floor", + "toilet_plunger": "plunger", + "plug_panel": "power outlet", + "stand": "stand", + "potted_plant": "potted plant", + "poster": "poster", + "isolation_board": "divider", + "soap_holder": "soap dish", + "plug": "power outlet", + "brush": "hair brush", + "threshold": "doorframe", + "air_conditioner_controller": "remote", + "iron": "iron", + "ironing_board": "ironing board", + "safe": "suitcase", + "gas_cooker": "stove", + "pressure_cooker": "cooking pot", + "steamer_pot": "pot", + "soy_sauce_bottle": "bottle", + "dishwashing_liquid": "dishwashing soap bottle", + "water_ladle": "bowl", + "power_socket_set": "power strip", + "kitchen_tool_holder": "kitchen cabinet", + "case": "case", + "wall_paper": "wall", + "comb": "hair brush", + "paper_cutter": "paper cutter", + "pencil_sharpener": "pen holder", + "sealing_machine": "machine", + "poster_board": "poster", + "shredder": "shredder", + "footstep": "stair", + "planter": "plant", + "floor_light": "lamp", + "paper_cup": "cup", + "divider": "divider", + "hanger": "clothes hanger", + "glove": "clothing", + "blanket": "blanket", + "remote": "remote", + "cloth": "cloth", + "clutter": "object", + "extinguisher": "fire extinguisher", + "dryer": "clothes dryer", + "soap_bottle": "soap bottle", + "fabric_softener_box": "box", + "dryer_sheet_box": "box", + "detergent_bottle": "laundry detergent", + "toaster": "toaster", + "stacked_bowls": "bowl", + "pot_lid": "pot", + "electric_pressure_cooker": "rice cooker", + "bread": "food display", + "bagels": "object", + "oranges": "bananas", + "card_reader": "card", + "whiteboard_detergent": "soap dispenser", + "power_outlet": "power outlet", + "bouquet": "vase", + "water_bottle": "water bottle", + "wall_mounted_telephone": "telephone", + "fridge": "refrigerator", + "toy": "toy dinosaur", + "shoe_box": "box", + "hole_puncher": "paper cutter", + "landline_telephone": "telephone", + "base": "stand", + "handkerchief": "cloth", + "cornice_molding": "frame", + "bathtub_base": "bathtub", + "bidet": "toilet", + "pedestal_urinal": "urinal", + "pedestal_urinal_covered": "urinal", + "pit_toilet": "toilet", + "low_wall": "wall", + "rail": "rail", + "bottles": "bottles", + "floor_otherroom": "floor", + "wall_otherroom": "wall", + "canopy": "canopy", + "cable_manager": "cable", + "sneakers": "shoes", + "purse": "purse", + "cushion": "cushion", + "napkin": "towel", + "plush_toy": "stuffed animal", + "adjustable_desk": "desk", + "tableware": "plates", + "computer_desk": "desk", + "cat_kennel": "cat litter box", + "back_cushion": "pillow", + "ukulele_bag": "guitar case", + "litter_box": "trash can", + "storage_box": "storage bin", + "toy_doll": "doll", + "drawer_unit": "drawer", + "doll": "stuffed animal", + "laptop_bag": "messenger bag", + "clothing_rack": "clothing rack", + "bookshelf": "bookshelves", + "mask": "cloth", + "watch": "clock", + "book": "books", + "ashtray": "tray", + "car_key": "car", + "wallet": "purse", + "tea_pot": "tea kettle", + "wire": "cable", + "rake": "broom", + "dispenser": "soap dispenser", + "toilet_tank": "toilet", + "door_sill": "doorframe", + "cleanser": "soap", + "armrest": "armchair", + "short_wall": "wall", + "suspended_ceiling": "ceiling", + "fire_extinguisher_cabinet": "fire extinguisher", + "plastic_box": "plastic container", + "sanitation_station": "soap dispenser", + "plant_pot": "flowerpot", + "fireplace": "fireplace", + "computer_table": "desk", + "tissue_bag": "tissue box", + "wall_frame": "frame", + "map_board": "map", + "automated_teller_machine": "vending machine", + "ticket": "card", + "tablet": "ipad", + "blankets": "blanket", + "bags": "bag", + "flag": "flag", + "blackboard": "blackboard", + "bar_table": "bar", + "cardboard_holder": "cardboard", + "potted_planet": "potted plant", + "tray": "tray", + "utensil_holder": "kitchen counter", + "bird_ceramics": "statue", + "shirt": "shirt", + "clothes_rail": "clothes hanger", + "power_strips": "power strip", + "card_board": "board", + "pile_of_blankets": "blanket", + "bed_net": "bed", + "umbrella": "umbrella", + "dragon_fruit": "bananas", + "tissue": "tissue box", + "electrical_panel": "electric panel", + "panel": "door", + "tube": "tube", + "pile_of_cloth": "cloth", + "surface": "table", + "chair_cushion": "cushion", + "guide": "book", + "parapet": "railing", + "camera": "camera", + "light_base": "lamp base", + "first_aid": "object", + "bench": "bench", + "potted_plants": "potted plant", + "pot_cover": "pot", + "yoga_mat_roll": "yoga mat", + "panda_doll": "stuffed animal", + "window_trim": "window", + "shoe_cabinet": "shoe rack", + "toilet_paper_holder": "toilet paper dispenser", + "shower_faucet": "shower faucet handle", + "bath_sponge": "sponge", + "ornament": "decoration", + "planter_box": "plant", + "cooktop": "stove", + "knife_block": "knife block", + "step_stool": "step stool", + "touchpad": "keyboard", + "light_box": "light", + "sound": "speaker", + "exhaust_fan_vent": "vent", + "paperbin": "recycling bin", + "mop_bucket": "bucket", + "sneaker": "shoes", + "objects": "object", + "cd_tray": "cd case", + "wall_board": "board", + "room_divider": "divider", + "paiting": "painting", + "cabinet_otherroom": "cabinet", + "electric_switch": "light switch", + "sign": "exit sign", + "hand_soap": "soap bottle", + "window_blinds": "blinds" +} + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_ply_data(data_dir, scan_id): + """ + Load PLY data and propagate object IDs from faces to vertices. + """ + filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id)) + + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + # Extract vertex properties + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + + # Extract normals if available + if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']: + nx = np.array(ply_data['vertex']['nx']) + ny = np.array(ply_data['vertex']['ny']) + nz = np.array(ply_data['vertex']['nz']) + normals = np.stack([nx, ny, nz], axis=-1) + else: + normals = None + + # Initialize object IDs for vertices with a default undefined value + vertex_object_ids = np.full(len(x), -1, dtype='int32') # Default: -1 (undefined) + + # Extract face data + faces = ply_data['face'].data + face_vertex_indices = [face['vertex_indices'] for face in faces] + face_object_ids = [face['objectId'] for face in faces] + + # Propagate object IDs to vertices + for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): + vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), # Coordinates + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), # Colors + ('objectId', 'i4') # Propagated Object ID + ] + + if normals is not None: + vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')]) # Normals + + vertices = np.empty(len(x), dtype=vertex_dtype) + + vertices['x'] = x.astype('f4') + vertices['y'] = y.astype('f4') + vertices['z'] = z.astype('f4') + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + + if normals is not None: + vertices['nx'] = normals[:, 0].astype('f4') + vertices['ny'] = normals[:, 1].astype('f4') + vertices['nz'] = normals[:, 2].astype('f4') + + return vertices + +def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(meta_intrinsics_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + intrinsic_mat = np.array(stream.get("intrinsics")) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + resolution = stream.get("resolution") + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + return intrinsics + +def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + intrinsics_path = osp.join(scan_dir, 'poses.jsonl') + resoultion_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(resoultion_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + resolution = stream.get("resolution", None) + if resolution: + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + + with jsonlines.open(intrinsics_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + intrinsic_mat = np.asarray(entry.get('intrinsics')) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + break + + return intrinsics + +def load_pose(scan_dir, frame_id): + # Find alignment file + alignment_path = None + for file_name in os.listdir(scan_dir): + if file_name.endswith('.align.json'): + alignment_path = osp.join(scan_dir, file_name) + break + + if alignment_path is None: + raise FileNotFoundError(f"No alignment file found in {scan_dir}") + + with open(alignment_path, "r") as f: + alignment_data = json.load(f) + if 'coordinate_transform' not in alignment_data: + raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'") + coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F') + inv_transform = np.linalg.inv(coordinate_transform) + + pose_path = osp.join(scan_dir, 'poses.jsonl') + with jsonlines.open(pose_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + transform = np.asarray(entry.get('transform')) + pose = np.reshape(transform, (4, 4), order='F') + aligned_pose = inv_transform @ pose #align camera poses + return aligned_pose + + raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}") + + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, int(frame_idx)) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is None: + frame_idxs = frame_idxs + else: + frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]] + return frame_idxs + + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False \ No newline at end of file From c11fff47e0244e6b89ec4dc37930efb7085def9b Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Thu, 13 Mar 2025 02:24:18 +0530 Subject: [PATCH 07/18] config related changes for MultiScan --- configs/train/train_instance_crossover.yaml | 4 ++-- scripts/preprocess/process_multiscan.sh | 8 ++++---- scripts/preprocess/process_scan3r.sh | 7 +++---- scripts/preprocess/process_scannet.sh | 9 ++++----- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index 365f247..aaf472d 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -69,8 +69,8 @@ task: name : SceneLevelGrounding SceneLevelGrounding : modalities : ['rgb', 'point', 'cad', 'referral'] - train : [Scannet, Scan3R, ARKitScenes] - val : [Scannet, Scan3R, ARKitScenes] + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] trainer: GroundingTrainer diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh index c08bf84..a13a93c 100644 --- a/scripts/preprocess/process_multiscan.sh +++ b/scripts/preprocess/process_multiscan.sh @@ -1,9 +1,9 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh index 6d8a981..5ac2b71 100644 --- a/scripts/preprocess/process_scan3r.sh +++ b/scripts/preprocess/process_scan3r.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh index 68a2366..47aa945 100644 --- a/scripts/preprocess/process_scannet.sh +++ b/scripts/preprocess/process_scannet.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file From 6dcd65e46a7a39fdb758261d9df4573c783dcd3c Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Thu, 13 Mar 2025 02:51:26 +0530 Subject: [PATCH 08/18] prepare data readme fix --- prepare_data/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prepare_data/README.md b/prepare_data/README.md index 0246b5c..c369156 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -149,8 +149,7 @@ ARKitScenes/ ``` #### MultiScan -1. Download `files/` under `processed_data/meta_data/MultiScan/` from GDrive and place under `PATH_TO_MULTISCAN/`. -2. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data +1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data ```bash cd MultiScan/scenes From 655041555f3b3cfb06b44e32492ddd385b338dc6 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Fri, 21 Mar 2025 11:31:57 +0530 Subject: [PATCH 09/18] arkit open3d convention bug fix --- util/multiscan.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/util/multiscan.py b/util/multiscan.py index d570973..8478a7d 100644 --- a/util/multiscan.py +++ b/util/multiscan.py @@ -634,8 +634,10 @@ def load_pose(scan_dir, frame_id): for entry in reader: if entry.get("frame_id") == frame_id: transform = np.asarray(entry.get('transform')) - pose = np.reshape(transform, (4, 4), order='F') - aligned_pose = inv_transform @ pose #align camera poses + transform = np.reshape(transform, (4, 4), order='F') + transform = np.dot(transform, np.diag([1, -1, -1, 1])) + transform = transform / transform[3][3] + aligned_pose = inv_transform @ transform #align camera poses return aligned_pose raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}") From 521247aca1e32cca7566fbc66577fee2e53aad3b Mon Sep 17 00:00:00 2001 From: Sayan Deb Sarkar Date: Fri, 4 Apr 2025 10:51:01 -0700 Subject: [PATCH 10/18] Typo change --- retrieval/object_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py index 54c144f..526e5a2 100644 --- a/retrieval/object_retrieval.py +++ b/retrieval/object_retrieval.py @@ -293,6 +293,6 @@ def run(self) -> None: # Object Retrieval Evaluation self.eval(output_dict) - self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...') + self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...') # Scene Retrieval Evaluation self.scene_eval(output_dict) \ No newline at end of file From 8a119eea461e8906d9d7e6f6f62a08dd6d053334 Mon Sep 17 00:00:00 2001 From: Sayan Deb Sarkar Date: Fri, 18 Apr 2025 09:59:47 -0700 Subject: [PATCH 11/18] Commit issue fix + path change --- configs/evaluation/eval_instance.yaml | 9 ++------- configs/evaluation/eval_scene.yaml | 11 +++++------ configs/preprocess/process_1d.yaml | 4 ++-- configs/preprocess/process_2d.yaml | 5 +++-- configs/preprocess/process_3d.yaml | 4 ++-- configs/preprocess/process_multimodal.yaml | 4 ++-- configs/train/train_instance_baseline.yaml | 4 ++-- configs/train/train_instance_crossover.yaml | 6 +++--- configs/train/train_scene_crossover.yaml | 14 +++++++------- data/datasets/scanbase.py | 1 - preprocess/build.py | 1 + preprocess/feat1D/arkit.py | 2 +- preprocess/feat2D/arkit.py | 13 ++++++++++--- preprocess/feat3D/arkit.py | 2 +- preprocess/multimodal_preprocess.py | 6 +----- scripts/preprocess/process_arkit.sh | 4 ++-- trainer/grounding_trainer.py | 2 ++ 17 files changed, 46 insertions(+), 46 deletions(-) diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index 1c8518c..2b2310b 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -44,8 +44,8 @@ data : voxel_size : 0.02 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor @@ -69,12 +69,7 @@ task: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] -<<<<<<< HEAD - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth -======= ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth ->>>>>>> f86c782 (adding support for multiscan) - inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index 381153e..a666183 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -44,11 +44,14 @@ data : voxel_size : 0.02 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor + max_object_len : 150 + voxel_size : 0.02 + avail_modalities : ['point', 'cad', 'rgb', 'referral'] MultiScan: base_dir : /media/sayan/Expansion/data/datasets/MultiScan process_dir : ${data.process_dir}/MultiScan @@ -65,11 +68,7 @@ task: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] -<<<<<<< HEAD - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth -======= ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth ->>>>>>> f86c782 (adding support for multiscan) inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index baedd3a..4766677 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -26,8 +26,8 @@ data: skip_frames : 1 ARKitScenes: - base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 1cd64dc..244edff 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -28,11 +28,12 @@ data: skip_frames : 1 ARKitScenes: - base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor + skip_frames : 1 MultiScan: base_dir : /media/sayan/Expansion/data/datasets/MultiScan process_dir : ${data.process_dir}/MultiScan diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 5602ed8..1989286 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -25,8 +25,8 @@ data: label_filename : labels.instances.align.annotated.v2.ply ARKitScenes: - base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 54e3cd1..fd8809b 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -29,8 +29,8 @@ data: avail_modalities : ['point', 'rgb', 'referral'] ARKitScenes: - base_dir : /media/sayan/Expansion/data/datasets/ArkitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index a97cb22..ee70d74 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -45,8 +45,8 @@ data : voxel_size : 0.02 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index aaf472d..35a6a15 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -45,13 +45,13 @@ data : voxel_size : 0.02 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor - avail_modalities : ['point', 'rgb', 'referral'] + avail_modalities : ['point', 'cad', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 MultiScan: diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index aea7152..9886e95 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -45,13 +45,13 @@ data : voxel_size : 0.02 ARKitScenes: - base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes - process_dir : ${data.process_dir}/ARKitScenes/scans + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked processor3D : ARKitScenes3DProcessor processor2D : ARKitScenes2DProcessor processor1D : ARKitScenes1DProcessor - avail_modalities : ['point', 'rgb', 'referral'] + avail_modalities : ['point', 'cad', 'rgb', 'referral'] max_object_len : 150 voxel_size : 0.02 MultiScan: @@ -70,9 +70,9 @@ task: UnifiedTrain : modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'floorplan', 'referral'] - train : [Scannet, Scan3R, ARKitScenes] - val : [Scannet, Scan3R, ARKitScenes] - object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] + object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth trainer: UnifiedTrainer @@ -99,7 +99,7 @@ model: base_modality : 'rgb' dataloader: - batch_size : 16 + batch_size : 32 num_workers : 6 eval: diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index 7f8d3fe..b531e32 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -187,7 +187,6 @@ def __getitem__(self, index: int) -> Dict[str, Any]: rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings']) rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1) - rgb_embedding = rgb_embedding[list(range(0, rgb_embedding.shape[0], 2)), :] scene_dict['rgb_embedding'] = rgb_embedding scene_dict['scene_masks']['rgb'] = torch.Tensor([1.0]) diff --git a/preprocess/build.py b/preprocess/build.py index 551d97f..fb3445e 100644 --- a/preprocess/build.py +++ b/preprocess/build.py @@ -3,5 +3,6 @@ PROCESSOR_REGISTRY = Registry("Processor") def build_processor(processor_name, data_config, modality_config, split): + print(f"Building processor: {processor_name}") processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split) return processor \ No newline at end of file diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py index 0e2873d..efab03c 100644 --- a/preprocess/feat1D/arkit.py +++ b/preprocess/feat1D/arkit.py @@ -20,7 +20,7 @@ def __init__(self, config_data, config_1D, split) -> None: self.scan_ids = [] self.scan_ids = arkit.get_scan_ids(files_dir, split) - self.out_dir = config_data.process_dir + self.out_dir = osp.join(config_data.process_dir, 'scans') load_utils.ensure_dir(self.out_dir) # Object Referrals self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py index baec4ad..531b5b6 100644 --- a/preprocess/feat2D/arkit.py +++ b/preprocess/feat2D/arkit.py @@ -29,7 +29,7 @@ def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) - self.split = split self.scan_ids = arkit.get_scan_ids(files_dir, self.split) - self.out_dir = config_data.process_dir + self.out_dir = osp.join(config_data.process_dir, 'scans') load_utils.ensure_dir(self.out_dir) self.orig_image_size = config_2D.image.orig_size @@ -51,10 +51,14 @@ def compute2DFeatures(self) -> None: for scan_id in tqdm(self.scan_ids): self.compute2DImagesAndSeg(scan_id) self.compute2DFeaturesEachScan(scan_id) - if self.split == 'val': - self.computeAllImageFeaturesEachScan(scan_id) + # if self.split == 'val': + # self.computeAllImageFeaturesEachScan(scan_id) def compute2DImagesAndSeg(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')): + return + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") if not osp.exists(objects_path): raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") @@ -104,6 +108,9 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) + if osp.exists(osp.join(scene_out_dir, 'data2D.pt')): + return + obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] # Multi-view Image -- Object (Embeddings) diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py index e265d78..6172204 100644 --- a/preprocess/feat3D/arkit.py +++ b/preprocess/feat3D/arkit.py @@ -21,7 +21,7 @@ def __init__(self, config_data, config_3D, split) -> None: self.scan_ids = [] self.scan_ids = arkit.get_scan_ids(files_dir, split) - self.out_dir = config_data.process_dir + self.out_dir = osp.join(config_data.process_dir, 'scans') load_utils.ensure_dir(self.out_dir) self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 70adff4..a45274b 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -8,11 +8,7 @@ import h5py from common import load_utils from common.constants import ModalityType -<<<<<<< HEAD -from util import scan3r, scannet, arkit -======= -from util import scan3r, scannet, multiscan ->>>>>>> f86c782 (adding support for multiscan) +from util import scan3r, scannet, arkit, multiscan from typing import Dict, Optional from preprocess.build import PROCESSOR_REGISTRY diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh index 5ff7fd5..3acdb4a 100644 --- a/scripts/preprocess/process_arkit.sh +++ b/scripts/preprocess/process_arkit.sh @@ -1,9 +1,9 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null diff --git a/trainer/grounding_trainer.py b/trainer/grounding_trainer.py index e0a40b2..7ee201c 100644 --- a/trainer/grounding_trainer.py +++ b/trainer/grounding_trainer.py @@ -1,5 +1,7 @@ +import os.path as osp from tqdm import tqdm from omegaconf import DictConfig +from safetensors.torch import load_file import torch from trainer.build import TRAINER_REGISTRY From 5f3f734fc371415ee453fb221759235b1c0a6685 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 11:23:39 +0530 Subject: [PATCH 12/18] 1d preprocessing changes --- preprocess/feat1D/arkit.py | 55 ++++++++++++++--------- preprocess/feat1D/multiscan.py | 82 +++++++++++++++------------------- preprocess/feat1D/scan3r.py | 52 ++++++++++++--------- preprocess/feat1D/scannet.py | 56 +++++++++++++---------- 4 files changed, 133 insertions(+), 112 deletions(-) diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py index efab03c..f03571a 100644 --- a/preprocess/feat1D/arkit.py +++ b/preprocess/feat1D/arkit.py @@ -2,7 +2,7 @@ import torch import numpy as np from tqdm import tqdm - +import os from common import load_utils from util import labelmap, arkit from util.arkit import ARKITSCENE_SCANNET @@ -52,35 +52,46 @@ def load_objects_for_scan(self, scan_id): return objects - - def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - scan_objects = self.load_objects_for_scan(scan_id) + pt_1d_path = osp.join(scene_out_dir, "data1D.pt") + if osp.exists(pt_1d_path): + pt_data=torch.load(pt_1d_path) + data1D['objects'] = pt_data['objects'] + data1D['scene'] = pt_data['scene'] + os.remove(pt_1d_path) + else: + # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item() + + scan_objects = self.load_objects_for_scan(scan_id) - object_referral_embeddings, scene_referral_embeddings = {}, None - if len(scan_objects) != 0: - object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) - scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - - if len(scene_referrals) != 0: - if len(scene_referrals) > 10: - scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] - scene_referrals = ' '.join(scene_referrals) - scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) - assert scene_referral_embeddings is not None - - data1D = {} - data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} - data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): object_referral_embeddings = {} diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py index 58b9ff9..d96ad4e 100644 --- a/preprocess/feat1D/multiscan.py +++ b/preprocess/feat1D/multiscan.py @@ -2,7 +2,7 @@ import torch import numpy as np from tqdm import tqdm - +import os from common import load_utils from util import labelmap, multiscan @@ -45,59 +45,49 @@ def load_objects_for_scan(self, scan_id): return objects - def extractTextFeats(self, texts, return_text = False): - text_feats = [] - - for text in texts: - encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device) - if encoded_text['input_ids'].shape[1] > 512: - continue - - with torch.no_grad(): - encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask, - return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1) - - text_feats.append({'text' : text, 'feat' : encoded_text}) - - if len(text_feats) == 0: - return None - - if return_text: - return text_feats - - text_feats = [text_feat['feat'] for text_feat in text_feats] - text_feats = np.concatenate(text_feats) - return text_feats - def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - scan_objects = self.load_objects_for_scan(scan_id) + pt_1d_path = osp.join(scene_out_dir, "data1D.pt") + if osp.exists(pt_1d_path): + pt_data=torch.load(pt_1d_path) + data1D['objects'] = pt_data['objects'] + data1D['scene'] = pt_data['scene'] + os.remove(pt_1d_path) - object_referral_embeddings, scene_referral_embeddings = {}, None - if len(scan_objects) != 0: - object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + else: + # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True) + # print(npz_data) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item() + # print(objectID_to_labelID_map) + scan_objects = self.load_objects_for_scan(scan_id) - scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - - if len(scene_referrals) != 0: - if len(scene_referrals) > 10: - scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] - scene_referrals = ' '.join(scene_referrals) - scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) - assert scene_referral_embeddings is not None - - data1D = {} - data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} - data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + # Combine and save as npz + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) - def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): object_referral_embeddings = {} diff --git a/preprocess/feat1D/scan3r.py b/preprocess/feat1D/scan3r.py index 65fb6e9..fdd95d6 100644 --- a/preprocess/feat1D/scan3r.py +++ b/preprocess/feat1D/scan3r.py @@ -4,7 +4,7 @@ from common import load_utils from util import scan3r from typing import Dict, List, Union - +import os from preprocess.build import PROCESSOR_REGISTRY from preprocess.feat1D.base import Base1DProcessor @@ -32,32 +32,42 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] + pt_1d_path = osp.join(scene_out_dir, "data1D.pt") + if osp.exists(pt_1d_path): + pt_data=torch.load(pt_1d_path) + data1D['objects'] = pt_data['objects'] + data1D['scene'] = pt_data['scene'] + os.remove(pt_1d_path) + else: + # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item() + scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] - object_referral_embeddings, scene_referral_embeddings = {}, None - if len(scan_objects) != 0: - object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) - scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - - if len(scene_referrals) != 0: - if len(scene_referrals) > 10: - scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] - scene_referrals = ' '.join(scene_referrals) - scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) - assert scene_referral_embeddings is not None - - data1D = {} - data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} - data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, scan_objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: diff --git a/preprocess/feat1D/scannet.py b/preprocess/feat1D/scannet.py index e49b8e0..df4ac99 100644 --- a/preprocess/feat1D/scannet.py +++ b/preprocess/feat1D/scannet.py @@ -1,7 +1,7 @@ import os.path as osp import torch import numpy as np - +import os from common import load_utils from util import scannet from typing import Dict, List, Union @@ -34,32 +34,42 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id] - - object_referral_embeddings, scene_referral_embeddings = {}, None - if len(objects) != 0: - object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, objects, objectID_to_labelID_map) + pt_1d_path = osp.join(scene_out_dir, "data1D.pt") + if osp.exists(pt_1d_path): + pt_data=torch.load(pt_1d_path) + data1D['objects'] = pt_data['objects'] + data1D['scene'] = pt_data['scene'] + os.remove(pt_1d_path) + + else: + # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item() + objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id] + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, objects, objectID_to_labelID_map) - scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - - if len(scene_referrals) != 0: - if len(scene_referrals) > 10: - scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] - scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] - scene_referrals = ' '.join(scene_referrals) - scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) - assert scene_referral_embeddings is not None - - data1D = {} - data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} - data1D['scene'] = {'referral_embedding': scene_referral_embeddings} - - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: From 268d38ebf92f43e4c048318b70b97a73d648851f Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 11:40:02 +0530 Subject: [PATCH 13/18] 2d preprocessing changes --- preprocess/feat2D/arkit.py | 171 ++++++++++++++++++--------------- preprocess/feat2D/multiscan.py | 158 +++++++++++++++++------------- preprocess/feat2D/scan3r.py | 85 ++++++++-------- preprocess/feat2D/scannet.py | 76 ++++++++------- 4 files changed, 272 insertions(+), 218 deletions(-) diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py index 531b5b6..924a70a 100644 --- a/preprocess/feat2D/arkit.py +++ b/preprocess/feat2D/arkit.py @@ -12,7 +12,7 @@ from common import load_utils from util import render, arkit, visualisation from util import image as image_util - +import os from preprocess.build import PROCESSOR_REGISTRY from preprocess.feat2D.base import Base2DProcessor @@ -56,95 +56,108 @@ def compute2DFeatures(self) -> None: def compute2DImagesAndSeg(self, scan_id: str) -> None: scene_folder = osp.join(self.data_dir, 'scans', scan_id) - if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')): - return - - objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") - if not osp.exists(objects_path): - raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") - - annotations = load_utils.load_json(objects_path) - ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) - instance_ids = ply_data['objectId'] - - mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') - mesh = o3d.io.read_triangle_mesh(mesh_file) - mesh_triangles = np.asarray(mesh.triangles) - colors = np.asarray(mesh.vertex_colors)*255.0 - colors = colors.round() - num_triangles = mesh_triangles.shape[0] - - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) - - # project 3D model - obj_id_imgs = {} obj_id_imgs = {} - for frame_idx in self.frame_pose_data[scan_id].keys(): - camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) - intrinsics = camera_info['intrinsic_mat'] - img_width = int(camera_info['width']) - img_height = int(camera_info['height']) - img_pose = self.frame_pose_data[scan_id][frame_idx] - img_pose_inv = np.linalg.inv(img_pose) + + gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt') + if osp.exists(gt_pt_path): + # print("using gt pt") + old_gt = torch.load(gt_pt_path) + for frame_idx in self.frame_pose_data[scan_id]: + obj_id_imgs[frame_idx] = old_gt[frame_idx] + os.remove(gt_pt_path) + - obj_id_map = render.project_mesh3DTo2D_with_objectseg( - scene, intrinsics, img_pose_inv, img_width, img_height, - mesh_triangles, num_triangles, instance_ids - ) - obj_id_imgs[frame_idx] = obj_id_map + else: + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) + instance_ids = ply_data['objectId'] + + mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map - scene_folder = osp.join(self.data_dir, 'scans', scan_id) - if osp.exists(osp.join(scene_folder, 'gt-projection')): - shutil.rmtree(osp.join(scene_folder, 'gt-projection')) # save scene-level file for efficient loading - torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs) def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} + scene_folder = osp.join(self.data_dir, 'scans', scan_id) color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - - if osp.exists(osp.join(scene_out_dir, 'data2D.pt')): - return - - obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - - # Multi-view Image -- Object (Embeddings) - object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) - - # Multi-view Image -- Scene (Images + Embeddings) - frame_idxs = list(self.frame_pose_data[scan_id].keys()) - pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) - - # Visualise - for frame_idx in self.frame_pose_data[scan_id].keys(): - camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) - intrinsic_mat = camera_info['intrinsic_mat'] - break + pt_2d_path = osp.join(scene_out_dir, 'data2D.pt') + if osp.exists(pt_2d_path): + print("using 2d pt") + pt_data=torch.load(pt_2d_path) + data2D['objects']=pt_data['objects'] + data2D['scene']=pt_data['scene'] + os.remove(pt_2d_path) + else: + # obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item() + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsic_mat = camera_info['intrinsic_mat'] + break + + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict - scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) - intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], - 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} - - cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) - image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') - Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - - data2D = {} - data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} - data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, - 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} - - # dummy floorplan - floorplan_dict = {'img' : None, 'embedding' : None} - data2D['scene']['floorplan'] = floorplan_dict - - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeAllImageFeaturesEachScan(self, scan_id: str) -> None: scene_folder = osp.join(self.data_dir, 'scans', scan_id) @@ -174,7 +187,8 @@ def computeAllImageFeaturesEachScan(self, scan_id: str) -> None: data2D = {} data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 'frame_idxs' : frame_idxs} - torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + # torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D) def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid @@ -207,7 +221,8 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: - object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + # object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True) object_image_votes = {} scan_id=scene_folder.split('/')[-1] # iterate over all frames diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py index d95239e..cb13475 100644 --- a/preprocess/feat2D/multiscan.py +++ b/preprocess/feat2D/multiscan.py @@ -5,7 +5,7 @@ from tqdm import tqdm from PIL import Image from scipy.spatial.transform import Rotation as R - +import os from common import load_utils from util import render, multiscan, visualisation from util import image as image_util @@ -67,80 +67,100 @@ def compute2DFeatures(self): def compute2DImagesAndSeg(self, scan_id): scene_folder = osp.join(self.data_dir, 'scenes', scan_id) - mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) - - ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) - instance_ids = ply_data['objectId'] + obj_id_imgs = {} - mesh = o3d.io.read_triangle_mesh(mesh_file) - mesh_triangles = np.asarray(mesh.triangles) - colors = np.asarray(mesh.vertex_colors)*255.0 - colors = colors.round() - num_triangles = mesh_triangles.shape[0] + gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt') + if osp.exists(gt_pt_path): + # print("using gt pt") + old_gt = torch.load(gt_pt_path) + for frame_idx in self.frame_pose_data[scan_id]: + obj_id_imgs[frame_idx] = old_gt[frame_idx] + os.remove(gt_pt_path) + + else: + mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) + + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + instance_ids = ply_data['objectId'] + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id]: + camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + # if osp.exists(osp.join(scene_folder, 'gt-projection')): + # shutil.rmtree(osp.join(scene_folder, 'gt-projection')) - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) - - # project 3D model - obj_id_imgs = {} - for frame_idx in self.frame_pose_data[scan_id]: - camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) - intrinsics = camera_info['intrinsic_mat'] - img_width = int(camera_info['width']) - img_height = int(camera_info['height']) - img_pose = self.frame_pose_data[scan_id][frame_idx] - img_pose_inv = np.linalg.inv(img_pose) - - obj_id_map = render.project_mesh3DTo2D_with_objectseg( - scene, intrinsics, img_pose_inv, img_width, img_height, - mesh_triangles, num_triangles, instance_ids - ) - obj_id_imgs[frame_idx] = obj_id_map - - scene_out_dir = osp.join(self.out_dir, scan_id) - load_utils.ensure_dir(scene_out_dir) - - # save scene-level file for efficient loading - torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + # save scene-level file for efficient loading + # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs) def compute2DFeaturesEachScan(self, scan_id): + data2D = {} + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) color_path = osp.join(scene_folder, 'sequence') - scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - - obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] - - # Multi-view Image -- Object (Embeddings) - object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) - - # Multi-view Image -- Scene (Images + Embeddings) - frame_idxs = list(self.frame_pose_data[scan_id].keys()) - pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) - - # Visualise - camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) - intrinsic_mat = camera_info['intrinsic_mat'] - - scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) - intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], - 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} - - cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) - image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') - Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - - data2D = {} - data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} - data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, - 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} - - # dummy floorplan - floorplan_dict = {'img' : None, 'embedding' : None} - data2D['scene']['floorplan'] = floorplan_dict - - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + pt_2d_path = osp.join(scene_out_dir, 'data2D.pt') + if osp.exists(pt_2d_path): + # print("using 2d pt") + pt_data=torch.load(pt_2d_path) + data2D['objects']=pt_data['objects'] + data2D['scene']=pt_data['scene'] + os.remove(pt_2d_path) + + else: + obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item() + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) + intrinsic_mat = camera_info['intrinsic_mat'] + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh_old.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): # Sample Camera Indexes Based on Rotation Matrix From Grid @@ -170,7 +190,9 @@ def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map): - object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + # object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True) + object_image_votes = {} # iterate over all frames diff --git a/preprocess/feat2D/scan3r.py b/preprocess/feat2D/scan3r.py index 4927c97..1fe96d8 100644 --- a/preprocess/feat2D/scan3r.py +++ b/preprocess/feat2D/scan3r.py @@ -7,7 +7,7 @@ from scipy.spatial.transform import Rotation as R from omegaconf import DictConfig from typing import List, Dict, Tuple - +import os from common import load_utils from util import render, scan3r, visualisation from util import image as image_util @@ -55,44 +55,53 @@ def compute2DFeatures(self) -> None: self.compute2DFeaturesEachScan(scan_id) def compute2DImagesAndSeg(self, scan_id: str) -> None: - scene_folder = osp.join(self.data_dir, 'scans', scan_id) - mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', '')) - - ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename) - instance_ids = ply_data['objectId'] - - camera_info = scan3r.load_intrinsics(scene_folder) - intrinsics = camera_info['intrinsic_mat'] - img_width = int(camera_info['width']) - img_height = int(camera_info['height']) - - mesh = o3d.io.read_triangle_mesh(mesh_file) - mesh_triangles = np.asarray(mesh.triangles) - colors = np.asarray(mesh.vertex_colors)*255.0 - colors = colors.round() - num_triangles = mesh_triangles.shape[0] - - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) - - # project 3D model - obj_id_imgs = {} - for frame_idx in self.frame_pose_data[scan_id]: - img_pose = self.frame_pose_data[scan_id][frame_idx] - img_pose_inv = np.linalg.inv(img_pose) - - obj_id_map = render.project_mesh3DTo2D_with_objectseg( - scene, intrinsics, img_pose_inv, img_width, img_height, - mesh_triangles, num_triangles, instance_ids - ) - obj_id_imgs[frame_idx] = obj_id_map + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', '')) + obj_id_imgs = {} + gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt') + if osp.exists(gt_pt_path): + # print("using gt pt") + old_gt = torch.load(gt_pt_path) + for frame_idx in self.frame_pose_data[scan_id]: + obj_id_imgs[frame_idx] = old_gt[frame_idx] + os.remove(gt_pt_path) + + else: + ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename) + instance_ids = ply_data['objectId'] + + camera_info = scan3r.load_intrinsics(scene_folder) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id]: + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map - - # save scene-level file for efficient loading - scene_out_dir = osp.join(self.out_dir, scan_id) - load_utils.ensure_dir(scene_out_dir) - - torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + + # save scene-level file for efficient loading + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + # torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) def compute2DFeaturesEachScan(self, scan_id: str) -> None: scene_folder = osp.join(self.data_dir, 'scans', scan_id) diff --git a/preprocess/feat2D/scannet.py b/preprocess/feat2D/scannet.py index 8c59354..c0bc412 100644 --- a/preprocess/feat2D/scannet.py +++ b/preprocess/feat2D/scannet.py @@ -3,7 +3,7 @@ import numpy as np import torch from tqdm import tqdm - +import os import imageio import skimage.transform as sktf from PIL import Image @@ -81,49 +81,57 @@ def renderShapeAndFloorplan(self, scene_folder: str, scene_out_folder: str, scan return render_img def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} frame_idxs = list(self.frame_pose_data[scan_id].keys()) scene_folder = osp.join(self.data_dir, 'scans', scan_id) scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - + pt_2d_path = osp.join(scene_out_dir, 'data2D.pt') + if osp.exists(pt_2d_path): + print("using 2d pt") + pt_data=torch.load(pt_2d_path) + data2D['objects']=pt_data['objects'] + data2D['scene']=pt_data['scene'] + os.remove(pt_2d_path) + + else: # Floor-plan rendering - render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id) - floorplan_embeddings = None - - if render_img is not None: - render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) - render_img_pt = self.model.base_tf(render_img) - floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False) - - floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings} + render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id) + floorplan_embeddings = None - # Multi-view Image -- Object (Embeddings) - object_image_embeddings, object_image_votes_topK = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, frame_idxs) - - # Multi-view Image -- Scene (Images + Embeddings) - color_path = osp.join(scene_folder, 'data/color') - intrinsic_data = scannet.load_intrinsics(osp.join(self.data_dir, 'scans'), scan_id) - - pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeImageFeaturesEachScan(scan_id, color_path, frame_idxs) - - # Visualise - scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, scan_id + '_vh_clean_2.ply')) - intrinsics = { 'f' : intrinsic_data['intrinsic_mat'][0, 0], 'cx' : intrinsic_data['intrinsic_mat'][0, 2], 'cy' : intrinsic_data['intrinsic_mat'][1, 2], - 'w' : int(intrinsic_data['width']), 'h' : int(intrinsic_data['height'])} - - cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + if render_img is not None: + render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + render_img_pt = self.model.base_tf(render_img) + floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False) + floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings} + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK = self.computeImageFeaturesAllObjectsEachScan(scene_folder, frame_idxs) - image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') - Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + # Multi-view Image -- Scene (Images + Embeddings) + color_path = osp.join(scene_folder, 'data/color') + intrinsic_data = scannet.load_intrinsics(osp.join(self.data_dir, 'scans'), scan_id) - data2D = {} - data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} - data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, - 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeImageFeaturesEachScan(scan_id, color_path, frame_idxs) - data2D['scene']['floorplan'] = floorplan_dict - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + # Visualise + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, scan_id + '_vh_clean_2.ply')) + intrinsics = { 'f' : intrinsic_data['intrinsic_mat'][0, 0], 'cx' : intrinsic_data['intrinsic_mat'][0, 2], 'cy' : intrinsic_data['intrinsic_mat'][1, 2], + 'w' : int(intrinsic_data['width']), 'h' : int(intrinsic_data['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + data2D['scene']['floorplan'] = floorplan_dict + # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid From ef6a5e5d5758160de028661c9aed6e3358dc41d6 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 11:44:10 +0530 Subject: [PATCH 14/18] 3d preprocessing changes --- preprocess/feat3D/arkit.py | 15 ++++++++++++--- preprocess/feat3D/multiscan.py | 17 +++++++++++++---- preprocess/feat3D/scan3r.py | 15 +++++++++++++-- preprocess/feat3D/scannet.py | 32 ++++++++++++++++++-------------- 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py index 6172204..67b79c0 100644 --- a/preprocess/feat3D/arkit.py +++ b/preprocess/feat3D/arkit.py @@ -3,7 +3,7 @@ import numpy as np import torch from tqdm import tqdm - +import os from common import load_utils from util import point_cloud, arkit from util.arkit import ARKITSCENE_SCANNET @@ -93,5 +93,14 @@ def compute3DFeaturesEachScan(self, scan_id): scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt') + pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt') + if osp.exists(pt_data3d_path): + os.remove(pt_data3d_path) + if osp.exists(pt_map_path): + os.remove(pt_map_path) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py index 68ba025..336ea3a 100644 --- a/preprocess/feat3D/multiscan.py +++ b/preprocess/feat3D/multiscan.py @@ -3,7 +3,7 @@ import numpy as np import torch from tqdm import tqdm - +import os from common import load_utils from util import point_cloud, multiscan from util.multiscan import MULTISCAN_SCANNET @@ -89,6 +89,15 @@ def compute3DFeaturesEachScan(self, scan_id): scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) - \ No newline at end of file + # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + + # Save as .npz files instead of .pt files + pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt') + pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt') + if osp.exists(pt_data3d_path): + os.remove(pt_data3d_path) + if osp.exists(pt_map_path): + os.remove(pt_map_path) + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) diff --git a/preprocess/feat3D/scan3r.py b/preprocess/feat3D/scan3r.py index 7b949ca..315bc97 100644 --- a/preprocess/feat3D/scan3r.py +++ b/preprocess/feat3D/scan3r.py @@ -43,6 +43,9 @@ def __init__(self, config_data: DictConfig, config_3D: DictConfig, split: str) - self.feature_extractor = self.loadFeatureExtractor(config_3D, "3D") def compute3DFeaturesEachScan(self, scan_id: str) -> None: + """ + Computes 3D features for a single scan. + """ ply_data = scan3r.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, self.label_filename) mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) @@ -79,5 +82,13 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt') + pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt') + if osp.exists(pt_data3d_path): + os.remove(pt_data3d_path) + if osp.exists(pt_map_path): + os.remove(pt_map_path) + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/scannet.py b/preprocess/feat3D/scannet.py index e530195..13d1e5a 100644 --- a/preprocess/feat3D/scannet.py +++ b/preprocess/feat3D/scannet.py @@ -1,5 +1,5 @@ import os.path as osp - +import os import numpy as np import torch from omegaconf import DictConfig @@ -64,6 +64,10 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: mesh_points = mesh_vertices[:, 0:3] mesh_colors = mesh_vertices[:, 3:] + center_points = np.mean(mesh_points, axis=0) + center_points[2] = np.min(mesh_points[:, 2]) + mesh_points = mesh_points - center_points + text_file = mesh_file.replace('_vh_clean_2.labels.ply' , '.txt') with open(text_file, 'r') as file: for line in file: @@ -79,10 +83,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: if len(shape_annot) > 0: shape_annot = shape_annot[0] shape_annot_to_instance_map = scannet.get_cad_model_to_instance_mapping(instance_bboxes, shape_annot, meta_file, self.shape_dir) - - render_out_dir = osp.join(scene_out_dir, 'render') - load_utils.ensure_dir(render_out_dir) - + for instance_id in unique_instance_ids: if instance_id == self.undefined: continue @@ -98,11 +99,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: shape_annot_instance = shape_annot_to_instance_map[instance_id] object_cad_pcl = shape_annot_instance['points'] object_cad_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_cad_pcl) - - obj_verts, obj_faces, transform_shape = shape_annot_instance['verts'], shape_annot_instance['faces'], shape_annot_instance['transform_shape'] - # load_utils.ensure_dir(osp.join(render_out_dir, f'{instance_id}')) - # render.render_multiview_images(obj_verts, obj_faces, transform_shape, osp.join(render_out_dir, f'{instance_id}')) - + data3D = {} data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} data3D['scene'] = {'pcl_coords': mesh_points[instance_ids != self.undefined], 'pcl_feats': mesh_colors[instance_ids != self.undefined], 'scene_label' : scene_label} @@ -112,7 +109,14 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) assert len(list(object_id_to_label_id.keys())) >= len(list(object_cad_embeddings.keys())), 'CAD does not match for {}'.format(scan_id) - - - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt') + pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt') + if osp.exists(pt_data3d_path): + os.remove(pt_data3d_path) + if osp.exists(pt_map_path): + os.remove(pt_map_path) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file From bbf08e655f2037c9509f93213500f676397bd26a Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 11:45:53 +0530 Subject: [PATCH 15/18] multimodal dumping changes --- preprocess/multimodal_preprocess.py | 34 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index a45274b..a6d9063 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -10,7 +10,7 @@ from common.constants import ModalityType from util import scan3r, scannet, arkit, multiscan from typing import Dict, Optional - +import os from preprocess.build import PROCESSOR_REGISTRY @PROCESSOR_REGISTRY.register() @@ -75,18 +75,20 @@ def prepareObjectWiseDataEachScan(self, data2D: Optional[Dict] = None, data3D: Optional[Dict] = None) -> Dict: """Process object-wise data for a single scan combining features from all modalities.""" - object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + # object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + object_id_to_label_id_map = np.load(osp.join(out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item() + map_object_ids = list(object_id_to_label_id_map.keys()) precomputed_feats, inputs = {}, {} if data3D is not None: - precomputed_feats[ModalityType.POINT] = data3D['objects']['pcl_embeddings'] - precomputed_feats[ModalityType.CAD] = data3D['objects']['cad_embeddings'] + precomputed_feats[ModalityType.POINT] = data3D['objects'].item()['pcl_embeddings'] + precomputed_feats[ModalityType.CAD] = data3D['objects'].item()['cad_embeddings'] if data2D is not None: - precomputed_feats[ModalityType.RGB] = data2D['objects']['image_embeddings'] + precomputed_feats[ModalityType.RGB] = data2D['objects'].item()['image_embeddings'] if data1D is not None: - precomputed_feats[ModalityType.REF] = data1D['objects']['referral_embeddings'] + precomputed_feats[ModalityType.REF] = data1D['objects'].item()['referral_embeddings'] object_ids = [] for modalityType in ModalityType.__dict__.values(): @@ -141,19 +143,27 @@ def prepareObjectWiseDataEachScan(self, 'object_id2idx' : object_id2idx, 'object_id_to_label_id_map' : object_id_to_label_id_map, 'object_ids' : object_ids, - 'topK_images_votes' : data2D['objects']['topK_images_votes'] + 'topK_images_votes' : data2D['objects'].item()['topK_images_votes'] } - - torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + pt_multimodal_path = osp.join(out_dir, 'objectsDataMultimodal.pt') + if osp.exists(pt_multimodal_path): + os.remove(pt_multimodal_path) + # torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt) return objects_data_pt def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None: """Process data for a single scan and store it in the HDF5 file.""" out_dir = osp.join(self.out_dir, scan_id) - data1D = torch.load(osp.join(out_dir, 'data1D.pt')) - data2D = torch.load(osp.join(out_dir, 'data2D.pt')) - data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + # data1D = torch.load(osp.join(out_dir, 'data1D.pt')) + data1D = np.load(osp.join(out_dir, 'data1D.npz'),allow_pickle=True) + + # data2D = torch.load(osp.join(out_dir, 'data2D.pt')) + data2D = np.load(osp.join(out_dir, 'data2D.npz'),allow_pickle=True) + + # data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + data3D = np.load(osp.join(out_dir, 'data3D.npz'),allow_pickle=True) objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D) self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler) From 25a3dd7cecf00464ed617dbc0d93f5cb8a34dd8c Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 11:50:29 +0530 Subject: [PATCH 16/18] dataset util changes for alignment --- util/arkit.py | 27 ++++++------ util/multiscan.py | 73 +++++++++++++++++++++++++------ util/scan3r.py | 106 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 175 insertions(+), 31 deletions(-) diff --git a/util/arkit.py b/util/arkit.py index c4e7593..0029b58 100644 --- a/util/arkit.py +++ b/util/arkit.py @@ -219,12 +219,15 @@ def load_ply_data(data_dir, scan_id, annotations): file = open(filename_in, 'rb') plydata = PlyData.read(file) file.close() + # plydata = trimesh.load(filename_in, process=False) vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z'] + # vertices=plydata.vertices vertices = np.vstack(vertices).T vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue'] vertex_colors = np.vstack(vertex_colors).T - + # vertex_colors = plydata.visual.vertex_colors + # print("vertex_colors", vertex_colors.shape) vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('objectId', 'h')] @@ -260,24 +263,24 @@ def load_ply_data(data_dir, scan_id, annotations): vertices_structured['objectId'] = vertex_instance - # align_angle = calc_align_matrix(bbox_list) + align_angle = calc_align_matrix(bbox_list) - # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle) + vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle) if np.max(vertex_colors) <= 1: vertex_colors = vertex_colors * 255.0 - # center_points = np.mean(vertices_aligned, axis=0) - # center_points[2] = np.min(vertices_aligned[:, 2]) - # vertices_aligned = vertices_aligned - center_points + center_points = np.mean(vertices_aligned, axis=0) + center_points[2] = np.min(vertices_aligned[:, 2]) + vertices_aligned = vertices_aligned - center_points - # vertices_structured['x'] = vertices_aligned[:, 0] - # vertices_structured['y'] = vertices_aligned[:, 1] - # vertices_structured['z'] = vertices_aligned[:, 2] + vertices_structured['x'] = vertices_aligned[:, 0] + vertices_structured['y'] = vertices_aligned[:, 1] + vertices_structured['z'] = vertices_aligned[:, 2] - vertices_structured['x'] = plydata['vertex']['x'] - vertices_structured['y'] = plydata['vertex']['y'] - vertices_structured['z'] = plydata['vertex']['z'] + # vertices_structured['x'] = plydata['vertex']['x'] + # vertices_structured['y'] = plydata['vertex']['y'] + # vertices_structured['z'] = plydata['vertex']['z'] return vertices_structured diff --git a/util/multiscan.py b/util/multiscan.py index 8478a7d..9e14c03 100644 --- a/util/multiscan.py +++ b/util/multiscan.py @@ -6,6 +6,7 @@ import jsonlines import json import os +import pandas as pd MULTISCAN_SCANNET = { "wall": "wall", @@ -492,10 +493,35 @@ def get_scan_ids(dirname, split): scan_ids = np.genfromtxt(filepath, dtype = str) return scan_ids +def annotations_to_dataframe_obj(annotations): + objects = annotations['objects'] + df_list = [] + for obj in objects: + object_id = obj['objectId'] + object_label = obj['label'] + df_row = pd.DataFrame( + [[object_id, object_label]], + columns=['objectId', 'objectLabel'] + ) + df_list.append(df_row) + df = pd.concat(df_list) + return df + + def load_ply_data(data_dir, scan_id): """ Load PLY data and propagate object IDs from faces to vertices. + + Args: + data_dir (str): Directory containing the PLY file. + scan_id (str): Identifier for the scan. + + Returns: + np.ndarray: Vertex data with propagated object IDs. """ + with open(osp.join(data_dir, scan_id, f'{scan_id}.annotations.json'), "r", encoding='utf-8') as f: + annotations = json.load(f) + filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id)) if not osp.exists(filename_in): @@ -511,6 +537,7 @@ def load_ply_data(data_dir, scan_id): red = np.array(ply_data['vertex']['red']) green = np.array(ply_data['vertex']['green']) blue = np.array(ply_data['vertex']['blue']) + triangles = np.vstack(ply_data['face'].data['vertex_indices']) # Extract normals if available if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']: @@ -521,17 +548,36 @@ def load_ply_data(data_dir, scan_id): else: normals = None - # Initialize object IDs for vertices with a default undefined value - vertex_object_ids = np.full(len(x), -1, dtype='int32') # Default: -1 (undefined) + scene_vertices = np.column_stack([x, y, z]) + center_points = np.mean(scene_vertices, axis=0) + center_points[2] = np.min(scene_vertices[:, 2]) + scene_vertices = scene_vertices - center_points + + vertex_object_ids = np.zeros((scene_vertices.shape[0])) # Extract face data - faces = ply_data['face'].data - face_vertex_indices = [face['vertex_indices'] for face in faces] - face_object_ids = [face['objectId'] for face in faces] + # faces = ply_data['face'].data + # face_vertex_indices = [face['vertex_indices'] for face in faces] + # face_object_ids = [face['objectId'] for face in faces] + + # # Propagate object IDs to vertices + # for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): + # vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face + object_ids = ply_data['face'].data['objectId'] + part_ids = ply_data['face'].data['partId'] + + semseg_df = pd.DataFrame({'objectId': object_ids, 'partId': part_ids}) + df = annotations_to_dataframe_obj(annotations) + for _, row in df.iterrows(): + object_id = row['objectId'] + assert object_id > 0, f"object id should be greater than 0, but got {object_id}" + + condition1 = semseg_df['objectId'] == object_id + tri_indices = semseg_df[condition1].index.values + object_vertices = np.unique(triangles[tri_indices]) + vertex_object_ids[object_vertices] = object_id + - # Propagate object IDs to vertices - for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): - vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face vertex_dtype = [ ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), # Coordinates @@ -543,10 +589,13 @@ def load_ply_data(data_dir, scan_id): vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')]) # Normals vertices = np.empty(len(x), dtype=vertex_dtype) - - vertices['x'] = x.astype('f4') - vertices['y'] = y.astype('f4') - vertices['z'] = z.astype('f4') + # Update scene vertices - assign x, y, z coordinates from scene_vertices + vertices['x'] = scene_vertices[:, 0].astype('f4') + vertices['y'] = scene_vertices[:, 1].astype('f4') + vertices['z'] = scene_vertices[:, 2].astype('f4') + # vertices['x'] = x.astype('f4') + # vertices['y'] = y.astype('f4') + # vertices['z'] = z.astype('f4') vertices['red'] = red.astype('u1') vertices['green'] = green.astype('u1') vertices['blue'] = blue.astype('u1') diff --git a/util/scan3r.py b/util/scan3r.py index 2727d5a..8fc33bb 100644 --- a/util/scan3r.py +++ b/util/scan3r.py @@ -3,15 +3,19 @@ from plyfile import PlyData from glob import glob import csv - +import json def get_scan_ids(dirname: str, split: str) -> np.ndarray: """Retrieve scan IDs for the given directory and split.""" filepath = osp.join(dirname, '{}_scans.txt'.format(split)) scan_ids = np.genfromtxt(filepath, dtype = str) return scan_ids -def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarray: - """Load PLY data from specified directory, scan ID, and label file.""" +def load_ply_data(data_dir, scan_id, label_file_name): + with open(osp.join(data_dir, scan_id, 'mesh.refined.0.010000.segs.v2.json'), "r", encoding='utf-8') as f: + segments = json.load(f) + with open(osp.join(data_dir, scan_id, 'semseg.v2.json'), "r", encoding='utf-8') as f: + aggregation = json.load(f) + filename_in = osp.join(data_dir, scan_id, label_file_name) file = open(filename_in, 'rb') ply_data = PlyData.read(file) @@ -31,9 +35,32 @@ def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarr vertices = np.empty(len(x), dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('objectId', 'h'), ('globalId', 'h'), ('NYU40', 'u1'), ('Eigen13', 'u1'), ('RIO27', 'u1')]) - vertices['x'] = x.astype('f4') - vertices['y'] = y.astype('f4') - vertices['z'] = z.astype('f4') + seg_group = aggregation['segGroups'] + bbox_list = [] + for i, _ in enumerate(seg_group): + rotation = np.array(seg_group[i]["obb"]["normalizedAxes"]).reshape(3, 3) + transform = np.array(seg_group[i]["obb"]["centroid"]).reshape(-1, 3) + scale = np.array(seg_group[i]["obb"]["axesLengths"]).reshape(-1, 3) + trns = np.eye(4) + trns[0:3, 3] = transform + trns[0:3, 0:3] = rotation.T + box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) + bbox_list.append(box3d) + + align_angle = calc_align_matrix(bbox_list) + scene_vertices = np.column_stack([x, y, z]) + center_points = np.mean(scene_vertices, axis=0) + center_points[2] = np.min(scene_vertices[:, 2]) + scene_vertices = scene_vertices - center_points + + scene_vertices = rotate_z_axis_by_degrees(np.array(scene_vertices), align_angle) + + vertices['x'] = scene_vertices[:, 0].astype('f4') + vertices['y'] = scene_vertices[:, 1].astype('f4') + vertices['z'] = scene_vertices[:, 2].astype('f4') + # vertices['x'] = x.astype('f4') + # vertices['y'] = y.astype('f4') + # vertices['z'] = z.astype('f4') vertices['red'] = red.astype('u1') vertices['green'] = green.astype('u1') vertices['blue'] = blue.astype('u1') @@ -136,4 +163,69 @@ def represents_int(s: str) -> bool: int(s) return True except ValueError: - return False \ No newline at end of file + return False + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) \ No newline at end of file From 58f0d8e653137df1431d3185baeea80ff7fb5c8e Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 22 Apr 2025 12:00:23 +0530 Subject: [PATCH 17/18] scanbase changes to work with npz --- data/datasets/scanbase.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index b531e32..aa5d4a6 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -138,7 +138,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) # Point Cloud Data -- Scene - points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label'] + points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label'] feats /= 255. feats -= 0.5 @@ -152,9 +152,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]: _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) coords, feats = points[sel], feats[sel] - # Get coords, shift to center + # Get coords, already zero centered during preprocessing coords = np.floor(coords / self.voxel_size) - coords-=coords.min(0) + # coords-=coords.min(0) # Object Data scene_dict = {} @@ -185,7 +185,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scene_dict['scene_masks'] = {} - rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings']) + rgb_embedding = torch.from_numpy(scandata_2d['scene'].item()['scene_embeddings']) rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1) scene_dict['rgb_embedding'] = rgb_embedding @@ -194,7 +194,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scene_dict['scene_masks']['object'] = torch.Tensor([1.0]) referral_mask = torch.Tensor([0.0]) - referral_embedding = scandata_1d['scene']['referral_embedding'] + referral_embedding = scandata_1d['scene'].item()['referral_embedding'] if referral_embedding is not None: referral_embedding = torch.from_numpy(referral_embedding[0]['feat']).reshape(-1,) @@ -202,7 +202,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: else: referral_embedding = torch.zeros((scene_dict['rgb_embedding'].shape[-1] // 4, )) - floorplan_embedding = scandata_2d['scene']['floorplan']['embedding'] + floorplan_embedding = scandata_2d['scene'].item()['floorplan']['embedding'] floorplan_mask = torch.Tensor([0.0]) if floorplan_embedding is not None: floorplan_embedding = torch.from_numpy(floorplan_embedding[0, 0]).reshape(-1, ) From 736f72a63b1000de77e2690b6e19a85eac4c77e9 Mon Sep 17 00:00:00 2001 From: Gaurav Pradeep Date: Tue, 29 Apr 2025 10:41:14 +0530 Subject: [PATCH 18/18] scanbase change to read npz isntead of pt --- data/datasets/scanbase.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index aa5d4a6..2d054ec 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -131,11 +131,15 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scan_process_dir = osp.join(self.process_dir, 'scans', scan_id) - scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) - - scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) - scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) - scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + # scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) + scan_objects_data = np.load(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'), allow_pickle=True) + + # scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) + scandata_1d = np.load(osp.join(scan_process_dir, 'data1D.npz'), allow_pickle=True) + # scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) + scandata_2d = np.load(osp.join(scan_process_dir, 'data2D.npz'), allow_pickle=True) + # scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + scandata_3d = np.load(osp.join(scan_process_dir, 'data3D.npz'), allow_pickle=True) # Point Cloud Data -- Scene points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label']