From 16dd9bb70ea56b1402c723766ec729e7b694441f Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Thu, 20 Feb 2025 02:29:28 +0530
Subject: [PATCH 01/18] adding support for arkitscenes

---
 DATA.md                                     |  34 ++
 README.md                                   |   1 +
 TRAIN.md                                    |   2 +-
 configs/evaluation/eval_instance.yaml       |  12 +-
 configs/evaluation/eval_scene.yaml          |  12 +-
 configs/preprocess/process_1d.yaml          |   8 +
 configs/preprocess/process_2d.yaml          |   8 +
 configs/preprocess/process_3d.yaml          |   8 +
 configs/preprocess/process_multimodal.yaml  |   9 +
 configs/train/train_instance_baseline.yaml  |  11 +
 configs/train/train_instance_crossover.yaml |  11 +
 configs/train/train_scene_crossover.yaml    |  11 +
 data/datasets/__init__.py                   |   3 +-
 data/datasets/arkit.py                      |  41 +++
 prepare_data/README.md                      |  50 ++-
 preprocess/feat1D/__init__.py               |   3 +-
 preprocess/feat1D/arkit.py                  | 107 ++++++
 preprocess/feat2D/__init__.py               |   3 +-
 preprocess/feat2D/arkit.py                  | 287 ++++++++++++++++
 preprocess/feat3D/__init__.py               |   3 +-
 preprocess/feat3D/arkit.py                  |  98 ++++++
 preprocess/multimodal_preprocess.py         |   4 +-
 scripts/preprocess/process_arkit.sh         |   9 +
 single_inference/datasets/__init__.py       |   3 +-
 single_inference/datasets/arkit.py          | 126 +++++++
 single_inference/scene_inference.py         |   2 +
 util/arkit.py                               | 347 ++++++++++++++++++++
 27 files changed, 1193 insertions(+), 20 deletions(-)
 create mode 100644 data/datasets/arkit.py
 create mode 100644 preprocess/feat1D/arkit.py
 create mode 100644 preprocess/feat2D/arkit.py
 create mode 100644 preprocess/feat3D/arkit.py
 create mode 100644 scripts/preprocess/process_arkit.sh
 create mode 100644 single_inference/datasets/arkit.py
 create mode 100644 util/arkit.py

diff --git a/DATA.md b/DATA.md
index 643b538..9377fd0 100644
--- a/DATA.md
+++ b/DATA.md
@@ -10,6 +10,7 @@ We list the available data used in the current version of CrossOver in the table
 | ------------ | ----------------------------- | ----------------------------------- |  -------------------------- | -------------------------- |
 | ScanNet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
+| ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                      |          ✅                |
 
 
 We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. 
@@ -110,4 +111,37 @@ Scan3R/
 |   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
 |   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
 |   └── ...
+```
+
+### ARKitScenes
+
+#### Running preprocessing scripts
+Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file):
+
+```bash
+$ bash scripts/preprocess/process_arkit.sh
+```
+
+Our script for ARKitScenes dataset performs the following additional processing:
+
+- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan.
+
+Post running preprocessing, the data structure should look like the following:
+
+```
+ARKitScenes/
+├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training)
+|   ├── train_objects.h5
+|   └── val_objects.h5
+├── scans/
+|   ├── 40753679/
+|   │   ├── gt-projection-seg.pt -> 3D-to-2D projected data  consisting of framewise 2D instance segmentation
+|   │   ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) 
+|   │   ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features)
+|   │   ├── data2D_all_images.pt (RGB features of every image of every scan )
+|   │   ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only)
+|   │   ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped
+|   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
+|   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
+|   └── ...
 ```
\ No newline at end of file
diff --git a/README.md b/README.md
index 1cb1030..b39d33a 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,7 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a
 | ------------ | ----------------------------- | ----------------------------------- |  -------------------------- | -------------------------- |
 | Scannet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
+| ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
 
 > To run our demo, you only need to download generated embedding data; no need for any data preprocessing.
 
diff --git a/TRAIN.md b/TRAIN.md
index fd56dcd..622d5c6 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh
 ```
 
 #### Train Scene Retrieval Pipeline
-Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following:
+Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan & ARKitScenes or any combination of the same. Run the following:
 
 ```bash
 $ bash scripts/train/train_scene_crossover.sh
diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml
index a14c626..5515123 100644
--- a/configs/evaluation/eval_instance.yaml
+++ b/configs/evaluation/eval_instance.yaml
@@ -43,13 +43,23 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+    
 task: 
   name       : InferenceObjectRetrieval
   InferenceObjectRetrieval:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan']
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth
     
 
 inference_module: ObjectRetrieval
diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml
index 0f1b6f2..eab4202 100644
--- a/configs/evaluation/eval_scene.yaml
+++ b/configs/evaluation/eval_scene.yaml
@@ -43,13 +43,23 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+
 task: 
   name       : InferenceSceneRetrieval
   InferenceSceneRetrieval:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan'] #, 'point']
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth
 
 inference_module: SceneRetrieval
 model: 
diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml
index c74b6bc..11a9df7 100644
--- a/configs/preprocess/process_1d.yaml
+++ b/configs/preprocess/process_1d.yaml
@@ -25,6 +25,14 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
     skip_frames    : 1
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
+    
   Shapenet:
     base_dir       : /drive/datasets/Shapenet/ShapeNetCore.v2/
 
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
index 74898cd..d02d017 100644
--- a/configs/preprocess/process_2d.yaml
+++ b/configs/preprocess/process_2d.yaml
@@ -27,6 +27,14 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
     skip_frames    : 1
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
+    
 modality_info:
   1D  :
     feature_extractor: 
diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml
index 3d15f23..e9bc9c6 100644
--- a/configs/preprocess/process_3d.yaml
+++ b/configs/preprocess/process_3d.yaml
@@ -24,6 +24,14 @@ data:
     processor1D    : Scan3R1DProcessor
     label_filename : labels.instances.align.annotated.v2.ply
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
+    
 modality_info:
   1D  :
     feature_extractor: 
diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml
index 3eb5ace..33b3def 100644
--- a/configs/preprocess/process_multimodal.yaml
+++ b/configs/preprocess/process_multimodal.yaml
@@ -28,6 +28,15 @@ data:
     skip_frames      : 1
     avail_modalities : ['point', 'rgb', 'referral']
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    
 modality_info:
   1D  :
     feature_extractor: 
diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml
index 8b6bc89..02e4324 100644
--- a/configs/train/train_instance_baseline.yaml
+++ b/configs/train/train_instance_baseline.yaml
@@ -44,6 +44,17 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+    
 task: 
   name       : ObjectLevelGrounding 
   ObjectLevelGrounding :
diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index c54257d..6bfdce4 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -44,6 +44,17 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+    
 task: 
   name       : SceneLevelGrounding 
   SceneLevelGrounding :
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
index f9459da..31ae435 100644
--- a/configs/train/train_scene_crossover.yaml
+++ b/configs/train/train_scene_crossover.yaml
@@ -44,6 +44,17 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+    
 task: 
   name         : UnifiedTrain 
   UnifiedTrain :
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
index 9a1b744..8c18552 100644
--- a/data/datasets/__init__.py
+++ b/data/datasets/__init__.py
@@ -1,2 +1,3 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
\ No newline at end of file
diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py
new file mode 100644
index 0000000..4944dae
--- /dev/null
+++ b/data/datasets/arkit.py
@@ -0,0 +1,41 @@
+import os.path as osp
+import numpy as np
+from typing import List, Any
+from omegaconf import DictConfig
+import pandas as pd
+from ..build import DATASET_REGISTRY
+from .scanbase import ScanObjectBase, ScanBase
+
+@DATASET_REGISTRY.register()
+class ARKitScenesObject(ScanObjectBase):
+    """ARKitScenes dataset class for instance level baseline"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+
+@DATASET_REGISTRY.register()
+class ARKitScenes(ScanBase):
+    """ARKitScenes dataset class"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+        
+        filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split))
+        self.scan_ids = np.genfromtxt(filepath, dtype = str)
+    
+    def get_temporal_scan_pairs(self):
+        """Groups scans into temporal pairs based on shared visit_id."""
+        csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv')
+        df = pd.read_csv(csv_path)
+
+        df = df[df["visit_id"].notna()]
+
+        grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict()
+
+        scene_pairs = []
+        for video_ids in grouped_scans.values():
+            if len(video_ids) > 1: 
+                ref_scan_id = video_ids[0]  # First video_id as reference
+                rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] 
+                
+                scene_pairs.append([ref_scan_id, rescan_list])
+        
+        return scene_pairs
\ No newline at end of file
diff --git a/prepare_data/README.md b/prepare_data/README.md
index dba34f5..919d73d 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -5,6 +5,7 @@
 This document provides instructions for pre-processing different datasets, including 
 - ScanNet
 - 3RScan
+- ARKitScenes
 
 ## Prerequisites
 
@@ -16,20 +17,14 @@ Before you begin, simply activate the `crossover` conda environment.
 #### Original Data
 - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark).
 
-- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt).
-    - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json).
-    - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0).
+- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan).
 
-- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip.
-
-#### Referral and CAD annotations
-We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). 
+- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes).
 
-- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/).
-- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset).
+- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip.
 
-### Prepare The Data
-Exact instructions for data setup + preparation below:
+### Download Referral and CAD annotations
+We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below.
 
 #### ScanNet
 1. Run the following to extract ScanNet data 
@@ -107,3 +102,36 @@ Scan3R/
     └── sceneverse  
         └── ssg_ref_rel2_template.json
 ```
+
+#### ARKitScenes
+1. Download `files/` under `processed_data/meta_data/ARKitScenes/` from GDrive and place under `PATH_TO_ARKITSCENES/`.
+2. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract MultiScan data 
+ 
+ ```bash
+cd ARKitScenes
+mv 3dod/Training/* scans
+mv 3dod/Validation/* scans
+```
+
+Once completed, the data structure would look like the following:
+```
+MultiScan/
+├── scans/
+│   ├── 40753679/
+│   │   ├── 40753679_frames/ 
+│   │   │    ├── lowres_depth/ (folder containing depth images)
+│   │   │    ├── lowres_wide/ (folder containing rgb images)
+│   │   │    ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics)
+│   │   │    ├── lowres_wide.traj (camera trajectory)
+│   │   ├── 40753679_3dod_annotation.json
+│   │   ├── 40753679_3dod_mesh.ply
+|   └── 
+└── files
+    ├── scannetv2-labels.combined.tsv
+    ├── train_scans.txt
+    ├── val_scans.txt
+    ├── metadata.csv
+    ├── 3dod_train_val_splits.csv
+    └── sceneverse  
+        └── ssg_ref_rel2_template.json
+```
\ No newline at end of file
diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py
index 9a1b744..8c18552 100644
--- a/preprocess/feat1D/__init__.py
+++ b/preprocess/feat1D/__init__.py
@@ -1,2 +1,3 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
\ No newline at end of file
diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py
new file mode 100644
index 0000000..0e2873d
--- /dev/null
+++ b/preprocess/feat1D/arkit.py
@@ -0,0 +1,107 @@
+import os.path as osp
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import load_utils 
+from util import labelmap, arkit
+from util.arkit import ARKITSCENE_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat1D.base import Base1DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes1DProcessor(Base1DProcessor):
+    def __init__(self, config_data, config_1D, split) -> None:
+        super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = arkit.get_scan_ids(files_dir, split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)        
+        # Object Referrals
+        self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        # label map
+        self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        self.undefined = 0
+
+    
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        for _i, label_info in enumerate(annotations["data"]):
+            obj_label = label_info["label"]
+            object_id = _i + 1
+            scannet_class=ARKITSCENE_SCANNET[obj_label]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        
+        return objects
+    
+    
+    
+    def compute1DFeaturesEachScan(self, scan_id):
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+        scan_objects = self.load_objects_for_scan(scan_id)
+
+        object_referral_embeddings, scene_referral_embeddings = {}, None
+        if len(scan_objects) != 0:
+            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+
+        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        if len(scene_referrals) != 0:
+            if len(scene_referrals) > 10:
+                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            
+            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+            scene_referrals = ' '.join(scene_referrals)
+            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+            assert scene_referral_embeddings is not None
+        
+        data1D = {}
+        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+        
+        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+             
+    def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
+        object_referral_embeddings = {}
+        
+        scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            
+            if instance_id not in objectID_to_labelID_map.keys():
+                continue
+            
+            # Object Referral
+            object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id]
+            if len(object_referral) != 0:
+                object_referral_feats = self.extractTextFeats(object_referral)    
+                if object_referral_feats is not None:
+                    object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1)
+                    assert object_referral_feats.shape == (1, self.embed_dim)
+                    
+                    object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats}
+
+            
+        return object_referral_embeddings
diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py
index 9a1b744..8c18552 100644
--- a/preprocess/feat2D/__init__.py
+++ b/preprocess/feat2D/__init__.py
@@ -1,2 +1,3 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
\ No newline at end of file
diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
new file mode 100644
index 0000000..f0d8456
--- /dev/null
+++ b/preprocess/feat2D/arkit.py
@@ -0,0 +1,287 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+import shutil
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from omegaconf import DictConfig
+from typing import List, Dict, Tuple
+import pandas as pd
+from common import load_utils
+from util import render, arkit, visualisation
+from util import image as image_util
+
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat2D.base import Base2DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes2DProcessor(Base2DProcessor):
+    """ARKitScenes 2D (RGB) feature processor class."""
+    def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None:
+        super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split)
+        self.data_dir = config_data.base_dir
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.split = split
+        self.scan_ids = arkit.get_scan_ids(files_dir, self.split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)
+        
+        self.orig_image_size = config_2D.image.orig_size
+        self.model_image_size = config_2D.image.model_size
+        
+        self.frame_skip = config_data.skip_frames
+        self.top_k = config_2D.image.top_k
+        self.num_levels = config_2D.image.num_levels
+        self.undefined = 0
+        self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv'))
+         
+        self.frame_pose_data = {}
+        for scan_id in self.scan_ids:
+            pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip)
+            self.frame_pose_data[scan_id] = pose_data
+        
+
+    def compute2DFeatures(self) -> None:
+        for scan_id in tqdm(self.scan_ids):
+            self.compute2DImagesAndSeg(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)   
+            if self.split == 'val':
+                self.computeAllImageFeaturesEachScan(scan_id)
+    
+    def compute2DImagesAndSeg(self, scan_id: str) -> None:
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)        
+        ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations)
+        instance_ids = ply_data['objectId']
+        
+        mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply')
+        mesh = o3d.io.read_triangle_mesh(mesh_file)
+        mesh_triangles = np.asarray(mesh.triangles)
+        colors = np.asarray(mesh.vertex_colors)*255.0
+        colors = colors.round()
+        num_triangles = mesh_triangles.shape[0]
+        
+        scene = o3d.t.geometry.RaycastingScene()
+        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+         
+        # project 3D model
+        obj_id_imgs = {}
+        obj_id_imgs = {}
+        for frame_idx in self.frame_pose_data[scan_id].keys():
+            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+            intrinsics = camera_info['intrinsic_mat']
+            img_width = int(camera_info['width'])
+            img_height = int(camera_info['height'])
+            img_pose = self.frame_pose_data[scan_id][frame_idx]
+            img_pose_inv = np.linalg.inv(img_pose)
+            
+            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                scene, intrinsics, img_pose_inv, img_width, img_height, 
+                mesh_triangles, num_triangles, instance_ids
+            )
+            obj_id_imgs[frame_idx] = obj_id_map
+
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        if osp.exists(osp.join(scene_folder, 'gt-projection')):
+            shutil.rmtree(osp.join(scene_folder, 'gt-projection'))
+    
+        # save scene-level file for efficient loading
+        torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+    
+    def compute2DFeaturesEachScan(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+        
+        # Multi-view Image -- Object (Embeddings)
+        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
+        
+        # Multi-view Image -- Scene (Images + Embeddings)
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+        
+        # Visualise
+        for frame_idx in self.frame_pose_data[scan_id].keys():
+            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+            intrinsic_mat = camera_info['intrinsic_mat']
+            break
+            
+        
+        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply'))
+        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+        
+        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+        
+        data2D = {}
+        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+        
+        # dummy floorplan
+        floorplan_dict = {'img' : None, 'embedding' : None}
+        data2D['scene']['floorplan'] = floorplan_dict
+        
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+    
+    def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
+
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        scene_image_embeddings = []
+        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+            
+        for frame_index in frame_idxs:
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+            if sky_direction=='Left':
+                image = image.transpose(Image.ROTATE_270)
+            elif sky_direction=='Right':
+                image = image.transpose(Image.ROTATE_90)
+                
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            
+            scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False))
+            scene_images_pt.append(image_pt)
+        
+        scene_image_embeddings = np.concatenate(scene_image_embeddings)
+        data2D = {} 
+        data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                           'frame_idxs' : frame_idxs}
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
+    
+    def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]:
+        # Sample Camera Indexes Based on Rotation Matrix From Grid
+        pose_data = []
+        for frame_idx in frame_idxs:
+            pose = self.frame_pose_data[scan_id][frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data = np.array(pose_data)
+        
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
+        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+            if sky_direction=='Left':
+                image = image.transpose(Image.ROTATE_270)
+            elif sky_direction=='Right':
+                image = image.transpose(Image.ROTATE_90)
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            scene_images_pt.append(image_pt)
+        
+        scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
+        
+        return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
+        # return pose_data, None, None, sampled_frame_idxs
+        
+    
+    def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]:
+        object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt'))
+        object_image_votes = {}
+        scan_id=scene_folder.split('/')[-1]
+        # iterate over all frames
+        for frame_idx in object_anno_2D:
+            obj_2D_anno_frame = object_anno_2D[frame_idx]
+            # process 2D anno
+            obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True)
+            for idx in range(len(obj_ids)):
+                obj_id = obj_ids[idx]
+                count = counts[idx]
+                if obj_id == self.undefined:
+                    continue
+                
+                if obj_id not in object_image_votes:
+                    object_image_votes[obj_id] = {}
+                if frame_idx not in object_image_votes[obj_id]:
+                    object_image_votes[obj_id][frame_idx] = 0
+                object_image_votes[obj_id][frame_idx] = count
+        
+        # select top K frames for each obj
+        object_image_votes_topK = {}
+        for obj_id in object_image_votes:
+            object_image_votes_topK[obj_id] = []
+            obj_image_votes_f = object_image_votes[obj_id]
+            sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True)
+            if len(sorted_frame_idxs) > self.top_k:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k]
+            else:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs
+        
+        object_ids_in_image_votes = list(object_image_votes_topK.keys())
+        for obj_id in object_ids_in_image_votes:
+            if obj_id not in list(obj_id_to_label_id_map.keys()):
+                del object_image_votes_topK[obj_id]
+        
+        assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found'
+        
+        object_image_embeddings = {}
+        for object_id in object_image_votes_topK:
+            object_image_votes_topK_frames = object_image_votes_topK[object_id]
+            object_image_embeddings[object_id] = {}
+            
+            for frame_idx in object_image_votes_topK_frames:
+                image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png')
+                color_img = Image.open(image_path)
+                object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx])
+
+        return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys()
+    
+    def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray:
+        object_anno_2d = object_anno_2d.transpose(1, 0)
+        object_anno_2d = np.flip(object_anno_2d, 1)
+        
+        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+        # load image
+        if sky_direction=='Left':
+            image = image.transpose(Image.ROTATE_270)
+        elif sky_direction=='Right':
+            image = image.transpose(Image.ROTATE_90)
+        
+        object_mask = object_anno_2d == object_id
+        
+        images_crops = []
+        for level in range(self.num_levels):
+            mask_tensor = torch.from_numpy(object_mask).float()
+            x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level)
+            cropped_img = image.crop((x1, y1, x2, y2))
+            cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC)
+            img_pt = self.model.base_tf(cropped_img)
+            images_crops.append(img_pt)
+        
+        if(len(images_crops) > 0):
+            mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True)
+        return mean_feats
\ No newline at end of file
diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py
index 9a1b744..8c18552 100644
--- a/preprocess/feat3D/__init__.py
+++ b/preprocess/feat3D/__init__.py
@@ -1,2 +1,3 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
\ No newline at end of file
diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py
new file mode 100644
index 0000000..9da2d4e
--- /dev/null
+++ b/preprocess/feat3D/arkit.py
@@ -0,0 +1,98 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from common import load_utils 
+from util import point_cloud, arkit
+from util.arkit import ARKITSCENE_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat3D.base import Base3DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes3DProcessor(Base3DProcessor):
+    def __init__(self, config_data, config_3D, split) -> None:
+        super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = arkit.get_scan_ids(files_dir, split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)
+        self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        
+        self.undefined = 0        
+
+        
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        for _i, label_info in enumerate(annotations["data"]):
+            obj_label = label_info["label"]
+            object_id = _i + 1
+            scannet_class=ARKITSCENE_SCANNET[obj_label]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        
+        return objects
+
+    def compute3DFeaturesEachScan(self, scan_id):
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations)
+        mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
+                
+        mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id)))
+        mesh_colors = np.asarray(mesh.vertex_colors)*255.0
+        mesh_colors = mesh_colors.round()
+        
+                
+        scan_objects=self.load_objects_for_scan(scan_id)
+        
+        object_pcl_embeddings, object_cad_embeddings = {}, {}
+        object_id_to_label_id = {}
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            global_object_id = scan_object['global_id']
+
+            object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)]
+            
+            if object_pcl.shape[0] <= self.config_3D.min_points_per_object: 
+                continue
+            
+            assert instance_id not in object_id_to_label_id
+            object_id_to_label_id[instance_id] = global_object_id
+            
+            if object_pcl.shape[0] >= self.config_3D.min_points_per_object:
+                object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl)
+
+        data3D = {}    
+        data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
+        data3D['scene']   = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None}
+            
+        object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id}
+        
+        assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+            
+        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+    
diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
index 822135d..34f2898 100644
--- a/preprocess/multimodal_preprocess.py
+++ b/preprocess/multimodal_preprocess.py
@@ -8,7 +8,7 @@
 import h5py
 from common import load_utils 
 from common.constants import ModalityType
-from util import scan3r, scannet
+from util import scan3r, scannet, arkit
 from typing import Dict, Optional
 
 from preprocess.build import PROCESSOR_REGISTRY
@@ -33,6 +33,8 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split:
             self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split)
         elif self.dataset_name == 'Scan3R':
             self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split)
+        elif self.dataset_name == 'ARKitScenes':
+            self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split)
         else:
             raise NotImplementedError
         
diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh
new file mode 100644
index 0000000..ecb457e
--- /dev/null
+++ b/scripts/preprocess/process_arkit.sh
@@ -0,0 +1,9 @@
+export PYTHONWARNINGS="ignore"
+
+# Preprocessing Object Level + Scene Level + Unified Data
+python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+
+# # Multi-modal dumping
+python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py
index 9a1b744..8c18552 100644
--- a/single_inference/datasets/__init__.py
+++ b/single_inference/datasets/__init__.py
@@ -1,2 +1,3 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
\ No newline at end of file
diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py
new file mode 100644
index 0000000..6434bde
--- /dev/null
+++ b/single_inference/datasets/arkit.py
@@ -0,0 +1,126 @@
+import os.path as osp
+import numpy as np
+from torch.utils.data import Dataset
+import MinkowskiEngine as ME
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from torchvision import transforms as tvf
+import torch
+import open3d as o3d
+import pandas as pd
+from common import load_utils
+from util import arkit
+from util import image as image_util
+
+class ARKitScenesInferDataset(Dataset):
+    def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None:
+        self.voxel_size = voxel_size
+        self.frame_skip = frame_skip
+        self.image_size = image_size
+        
+        self.scans_dir = osp.join(data_dir, 'scans')
+        self.files_dir = osp.join(data_dir, 'files')
+        self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        self.scan_ids = []
+        for split in ['train', 'val']:
+            filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split))
+            self.scan_ids.extend(np.genfromtxt(filepath, dtype = str))
+        
+        self.base_tf = tvf.Compose([
+            tvf.ToTensor(),
+            tvf.Normalize(mean=[0.485, 0.456, 0.406], 
+                          std=[0.229, 0.224, 0.225])
+        ])
+        self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv'))
+        
+    
+    def extract_images(self, scan_id, color_path):
+        pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip)      
+        frame_idxs = list(pose_data.keys())
+        
+        pose_data_arr = []
+        for frame_idx in frame_idxs:
+            pose = pose_data[frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data_arr = np.array(pose_data_arr)
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr)
+        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+        image_data = None
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+            if sky_direction=='Left':
+                image = image.transpose(Image.ROTATE_270)
+            elif sky_direction=='Right':
+                image = image.transpose(Image.ROTATE_90)
+            image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+            image_pt = self.base_tf(image).unsqueeze(0)
+            image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0)
+
+        return image_data.unsqueeze(0)
+     
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            scan_id = self.scan_ids[index]
+        
+        if isinstance(index, str):
+            scan_id = index
+        
+        scan_folder = osp.join(self.scans_dir, scan_id)
+        data_dict = {}
+        data_dict['masks'] = {}
+        
+        # Point Cloud
+        mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id)))
+        points = np.asarray(mesh.vertices)
+        feats  = np.asarray(mesh.vertex_colors)*255.0
+        feats = feats.round()
+        
+        feats /= 255.
+        feats -= 0.5
+        
+        _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
+        coords,  feats = points[sel], feats[sel]
+        coords = np.floor(coords / self.voxel_size)
+        coords-= coords.min(0)
+        
+        coords, feats = ME.utils.sparse_collate([coords], [feats])
+        data_dict['masks']['point'] = True
+        
+        # RGB
+        color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide')
+        image_data = self.extract_images(scan_id, color_path)
+        data_dict['masks']['rgb'] = True
+        
+        # Floorplan (dummy)
+        floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8)
+        floorplan_img = Image.fromarray(floorplan_img)
+        data_dict['masks']['floorplan'] = False
+    
+        floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+        floorplan_data = self.base_tf(floorplan_img).unsqueeze(0)
+        
+        # Referral
+        referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id]
+        if len(referrals) != 0:
+            if len(referrals) > 10:
+                referrals = np.random.choice(referrals, size=10, replace=False)
+            referrals = [referral['utterance'] for referral in referrals]
+            referrals = [' '.join(referrals)]
+            data_dict['masks']['referral'] = True
+        else:
+            referrals = ['']
+            data_dict['masks']['referral'] = False
+                
+        data_dict['coordinates'] = coords
+        data_dict['features'] = feats
+        data_dict['rgb'] = image_data
+        data_dict['floorplan'] = floorplan_data
+        data_dict['referral'] = referrals
+        
+        return data_dict
\ No newline at end of file
diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py
index 9846dd5..65465c2 100644
--- a/single_inference/scene_inference.py
+++ b/single_inference/scene_inference.py
@@ -26,6 +26,8 @@ def run_inference(args, scan_id=None):
         dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir)
     elif args.dataset == 'Scan3R':
         dataset = datasets.Scan3RInferDataset(args.data_dir)
+    elif args.dataset == 'ARKitScenes':
+        dataset = datasets.ARKitScenesInferDataset(args.data_dir)
     else:
         raise NotImplementedError('Dataset not implemented')
     
diff --git a/util/arkit.py b/util/arkit.py
new file mode 100644
index 0000000..c4e7593
--- /dev/null
+++ b/util/arkit.py
@@ -0,0 +1,347 @@
+import os.path as osp
+import numpy as np
+from plyfile import PlyData
+from glob import glob
+import csv
+import jsonlines
+import json
+import os
+import trimesh
+import pandas as pd
+import cv2
+
+ARKITSCENE_SCANNET= {
+'bed': 'bed',
+'cabinet': 'cabinet',
+'refrigerator': 'refrigerator',
+'table': 'table',
+'chair': 'chair',
+'sink': 'sink',
+'stove': 'stove',
+'oven': 'oven',
+'washer': 'washing machine',
+'shelf': 'shelf',
+'tv_monitor': 'tv',
+'bathtub': 'bathtub',
+'toilet': 'toilet',
+'sofa': 'sofa',
+'stool': 'stool',
+'fireplace': 'fireplace',
+'build_in_cabinet': 'cabinet',
+'dishwasher': 'dishwasher',
+'stairs': 'stairs'
+}
+
+def get_scan_ids(dirname, split):
+    filepath = osp.join(dirname, '{}_scans.txt'.format(split))
+    scan_ids = np.genfromtxt(filepath, dtype = str)
+    return scan_ids
+
+def load_frame_idxs(scan_dir, skip=None):
+    frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png'))
+    frame_names = [osp.basename(frame_path) for frame_path in frames_paths]
+    frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names]
+    frame_idxs.sort() 
+
+    if skip is not None:
+        frame_idxs = frame_idxs[::skip]
+
+    return frame_idxs
+
+def TrajStringToMatrix(traj_str):
+    """ convert traj_str into translation and rotation matrices
+    Args:
+        traj_str: A space-delimited file where each line represents a camera position at a particular timestamp.
+        The file has seven columns:
+        * Column 1: timestamp
+        * Columns 2-4: rotation (axis-angle representation in radians)
+        * Columns 5-7: translation (usually in meters)
+
+    Returns:
+        ts: translation matrix
+        Rt: rotation matrix
+    """
+    # line=[float(x) for x in traj_str.split()]
+    # ts = line[0];
+    # R = cv2.Rodrigues(np.array(line[1:4]))[0];
+    # t = np.array(line[4:7]);
+    # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0)
+    tokens = traj_str.split()
+    assert len(tokens) == 7
+    ts = tokens[0]
+    # Rotation in angle axis
+    angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
+    r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis))
+    # Translation
+    t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])
+    extrinsics = np.eye(4, 4)
+    extrinsics[:3, :3] = r_w_to_p
+    extrinsics[:3, -1] = t_w_to_p
+    Rt = np.linalg.inv(extrinsics)
+    return Rt
+
+def convert_angle_axis_to_matrix3(angle_axis):
+    """Return a Matrix3 for the angle axis.
+    Arguments:
+        angle_axis {Point3} -- a rotation in angle axis form.
+    """
+    matrix, jacobian = cv2.Rodrigues(angle_axis)
+    return matrix
+
+def load_poses(scan_dir, scan_id, skip=None):
+    frame_poses = {}
+    frame_idxs = load_frame_idxs(scan_dir, skip=skip)
+    traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj')
+    with open(traj_file) as f:
+            traj = f.readlines()
+    for i,line in enumerate(traj):
+        ts=line.split(" ")[0]
+        rounded_ts = round(float(ts), 3)
+        formatted_ts = f"{rounded_ts:.3f}"
+        if formatted_ts not in frame_idxs:
+            if f"{rounded_ts - 0.001:.3f}" in frame_idxs:
+                frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line)
+            elif f"{rounded_ts + 0.001:.3f}" in frame_idxs:
+                frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line)
+            else:
+                print("no matching pose for frame", formatted_ts)
+                continue
+        # if f"{round(float(ts), 3):.3f}" not in frame_idxs:
+        #     if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs:
+        #         frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line)
+        #     elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs:
+        #         frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line)
+        #     else:    
+        #         continue
+        else:
+            frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line)
+    # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None)
+    # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()):
+    #     if skip is not None and index % skip != 0:
+    #         continue
+    #     rotation_axis = row[1:4].values
+    #     rotation_angle = np.linalg.norm(rotation_axis)
+    #     if rotation_angle != 0:
+    #         rotation_axis = rotation_axis / rotation_angle
+    #     translation = row[4:7].values
+    #     # Convert axis-angle to rotation matrix
+    #     # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle)
+    #     rotation_matrix=
+    #     # Construct the 4x4 homogeneous transformation matrix
+    #     homogenous_matrix = np.eye(4)
+    #     homogenous_matrix[:3, :3] = rotation_matrix
+    #     homogenous_matrix[:3, 3] = translation
+    #     frame_poses[frame_idx] = homogenous_matrix
+        
+    return frame_poses
+
+def axis_angle_to_rotation_matrix(axis, angle):
+    # Normalize the rotation axis
+    axis = axis / np.linalg.norm(axis)
+    x, y, z = axis
+    c = np.cos(angle)
+    s = np.sin(angle)
+    t = 1 - c
+
+    # Compute the rotation matrix using the axis-angle formula
+    rotation_matrix = np.array([
+        [t*x*x + c,    t*x*y - s*z,  t*x*z + s*y],
+        [t*x*y + s*z,  t*y*y + c,    t*y*z - s*x],
+        [t*x*z - s*y,  t*y*z + s*x,  t*z*z + c]
+    ])
+
+    return rotation_matrix
+
+def load_intrinsics(data_dir, scan_id, frame_id):
+    '''
+    Load ARKit intrinsic information
+    '''
+    pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam')
+    if not os.path.exists(pincam_path):
+        pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam')
+    if not os.path.exists(pincam_path):
+        pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam')
+        
+        
+    intrinsics = {}
+
+    # Read the .pincam file
+    with open(pincam_path, "r") as f:
+        line = f.readline().strip()
+    
+    # Parse the intrinsic parameters
+    width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split())
+
+    # Store the width and height
+    intrinsics['width'] = width
+    intrinsics['height'] = height
+
+    # Construct the intrinsic matrix
+    intrinsic_mat = np.array([
+        [focal_length_x, 0, principal_point_x],
+        [0, focal_length_y, principal_point_y],
+        [0, 0, 1]
+    ])
+    intrinsics['intrinsic_mat'] = intrinsic_mat
+
+    return intrinsics
+
+def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'):
+    LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv')
+    assert osp.exists(LABEL_MAP_FILE)
+    
+    raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to)
+    return raw_label_map
+
+def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
+    assert osp.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = row[label_to]
+    
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k):v for k,v in mapping.items()}
+    
+    return mapping
+
+def represents_int(s):
+    ''' if string s represents an int. '''
+    try: 
+        int(s)
+        return True
+    except ValueError:
+        return False
+    
+def load_ply_data(data_dir, scan_id, annotations):
+    filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply')
+    file = open(filename_in, 'rb')
+    plydata = PlyData.read(file)
+    file.close()
+    vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z']
+    vertices = np.vstack(vertices).T
+
+    vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue']
+    vertex_colors = np.vstack(vertex_colors).T
+
+    vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), 
+                    ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),
+                    ('objectId', 'h')]  
+    vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype)
+
+    # Assign x, y, z, and color values to the structured array
+    vertices_structured['red'] = vertex_colors[:, 0]
+    vertices_structured['green'] = vertex_colors[:, 1]
+    vertices_structured['blue'] = vertex_colors[:, 2]
+
+    vertex_instance = np.zeros(vertices.shape[0], dtype='h')  # Use 'h' for signed 16-bit integer
+    bbox_list=[]
+    for _i, label_info in enumerate(annotations["data"]):
+        object_id = _i + 1
+        rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3)
+
+        transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3)
+        scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3)
+
+        trns = np.eye(4)
+        trns[0:3, 3] = transform
+        trns[0:3, 0:3] = rotation.T
+
+        box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns)
+        obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices))
+
+        vertex_instance[obj_containment] = object_id
+        box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation)
+        bbox_list.append(box3d)
+    
+    # if len(bbox_list) == 0:
+    #         return
+
+    vertices_structured['objectId'] = vertex_instance
+
+    # align_angle = calc_align_matrix(bbox_list)
+
+    # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle)
+
+    if np.max(vertex_colors) <= 1:
+        vertex_colors = vertex_colors * 255.0
+
+    # center_points = np.mean(vertices_aligned, axis=0)
+    # center_points[2] = np.min(vertices_aligned[:, 2]) 
+    # vertices_aligned = vertices_aligned - center_points
+
+    # vertices_structured['x'] = vertices_aligned[:, 0]
+    # vertices_structured['y'] = vertices_aligned[:, 1]
+    # vertices_structured['z'] = vertices_aligned[:, 2]
+    
+    vertices_structured['x'] = plydata['vertex']['x']
+    vertices_structured['y'] = plydata['vertex']['y']
+    vertices_structured['z'] = plydata['vertex']['z']
+    
+    return vertices_structured
+
+def compute_box_3d(size, center, rotmat):
+    """Compute corners of a single box from rotation matrix
+    Args:
+        size: list of float [dx, dy, dz]
+        center: np.array [x, y, z]
+        rotmat: np.array (3, 3)
+    Returns:
+        corners: (8, 3)
+    """
+    l, h, w = [i / 2 for i in size]
+    center = np.reshape(center, (-1, 3))
+    center = center.reshape(3)
+    x_corners = [l, l, -l, -l, l, l, -l, -l]
+    y_corners = [h, -h, -h, h, h, -h, -h, h]
+    z_corners = [w, w, w, w, -w, -w, -w, -w]
+    corners_3d = np.dot(
+        np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners])
+    )
+    corners_3d[0, :] += center[0]
+    corners_3d[1, :] += center[1]
+    corners_3d[2, :] += center[2]
+    return np.transpose(corners_3d)
+
+def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True):
+    theta = np.deg2rad(theta)
+    cos_t = np.cos(theta)
+    sin_t = np.sin(theta)
+    rot_matrix = np.array([[cos_t, -sin_t, 0],
+                           [sin_t, cos_t, 0],
+                           [0, 0, 1]], pointcloud.dtype)
+    if not clockwise:
+        rot_matrix = rot_matrix.T
+    return pointcloud.dot(rot_matrix)
+
+def calc_align_matrix(bbox_list):
+    RANGE = [-45, 45]
+    NUM_BIN = 90
+    angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+    angle_counts = {}
+    for _a in angles:
+        bucket = round(_a, 3)
+        for box in bbox_list:
+            box_r = rotate_z_axis_by_degrees(box, bucket)
+            bottom = box_r[4:]
+            if is_axis_aligned(bottom):
+                angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    if len(angle_counts) == 0:
+        RANGE = [-90, 90]
+        NUM_BIN = 180
+        angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+        for _a in angles:
+            bucket = round(_a, 3)
+            for box in bbox_list:
+                box_r = rotate_z_axis_by_degrees(box, bucket)
+                bottom = box_r[4:]
+                if is_axis_aligned(bottom, thres=0.15):
+                    angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    most_common_angle = max(angle_counts, key=angle_counts.get)
+    return most_common_angle
+
+def is_axis_aligned(rotated_box, thres=0.05):
+    x_diff = abs(rotated_box[0][0] - rotated_box[1][0])
+    y_diff = abs(rotated_box[0][1] - rotated_box[3][1])
+    return x_diff < thres and y_diff < thres

From c9049965fa38062315d5f82dfcfc76c627d2fd19 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Fri, 28 Mar 2025 23:39:08 +0530
Subject: [PATCH 02/18] removing image rotations

---
 preprocess/feat2D/arkit.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
index f0d8456..baec4ad 100644
--- a/preprocess/feat2D/arkit.py
+++ b/preprocess/feat2D/arkit.py
@@ -152,15 +152,10 @@ def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
         # Extract Scene Image Features
         scene_images_pt = []
         scene_image_embeddings = []
-        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
-        
+        # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
             
         for frame_index in frame_idxs:
             image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
-            if sky_direction=='Left':
-                image = image.transpose(Image.ROTATE_270)
-            elif sky_direction=='Right':
-                image = image.transpose(Image.ROTATE_90)
                 
             image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
             image_pt = self.model.base_tf(image)
@@ -186,7 +181,7 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr
         pose_data = np.array(pose_data)
         
         sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
-        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
         
         # Extract Scene Image Features
         scene_images_pt = []
@@ -194,10 +189,6 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr
             frame_index = frame_idxs[idx]
             
             image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
-            if sky_direction=='Left':
-                image = image.transpose(Image.ROTATE_270)
-            elif sky_direction=='Right':
-                image = image.transpose(Image.ROTATE_90)
             image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
             image_pt = self.model.base_tf(image)
             scene_images_pt.append(image_pt)
@@ -263,14 +254,6 @@ def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id:
         object_anno_2d = object_anno_2d.transpose(1, 0)
         object_anno_2d = np.flip(object_anno_2d, 1)
         
-        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
-        
-        # load image
-        if sky_direction=='Left':
-            image = image.transpose(Image.ROTATE_270)
-        elif sky_direction=='Right':
-            image = image.transpose(Image.ROTATE_90)
-        
         object_mask = object_anno_2d == object_id
         
         images_crops = []

From 95efd602b1a6e08e85a3e6387cbb69841f8b1666 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Thu, 3 Apr 2025 23:18:22 +0530
Subject: [PATCH 03/18] readme fix

---
 prepare_data/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/prepare_data/README.md b/prepare_data/README.md
index 919d73d..279719f 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -104,18 +104,18 @@ Scan3R/
 ```
 
 #### ARKitScenes
-1. Download `files/` under `processed_data/meta_data/ARKitScenes/` from GDrive and place under `PATH_TO_ARKITSCENES/`.
-2. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract MultiScan data 
+1. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract the data 
  
  ```bash
 cd ARKitScenes
 mv 3dod/Training/* scans
 mv 3dod/Validation/* scans
 ```
+2. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`.
 
 Once completed, the data structure would look like the following:
 ```
-MultiScan/
+ARKitScenes/
 ├── scans/
 │   ├── 40753679/
 │   │   ├── 40753679_frames/ 

From beb066028b6a8a5c032dee5d342bd569e7c9acc6 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Sun, 6 Apr 2025 16:25:06 +0530
Subject: [PATCH 04/18] updated installation instructions

---
 prepare_data/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/prepare_data/README.md b/prepare_data/README.md
index 279719f..0a46691 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -104,14 +104,22 @@ Scan3R/
 ```
 
 #### ARKitScenes
-1. Download ARKitScenes 3dod data into ARKitScenes/scans and run the following to extract the data 
+1. Download ARKitScenes 3dod data using the following command:
+
+```bash
+python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES
+```
+The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data.
+
+2. Once the data is downloaded, run the following to organize it as per our requirements.
  
  ```bash
 cd ARKitScenes
 mv 3dod/Training/* scans
 mv 3dod/Validation/* scans
 ```
-2. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`.
+
+3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`.
 
 Once completed, the data structure would look like the following:
 ```

From 417fcc46d24b06e5d7b46c32652118c1eaf21522 Mon Sep 17 00:00:00 2001
From: Sayan Deb Sarkar <sayandsarkar.1997@gmail.com>
Date: Sun, 6 Apr 2025 19:47:52 -0700
Subject: [PATCH 05/18] Small config changes

---
 configs/preprocess/process_1d.yaml          |  5 +----
 configs/preprocess/process_2d.yaml          |  4 ++--
 configs/preprocess/process_3d.yaml          |  2 +-
 configs/preprocess/process_multimodal.yaml  |  2 +-
 configs/train/train_instance_crossover.yaml |  4 ++--
 configs/train/train_scene_crossover.yaml    |  4 ++--
 preprocess/feat3D/arkit.py                  |  3 +--
 scripts/preprocess/process_arkit.sh         | 10 +++++-----
 8 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml
index 11a9df7..42ce6ef 100644
--- a/configs/preprocess/process_1d.yaml
+++ b/configs/preprocess/process_1d.yaml
@@ -26,15 +26,12 @@ data:
     skip_frames    : 1
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
     process_dir    : ${data.process_dir}/ARKitScenes/scans
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
     skip_frames    : 1
-    
-  Shapenet:
-    base_dir       : /drive/datasets/Shapenet/ShapeNetCore.v2/
 
 modality_info:
   1D  :
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
index d02d017..85e9d82 100644
--- a/configs/preprocess/process_2d.yaml
+++ b/configs/preprocess/process_2d.yaml
@@ -28,7 +28,7 @@ data:
     skip_frames    : 1
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
     process_dir    : ${data.process_dir}/ARKitScenes/scans
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
@@ -68,4 +68,4 @@ task:
   name       : Preprocess 
   Preprocess :
     modality : '2D'
-    splits   : ['val']
\ No newline at end of file
+    splits   : ['train', 'val']
\ No newline at end of file
diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml
index e9bc9c6..9971666 100644
--- a/configs/preprocess/process_3d.yaml
+++ b/configs/preprocess/process_3d.yaml
@@ -25,7 +25,7 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
     process_dir    : ${data.process_dir}/ARKitScenes/scans
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml
index 33b3def..f8910c3 100644
--- a/configs/preprocess/process_multimodal.yaml
+++ b/configs/preprocess/process_multimodal.yaml
@@ -29,7 +29,7 @@ data:
     avail_modalities : ['point', 'rgb', 'referral']
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
+    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
     process_dir    : ${data.process_dir}/ARKitScenes/scans
     chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
     processor3D    : ARKitScenes3DProcessor
diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index 6bfdce4..e4eed4b 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -59,8 +59,8 @@ task:
   name       : SceneLevelGrounding 
   SceneLevelGrounding :
     modalities  : ['rgb', 'point', 'cad', 'referral']
-    train       : [Scannet, Scan3R]
-    val         : [Scannet, Scan3R]
+    train       : [Scannet, Scan3R, ARKitScenes]
+    val         : [Scannet, Scan3R, ARKitScenes]
 
 trainer: GroundingTrainer
 
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
index 31ae435..43ef415 100644
--- a/configs/train/train_scene_crossover.yaml
+++ b/configs/train/train_scene_crossover.yaml
@@ -60,8 +60,8 @@ task:
   UnifiedTrain :
     modalities       : ['rgb', 'point', 'cad', 'referral']
     scene_modalities : ['rgb', 'point', 'floorplan', 'referral']
-    train            : [Scannet, Scan3R, MultiScan]
-    val              : [Scannet, Scan3R, MultiScan]
+    train            : [Scannet, Scan3R, ARKitScenes]
+    val              : [Scannet, Scan3R, ARKitScenes]
     object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
     
 trainer: UnifiedTrainer
diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py
index 9da2d4e..e265d78 100644
--- a/preprocess/feat3D/arkit.py
+++ b/preprocess/feat3D/arkit.py
@@ -94,5 +94,4 @@ def compute3DFeaturesEachScan(self, scan_id):
         load_utils.ensure_dir(scene_out_dir)
             
         torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
-        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
-    
+        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
\ No newline at end of file
diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh
index ecb457e..5ff7fd5 100644
--- a/scripts/preprocess/process_arkit.sh
+++ b/scripts/preprocess/process_arkit.sh
@@ -1,9 +1,9 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
-python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
-python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
 
-# # Multi-modal dumping
-python3 preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+# Multi-modal dumping
+# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 

From 1ef819c044509320a09eb19cddfb1ba5a95c83a5 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Wed, 19 Feb 2025 14:12:25 +0530
Subject: [PATCH 06/18] adding support for multiscan

---
 DATA.md                                       |  35 +-
 README.md                                     |   4 +-
 TRAIN.md                                      |   2 +-
 configs/evaluation/eval_instance.yaml         |  14 +
 configs/evaluation/eval_scene.yaml            |  10 +
 configs/preprocess/process_1d.yaml            |  10 +
 configs/preprocess/process_2d.yaml            |   6 +
 configs/preprocess/process_3d.yaml            |   6 +
 configs/preprocess/process_multimodal.yaml    |   9 +
 configs/train/train_instance_baseline.yaml    |  10 +
 configs/train/train_instance_crossover.yaml   |  10 +
 configs/train/train_scene_crossover.yaml      |  10 +
 data/datasets/__init__.py                     |   3 +-
 data/datasets/multiscan.py                    |  42 ++
 prepare_data/README.md                        |  44 +-
 .../multiscan/preprocess_2d_multiscan.py      |  94 +++
 preprocess/feat1D/__init__.py                 |   3 +-
 preprocess/feat1D/multiscan.py                | 123 ++++
 preprocess/feat2D/__init__.py                 |   3 +-
 preprocess/feat2D/multiscan.py                | 240 +++++++
 preprocess/feat3D/__init__.py                 |   3 +-
 preprocess/feat3D/multiscan.py                |  94 +++
 preprocess/multimodal_preprocess.py           |   6 +
 scripts/preprocess/process_multiscan.sh       |   9 +
 single_inference/datasets/__init__.py         |   6 +-
 single_inference/datasets/multiscan.py        | 120 ++++
 single_inference/scene_inference.py           |   2 +
 util/multiscan.py                             | 670 ++++++++++++++++++
 28 files changed, 1579 insertions(+), 9 deletions(-)
 create mode 100644 data/datasets/multiscan.py
 create mode 100644 prepare_data/multiscan/preprocess_2d_multiscan.py
 create mode 100644 preprocess/feat1D/multiscan.py
 create mode 100644 preprocess/feat2D/multiscan.py
 create mode 100644 preprocess/feat3D/multiscan.py
 create mode 100644 scripts/preprocess/process_multiscan.sh
 create mode 100644 single_inference/datasets/multiscan.py
 create mode 100644 util/multiscan.py

diff --git a/DATA.md b/DATA.md
index 9377fd0..92a22dd 100644
--- a/DATA.md
+++ b/DATA.md
@@ -11,6 +11,7 @@ We list the available data used in the current version of CrossOver in the table
 | ScanNet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
 | ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                      |          ✅                |
+| MultiScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
 
 
 We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. 
@@ -112,6 +113,38 @@ Scan3R/
 |   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
 |   └── ...
 ```
+### MultiScan
+
+#### Running preprocessing scripts
+Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file):
+
+```bash
+$ bash scripts/preprocess/process_multiscan.sh
+```
+
+Our script for MultiScan dataset performs the following additional processing:
+
+- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan.
+
+Post running preprocessing, the data structure should look like the following:
+
+```
+MultiScan/
+├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training)
+|   ├── train_objects.h5
+|   └── val_objects.h5
+├── scans/
+|   ├── scene_00000_00/
+|   │   ├── gt-projection-seg.pt -> 3D-to-2D projected data  consisting of framewise 2D instance segmentation
+|   │   ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) 
+|   │   ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features)
+|   │   ├── data2D_all_images.pt (RGB features of every image of every scan)
+|   │   ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only)
+|   │   ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped
+|   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
+|   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
+|   └── ...
+```
 
 ### ARKitScenes
 
@@ -144,4 +177,4 @@ ARKitScenes/
 |   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
 |   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
 |   └── ...
-```
\ No newline at end of file
+```
diff --git a/README.md b/README.md
index b39d33a..c133ec5 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,8 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a
 | Scannet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
 | ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
+| MultiScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
+
 
 > To run our demo, you only need to download generated embedding data; no need for any data preprocessing.
 
@@ -135,7 +137,7 @@ Various configurable parameters:
 - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`).
 - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral`
 - `--database_modality`: Modality used for retrieval. Same options as above.
-- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`).
+- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`.
 
 For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections.
 
diff --git a/TRAIN.md b/TRAIN.md
index 622d5c6..5520b7d 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh
 ```
 
 #### Train Scene Retrieval Pipeline
-Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan & ARKitScenes or any combination of the same. Run the following:
+Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following:
 
 ```bash
 $ bash scripts/train/train_scene_crossover.sh
diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml
index 5515123..1c8518c 100644
--- a/configs/evaluation/eval_instance.yaml
+++ b/configs/evaluation/eval_instance.yaml
@@ -53,13 +53,27 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
     
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+
 task: 
   name       : InferenceObjectRetrieval
   InferenceObjectRetrieval:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan']
+<<<<<<< HEAD
     ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth
+=======
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
+>>>>>>> f86c782 (adding support for multiscan)
     
 
 inference_module: ObjectRetrieval
diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml
index eab4202..381153e 100644
--- a/configs/evaluation/eval_scene.yaml
+++ b/configs/evaluation/eval_scene.yaml
@@ -49,6 +49,12 @@ data :
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
     avail_modalities : ['point', 'cad', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
@@ -59,7 +65,11 @@ task:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan'] #, 'point']
+<<<<<<< HEAD
     ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth
+=======
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth
+>>>>>>> f86c782 (adding support for multiscan)
 
 inference_module: SceneRetrieval
 model: 
diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml
index 42ce6ef..baedd3a 100644
--- a/configs/preprocess/process_1d.yaml
+++ b/configs/preprocess/process_1d.yaml
@@ -32,6 +32,16 @@ data:
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
     skip_frames    : 1
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    skip_frames    : 1
+    
+  Shapenet:
+    base_dir       : /drive/datasets/Shapenet/ShapeNetCore.v2/
 
 modality_info:
   1D  :
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
index 85e9d82..1cd64dc 100644
--- a/configs/preprocess/process_2d.yaml
+++ b/configs/preprocess/process_2d.yaml
@@ -33,6 +33,12 @@ data:
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
     skip_frames    : 1
     
 modality_info:
diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml
index 9971666..5602ed8 100644
--- a/configs/preprocess/process_3d.yaml
+++ b/configs/preprocess/process_3d.yaml
@@ -30,6 +30,12 @@ data:
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
     skip_frames    : 1
     
 modality_info:
diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml
index f8910c3..54e3cd1 100644
--- a/configs/preprocess/process_multimodal.yaml
+++ b/configs/preprocess/process_multimodal.yaml
@@ -36,6 +36,15 @@ data:
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
     avail_modalities : ['point', 'rgb', 'referral']
+
+  MultiScan:
+    base_dir         : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir      : ${data.process_dir}/MultiScan/
+    chunked_dir      : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D      : Scan3R3DProcessor
+    processor2D      : Scan3R2DProcessor
+    processor1D      : Scan3R1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
     
 modality_info:
   1D  :
diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml
index 02e4324..a97cb22 100644
--- a/configs/train/train_instance_baseline.yaml
+++ b/configs/train/train_instance_baseline.yaml
@@ -54,6 +54,16 @@ data :
     avail_modalities : ['point', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
     
 task: 
   name       : ObjectLevelGrounding 
diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index e4eed4b..365f247 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -54,6 +54,16 @@ data :
     avail_modalities : ['point', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
     
 task: 
   name       : SceneLevelGrounding 
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
index 43ef415..aea7152 100644
--- a/configs/train/train_scene_crossover.yaml
+++ b/configs/train/train_scene_crossover.yaml
@@ -54,6 +54,16 @@ data :
     avail_modalities : ['point', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
     
 task: 
   name         : UnifiedTrain 
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
index 8c18552..9c7b829 100644
--- a/data/datasets/__init__.py
+++ b/data/datasets/__init__.py
@@ -1,3 +1,4 @@
 from .scannet import *
 from .scan3r import *
-from .arkit import *
\ No newline at end of file
+from .arkit import *
+from .multiscan import *
diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py
new file mode 100644
index 0000000..a43d8a1
--- /dev/null
+++ b/data/datasets/multiscan.py
@@ -0,0 +1,42 @@
+import os.path as osp
+import numpy as np
+from typing import List, Any
+from omegaconf import DictConfig
+
+from ..build import DATASET_REGISTRY
+from .scanbase import ScanObjectBase, ScanBase
+
+@DATASET_REGISTRY.register()
+class MultiScanObject(ScanObjectBase):
+    """MultiScan dataset class for instance level baseline"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+
+@DATASET_REGISTRY.register()
+class MultiScan(ScanBase):
+    """MultiScan dataset class"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+        
+        filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split))
+        self.scan_ids = np.genfromtxt(filepath, dtype = str)
+    
+    def get_temporal_scan_pairs(self) -> List[List[Any]]:
+        """Gets pairs of temporal scans from the dataset."""
+        scene_pairs = []
+        
+        ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')]
+        
+        for ref_scan_id in ref_scan_ids:    
+            rescan_list = []
+            
+            for rescan_id in self.scan_ids:
+                rescan = {}
+                if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id:
+                    rescan['scan_id'] = rescan_id
+                    rescan_list.append(rescan)
+            if len(rescan_list) == 0: 
+                continue
+            
+            scene_pairs.append([ref_scan_id, rescan_list])
+        return scene_pairs
\ No newline at end of file
diff --git a/prepare_data/README.md b/prepare_data/README.md
index 0a46691..0246b5c 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -6,6 +6,7 @@ This document provides instructions for pre-processing different datasets, inclu
 - ScanNet
 - 3RScan
 - ARKitScenes
+- MultiScan
 
 ## Prerequisites
 
@@ -19,12 +20,15 @@ Before you begin, simply activate the `crossover` conda environment.
 
 - **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan).
 
+- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan).
+
 - **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes).
 
 - **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip.
 
 ### Download Referral and CAD annotations
-We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below.
+We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below.
+
 
 #### ScanNet
 1. Run the following to extract ScanNet data 
@@ -142,4 +146,42 @@ ARKitScenes/
     ├── 3dod_train_val_splits.csv
     └── sceneverse  
         └── ssg_ref_rel2_template.json
+```
+
+#### MultiScan
+1. Download `files/` under `processed_data/meta_data/MultiScan/` from GDrive and place under `PATH_TO_MULTISCAN/`.
+2. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data 
+ 
+ ```bash
+cd MultiScan/scenes
+unzip '*.zip'
+rm -rf '*.zip'
+```
+3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing
+```bash
+cd prepare_data/multiscan
+python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval}
+```
+Once completed, the data structure would look like the following:
+```
+MultiScan/
+├── scenes/
+│   ├── scene_00000_00/
+│   │   ├── sequence/ (folder containing rgb images at specified frame interval)
+|   |   ├── frame_ids.txt
+│   │   ├── scene_00000_00.annotations.json
+│   │   ├── scene_00000_00.jsonl
+│   │   ├── scene_00000_00.confidence.zlib
+│   │   ├── scene_00000_00.mp4
+│   │   ├── poses.jsonl
+│   │   ├── scene_00000_00.ply
+│   │   ├── scene_00000_00.align.json
+│   │   ├── scene_00000_00.json
+|   └── 
+└── files
+    ├── scannetv2-labels.combined.tsv
+    ├── train_scans.txt
+    ├── test_scans.txt
+    └── sceneverse  
+        └── ssg_ref_rel2_template.json
 ```
\ No newline at end of file
diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py
new file mode 100644
index 0000000..da89da1
--- /dev/null
+++ b/prepare_data/multiscan/preprocess_2d_multiscan.py
@@ -0,0 +1,94 @@
+import os
+import cv2
+import json
+import jsonlines
+import argparse
+import os.path as osp
+import shutil
+
+def process_scene_folders(base_dir, frame_interval=10):
+    base_dir=osp.join(base_dir, 'scenes')
+    scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+
+    for scene_folder in scene_folders:
+        scene_path = os.path.join(base_dir, scene_folder)
+        video_path = os.path.join(scene_path, f"{scene_folder}.mp4")
+        jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl")
+        frame_output_dir = os.path.join(scene_path, "sequence")
+        frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt")
+        metadata_output_path = os.path.join(scene_path, "poses.jsonl")
+
+        if os.path.exists(frame_output_dir):
+            shutil.rmtree(frame_output_dir)
+        os.makedirs(frame_output_dir)
+
+        if not os.path.exists(video_path):
+            print(f"Video file not found: {video_path}")
+            continue
+        if not os.path.exists(jsonl_path):
+            print(f"Metadata file not found: {jsonl_path}")
+            continue
+
+        print(f"Processing scene: {scene_folder}")
+
+        frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval)
+
+        with open(frame_ids_txt_path, "w") as f:
+            for frame_id in frame_ids:
+                f.write(f"{frame_id}\n")
+
+        selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids)
+
+        with jsonlines.open(metadata_output_path, mode="w") as writer:
+            for entry in selected_metadata:
+                writer.write(entry)
+
+        print(f"Finished processing scene: {scene_folder}")
+
+
+def extract_frames_from_video(video_path, output_dir, frame_interval):
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file: {video_path}")
+
+    frame_ids = []
+    frame_count = 0
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break  # End of video
+
+        if frame_count % frame_interval == 0:
+            frame_id = frame_count
+            frame_ids.append(frame_id)
+            output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg")
+            cv2.imwrite(output_path, frame)  # Save frame as an image
+
+        frame_count += 1
+
+    cap.release()
+    return frame_ids
+
+
+def extract_metadata_by_line_number(jsonl_path, line_numbers):
+
+    selected_metadata = []
+
+    with jsonlines.open(jsonl_path) as reader:
+        for line_idx, entry in enumerate(reader):
+            if line_idx in line_numbers:
+                entry["frame_id"] = line_idx
+                selected_metadata.append(entry)
+
+    return selected_metadata
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process scene folders.")
+    parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.")
+    parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.")
+    args = parser.parse_args()
+
+    process_scene_folders(args.base_dir, args.frame_interval)
\ No newline at end of file
diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py
index 8c18552..9c7b829 100644
--- a/preprocess/feat1D/__init__.py
+++ b/preprocess/feat1D/__init__.py
@@ -1,3 +1,4 @@
 from .scannet import *
 from .scan3r import *
-from .arkit import *
\ No newline at end of file
+from .arkit import *
+from .multiscan import *
diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py
new file mode 100644
index 0000000..58b9ff9
--- /dev/null
+++ b/preprocess/feat1D/multiscan.py
@@ -0,0 +1,123 @@
+import os.path as osp
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import load_utils 
+from util import labelmap, multiscan
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat1D.base import Base1DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan1DProcessor(Base1DProcessor):
+    def __init__(self, config_data, config_1D, split) -> None:
+        super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)        
+        # Object Referrals
+        self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        # label map
+        self.undefined = 0
+
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        objects = []
+        
+        for obj in annotations["objects"]:
+            objects.append({
+                "objectId": obj["objectId"],
+                "global_id": obj.get("label")
+            })
+        
+        return objects
+    
+    def extractTextFeats(self, texts, return_text = False):
+        text_feats = []
+        
+        for text in texts:
+            encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device)  
+            if encoded_text['input_ids'].shape[1] > 512: 
+                continue
+            
+            with torch.no_grad():
+                encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask,                      
+                                                return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1)
+                
+            text_feats.append({'text' : text, 'feat' : encoded_text})
+        
+        if len(text_feats) == 0:
+            return None
+        
+        if return_text:
+            return text_feats
+         
+        text_feats = [text_feat['feat'] for text_feat in text_feats]
+        text_feats = np.concatenate(text_feats)
+        return text_feats
+    
+    
+    def compute1DFeaturesEachScan(self, scan_id):
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+        scan_objects = self.load_objects_for_scan(scan_id)
+
+        object_referral_embeddings, scene_referral_embeddings = {}, None
+        if len(scan_objects) != 0:
+            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+
+        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        if len(scene_referrals) != 0:
+            if len(scene_referrals) > 10:
+                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            
+            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+            scene_referrals = ' '.join(scene_referrals)
+            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+            assert scene_referral_embeddings is not None
+        
+        data1D = {}
+        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+        
+        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+             
+    def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
+        object_referral_embeddings = {}
+        
+        scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            
+            if instance_id not in objectID_to_labelID_map.keys():
+                continue
+            
+            # Object Referral
+            object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id]
+            if len(object_referral) != 0:
+                object_referral_feats = self.extractTextFeats(object_referral)    
+                if object_referral_feats is not None:
+                    object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1)
+                    assert object_referral_feats.shape == (1, self.embed_dim)
+                    
+                    object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats}
+
+            
+        return object_referral_embeddings
\ No newline at end of file
diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py
index 8c18552..9c7b829 100644
--- a/preprocess/feat2D/__init__.py
+++ b/preprocess/feat2D/__init__.py
@@ -1,3 +1,4 @@
 from .scannet import *
 from .scan3r import *
-from .arkit import *
\ No newline at end of file
+from .arkit import *
+from .multiscan import *
diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py
new file mode 100644
index 0000000..d95239e
--- /dev/null
+++ b/preprocess/feat2D/multiscan.py
@@ -0,0 +1,240 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+
+from common import load_utils
+from util import render, multiscan, visualisation
+from util import image as image_util
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat2D.base import Base2DProcessor
+
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan2DProcessor(Base2DProcessor):
+    def __init__(self, config_data, config_2D, split) -> None:
+        super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split)
+        self.data_dir = config_data.base_dir
+        files_dir = osp.join(config_data.base_dir, 'files')
+        self.split = split
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        
+        self.orig_image_size = config_2D.image.orig_size
+        self.model_image_size = config_2D.image.model_size
+        
+        self.frame_skip = config_data.skip_frames
+        self.top_k = config_2D.image.top_k
+        self.num_levels = config_2D.image.num_levels
+        self.undefined = 0
+        
+        
+        # get frame_indexes
+        self.frame_pose_data = {}
+        for scan_id in self.scan_ids:
+            scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+            frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip)
+            while(len(frame_idxs) > 500):
+                self.frame_skip += 2
+                frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=2)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=5)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=10)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=15)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=20)
+            
+            pose_data = multiscan.load_all_poses(scene_folder, frame_idxs)
+            self.frame_pose_data[scan_id] = pose_data
+
+
+    def compute2DFeatures(self):
+        for scan_id in tqdm(self.scan_ids):
+            self.compute2DImagesAndSeg(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)
+    
+    def compute2DImagesAndSeg(self, scan_id):
+        scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+        mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id))
+        
+        ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
+        instance_ids = ply_data['objectId']
+        
+        mesh = o3d.io.read_triangle_mesh(mesh_file)
+        mesh_triangles = np.asarray(mesh.triangles)
+        colors = np.asarray(mesh.vertex_colors)*255.0
+        colors = colors.round()
+        num_triangles = mesh_triangles.shape[0]
+        
+        scene = o3d.t.geometry.RaycastingScene()
+        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+         
+        # project 3D model
+        obj_id_imgs = {}
+        for frame_idx in self.frame_pose_data[scan_id]:
+            camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx))
+            intrinsics = camera_info['intrinsic_mat']
+            img_width = int(camera_info['width'])
+            img_height = int(camera_info['height'])
+            img_pose = self.frame_pose_data[scan_id][frame_idx]
+            img_pose_inv = np.linalg.inv(img_pose)
+            
+            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                scene, intrinsics, img_pose_inv, img_width, img_height, 
+                mesh_triangles, num_triangles, instance_ids
+            )
+            obj_id_imgs[frame_idx] = obj_id_map
+ 
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+    
+        # save scene-level file for efficient loading
+        torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+    
+    def compute2DFeaturesEachScan(self, scan_id):
+        scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+        color_path = osp.join(scene_folder, 'sequence')
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+        
+        # Multi-view Image -- Object (Embeddings)
+        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
+        
+        # Multi-view Image -- Scene (Images + Embeddings)
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+        
+        # Visualise
+        camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id)
+        intrinsic_mat = camera_info['intrinsic_mat']
+        
+        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id)))
+        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+        
+        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+        
+        data2D = {}
+        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+        
+        # dummy floorplan
+        floorplan_dict = {'img' : None, 'embedding' : None}
+        data2D['scene']['floorplan'] = floorplan_dict
+        
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+    
+    def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs):
+        # Sample Camera Indexes Based on Rotation Matrix From Grid
+        pose_data = []
+        for frame_idx in frame_idxs:
+            pose = self.frame_pose_data[scan_id][frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()  
+            trans = pose[:3, 3]
+            pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data = np.array(pose_data)
+        
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            
+            image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg'))
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            scene_images_pt.append(image_pt)
+            
+        scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
+
+        return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
+    
+    def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map):
+        object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+        object_image_votes = {}
+        
+        # iterate over all frames
+        for frame_idx in object_anno_2D:
+            obj_2D_anno_frame = object_anno_2D[frame_idx]
+            # process 2D anno
+            obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True)
+            for idx in range(len(obj_ids)):
+                obj_id = obj_ids[idx]
+                count = counts[idx]
+                if obj_id == self.undefined:
+                    continue
+                
+                if obj_id not in object_image_votes:
+                    object_image_votes[obj_id] = {}
+                if frame_idx not in object_image_votes[obj_id]:
+                    object_image_votes[obj_id][frame_idx] = 0
+                object_image_votes[obj_id][frame_idx] = count
+        
+        # select top K frames for each obj
+        object_image_votes_topK = {}
+        for obj_id in object_image_votes:
+            object_image_votes_topK[obj_id] = []
+            obj_image_votes_f = object_image_votes[obj_id]
+            sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True)
+            if len(sorted_frame_idxs) > self.top_k:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k]
+            else:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs
+        
+        object_ids_in_image_votes = list(object_image_votes_topK.keys())
+        for obj_id in object_ids_in_image_votes:
+            if obj_id not in list(obj_id_to_label_id_map.keys()):
+                del object_image_votes_topK[obj_id]
+        
+        assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found'
+        
+        object_image_embeddings = {}
+        for object_id in object_image_votes_topK:
+            object_image_votes_topK_frames = object_image_votes_topK[object_id]
+            object_image_embeddings[object_id] = {}
+            
+            for frame_idx in object_image_votes_topK_frames:
+                image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg')
+                color_img = Image.open(image_path)
+                object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx])
+
+        return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys()
+    
+    def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d):
+        # load image
+        object_mask = object_anno_2d == object_id
+        
+        images_crops = []
+        for level in range(self.num_levels):
+            mask_tensor = torch.from_numpy(object_mask).float()
+            x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level)
+            cropped_img = image.crop((x1, y1, x2, y2))
+            cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC)
+            img_pt = self.model.base_tf(cropped_img)
+            images_crops.append(img_pt)
+            # images_crops.append(cropped_img)
+            
+        
+        if(len(images_crops) > 0):
+            mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True)
+        return mean_feats
\ No newline at end of file
diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py
index 8c18552..9c7b829 100644
--- a/preprocess/feat3D/__init__.py
+++ b/preprocess/feat3D/__init__.py
@@ -1,3 +1,4 @@
 from .scannet import *
 from .scan3r import *
-from .arkit import *
\ No newline at end of file
+from .arkit import *
+from .multiscan import *
diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py
new file mode 100644
index 0000000..68ba025
--- /dev/null
+++ b/preprocess/feat3D/multiscan.py
@@ -0,0 +1,94 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from common import load_utils 
+from util import point_cloud, multiscan
+from util.multiscan import MULTISCAN_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat3D.base import Base3DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan3DProcessor(Base3DProcessor):
+    def __init__(self, config_data, config_3D, split) -> None:
+        super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        
+        self.undefined = 0        
+
+        
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        
+        for obj in annotations["objects"]:
+            object_id=obj["objectId"]
+            objectName=obj["label"].split('.')[0]
+            scannet_class=MULTISCAN_SCANNET[objectName]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        return objects
+
+
+    
+    def compute3DFeaturesEachScan(self, scan_id):
+        ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
+        mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
+                
+        mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id)))
+        mesh_colors = np.asarray(mesh.vertex_colors)*255.0
+        mesh_colors = mesh_colors.round()
+                
+        scan_objects=self.load_objects_for_scan(scan_id)
+        
+        object_pcl_embeddings, object_cad_embeddings = {}, {}
+        object_id_to_label_id = {}
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            global_object_id = scan_object['global_id']
+
+            object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)]
+            
+            if object_pcl.shape[0] <= self.config_3D.min_points_per_object: 
+                continue
+            
+            assert instance_id not in object_id_to_label_id
+            object_id_to_label_id[instance_id] = global_object_id
+            
+            if object_pcl.shape[0] >= self.config_3D.min_points_per_object:
+                object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl)
+
+        data3D = {}    
+        data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
+        data3D['scene']   = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None}
+            
+        object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id}
+        
+        assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+            
+        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+    
\ No newline at end of file
diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
index 34f2898..70adff4 100644
--- a/preprocess/multimodal_preprocess.py
+++ b/preprocess/multimodal_preprocess.py
@@ -8,7 +8,11 @@
 import h5py
 from common import load_utils 
 from common.constants import ModalityType
+<<<<<<< HEAD
 from util import scan3r, scannet, arkit
+=======
+from util import scan3r, scannet, multiscan
+>>>>>>> f86c782 (adding support for multiscan)
 from typing import Dict, Optional
 
 from preprocess.build import PROCESSOR_REGISTRY
@@ -35,6 +39,8 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split:
             self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split)
         elif self.dataset_name == 'ARKitScenes':
             self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split)
+        elif self.dataset_name == 'MultiScan':
+            self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split)
         else:
             raise NotImplementedError
         
diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh
new file mode 100644
index 0000000..c08bf84
--- /dev/null
+++ b/scripts/preprocess/process_multiscan.sh
@@ -0,0 +1,9 @@
+export PYTHONWARNINGS="ignore"
+
+# Preprocessing Object Level + Scene Level + Unified Data
+python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['MultiScan']  hydra.run.dir=. hydra.output_subdir=null
+
+# Multi-modal dumping
+python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py
index 8c18552..d7126ea 100644
--- a/single_inference/datasets/__init__.py
+++ b/single_inference/datasets/__init__.py
@@ -1,3 +1,7 @@
 from .scannet import *
 from .scan3r import *
-from .arkit import *
\ No newline at end of file
+<<<<<<< HEAD
+from .arkit import *
+=======
+from .multiscan import *
+>>>>>>> f86c782 (adding support for multiscan)
diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py
new file mode 100644
index 0000000..06538e6
--- /dev/null
+++ b/single_inference/datasets/multiscan.py
@@ -0,0 +1,120 @@
+import os.path as osp
+import numpy as np
+from torch.utils.data import Dataset
+import MinkowskiEngine as ME
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from torchvision import transforms as tvf
+import torch
+import open3d as o3d
+
+from common import load_utils
+from util import multiscan
+from util import image as image_util
+
+class MultiScanInferDataset(Dataset):
+    def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None:
+        self.voxel_size = voxel_size
+        self.frame_skip = frame_skip
+        self.image_size = image_size
+        
+        self.scans_dir = osp.join(data_dir, 'scenes')
+        self.files_dir = osp.join(data_dir, 'files')
+        self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        self.scan_ids = []
+        for split in ['train', 'val']:
+            filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split))
+            self.scan_ids.extend(np.genfromtxt(filepath, dtype = str))
+        
+        self.base_tf = tvf.Compose([
+            tvf.ToTensor(),
+            tvf.Normalize(mean=[0.485, 0.456, 0.406], 
+                          std=[0.229, 0.224, 0.225])
+        ])
+    
+    def extract_images(self, scan_id, color_path):
+        frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id))
+        pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs)    
+        frame_idxs = list(pose_data.keys())
+        
+        pose_data_arr = []
+        for frame_idx in frame_idxs:
+            pose = pose_data[frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data_arr = np.array(pose_data_arr)
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr)
+        
+        image_data = None
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg'))
+            image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+            image_pt = self.base_tf(image).unsqueeze(0)
+            image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0)
+
+        return image_data.unsqueeze(0)
+     
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            scan_id = self.scan_ids[index]
+        
+        if isinstance(index, str):
+            scan_id = index
+        
+        scan_folder = osp.join(self.scans_dir, scan_id)
+        data_dict = {}
+        data_dict['masks'] = {}
+        
+        # Point Cloud
+        mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply'))
+        points = np.asarray(mesh.vertices)
+        feats  = np.asarray(mesh.vertex_colors)*255.0
+        feats = feats.round()
+        
+        feats /= 255.
+        feats -= 0.5
+        
+        _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
+        coords,  feats = points[sel], feats[sel]
+        coords = np.floor(coords / self.voxel_size)
+        coords-= coords.min(0)
+        
+        coords, feats = ME.utils.sparse_collate([coords], [feats])
+        data_dict['masks']['point'] = True
+        
+        # RGB
+        color_path = osp.join(scan_folder, 'sequence')
+        image_data = self.extract_images(scan_id, color_path)
+        data_dict['masks']['rgb'] = True
+        
+        # Floorplan (dummy)
+        floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8)
+        floorplan_img = Image.fromarray(floorplan_img)
+        data_dict['masks']['floorplan'] = False
+    
+        floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+        floorplan_data = self.base_tf(floorplan_img).unsqueeze(0)
+        
+        # Referral
+        referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id]
+        if len(referrals) != 0:
+            if len(referrals) > 10:
+                referrals = np.random.choice(referrals, size=10, replace=False)
+            referrals = [referral['utterance'] for referral in referrals]
+            referrals = [' '.join(referrals)]
+            data_dict['masks']['referral'] = True
+        else:
+            referrals = ['']
+            data_dict['masks']['referral'] = False
+                
+        data_dict['coordinates'] = coords
+        data_dict['features'] = feats
+        data_dict['rgb'] = image_data
+        data_dict['floorplan'] = floorplan_data
+        data_dict['referral'] = referrals
+        
+        return data_dict
\ No newline at end of file
diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py
index 65465c2..1d13b5e 100644
--- a/single_inference/scene_inference.py
+++ b/single_inference/scene_inference.py
@@ -28,6 +28,8 @@ def run_inference(args, scan_id=None):
         dataset = datasets.Scan3RInferDataset(args.data_dir)
     elif args.dataset == 'ARKitScenes':
         dataset = datasets.ARKitScenesInferDataset(args.data_dir)
+    elif args.dataset == 'MultiScan':
+        dataset = datasets.MultiScanInferDataset(args.data_dir)
     else:
         raise NotImplementedError('Dataset not implemented')
     
diff --git a/util/multiscan.py b/util/multiscan.py
new file mode 100644
index 0000000..d570973
--- /dev/null
+++ b/util/multiscan.py
@@ -0,0 +1,670 @@
+import os.path as osp
+import numpy as np
+from plyfile import PlyData
+from glob import glob
+import csv
+import jsonlines
+import json
+import os
+
+MULTISCAN_SCANNET = {
+    "wall": "wall",
+    "door": "door",
+    "slippers": "shoe",
+    "mop": "broom",
+    "rug": "rug",
+    "floor": "floor",
+    "basin": "sink",
+    "basin_stand": "sink",
+    "bucket": "bucket",
+    "shower": "shower",
+    "water_tank": "container",
+    "beam": "wood beam",
+    "pillar": "pillar",
+    "ceiling": "ceiling",
+    "sink": "sink",
+    "toilet": "toilet",
+    "cabinet": "cabinet",
+    "remove": "object",
+    "towel": "towel",
+    "pillow": "pillow",
+    "sofa": "sofa",
+    "footstool": "footstool",
+    "picture": "picture",
+    "window": "window",
+    "heater": "heater",
+    "mirror": "mirror",
+    "pipe": "pipe",
+    "scarf": "cloth",
+    "ceiling_light": "ceiling light",
+    "chair": "chair",
+    "table": "table",
+    "vent": "vent",
+    "bag": "bag",
+    "wall_cabinet": "cabinet",
+    "range": "stove",
+    "ricemaker": "rice cooker",
+    "pan": "cooking pan",
+    "coffee_machine": "coffee maker",
+    "rice_bag": "bag",
+    "light": "light",
+    "trashbin": "trash bin",
+    "kettle": "kettle",
+    "refrigerator": "refrigerator",
+    "microwave": "microwave",
+    "light_switch": "light switch",
+    "rice_cooker": "rice cooker",
+    "box": "box",
+    "shoe": "shoe",
+    "range_hood": "range hood",
+    "wok": "cooking pan",
+    "router": "object",
+    "paper_towel": "paper towel roll",
+    "stock_pot": "pot",
+    "cutting_board": "cutting board",
+    "wall_calendar": "calendar",
+    "baseboard": "object",
+    "coke_box": "box",
+    "printer": "printer",
+    "bowl": "bowl",
+    "backpack": "backpack",
+    "baseboard_heater": "heater",
+    "broom": "broom",
+    "dust_pan": "dustpan",
+    "trash_bin": "trash bin",
+    "rigid_duct": "vent",
+    "electric_range": "stove",
+    "spatula": "object",
+    "faucet": "faucet",
+    "bottle": "bottle",
+    "countertop": "counter",
+    "railing": "railing",
+    "suitcase": "suitcase",
+    "trash": "trash can",
+    "pot": "pot",
+    "kitchen_tool": "object",
+    "vegetable": "object",
+    "board": "board",
+    "washing_machine": "washing machine",
+    "jar": "jar",
+    "object": "object",
+    "notebook": "book",
+    "induction_cooker": "stove",
+    "instant_pot_lid": "cooking pot",
+    "oven": "oven",
+    "air_fryer": "object",
+    "lid": "pot",
+    "sponge": "sponge",
+    "blender": "object",
+    "spoon": "object",
+    "dishwasher": "dishwasher",
+    "detergent": "laundry detergent",
+    "watermelon": "bananas",
+    "yard_waste_bag": "garbage bag",
+    "container": "container",
+    "newspapers": "paper",
+    "rag": "cloth",
+    "ladder": "ladder",
+    "gate": "door",
+    "napkin_box": "tissue box",
+    "jacket": "jacket",
+    "windowsill": "windowsill",
+    "water_faucet": "faucet",
+    "steel_ball": "ball",
+    "rice_maker": "rice cooker",
+    "watter_bottle": "water bottle",
+    "plastic_bag": "bag",
+    "paper_bag": "paper bag",
+    "cuttting_board": "cutting board",
+    "trash_bin_lid": "trash bin",
+    "hair_dryer": "hair dryer",
+    "electric_socket": "power outlet",
+    "electric_panel": "electric panel",
+    "wash_stand": "sink",
+    "soap": "soap",
+    "curtain": "curtain",
+    "bathtub": "bathtub",
+    "smoke_detector": "smoke detector",
+    "roll_paper": "paper towel roll",
+    "chandelier": "chandelier",
+    "hand_sanitizer": "hand sanitzer dispenser",
+    "plate": "plate",
+    "sticker": "sticker",
+    "power_socket": "power outlet",
+    "stacked_cups": "stack of cups",
+    "stacked_chairs": "stack of chairs",
+    "air_vent": "vent",
+    "cornice": "cabinet",
+    "wine_cabinet": "kitchen cabinet",
+    "crock": "bowl",
+    "liquor_box": "cabinet",
+    "shampoo": "shampoo",
+    "shower_curtain": "shower curtain",
+    "wall_light": "wall lamp",
+    "sink_cabinet": "sink",
+    "toilet_roll": "toilet paper",
+    "shelf": "shelf",
+    "paper_bin": "recycling bin",
+    "toilet_brush": "toilet brush",
+    "shower_head": "shower head",
+    "tv": "tv",
+    "remote_control": "remote",
+    "tv_box": "tv stand",
+    "nightstand": "nightstand",
+    "bed": "bed",
+    "quilt": "blanket",
+    "telephone": "telephone",
+    "monitor": "monitor",
+    "desk": "desk",
+    "radiator_shell": "radiator",
+    "calendar": "calendar",
+    "clock": "clock",
+    "keyboard": "keyboard",
+    "speaker": "speaker",
+    "clothes": "clothes",
+    "door_frame": "doorframe",
+    "sliding_door": "sliding door",
+    "ceiling_lamp": "ceiling lamp",
+    "scale": "scale",
+    "power_strip": "power strip",
+    "switch": "light switch",
+    "basket": "basket",
+    "stool": "stool",
+    "shoes": "shoe",
+    "slipper": "slippers",
+    "bifold_door": "door",
+    "rangehood": "range hood",
+    "books": "books",
+    "toilet_paper": "toilet paper",
+    "mouse_pad": "mouse",
+    "ipad": "ipad",
+    "scissor": "knife block",
+    "radiator": "radiator",
+    "pc": "computer tower",
+    "bicycle": "bicycle",
+    "wardrobe": "wardrobe",
+    "mouse": "mouse",
+    "advertising_board": "poster",
+    "banner": "banner",
+    "ceiling_decoration": "ceiling light",
+    "whiteboard": "whiteboard",
+    "wall_storage_set": "shelf",
+    "traffic_cone": "traffic cone",
+    "wall_decoration": "decoration",
+    "papers": "papers",
+    "hat": "hat",
+    "velvet_hangers": "clothes hanger",
+    "circular_plate": "plate",
+    "cellphone": "telephone",
+    "pen": "keyboard piano",
+    "paper": "paper",
+    "lamp": "lamp",
+    "curtain_box": "curtains",
+    "woodcarving": "wood",
+    "scissors": "knife block",
+    "hand_dryer": "hand dryer",
+    "machine": "machine",
+    "vase": "vase",
+    "plant": "plant",
+    "power_socket_case": "power outlet",
+    "gloves": "clothes",
+    "dishcloth": "cloth",
+    "painting": "painting",
+    "shower_wall": "shower wall",
+    "showerhead": "shower head",
+    "tooth_mug": "cup",
+    "map": "map",
+    "knot_artwork": "decoration",
+    "fan": "fan",
+    "sphygmomanometer": "scale",
+    "electric_kettle": "kettle",
+    "bread_maker": "oven",
+    "knife_set": "knife block",
+    "soup_pot": "cooking pot",
+    "flatware_set": "cutting board",
+    "candle": "candle",
+    "lid_rack": "dish rack",
+    "flower": "flowerpot",
+    "can": "can",
+    "scoop": "bowl",
+    "laptop": "laptop",
+    "glass": "glass doors",
+    "wet_floor_sign": "wet floor sign",
+    "shower_enclosure": "shower doors",
+    "jewelry_box": "jewelry box",
+    "bath_brush": "hair brush",
+    "sofa_cushion": "couch cushions",
+    "tv_cabinet": "tv stand",
+    "wood_fence": "wood beam",
+    "floor_lamp": "lamp",
+    "computer_case": "computer tower",
+    "waste_container": "trash bin",
+    "roadblock": "barricade",
+    "trash_can_lids": "trash can",
+    "hand_sanitizer_stand": "soap dispenser",
+    "air_conditioner": "conditioner bottle",
+    "pattern": "rug",
+    "remote_controller": "remote",
+    "phone": "telephone",
+    "speakers": "speaker",
+    "table_divider": "divider",
+    "table_card": "card",
+    "paper_trimmer": "paper cutter",
+    "stapler": "stapler",
+    "cup": "cup",
+    "bathroom_heater": "heater",
+    "wall_shelf": "shelf",
+    "towel_rack": "towel",
+    "sink_drain": "sink",
+    "floor_drain": "floor",
+    "broom_head": "broom",
+    "door_curtain": "curtain",
+    "refill_pouch": "plastic container",
+    "bin": "bin",
+    "stall_wall": "bathroom stall door",
+    "wall_speaker": "speaker",
+    "laundry_basket": "laundry basket",
+    "tissue_box": "tissue box",
+    "document_holder": "file cabinet",
+    "yoga_mat": "yoga mat",
+    "gas_range": "stove",
+    "chopping_board": "cutting board",
+    "book_scanner": "scanner",
+    "payment_terminal": "vending machine",
+    "napkin_roll": "paper towel roll",
+    "faucet_switch": "faucet",
+    "glass_door": "glass doors",
+    "carpet": "carpet",
+    "shower_floor": "shower floor",
+    "toilet_plunger": "plunger",
+    "plug_panel": "power outlet",
+    "stand": "stand",
+    "potted_plant": "potted plant",
+    "poster": "poster",
+    "isolation_board": "divider",
+    "soap_holder": "soap dish",
+    "plug": "power outlet",
+    "brush": "hair brush",
+    "threshold": "doorframe",
+    "air_conditioner_controller": "remote",
+    "iron": "iron",
+    "ironing_board": "ironing board",
+    "safe": "suitcase",
+    "gas_cooker": "stove",
+    "pressure_cooker": "cooking pot",
+    "steamer_pot": "pot",
+    "soy_sauce_bottle": "bottle",
+    "dishwashing_liquid": "dishwashing soap bottle",
+    "water_ladle": "bowl",
+    "power_socket_set": "power strip",
+    "kitchen_tool_holder": "kitchen cabinet",
+    "case": "case",
+    "wall_paper": "wall",
+    "comb": "hair brush",
+    "paper_cutter": "paper cutter",
+    "pencil_sharpener": "pen holder",
+    "sealing_machine": "machine",
+    "poster_board": "poster",
+    "shredder": "shredder",
+    "footstep": "stair",
+    "planter": "plant",
+    "floor_light": "lamp",
+    "paper_cup": "cup",
+    "divider": "divider",
+    "hanger": "clothes hanger",
+    "glove": "clothing",
+    "blanket": "blanket",
+    "remote": "remote",
+    "cloth": "cloth",
+    "clutter": "object",
+    "extinguisher": "fire extinguisher",
+    "dryer": "clothes dryer",
+    "soap_bottle": "soap bottle",
+    "fabric_softener_box": "box",
+    "dryer_sheet_box": "box",
+    "detergent_bottle": "laundry detergent",
+    "toaster": "toaster",
+    "stacked_bowls": "bowl",
+    "pot_lid": "pot",
+    "electric_pressure_cooker": "rice cooker",
+    "bread": "food display",
+    "bagels": "object",
+    "oranges": "bananas",
+    "card_reader": "card",
+    "whiteboard_detergent": "soap dispenser",
+    "power_outlet": "power outlet",
+    "bouquet": "vase",
+    "water_bottle": "water bottle",
+    "wall_mounted_telephone": "telephone",
+    "fridge": "refrigerator",
+    "toy": "toy dinosaur",
+    "shoe_box": "box",
+    "hole_puncher": "paper cutter",
+    "landline_telephone": "telephone",
+    "base": "stand",
+    "handkerchief": "cloth",
+    "cornice_molding": "frame",
+    "bathtub_base": "bathtub",
+    "bidet": "toilet",
+    "pedestal_urinal": "urinal",
+    "pedestal_urinal_covered": "urinal",
+    "pit_toilet": "toilet",
+    "low_wall": "wall",
+    "rail": "rail",
+    "bottles": "bottles",
+    "floor_otherroom": "floor",
+    "wall_otherroom": "wall",
+    "canopy": "canopy",
+    "cable_manager": "cable",
+    "sneakers": "shoes",
+    "purse": "purse",
+    "cushion": "cushion",
+    "napkin": "towel",
+    "plush_toy": "stuffed animal",
+    "adjustable_desk": "desk",
+    "tableware": "plates",
+    "computer_desk": "desk",
+    "cat_kennel": "cat litter box",
+    "back_cushion": "pillow",
+    "ukulele_bag": "guitar case",
+    "litter_box": "trash can",
+    "storage_box": "storage bin",
+    "toy_doll": "doll",
+    "drawer_unit": "drawer",
+    "doll": "stuffed animal",
+    "laptop_bag": "messenger bag",
+    "clothing_rack": "clothing rack",
+    "bookshelf": "bookshelves",
+    "mask": "cloth",
+    "watch": "clock",
+    "book": "books",
+    "ashtray": "tray",
+    "car_key": "car",
+    "wallet": "purse",
+    "tea_pot": "tea kettle",
+    "wire": "cable",
+    "rake": "broom",
+    "dispenser": "soap dispenser",
+    "toilet_tank": "toilet",
+    "door_sill": "doorframe",
+    "cleanser": "soap",
+    "armrest": "armchair",
+    "short_wall": "wall",
+    "suspended_ceiling": "ceiling",
+    "fire_extinguisher_cabinet": "fire extinguisher",
+    "plastic_box": "plastic container",
+    "sanitation_station": "soap dispenser",
+    "plant_pot": "flowerpot",
+    "fireplace": "fireplace",
+    "computer_table": "desk",
+    "tissue_bag": "tissue box",
+    "wall_frame": "frame",
+    "map_board": "map",
+    "automated_teller_machine": "vending machine",
+    "ticket": "card",
+    "tablet": "ipad",
+    "blankets": "blanket",
+    "bags": "bag",
+    "flag": "flag",
+    "blackboard": "blackboard",
+    "bar_table": "bar",
+    "cardboard_holder": "cardboard",
+    "potted_planet": "potted plant",
+    "tray": "tray",
+    "utensil_holder": "kitchen counter",
+    "bird_ceramics": "statue",
+    "shirt": "shirt",
+    "clothes_rail": "clothes hanger",
+    "power_strips": "power strip",
+    "card_board": "board",
+    "pile_of_blankets": "blanket",
+    "bed_net": "bed",
+    "umbrella": "umbrella",
+    "dragon_fruit": "bananas",
+    "tissue": "tissue box",
+    "electrical_panel": "electric panel",
+    "panel": "door",
+    "tube": "tube",
+    "pile_of_cloth": "cloth",
+    "surface": "table",
+    "chair_cushion": "cushion",
+    "guide": "book",
+    "parapet": "railing",
+    "camera": "camera",
+    "light_base": "lamp base",
+    "first_aid": "object",
+    "bench": "bench",
+    "potted_plants": "potted plant",
+    "pot_cover": "pot",
+    "yoga_mat_roll": "yoga mat",
+    "panda_doll": "stuffed animal",
+    "window_trim": "window",
+    "shoe_cabinet": "shoe rack",
+    "toilet_paper_holder": "toilet paper dispenser",
+    "shower_faucet": "shower faucet handle",
+    "bath_sponge": "sponge",
+    "ornament": "decoration",
+    "planter_box": "plant",
+    "cooktop": "stove",
+    "knife_block": "knife block",
+    "step_stool": "step stool",
+    "touchpad": "keyboard",
+    "light_box": "light",
+    "sound": "speaker",
+    "exhaust_fan_vent": "vent",
+    "paperbin": "recycling bin",
+    "mop_bucket": "bucket",
+    "sneaker": "shoes",
+    "objects": "object",
+    "cd_tray": "cd case",
+    "wall_board": "board",
+    "room_divider": "divider",
+    "paiting": "painting",
+    "cabinet_otherroom": "cabinet",
+    "electric_switch": "light switch",
+    "sign": "exit sign",
+    "hand_soap": "soap bottle",
+    "window_blinds": "blinds"
+}
+
+def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'):
+    LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv')
+    assert osp.exists(LABEL_MAP_FILE)
+    
+    raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to)
+    return raw_label_map
+
+def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
+    assert osp.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = row[label_to]
+    
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k):v for k,v in mapping.items()}
+    
+    return mapping
+
+def get_scan_ids(dirname, split):
+    filepath = osp.join(dirname, '{}_scans.txt'.format(split))
+    scan_ids = np.genfromtxt(filepath, dtype = str)
+    return scan_ids
+
+def load_ply_data(data_dir, scan_id):
+    """
+    Load PLY data and propagate object IDs from faces to vertices.
+    """
+    filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id))
+    
+    if not osp.exists(filename_in):
+        raise FileNotFoundError(f"PLY file not found: {filename_in}")
+    
+    with open(filename_in, 'rb') as file:
+        ply_data = PlyData.read(file)
+    
+    # Extract vertex properties
+    x = np.array(ply_data['vertex']['x'])
+    y = np.array(ply_data['vertex']['y'])
+    z = np.array(ply_data['vertex']['z'])
+    red = np.array(ply_data['vertex']['red'])
+    green = np.array(ply_data['vertex']['green'])
+    blue = np.array(ply_data['vertex']['blue'])
+    
+    # Extract normals if available
+    if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']:
+        nx = np.array(ply_data['vertex']['nx'])
+        ny = np.array(ply_data['vertex']['ny'])
+        nz = np.array(ply_data['vertex']['nz'])
+        normals = np.stack([nx, ny, nz], axis=-1)
+    else:
+        normals = None
+
+    # Initialize object IDs for vertices with a default undefined value
+    vertex_object_ids = np.full(len(x), -1, dtype='int32')  # Default: -1 (undefined)
+    
+    # Extract face data
+    faces = ply_data['face'].data
+    face_vertex_indices = [face['vertex_indices'] for face in faces]
+    face_object_ids = [face['objectId'] for face in faces]
+    
+    # Propagate object IDs to vertices
+    for face_indices, obj_id in zip(face_vertex_indices, face_object_ids):
+        vertex_object_ids[face_indices] = obj_id  # Assign object ID to all vertices in the face
+    
+    vertex_dtype = [
+        ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),       # Coordinates
+        ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),  # Colors
+        ('objectId', 'i4')                            # Propagated Object ID
+    ]
+    
+    if normals is not None:
+        vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')])  # Normals
+    
+    vertices = np.empty(len(x), dtype=vertex_dtype)
+    
+    vertices['x'] = x.astype('f4')
+    vertices['y'] = y.astype('f4')
+    vertices['z'] = z.astype('f4')
+    vertices['red'] = red.astype('u1')
+    vertices['green'] = green.astype('u1')
+    vertices['blue'] = blue.astype('u1')
+    vertices['objectId'] = vertex_object_ids.astype('i4')
+    
+    if normals is not None:
+        vertices['nx'] = normals[:, 0].astype('f4')
+        vertices['ny'] = normals[:, 1].astype('f4')
+        vertices['nz'] = normals[:, 2].astype('f4')
+    
+    return vertices
+
+def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"):
+    '''
+    Load MultiScan intrinsic information
+    '''
+    meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json')
+    intrinsics = {}
+    
+    with open(meta_intrinsics_path,"r") as f:
+        json_data=json.load(f)
+    
+    for stream in json_data.get("streams", []):
+        if stream.get("type") == stream_type:
+            intrinsic_mat = np.array(stream.get("intrinsics"))
+            intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F')
+            intrinsics['intrinsic_mat']=intrinsic_mat
+            resolution = stream.get("resolution")
+            width, height = resolution[1], resolution[0]  # [width, height]
+            intrinsics['width']=float(width)
+            intrinsics['height']=float(height)
+    
+    return intrinsics
+
+def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"):
+    '''
+    Load MultiScan intrinsic information
+    '''
+    intrinsics_path = osp.join(scan_dir, 'poses.jsonl')
+    resoultion_path = osp.join(scan_dir, f'{scene_id}.json')
+    intrinsics = {}
+    
+    with open(resoultion_path,"r") as f:
+        json_data=json.load(f)
+    
+    for stream in json_data.get("streams", []):
+        if stream.get("type") == stream_type:
+            resolution = stream.get("resolution", None)
+            if resolution:
+                width, height = resolution[1], resolution[0]  # [width, height]
+                intrinsics['width']=float(width)
+                intrinsics['height']=float(height)
+                
+        
+    with jsonlines.open(intrinsics_path) as reader:
+        for entry in reader:
+            if entry.get("frame_id") == frame_id:
+                intrinsic_mat = np.asarray(entry.get('intrinsics'))
+                intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F')
+                intrinsics['intrinsic_mat']=intrinsic_mat
+                break
+    
+    return intrinsics
+
+def load_pose(scan_dir, frame_id):
+    # Find alignment file
+    alignment_path = None
+    for file_name in os.listdir(scan_dir):
+        if file_name.endswith('.align.json'):
+            alignment_path = osp.join(scan_dir, file_name)
+            break
+
+    if alignment_path is None:
+        raise FileNotFoundError(f"No alignment file found in {scan_dir}")
+
+    with open(alignment_path, "r") as f:
+        alignment_data = json.load(f)
+    if 'coordinate_transform' not in alignment_data:
+        raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'")
+    coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F')
+    inv_transform = np.linalg.inv(coordinate_transform)
+
+    pose_path = osp.join(scan_dir, 'poses.jsonl')
+    with jsonlines.open(pose_path) as reader:
+        for entry in reader:
+            if entry.get("frame_id") == frame_id:
+                transform = np.asarray(entry.get('transform'))
+                pose = np.reshape(transform, (4, 4), order='F')
+                aligned_pose = inv_transform @ pose #align camera poses
+                return aligned_pose
+
+    raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}")
+
+
+def load_all_poses(scan_dir, frame_idxs):
+    frame_poses = {}
+    for frame_idx in frame_idxs:
+        frame_pose = load_pose(scan_dir, int(frame_idx))
+        frame_poses[frame_idx] = frame_pose
+    return frame_poses
+
+def load_frame_idxs(scan_dir, skip=None):
+    frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg'))
+    frame_names = [osp.basename(frame_path) for frame_path in frames_paths]
+    frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names]
+    frame_idxs.sort()    
+
+    if skip is None:
+        frame_idxs = frame_idxs
+    else:
+        frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]]
+    return frame_idxs
+
+
+def represents_int(s):
+    ''' if string s represents an int. '''
+    try: 
+        int(s)
+        return True
+    except ValueError:
+        return False
\ No newline at end of file

From c11fff47e0244e6b89ec4dc37930efb7085def9b Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Thu, 13 Mar 2025 02:24:18 +0530
Subject: [PATCH 07/18] config related changes for MultiScan

---
 configs/train/train_instance_crossover.yaml | 4 ++--
 scripts/preprocess/process_multiscan.sh     | 8 ++++----
 scripts/preprocess/process_scan3r.sh        | 7 +++----
 scripts/preprocess/process_scannet.sh       | 9 ++++-----
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index 365f247..aaf472d 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -69,8 +69,8 @@ task:
   name       : SceneLevelGrounding 
   SceneLevelGrounding :
     modalities  : ['rgb', 'point', 'cad', 'referral']
-    train       : [Scannet, Scan3R, ARKitScenes]
-    val         : [Scannet, Scan3R, ARKitScenes]
+    train       : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    val         : [Scannet, Scan3R, MultiScan, ARKitScenes]
 
 trainer: GroundingTrainer
 
diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh
index c08bf84..a13a93c 100644
--- a/scripts/preprocess/process_multiscan.sh
+++ b/scripts/preprocess/process_multiscan.sh
@@ -1,9 +1,9 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null 
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['MultiScan']  hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan']  hydra.run.dir=. hydra.output_subdir=null
 
 # Multi-modal dumping
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh
index 6d8a981..5ac2b71 100644
--- a/scripts/preprocess/process_scan3r.sh
+++ b/scripts/preprocess/process_scan3r.sh
@@ -1,9 +1,8 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R']  hydra.run.dir=. hydra.output_subdir=null
-
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R']  hydra.run.dir=. hydra.output_subdir=null
 # Multi-modal dumping
 python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh
index 68a2366..47aa945 100644
--- a/scripts/preprocess/process_scannet.sh
+++ b/scripts/preprocess/process_scannet.sh
@@ -1,9 +1,8 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null 
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet']  hydra.run.dir=. hydra.output_subdir=null
-
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet']  hydra.run.dir=. hydra.output_subdir=null
 # Multi-modal dumping
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file

From 6dcd65e46a7a39fdb758261d9df4573c783dcd3c Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Thu, 13 Mar 2025 02:51:26 +0530
Subject: [PATCH 08/18] prepare data readme fix

---
 prepare_data/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/prepare_data/README.md b/prepare_data/README.md
index 0246b5c..c369156 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -149,8 +149,7 @@ ARKitScenes/
 ```
 
 #### MultiScan
-1. Download `files/` under `processed_data/meta_data/MultiScan/` from GDrive and place under `PATH_TO_MULTISCAN/`.
-2. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data 
+1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data 
  
  ```bash
 cd MultiScan/scenes

From 655041555f3b3cfb06b44e32492ddd385b338dc6 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Fri, 21 Mar 2025 11:31:57 +0530
Subject: [PATCH 09/18] arkit open3d convention bug fix

---
 util/multiscan.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/util/multiscan.py b/util/multiscan.py
index d570973..8478a7d 100644
--- a/util/multiscan.py
+++ b/util/multiscan.py
@@ -634,8 +634,10 @@ def load_pose(scan_dir, frame_id):
         for entry in reader:
             if entry.get("frame_id") == frame_id:
                 transform = np.asarray(entry.get('transform'))
-                pose = np.reshape(transform, (4, 4), order='F')
-                aligned_pose = inv_transform @ pose #align camera poses
+                transform = np.reshape(transform, (4, 4), order='F')
+                transform = np.dot(transform, np.diag([1, -1, -1, 1]))
+                transform = transform / transform[3][3]
+                aligned_pose = inv_transform @ transform #align camera poses
                 return aligned_pose
 
     raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}")

From 521247aca1e32cca7566fbc66577fee2e53aad3b Mon Sep 17 00:00:00 2001
From: Sayan Deb Sarkar <sayandsarkar.1997@gmail.com>
Date: Fri, 4 Apr 2025 10:51:01 -0700
Subject: [PATCH 10/18] Typo change

---
 retrieval/object_retrieval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py
index 54c144f..526e5a2 100644
--- a/retrieval/object_retrieval.py
+++ b/retrieval/object_retrieval.py
@@ -293,6 +293,6 @@ def run(self) -> None:
         # Object Retrieval Evaluation
         self.eval(output_dict)
         
-        self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...')
+        self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...')
         # Scene Retrieval Evaluation
         self.scene_eval(output_dict)
\ No newline at end of file

From 8a119eea461e8906d9d7e6f6f62a08dd6d053334 Mon Sep 17 00:00:00 2001
From: Sayan Deb Sarkar <sayandsarkar.1997@gmail.com>
Date: Fri, 18 Apr 2025 09:59:47 -0700
Subject: [PATCH 11/18] Commit issue fix + path change

---
 configs/evaluation/eval_instance.yaml       |  9 ++-------
 configs/evaluation/eval_scene.yaml          | 11 +++++------
 configs/preprocess/process_1d.yaml          |  4 ++--
 configs/preprocess/process_2d.yaml          |  5 +++--
 configs/preprocess/process_3d.yaml          |  4 ++--
 configs/preprocess/process_multimodal.yaml  |  4 ++--
 configs/train/train_instance_baseline.yaml  |  4 ++--
 configs/train/train_instance_crossover.yaml |  6 +++---
 configs/train/train_scene_crossover.yaml    | 14 +++++++-------
 data/datasets/scanbase.py                   |  1 -
 preprocess/build.py                         |  1 +
 preprocess/feat1D/arkit.py                  |  2 +-
 preprocess/feat2D/arkit.py                  | 13 ++++++++++---
 preprocess/feat3D/arkit.py                  |  2 +-
 preprocess/multimodal_preprocess.py         |  6 +-----
 scripts/preprocess/process_arkit.sh         |  4 ++--
 trainer/grounding_trainer.py                |  2 ++
 17 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml
index 1c8518c..2b2310b 100644
--- a/configs/evaluation/eval_instance.yaml
+++ b/configs/evaluation/eval_instance.yaml
@@ -44,8 +44,8 @@ data :
     voxel_size     : 0.02
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
@@ -69,12 +69,7 @@ task:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan']
-<<<<<<< HEAD
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+arkit.pth
-=======
     ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
->>>>>>> f86c782 (adding support for multiscan)
-    
 
 inference_module: ObjectRetrieval
 
diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml
index 381153e..a666183 100644
--- a/configs/evaluation/eval_scene.yaml
+++ b/configs/evaluation/eval_scene.yaml
@@ -44,11 +44,14 @@ data :
     voxel_size     : 0.02
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
+    max_object_len : 150
+    voxel_size     : 0.02
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
   MultiScan:
     base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
     process_dir    : ${data.process_dir}/MultiScan
@@ -65,11 +68,7 @@ task:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan'] #, 'point']
-<<<<<<< HEAD
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+arkit.pth
-=======
     ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth
->>>>>>> f86c782 (adding support for multiscan)
 
 inference_module: SceneRetrieval
 model: 
diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml
index baedd3a..4766677 100644
--- a/configs/preprocess/process_1d.yaml
+++ b/configs/preprocess/process_1d.yaml
@@ -26,8 +26,8 @@ data:
     skip_frames    : 1
 
   ARKitScenes:
-    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
index 1cd64dc..244edff 100644
--- a/configs/preprocess/process_2d.yaml
+++ b/configs/preprocess/process_2d.yaml
@@ -28,11 +28,12 @@ data:
     skip_frames    : 1
 
   ARKitScenes:
-    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
   MultiScan:
     base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
     process_dir    : ${data.process_dir}/MultiScan
diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml
index 5602ed8..1989286 100644
--- a/configs/preprocess/process_3d.yaml
+++ b/configs/preprocess/process_3d.yaml
@@ -25,8 +25,8 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
 
   ARKitScenes:
-    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml
index 54e3cd1..fd8809b 100644
--- a/configs/preprocess/process_multimodal.yaml
+++ b/configs/preprocess/process_multimodal.yaml
@@ -29,8 +29,8 @@ data:
     avail_modalities : ['point', 'rgb', 'referral']
 
   ARKitScenes:
-    base_dir       : /media/sayan/Expansion/data/datasets/ArkitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml
index a97cb22..ee70d74 100644
--- a/configs/train/train_instance_baseline.yaml
+++ b/configs/train/train_instance_baseline.yaml
@@ -45,8 +45,8 @@ data :
     voxel_size     : 0.02
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index aaf472d..35a6a15 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -45,13 +45,13 @@ data :
     voxel_size     : 0.02
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
-    avail_modalities : ['point', 'rgb', 'referral']
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
   MultiScan:
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
index aea7152..9886e95 100644
--- a/configs/train/train_scene_crossover.yaml
+++ b/configs/train/train_scene_crossover.yaml
@@ -45,13 +45,13 @@ data :
     voxel_size     : 0.02
 
   ARKitScenes:
-    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/ARKitScenes
-    process_dir    : ${data.process_dir}/ARKitScenes/scans
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
     chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
     processor3D    : ARKitScenes3DProcessor
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
-    avail_modalities : ['point', 'rgb', 'referral']
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
     max_object_len : 150
     voxel_size     : 0.02
   MultiScan:
@@ -70,9 +70,9 @@ task:
   UnifiedTrain :
     modalities       : ['rgb', 'point', 'cad', 'referral']
     scene_modalities : ['rgb', 'point', 'floorplan', 'referral']
-    train            : [Scannet, Scan3R, ARKitScenes]
-    val              : [Scannet, Scan3R, ARKitScenes]
-    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
+    train            : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    val              : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth
     
 trainer: UnifiedTrainer
 
@@ -99,7 +99,7 @@ model:
   base_modality : 'rgb'
 
 dataloader:
-  batch_size  : 16
+  batch_size  : 32
   num_workers : 6
 
 eval:
diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py
index 7f8d3fe..b531e32 100644
--- a/data/datasets/scanbase.py
+++ b/data/datasets/scanbase.py
@@ -187,7 +187,6 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings'])
         rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1)
-        rgb_embedding = rgb_embedding[list(range(0, rgb_embedding.shape[0], 2)), :]
         scene_dict['rgb_embedding'] = rgb_embedding
         
         scene_dict['scene_masks']['rgb'] = torch.Tensor([1.0])
diff --git a/preprocess/build.py b/preprocess/build.py
index 551d97f..fb3445e 100644
--- a/preprocess/build.py
+++ b/preprocess/build.py
@@ -3,5 +3,6 @@
 PROCESSOR_REGISTRY = Registry("Processor")
 
 def build_processor(processor_name, data_config, modality_config, split):
+    print(f"Building processor: {processor_name}")
     processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split)
     return processor
\ No newline at end of file
diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py
index 0e2873d..efab03c 100644
--- a/preprocess/feat1D/arkit.py
+++ b/preprocess/feat1D/arkit.py
@@ -20,7 +20,7 @@ def __init__(self, config_data, config_1D, split) -> None:
         self.scan_ids = []
         self.scan_ids = arkit.get_scan_ids(files_dir, split)
         
-        self.out_dir = config_data.process_dir
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
         load_utils.ensure_dir(self.out_dir)        
         # Object Referrals
         self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
index baec4ad..531b5b6 100644
--- a/preprocess/feat2D/arkit.py
+++ b/preprocess/feat2D/arkit.py
@@ -29,7 +29,7 @@ def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -
         self.split = split
         self.scan_ids = arkit.get_scan_ids(files_dir, self.split)
         
-        self.out_dir = config_data.process_dir
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
         load_utils.ensure_dir(self.out_dir)
         
         self.orig_image_size = config_2D.image.orig_size
@@ -51,10 +51,14 @@ def compute2DFeatures(self) -> None:
         for scan_id in tqdm(self.scan_ids):
             self.compute2DImagesAndSeg(scan_id)
             self.compute2DFeaturesEachScan(scan_id)   
-            if self.split == 'val':
-                self.computeAllImageFeaturesEachScan(scan_id)
+            # if self.split == 'val':
+            #     self.computeAllImageFeaturesEachScan(scan_id)
     
     def compute2DImagesAndSeg(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')):
+            return
+        
         objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
         if not osp.exists(objects_path):
             raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
@@ -104,6 +108,9 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
+        if osp.exists(osp.join(scene_out_dir, 'data2D.pt')):
+            return
+        
         obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
         
         # Multi-view Image -- Object (Embeddings)
diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py
index e265d78..6172204 100644
--- a/preprocess/feat3D/arkit.py
+++ b/preprocess/feat3D/arkit.py
@@ -21,7 +21,7 @@ def __init__(self, config_data, config_3D, split) -> None:
         self.scan_ids = []
         self.scan_ids = arkit.get_scan_ids(files_dir, split)
         
-        self.out_dir = config_data.process_dir
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
         load_utils.ensure_dir(self.out_dir)
         self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
         
diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
index 70adff4..a45274b 100644
--- a/preprocess/multimodal_preprocess.py
+++ b/preprocess/multimodal_preprocess.py
@@ -8,11 +8,7 @@
 import h5py
 from common import load_utils 
 from common.constants import ModalityType
-<<<<<<< HEAD
-from util import scan3r, scannet, arkit
-=======
-from util import scan3r, scannet, multiscan
->>>>>>> f86c782 (adding support for multiscan)
+from util import scan3r, scannet, arkit, multiscan
 from typing import Dict, Optional
 
 from preprocess.build import PROCESSOR_REGISTRY
diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh
index 5ff7fd5..3acdb4a 100644
--- a/scripts/preprocess/process_arkit.sh
+++ b/scripts/preprocess/process_arkit.sh
@@ -1,9 +1,9 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
 python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
 python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
 
 # Multi-modal dumping
-# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
diff --git a/trainer/grounding_trainer.py b/trainer/grounding_trainer.py
index e0a40b2..7ee201c 100644
--- a/trainer/grounding_trainer.py
+++ b/trainer/grounding_trainer.py
@@ -1,5 +1,7 @@
+import os.path as osp
 from tqdm import tqdm
 from omegaconf import DictConfig
+from safetensors.torch import load_file
 
 import torch
 from trainer.build import TRAINER_REGISTRY

From 5f3f734fc371415ee453fb221759235b1c0a6685 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 11:23:39 +0530
Subject: [PATCH 12/18] 1d preprocessing changes

---
 preprocess/feat1D/arkit.py     | 55 ++++++++++++++---------
 preprocess/feat1D/multiscan.py | 82 +++++++++++++++-------------------
 preprocess/feat1D/scan3r.py    | 52 ++++++++++++---------
 preprocess/feat1D/scannet.py   | 56 +++++++++++++----------
 4 files changed, 133 insertions(+), 112 deletions(-)

diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py
index efab03c..f03571a 100644
--- a/preprocess/feat1D/arkit.py
+++ b/preprocess/feat1D/arkit.py
@@ -2,7 +2,7 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-
+import os
 from common import load_utils 
 from util import labelmap, arkit
 from util.arkit import ARKITSCENE_SCANNET
@@ -52,35 +52,46 @@ def load_objects_for_scan(self, scan_id):
         
         return objects
     
-    
-    
     def compute1DFeaturesEachScan(self, scan_id):
+        data1D = {}
+        
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
-        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
-        scan_objects = self.load_objects_for_scan(scan_id)
+        pt_1d_path = osp.join(scene_out_dir, "data1D.pt")
+        if osp.exists(pt_1d_path):
+            pt_data=torch.load(pt_1d_path)
+            data1D['objects'] = pt_data['objects']
+            data1D['scene'] = pt_data['scene']
+            os.remove(pt_1d_path)
+        else:
+        # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+            npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
+            objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+            
+            scan_objects = self.load_objects_for_scan(scan_id)
 
-        object_referral_embeddings, scene_referral_embeddings = {}, None
-        if len(scan_objects) != 0:
-            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+            object_referral_embeddings, scene_referral_embeddings = {}, None
+            if len(scan_objects) != 0:
+                object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
 
-        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
-        
-        if len(scene_referrals) != 0:
-            if len(scene_referrals) > 10:
-                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
             
-            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
-            scene_referrals = ' '.join(scene_referrals)
-            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
-            assert scene_referral_embeddings is not None
-        
-        data1D = {}
-        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
-        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            if len(scene_referrals) != 0:
+                if len(scene_referrals) > 10:
+                    scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+                
+                scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+                scene_referrals = ' '.join(scene_referrals)
+                scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+                assert scene_referral_embeddings is not None
+            
+            data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+            data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            
+        # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D)
         
-        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
              
     def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
         object_referral_embeddings = {}
diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py
index 58b9ff9..d96ad4e 100644
--- a/preprocess/feat1D/multiscan.py
+++ b/preprocess/feat1D/multiscan.py
@@ -2,7 +2,7 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-
+import os
 from common import load_utils 
 from util import labelmap, multiscan
 
@@ -45,59 +45,49 @@ def load_objects_for_scan(self, scan_id):
         
         return objects
     
-    def extractTextFeats(self, texts, return_text = False):
-        text_feats = []
-        
-        for text in texts:
-            encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device)  
-            if encoded_text['input_ids'].shape[1] > 512: 
-                continue
-            
-            with torch.no_grad():
-                encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask,                      
-                                                return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1)
-                
-            text_feats.append({'text' : text, 'feat' : encoded_text})
-        
-        if len(text_feats) == 0:
-            return None
-        
-        if return_text:
-            return text_feats
-         
-        text_feats = [text_feat['feat'] for text_feat in text_feats]
-        text_feats = np.concatenate(text_feats)
-        return text_feats
-    
     
     def compute1DFeaturesEachScan(self, scan_id):
+        data1D = {}
+        
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
-        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
-        scan_objects = self.load_objects_for_scan(scan_id)
+        pt_1d_path = osp.join(scene_out_dir, "data1D.pt")
+        if osp.exists(pt_1d_path):
+            pt_data=torch.load(pt_1d_path)
+            data1D['objects'] = pt_data['objects']
+            data1D['scene'] = pt_data['scene']
+            os.remove(pt_1d_path)
 
-        object_referral_embeddings, scene_referral_embeddings = {}, None
-        if len(scan_objects) != 0:
-            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+        else:
+        # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+            npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
+            # print(npz_data)
+            objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+            # print(objectID_to_labelID_map)
+            scan_objects = self.load_objects_for_scan(scan_id)
 
-        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
-        
-        if len(scene_referrals) != 0:
-            if len(scene_referrals) > 10:
-                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            object_referral_embeddings, scene_referral_embeddings = {}, None
+            if len(scan_objects) != 0:
+                object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+
+            scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
             
-            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
-            scene_referrals = ' '.join(scene_referrals)
-            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
-            assert scene_referral_embeddings is not None
-        
-        data1D = {}
-        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
-        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            if len(scene_referrals) != 0:
+                if len(scene_referrals) > 10:
+                    scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+                
+                scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+                scene_referrals = ' '.join(scene_referrals)
+                scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+                assert scene_referral_embeddings is not None
+            
+            data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+            data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            
+        # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+        # Combine and save as npz
+        np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D)
         
-        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
-             
     def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
         object_referral_embeddings = {}
         
diff --git a/preprocess/feat1D/scan3r.py b/preprocess/feat1D/scan3r.py
index 65fb6e9..fdd95d6 100644
--- a/preprocess/feat1D/scan3r.py
+++ b/preprocess/feat1D/scan3r.py
@@ -4,7 +4,7 @@
 from common import load_utils 
 from util import scan3r
 from typing import Dict, List, Union
-
+import os
 from preprocess.build import PROCESSOR_REGISTRY
 from preprocess.feat1D.base import Base1DProcessor
 
@@ -32,32 +32,42 @@ def __init__(self, config_data, config_1D, split) -> None:
         self.undefined = 0
     
     def compute1DFeaturesEachScan(self, scan_id: str) -> None:
+        data1D = {}
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
-        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
-        scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects']
+        pt_1d_path = osp.join(scene_out_dir, "data1D.pt")
+        if osp.exists(pt_1d_path):
+            pt_data=torch.load(pt_1d_path)
+            data1D['objects'] = pt_data['objects']
+            data1D['scene'] = pt_data['scene']
+            os.remove(pt_1d_path)
+        else:
+        # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+            npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
+            objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+            scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects']
 
-        object_referral_embeddings, scene_referral_embeddings = {}, None
-        if len(scan_objects) != 0:
-            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+            object_referral_embeddings, scene_referral_embeddings = {}, None
+            if len(scan_objects) != 0:
+                object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
 
-        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
-        
-        if len(scene_referrals) != 0:
-            if len(scene_referrals) > 10:
-                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
             
-            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
-            scene_referrals = ' '.join(scene_referrals)
-            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
-            assert scene_referral_embeddings is not None
-        
-        data1D = {}
-        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
-        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            if len(scene_referrals) != 0:
+                if len(scene_referrals) > 10:
+                    scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+                
+                scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+                scene_referrals = ' '.join(scene_referrals)
+                scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+                assert scene_referral_embeddings is not None
+            
+            data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+            data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            
+        # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D)
         
-        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
              
     def computeObjectWise1DFeaturesEachScan(self, scan_id: str, scan_objects: Dict, 
                                             objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]:
diff --git a/preprocess/feat1D/scannet.py b/preprocess/feat1D/scannet.py
index e49b8e0..df4ac99 100644
--- a/preprocess/feat1D/scannet.py
+++ b/preprocess/feat1D/scannet.py
@@ -1,7 +1,7 @@
 import os.path as osp
 import torch
 import numpy as np
-
+import os
 from common import load_utils 
 from util import scannet
 from typing import Dict, List, Union
@@ -34,32 +34,42 @@ def __init__(self, config_data, config_1D, split) -> None:
         self.undefined = 0
      
     def compute1DFeaturesEachScan(self, scan_id: str) -> None:
+        data1D = {}
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
-        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
-        objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id]
-        
-        object_referral_embeddings, scene_referral_embeddings = {}, None
-        if len(objects) != 0:
-            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, objects, objectID_to_labelID_map)
+        pt_1d_path = osp.join(scene_out_dir, "data1D.pt")
+        if osp.exists(pt_1d_path):
+            pt_data=torch.load(pt_1d_path)
+            data1D['objects'] = pt_data['objects']
+            data1D['scene'] = pt_data['scene']
+            os.remove(pt_1d_path)
+            
+        else:
+            # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+            npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
+            objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+            objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id]
+            
+            object_referral_embeddings, scene_referral_embeddings = {}, None
+            if len(objects) != 0:
+                object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, objects, objectID_to_labelID_map)
 
-        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
-        
-        if len(scene_referrals) != 0:
-            if len(scene_referrals) > 10:
-                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
             
-            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
-            scene_referrals = ' '.join(scene_referrals)
-            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
-            assert scene_referral_embeddings is not None
-        
-        data1D = {}
-        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
-        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
-        
-        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+            if len(scene_referrals) != 0:
+                if len(scene_referrals) > 10:
+                    scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+                
+                scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+                scene_referrals = ' '.join(scene_referrals)
+                scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+                assert scene_referral_embeddings is not None
+            
+            data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+            data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+            
+        # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D)
              
     def computeObjectWise1DFeaturesEachScan(self, scan_id: str, objects: Dict, 
                                             objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]:

From 268d38ebf92f43e4c048318b70b97a73d648851f Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 11:40:02 +0530
Subject: [PATCH 13/18] 2d preprocessing changes

---
 preprocess/feat2D/arkit.py     | 171 ++++++++++++++++++---------------
 preprocess/feat2D/multiscan.py | 158 +++++++++++++++++-------------
 preprocess/feat2D/scan3r.py    |  85 ++++++++--------
 preprocess/feat2D/scannet.py   |  76 ++++++++-------
 4 files changed, 272 insertions(+), 218 deletions(-)

diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
index 531b5b6..924a70a 100644
--- a/preprocess/feat2D/arkit.py
+++ b/preprocess/feat2D/arkit.py
@@ -12,7 +12,7 @@
 from common import load_utils
 from util import render, arkit, visualisation
 from util import image as image_util
-
+import os
 
 from preprocess.build import PROCESSOR_REGISTRY
 from preprocess.feat2D.base import Base2DProcessor
@@ -56,95 +56,108 @@ def compute2DFeatures(self) -> None:
     
     def compute2DImagesAndSeg(self, scan_id: str) -> None:
         scene_folder = osp.join(self.data_dir, 'scans', scan_id)
-        if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')):
-            return
-        
-        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
-        if not osp.exists(objects_path):
-            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
-        
-        annotations = load_utils.load_json(objects_path)        
-        ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations)
-        instance_ids = ply_data['objectId']
-        
-        mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply')
-        mesh = o3d.io.read_triangle_mesh(mesh_file)
-        mesh_triangles = np.asarray(mesh.triangles)
-        colors = np.asarray(mesh.vertex_colors)*255.0
-        colors = colors.round()
-        num_triangles = mesh_triangles.shape[0]
-        
-        scene = o3d.t.geometry.RaycastingScene()
-        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
-         
-        # project 3D model
-        obj_id_imgs = {}
         obj_id_imgs = {}
-        for frame_idx in self.frame_pose_data[scan_id].keys():
-            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
-            intrinsics = camera_info['intrinsic_mat']
-            img_width = int(camera_info['width'])
-            img_height = int(camera_info['height'])
-            img_pose = self.frame_pose_data[scan_id][frame_idx]
-            img_pose_inv = np.linalg.inv(img_pose)
+        
+        gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
+        if osp.exists(gt_pt_path):
+            # print("using gt pt")
+            old_gt = torch.load(gt_pt_path)
+            for frame_idx in self.frame_pose_data[scan_id]:
+                obj_id_imgs[frame_idx] = old_gt[frame_idx]
+            os.remove(gt_pt_path)
+            
             
-            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
-                scene, intrinsics, img_pose_inv, img_width, img_height, 
-                mesh_triangles, num_triangles, instance_ids
-            )
-            obj_id_imgs[frame_idx] = obj_id_map
+        else:
+            objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+            if not osp.exists(objects_path):
+                raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+            
+            annotations = load_utils.load_json(objects_path)        
+            ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations)
+            instance_ids = ply_data['objectId']
+            
+            mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply')
+            mesh = o3d.io.read_triangle_mesh(mesh_file)
+            mesh_triangles = np.asarray(mesh.triangles)
+            colors = np.asarray(mesh.vertex_colors)*255.0
+            colors = colors.round()
+            num_triangles = mesh_triangles.shape[0]
+            
+            scene = o3d.t.geometry.RaycastingScene()
+            scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+            
+            # project 3D model
+            for frame_idx in self.frame_pose_data[scan_id].keys():
+                camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+                intrinsics = camera_info['intrinsic_mat']
+                img_width = int(camera_info['width'])
+                img_height = int(camera_info['height'])
+                img_pose = self.frame_pose_data[scan_id][frame_idx]
+                img_pose_inv = np.linalg.inv(img_pose)
+                
+                obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                    scene, intrinsics, img_pose_inv, img_width, img_height, 
+                    mesh_triangles, num_triangles, instance_ids
+                )
+                obj_id_imgs[frame_idx] = obj_id_map
 
-        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
-        if osp.exists(osp.join(scene_folder, 'gt-projection')):
-            shutil.rmtree(osp.join(scene_folder, 'gt-projection'))
     
         # save scene-level file for efficient loading
-        torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+        # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+        np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs)
     
     def compute2DFeaturesEachScan(self, scan_id: str) -> None:
+        data2D = {}
+        
         scene_folder = osp.join(self.data_dir, 'scans', scan_id)
         color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
         
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
-        if osp.exists(osp.join(scene_out_dir, 'data2D.pt')):
-            return
-        
-        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
-        
-        # Multi-view Image -- Object (Embeddings)
-        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
-        
-        # Multi-view Image -- Scene (Images + Embeddings)
-        frame_idxs = list(self.frame_pose_data[scan_id].keys())
-        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
-        
-        # Visualise
-        for frame_idx in self.frame_pose_data[scan_id].keys():
-            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
-            intrinsic_mat = camera_info['intrinsic_mat']
-            break
+        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
+        if osp.exists(pt_2d_path):
+            print("using 2d pt")
+            pt_data=torch.load(pt_2d_path)
+            data2D['objects']=pt_data['objects']
+            data2D['scene']=pt_data['scene']
+            os.remove(pt_2d_path)
             
+        else:
+        # obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+            obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+            
+            # Multi-view Image -- Object (Embeddings)
+            object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
+            
+            # Multi-view Image -- Scene (Images + Embeddings)
+            frame_idxs = list(self.frame_pose_data[scan_id].keys())
+            pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+            
+            # Visualise
+            for frame_idx in self.frame_pose_data[scan_id].keys():
+                camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+                intrinsic_mat = camera_info['intrinsic_mat']
+                break
+                
+            
+            scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply'))
+            intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                            'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+            
+            cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+            image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+            Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+            
+            data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+            data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                    'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+            
+            # dummy floorplan
+            floorplan_dict = {'img' : None, 'embedding' : None}
+            data2D['scene']['floorplan'] = floorplan_dict
         
-        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply'))
-        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
-                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
-        
-        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
-        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
-        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
-        
-        data2D = {}
-        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
-        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
-                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
-        
-        # dummy floorplan
-        floorplan_dict = {'img' : None, 'embedding' : None}
-        data2D['scene']['floorplan'] = floorplan_dict
-        
-        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D)
     
     def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
         scene_folder = osp.join(self.data_dir, 'scans', scan_id)
@@ -174,7 +187,8 @@ def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
         data2D = {} 
         data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
                            'frame_idxs' : frame_idxs}
-        torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
+        # torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D)
     
     def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]:
         # Sample Camera Indexes Based on Rotation Matrix From Grid
@@ -207,7 +221,8 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr
         
     
     def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]:
-        object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt'))
+        # object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt'))
+        object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True)
         object_image_votes = {}
         scan_id=scene_folder.split('/')[-1]
         # iterate over all frames
diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py
index d95239e..cb13475 100644
--- a/preprocess/feat2D/multiscan.py
+++ b/preprocess/feat2D/multiscan.py
@@ -5,7 +5,7 @@
 from tqdm import tqdm
 from PIL import Image
 from scipy.spatial.transform import Rotation as R
-
+import os
 from common import load_utils
 from util import render, multiscan, visualisation
 from util import image as image_util
@@ -67,80 +67,100 @@ def compute2DFeatures(self):
     
     def compute2DImagesAndSeg(self, scan_id):
         scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
-        mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id))
-        
-        ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
-        instance_ids = ply_data['objectId']
+        obj_id_imgs = {}
         
-        mesh = o3d.io.read_triangle_mesh(mesh_file)
-        mesh_triangles = np.asarray(mesh.triangles)
-        colors = np.asarray(mesh.vertex_colors)*255.0
-        colors = colors.round()
-        num_triangles = mesh_triangles.shape[0]
+        gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
+        if osp.exists(gt_pt_path):
+            # print("using gt pt")
+            old_gt = torch.load(gt_pt_path)
+            for frame_idx in self.frame_pose_data[scan_id]:
+                obj_id_imgs[frame_idx] = old_gt[frame_idx]
+            os.remove(gt_pt_path)
+                      
+        else:     
+            mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id))
+            
+            ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
+            instance_ids = ply_data['objectId']
+            
+            mesh = o3d.io.read_triangle_mesh(mesh_file)
+            mesh_triangles = np.asarray(mesh.triangles)
+            colors = np.asarray(mesh.vertex_colors)*255.0
+            colors = colors.round()
+            num_triangles = mesh_triangles.shape[0]
+            
+            scene = o3d.t.geometry.RaycastingScene()
+            scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+            
+            # project 3D model
+            for frame_idx in self.frame_pose_data[scan_id]:
+                camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx))
+                intrinsics = camera_info['intrinsic_mat']
+                img_width = int(camera_info['width'])
+                img_height = int(camera_info['height'])
+                img_pose = self.frame_pose_data[scan_id][frame_idx]
+                img_pose_inv = np.linalg.inv(img_pose)
+                
+                obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                    scene, intrinsics, img_pose_inv, img_width, img_height, 
+                    mesh_triangles, num_triangles, instance_ids
+                )
+                obj_id_imgs[frame_idx] = obj_id_map
+
+            # if osp.exists(osp.join(scene_folder, 'gt-projection')):
+            #     shutil.rmtree(osp.join(scene_folder, 'gt-projection'))
         
-        scene = o3d.t.geometry.RaycastingScene()
-        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
-         
-        # project 3D model
-        obj_id_imgs = {}
-        for frame_idx in self.frame_pose_data[scan_id]:
-            camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx))
-            intrinsics = camera_info['intrinsic_mat']
-            img_width = int(camera_info['width'])
-            img_height = int(camera_info['height'])
-            img_pose = self.frame_pose_data[scan_id][frame_idx]
-            img_pose_inv = np.linalg.inv(img_pose)
-            
-            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
-                scene, intrinsics, img_pose_inv, img_width, img_height, 
-                mesh_triangles, num_triangles, instance_ids
-            )
-            obj_id_imgs[frame_idx] = obj_id_map
- 
-        scene_out_dir = osp.join(self.out_dir, scan_id)
-        load_utils.ensure_dir(scene_out_dir)
-    
-        # save scene-level file for efficient loading
-        torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+            # save scene-level file for efficient loading
+            # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+        np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs)
     
     def compute2DFeaturesEachScan(self, scan_id):
+        data2D = {}
+        
         scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
         color_path = osp.join(scene_folder, 'sequence')
-        
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
-        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
-        
-        # Multi-view Image -- Object (Embeddings)
-        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
-        
-        # Multi-view Image -- Scene (Images + Embeddings)
-        frame_idxs = list(self.frame_pose_data[scan_id].keys())
-        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
-        
-        # Visualise
-        camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id)
-        intrinsic_mat = camera_info['intrinsic_mat']
-        
-        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id)))
-        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
-                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
-        
-        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
-        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
-        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
-        
-        data2D = {}
-        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
-        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
-                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
-        
-        # dummy floorplan
-        floorplan_dict = {'img' : None, 'embedding' : None}
-        data2D['scene']['floorplan'] = floorplan_dict
-        
-        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
+        if osp.exists(pt_2d_path):
+            # print("using 2d pt")
+            pt_data=torch.load(pt_2d_path)
+            data2D['objects']=pt_data['objects']
+            data2D['scene']=pt_data['scene']
+            os.remove(pt_2d_path)
+            
+        else:
+            obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+            
+            # Multi-view Image -- Object (Embeddings)
+            object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, obj_id_to_label_id_map)
+            
+            # Multi-view Image -- Scene (Images + Embeddings)
+            frame_idxs = list(self.frame_pose_data[scan_id].keys())
+            pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+            
+            # Visualise
+            camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id)
+            intrinsic_mat = camera_info['intrinsic_mat']
+            
+            scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id)))
+            intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                            'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+            
+            cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+            image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh_old.png')
+            Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+            
+            data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+            data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                    'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+            
+            # dummy floorplan
+            floorplan_dict = {'img' : None, 'embedding' : None}
+            data2D['scene']['floorplan'] = floorplan_dict
+            
+        # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D)
     
     def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs):
         # Sample Camera Indexes Based on Rotation Matrix From Grid
@@ -170,7 +190,9 @@ def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs):
         return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
     
     def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map):
-        object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+        # object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+        object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True)
+        
         object_image_votes = {}
         
         # iterate over all frames
diff --git a/preprocess/feat2D/scan3r.py b/preprocess/feat2D/scan3r.py
index 4927c97..1fe96d8 100644
--- a/preprocess/feat2D/scan3r.py
+++ b/preprocess/feat2D/scan3r.py
@@ -7,7 +7,7 @@
 from scipy.spatial.transform import Rotation as R
 from omegaconf import DictConfig
 from typing import List, Dict, Tuple
-
+import os
 from common import load_utils
 from util import render, scan3r, visualisation
 from util import image as image_util
@@ -55,44 +55,53 @@ def compute2DFeatures(self) -> None:
             self.compute2DFeaturesEachScan(scan_id)   
 
     def compute2DImagesAndSeg(self, scan_id: str) -> None:
-        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
-        mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', ''))
-        
-        ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename)
-        instance_ids = ply_data['objectId']
-          
-        camera_info = scan3r.load_intrinsics(scene_folder)
-        intrinsics = camera_info['intrinsic_mat']
-        img_width = int(camera_info['width'])
-        img_height = int(camera_info['height'])
-        
-        mesh = o3d.io.read_triangle_mesh(mesh_file)
-        mesh_triangles = np.asarray(mesh.triangles)
-        colors = np.asarray(mesh.vertex_colors)*255.0
-        colors = colors.round()
-        num_triangles = mesh_triangles.shape[0]
-        
-        scene = o3d.t.geometry.RaycastingScene()
-        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
-         
-        # project 3D model
-        obj_id_imgs = {}
-        for frame_idx in self.frame_pose_data[scan_id]:
-            img_pose = self.frame_pose_data[scan_id][frame_idx]
-            img_pose_inv = np.linalg.inv(img_pose)
-            
-            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
-                scene, intrinsics, img_pose_inv, img_width, img_height, 
-                mesh_triangles, num_triangles, instance_ids
-            )
-            obj_id_imgs[frame_idx] = obj_id_map
+            scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+            mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', ''))
+            obj_id_imgs = {}
+            gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
+            if osp.exists(gt_pt_path):
+                # print("using gt pt")
+                old_gt = torch.load(gt_pt_path)
+                for frame_idx in self.frame_pose_data[scan_id]:
+                    obj_id_imgs[frame_idx] = old_gt[frame_idx]
+                os.remove(gt_pt_path)
+                    
+            else:
+                ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename)
+                instance_ids = ply_data['objectId']
+                
+                camera_info = scan3r.load_intrinsics(scene_folder)
+                intrinsics = camera_info['intrinsic_mat']
+                img_width = int(camera_info['width'])
+                img_height = int(camera_info['height'])
+                
+                mesh = o3d.io.read_triangle_mesh(mesh_file)
+                mesh_triangles = np.asarray(mesh.triangles)
+                colors = np.asarray(mesh.vertex_colors)*255.0
+                colors = colors.round()
+                num_triangles = mesh_triangles.shape[0]
+                
+                scene = o3d.t.geometry.RaycastingScene()
+                scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+                
+                # project 3D model
+                for frame_idx in self.frame_pose_data[scan_id]:
+                    img_pose = self.frame_pose_data[scan_id][frame_idx]
+                    img_pose_inv = np.linalg.inv(img_pose)
+                    
+                    obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                        scene, intrinsics, img_pose_inv, img_width, img_height, 
+                        mesh_triangles, num_triangles, instance_ids
+                    )
+                    obj_id_imgs[frame_idx] = obj_id_map
 
-        
-        # save scene-level file for efficient loading
-        scene_out_dir = osp.join(self.out_dir, scan_id)
-        load_utils.ensure_dir(scene_out_dir)
-        
-        torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+            
+            # save scene-level file for efficient loading
+            scene_out_dir = osp.join(self.out_dir, scan_id)
+            load_utils.ensure_dir(scene_out_dir)
+            
+            # torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+            np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs)
     
     def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         scene_folder = osp.join(self.data_dir, 'scans', scan_id)
diff --git a/preprocess/feat2D/scannet.py b/preprocess/feat2D/scannet.py
index 8c59354..c0bc412 100644
--- a/preprocess/feat2D/scannet.py
+++ b/preprocess/feat2D/scannet.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 from tqdm import tqdm
-
+import os
 import imageio
 import skimage.transform as sktf
 from PIL import Image
@@ -81,49 +81,57 @@ def renderShapeAndFloorplan(self, scene_folder: str, scene_out_folder: str, scan
         return render_img
          
     def compute2DFeaturesEachScan(self, scan_id: str) -> None:
+        data2D = {}
         frame_idxs = list(self.frame_pose_data[scan_id].keys())
         scene_folder = osp.join(self.data_dir, 'scans', scan_id)
         
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        
+        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
+        if osp.exists(pt_2d_path):
+            print("using 2d pt")
+            pt_data=torch.load(pt_2d_path)
+            data2D['objects']=pt_data['objects']
+            data2D['scene']=pt_data['scene']
+            os.remove(pt_2d_path)
+            
+        else:
         # Floor-plan rendering
-        render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id)
-        floorplan_embeddings = None
-        
-        if render_img is not None:
-            render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
-            render_img_pt = self.model.base_tf(render_img)
-            floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False)            
-        
-        floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings}
+            render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id)
+            floorplan_embeddings = None
             
-        # Multi-view Image -- Object (Embeddings)
-        object_image_embeddings, object_image_votes_topK = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, frame_idxs)
-    
-        # Multi-view Image -- Scene (Images + Embeddings)
-        color_path = osp.join(scene_folder, 'data/color')
-        intrinsic_data = scannet.load_intrinsics(osp.join(self.data_dir, 'scans'), scan_id)
-    
-        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeImageFeaturesEachScan(scan_id, color_path, frame_idxs)
-    
-        # Visualise
-        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, scan_id + '_vh_clean_2.ply'))
-        intrinsics = { 'f' : intrinsic_data['intrinsic_mat'][0, 0], 'cx' : intrinsic_data['intrinsic_mat'][0, 2], 'cy' : intrinsic_data['intrinsic_mat'][1, 2], 
-                        'w' : int(intrinsic_data['width']), 'h' : int(intrinsic_data['height'])}
-        
-        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+            if render_img is not None:
+                render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+                render_img_pt = self.model.base_tf(render_img)
+                floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False)            
+            floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings}
+                
+            # Multi-view Image -- Object (Embeddings)
+            object_image_embeddings, object_image_votes_topK = self.computeImageFeaturesAllObjectsEachScan(scene_folder, frame_idxs)
         
-        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
-        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+            # Multi-view Image -- Scene (Images + Embeddings)
+            color_path = osp.join(scene_folder, 'data/color')
+            intrinsic_data = scannet.load_intrinsics(osp.join(self.data_dir, 'scans'), scan_id)
         
-        data2D = {}
-        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
-        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
-                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+            pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeImageFeaturesEachScan(scan_id, color_path, frame_idxs)
         
-        data2D['scene']['floorplan'] = floorplan_dict
-        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+            # Visualise
+            scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, scan_id + '_vh_clean_2.ply'))
+            intrinsics = { 'f' : intrinsic_data['intrinsic_mat'][0, 0], 'cx' : intrinsic_data['intrinsic_mat'][0, 2], 'cy' : intrinsic_data['intrinsic_mat'][1, 2], 
+                            'w' : int(intrinsic_data['width']), 'h' : int(intrinsic_data['height'])}
+            
+            cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+            
+            image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+            Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+            
+            data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+            data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                    'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+            
+            data2D['scene']['floorplan'] = floorplan_dict
+        # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D)
     
     def computeImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]:
         # Sample Camera Indexes Based on Rotation Matrix From Grid

From ef6a5e5d5758160de028661c9aed6e3358dc41d6 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 11:44:10 +0530
Subject: [PATCH 14/18] 3d preprocessing changes

---
 preprocess/feat3D/arkit.py     | 15 ++++++++++++---
 preprocess/feat3D/multiscan.py | 17 +++++++++++++----
 preprocess/feat3D/scan3r.py    | 15 +++++++++++++--
 preprocess/feat3D/scannet.py   | 32 ++++++++++++++++++--------------
 4 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py
index 6172204..67b79c0 100644
--- a/preprocess/feat3D/arkit.py
+++ b/preprocess/feat3D/arkit.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 from tqdm import tqdm
-
+import os
 from common import load_utils 
 from util import point_cloud, arkit
 from util.arkit import ARKITSCENE_SCANNET
@@ -93,5 +93,14 @@ def compute3DFeaturesEachScan(self, scan_id):
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
             
-        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
-        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
\ No newline at end of file
+        # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+        pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt')
+        pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')
+        if osp.exists(pt_data3d_path):
+            os.remove(pt_data3d_path)
+        if osp.exists(pt_map_path): 
+            os.remove(pt_map_path)
+            
+        np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D)
+        np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map)
\ No newline at end of file
diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py
index 68ba025..336ea3a 100644
--- a/preprocess/feat3D/multiscan.py
+++ b/preprocess/feat3D/multiscan.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 from tqdm import tqdm
-
+import os
 from common import load_utils 
 from util import point_cloud, multiscan
 from util.multiscan import MULTISCAN_SCANNET
@@ -89,6 +89,15 @@ def compute3DFeaturesEachScan(self, scan_id):
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
             
-        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
-        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
-    
\ No newline at end of file
+        # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+        
+        # Save as .npz files instead of .pt files
+        pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt')
+        pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')
+        if osp.exists(pt_data3d_path):
+            os.remove(pt_data3d_path)
+        if osp.exists(pt_map_path): 
+            os.remove(pt_map_path)
+        np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D)
+        np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map)
diff --git a/preprocess/feat3D/scan3r.py b/preprocess/feat3D/scan3r.py
index 7b949ca..315bc97 100644
--- a/preprocess/feat3D/scan3r.py
+++ b/preprocess/feat3D/scan3r.py
@@ -43,6 +43,9 @@ def __init__(self, config_data: DictConfig, config_3D: DictConfig, split: str) -
         self.feature_extractor = self.loadFeatureExtractor(config_3D, "3D")
     
     def compute3DFeaturesEachScan(self, scan_id: str) -> None:
+        """
+        Computes 3D features for a single scan.
+        """
         ply_data = scan3r.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, self.label_filename)
         mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
         
@@ -79,5 +82,13 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None:
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
             
-        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
-        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
\ No newline at end of file
+        # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+        pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt')
+        pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')
+        if osp.exists(pt_data3d_path):
+            os.remove(pt_data3d_path)
+        if osp.exists(pt_map_path): 
+            os.remove(pt_map_path)
+        np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D)
+        np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map)
\ No newline at end of file
diff --git a/preprocess/feat3D/scannet.py b/preprocess/feat3D/scannet.py
index e530195..13d1e5a 100644
--- a/preprocess/feat3D/scannet.py
+++ b/preprocess/feat3D/scannet.py
@@ -1,5 +1,5 @@
 import os.path as osp
-
+import os
 import numpy as np
 import torch
 from omegaconf import DictConfig
@@ -64,6 +64,10 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None:
         mesh_points = mesh_vertices[:, 0:3] 
         mesh_colors  = mesh_vertices[:, 3:]
         
+        center_points = np.mean(mesh_points, axis=0)
+        center_points[2] = np.min(mesh_points[:, 2])
+        mesh_points = mesh_points - center_points
+        
         text_file = mesh_file.replace('_vh_clean_2.labels.ply' , '.txt')
         with open(text_file, 'r') as file:
                 for line in file:
@@ -79,10 +83,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None:
         if len(shape_annot) > 0: 
             shape_annot = shape_annot[0]
             shape_annot_to_instance_map = scannet.get_cad_model_to_instance_mapping(instance_bboxes, shape_annot, meta_file, self.shape_dir)
-
-            render_out_dir = osp.join(scene_out_dir, 'render')
-            load_utils.ensure_dir(render_out_dir)
-            
+        
         for instance_id in unique_instance_ids:
             if instance_id == self.undefined: 
                 continue
@@ -98,11 +99,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None:
                 shape_annot_instance = shape_annot_to_instance_map[instance_id]
                 object_cad_pcl = shape_annot_instance['points']
                 object_cad_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_cad_pcl)
-                
-                obj_verts, obj_faces, transform_shape = shape_annot_instance['verts'], shape_annot_instance['faces'], shape_annot_instance['transform_shape']
-                # load_utils.ensure_dir(osp.join(render_out_dir, f'{instance_id}'))
-                # render.render_multiview_images(obj_verts, obj_faces, transform_shape, osp.join(render_out_dir, f'{instance_id}'))
-        
+            
         data3D = {}    
         data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
         data3D['scene']   = {'pcl_coords': mesh_points[instance_ids != self.undefined], 'pcl_feats': mesh_colors[instance_ids != self.undefined], 'scene_label' : scene_label}
@@ -112,7 +109,14 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None:
         assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
         assert len(list(object_id_to_label_id.keys())) >= len(list(object_cad_embeddings.keys())), 'CAD does not match for {}'.format(scan_id)
         
-        
-        
-        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
-        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
\ No newline at end of file
+        # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+        pt_data3d_path = osp.join(scene_out_dir, 'data3D.pt')
+        pt_map_path = osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')
+        if osp.exists(pt_data3d_path):
+            os.remove(pt_data3d_path)
+        if osp.exists(pt_map_path): 
+            os.remove(pt_map_path)
+        
+        np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D)
+        np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map)
\ No newline at end of file

From bbf08e655f2037c9509f93213500f676397bd26a Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 11:45:53 +0530
Subject: [PATCH 15/18] multimodal dumping changes

---
 preprocess/multimodal_preprocess.py | 34 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
index a45274b..a6d9063 100644
--- a/preprocess/multimodal_preprocess.py
+++ b/preprocess/multimodal_preprocess.py
@@ -10,7 +10,7 @@
 from common.constants import ModalityType
 from util import scan3r, scannet, arkit, multiscan
 from typing import Dict, Optional
-
+import os
 from preprocess.build import PROCESSOR_REGISTRY
 
 @PROCESSOR_REGISTRY.register()
@@ -75,18 +75,20 @@ def prepareObjectWiseDataEachScan(self,
                                     data2D: Optional[Dict] = None, 
                                     data3D: Optional[Dict] = None) -> Dict:
         """Process object-wise data for a single scan combining features from all modalities."""
-        object_id_to_label_id_map  = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] 
+        # object_id_to_label_id_map  = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] 
+        object_id_to_label_id_map = np.load(osp.join(out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        
         map_object_ids = list(object_id_to_label_id_map.keys())
         
         precomputed_feats, inputs = {}, {}
         
         if data3D is not None:
-            precomputed_feats[ModalityType.POINT] = data3D['objects']['pcl_embeddings'] 
-            precomputed_feats[ModalityType.CAD] = data3D['objects']['cad_embeddings']
+            precomputed_feats[ModalityType.POINT] = data3D['objects'].item()['pcl_embeddings'] 
+            precomputed_feats[ModalityType.CAD] = data3D['objects'].item()['cad_embeddings']
         if data2D is not None:
-            precomputed_feats[ModalityType.RGB] = data2D['objects']['image_embeddings']
+            precomputed_feats[ModalityType.RGB] = data2D['objects'].item()['image_embeddings']
         if data1D is not None:
-            precomputed_feats[ModalityType.REF] = data1D['objects']['referral_embeddings']
+            precomputed_feats[ModalityType.REF] = data1D['objects'].item()['referral_embeddings']
         
         object_ids = []
         for modalityType in ModalityType.__dict__.values():
@@ -141,19 +143,27 @@ def prepareObjectWiseDataEachScan(self,
             'object_id2idx' : object_id2idx,
             'object_id_to_label_id_map' : object_id_to_label_id_map,
             'object_ids' : object_ids,
-            'topK_images_votes' : data2D['objects']['topK_images_votes']
+            'topK_images_votes' : data2D['objects'].item()['topK_images_votes']
         }
-        
-        torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt'))
+        pt_multimodal_path = osp.join(out_dir, 'objectsDataMultimodal.pt')
+        if osp.exists(pt_multimodal_path):
+            os.remove(pt_multimodal_path)
+        # torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt'))
+        np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt)
         return objects_data_pt
         
     def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None:
         """Process data for a single scan and store it in the HDF5 file."""
         out_dir = osp.join(self.out_dir, scan_id)
         
-        data1D = torch.load(osp.join(out_dir, 'data1D.pt'))
-        data2D = torch.load(osp.join(out_dir, 'data2D.pt'))
-        data3D = torch.load(osp.join(out_dir, 'data3D.pt'))
+        # data1D = torch.load(osp.join(out_dir, 'data1D.pt'))
+        data1D = np.load(osp.join(out_dir, 'data1D.npz'),allow_pickle=True)
+        
+        # data2D = torch.load(osp.join(out_dir, 'data2D.pt'))
+        data2D = np.load(osp.join(out_dir, 'data2D.npz'),allow_pickle=True)
+        
+        # data3D = torch.load(osp.join(out_dir, 'data3D.pt'))
+        data3D = np.load(osp.join(out_dir, 'data3D.npz'),allow_pickle=True)
         
         objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D)
         self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler)

From 25a3dd7cecf00464ed617dbc0d93f5cb8a34dd8c Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 11:50:29 +0530
Subject: [PATCH 16/18] dataset util changes for alignment

---
 util/arkit.py     |  27 ++++++------
 util/multiscan.py |  73 +++++++++++++++++++++++++------
 util/scan3r.py    | 106 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 175 insertions(+), 31 deletions(-)

diff --git a/util/arkit.py b/util/arkit.py
index c4e7593..0029b58 100644
--- a/util/arkit.py
+++ b/util/arkit.py
@@ -219,12 +219,15 @@ def load_ply_data(data_dir, scan_id, annotations):
     file = open(filename_in, 'rb')
     plydata = PlyData.read(file)
     file.close()
+    # plydata = trimesh.load(filename_in, process=False)
     vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z']
+    # vertices=plydata.vertices
     vertices = np.vstack(vertices).T
 
     vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue']
     vertex_colors = np.vstack(vertex_colors).T
-
+    # vertex_colors = plydata.visual.vertex_colors
+    # print("vertex_colors", vertex_colors.shape)
     vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), 
                     ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),
                     ('objectId', 'h')]  
@@ -260,24 +263,24 @@ def load_ply_data(data_dir, scan_id, annotations):
 
     vertices_structured['objectId'] = vertex_instance
 
-    # align_angle = calc_align_matrix(bbox_list)
+    align_angle = calc_align_matrix(bbox_list)
 
-    # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle)
+    vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle)
 
     if np.max(vertex_colors) <= 1:
         vertex_colors = vertex_colors * 255.0
 
-    # center_points = np.mean(vertices_aligned, axis=0)
-    # center_points[2] = np.min(vertices_aligned[:, 2]) 
-    # vertices_aligned = vertices_aligned - center_points
+    center_points = np.mean(vertices_aligned, axis=0)
+    center_points[2] = np.min(vertices_aligned[:, 2]) 
+    vertices_aligned = vertices_aligned - center_points
 
-    # vertices_structured['x'] = vertices_aligned[:, 0]
-    # vertices_structured['y'] = vertices_aligned[:, 1]
-    # vertices_structured['z'] = vertices_aligned[:, 2]
+    vertices_structured['x'] = vertices_aligned[:, 0]
+    vertices_structured['y'] = vertices_aligned[:, 1]
+    vertices_structured['z'] = vertices_aligned[:, 2]
     
-    vertices_structured['x'] = plydata['vertex']['x']
-    vertices_structured['y'] = plydata['vertex']['y']
-    vertices_structured['z'] = plydata['vertex']['z']
+    # vertices_structured['x'] = plydata['vertex']['x']
+    # vertices_structured['y'] = plydata['vertex']['y']
+    # vertices_structured['z'] = plydata['vertex']['z']
     
     return vertices_structured
 
diff --git a/util/multiscan.py b/util/multiscan.py
index 8478a7d..9e14c03 100644
--- a/util/multiscan.py
+++ b/util/multiscan.py
@@ -6,6 +6,7 @@
 import jsonlines
 import json
 import os
+import pandas as pd
 
 MULTISCAN_SCANNET = {
     "wall": "wall",
@@ -492,10 +493,35 @@ def get_scan_ids(dirname, split):
     scan_ids = np.genfromtxt(filepath, dtype = str)
     return scan_ids
 
+def annotations_to_dataframe_obj(annotations):
+        objects = annotations['objects']
+        df_list = []
+        for obj in objects:
+            object_id = obj['objectId']
+            object_label = obj['label']
+            df_row = pd.DataFrame(
+                [[object_id, object_label]],
+                columns=['objectId', 'objectLabel']
+            )
+            df_list.append(df_row)
+        df = pd.concat(df_list)
+        return df
+    
+    
 def load_ply_data(data_dir, scan_id):
     """
     Load PLY data and propagate object IDs from faces to vertices.
+    
+    Args:
+        data_dir (str): Directory containing the PLY file.
+        scan_id (str): Identifier for the scan.
+    
+    Returns:
+        np.ndarray: Vertex data with propagated object IDs.
     """
+    with open(osp.join(data_dir, scan_id, f'{scan_id}.annotations.json'), "r", encoding='utf-8') as f:
+            annotations = json.load(f)
+            
     filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id))
     
     if not osp.exists(filename_in):
@@ -511,6 +537,7 @@ def load_ply_data(data_dir, scan_id):
     red = np.array(ply_data['vertex']['red'])
     green = np.array(ply_data['vertex']['green'])
     blue = np.array(ply_data['vertex']['blue'])
+    triangles = np.vstack(ply_data['face'].data['vertex_indices'])
     
     # Extract normals if available
     if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']:
@@ -521,17 +548,36 @@ def load_ply_data(data_dir, scan_id):
     else:
         normals = None
 
-    # Initialize object IDs for vertices with a default undefined value
-    vertex_object_ids = np.full(len(x), -1, dtype='int32')  # Default: -1 (undefined)
+    scene_vertices = np.column_stack([x, y, z])    
+    center_points = np.mean(scene_vertices, axis=0)
+    center_points[2] = np.min(scene_vertices[:, 2])
+    scene_vertices = scene_vertices - center_points
+    
+    vertex_object_ids = np.zeros((scene_vertices.shape[0])) 
     
     # Extract face data
-    faces = ply_data['face'].data
-    face_vertex_indices = [face['vertex_indices'] for face in faces]
-    face_object_ids = [face['objectId'] for face in faces]
+    # faces = ply_data['face'].data
+    # face_vertex_indices = [face['vertex_indices'] for face in faces]
+    # face_object_ids = [face['objectId'] for face in faces]
+    
+    # # Propagate object IDs to vertices
+    # for face_indices, obj_id in zip(face_vertex_indices, face_object_ids):
+    #     vertex_object_ids[face_indices] = obj_id  # Assign object ID to all vertices in the face
+    object_ids = ply_data['face'].data['objectId']
+    part_ids = ply_data['face'].data['partId']
+    
+    semseg_df = pd.DataFrame({'objectId': object_ids, 'partId': part_ids})
+    df = annotations_to_dataframe_obj(annotations)
+    for _, row in df.iterrows():
+        object_id = row['objectId']
+        assert object_id > 0, f"object id should be greater than 0, but got {object_id}"
+
+        condition1 = semseg_df['objectId'] == object_id
+        tri_indices = semseg_df[condition1].index.values
+        object_vertices = np.unique(triangles[tri_indices])
+        vertex_object_ids[object_vertices] = object_id
+    
     
-    # Propagate object IDs to vertices
-    for face_indices, obj_id in zip(face_vertex_indices, face_object_ids):
-        vertex_object_ids[face_indices] = obj_id  # Assign object ID to all vertices in the face
     
     vertex_dtype = [
         ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),       # Coordinates
@@ -543,10 +589,13 @@ def load_ply_data(data_dir, scan_id):
         vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')])  # Normals
     
     vertices = np.empty(len(x), dtype=vertex_dtype)
-    
-    vertices['x'] = x.astype('f4')
-    vertices['y'] = y.astype('f4')
-    vertices['z'] = z.astype('f4')
+    # Update scene vertices - assign x, y, z coordinates from scene_vertices
+    vertices['x'] = scene_vertices[:, 0].astype('f4')
+    vertices['y'] = scene_vertices[:, 1].astype('f4')
+    vertices['z'] = scene_vertices[:, 2].astype('f4')
+    # vertices['x'] = x.astype('f4')
+    # vertices['y'] = y.astype('f4')
+    # vertices['z'] = z.astype('f4')
     vertices['red'] = red.astype('u1')
     vertices['green'] = green.astype('u1')
     vertices['blue'] = blue.astype('u1')
diff --git a/util/scan3r.py b/util/scan3r.py
index 2727d5a..8fc33bb 100644
--- a/util/scan3r.py
+++ b/util/scan3r.py
@@ -3,15 +3,19 @@
 from plyfile import PlyData
 from glob import glob
 import csv
-
+import json
 def get_scan_ids(dirname: str, split: str) -> np.ndarray:
     """Retrieve scan IDs for the given directory and split."""
     filepath = osp.join(dirname, '{}_scans.txt'.format(split))
     scan_ids = np.genfromtxt(filepath, dtype = str)
     return scan_ids
 
-def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarray:
-    """Load PLY data from specified directory, scan ID, and label file."""
+def load_ply_data(data_dir, scan_id, label_file_name):
+    with open(osp.join(data_dir, scan_id, 'mesh.refined.0.010000.segs.v2.json'), "r", encoding='utf-8') as f:
+            segments = json.load(f)
+    with open(osp.join(data_dir, scan_id, 'semseg.v2.json'), "r", encoding='utf-8') as f:
+        aggregation = json.load(f)
+            
     filename_in = osp.join(data_dir, scan_id, label_file_name)
     file = open(filename_in, 'rb')
     ply_data = PlyData.read(file)
@@ -31,9 +35,32 @@ def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarr
     vertices = np.empty(len(x), dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'),  ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),
                                                      ('objectId', 'h'), ('globalId', 'h'), ('NYU40', 'u1'), ('Eigen13', 'u1'), ('RIO27', 'u1')])
     
-    vertices['x'] = x.astype('f4')
-    vertices['y'] = y.astype('f4')
-    vertices['z'] = z.astype('f4')
+    seg_group = aggregation['segGroups']
+    bbox_list = []
+    for i, _ in enumerate(seg_group):
+        rotation = np.array(seg_group[i]["obb"]["normalizedAxes"]).reshape(3, 3)
+        transform = np.array(seg_group[i]["obb"]["centroid"]).reshape(-1, 3)
+        scale = np.array(seg_group[i]["obb"]["axesLengths"]).reshape(-1, 3)
+        trns = np.eye(4)
+        trns[0:3, 3] = transform
+        trns[0:3, 0:3] = rotation.T
+        box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation)
+        bbox_list.append(box3d)
+    
+    align_angle = calc_align_matrix(bbox_list)
+    scene_vertices = np.column_stack([x, y, z])    
+    center_points = np.mean(scene_vertices, axis=0)
+    center_points[2] = np.min(scene_vertices[:, 2])
+    scene_vertices = scene_vertices - center_points
+    
+    scene_vertices = rotate_z_axis_by_degrees(np.array(scene_vertices), align_angle)
+    
+    vertices['x'] = scene_vertices[:, 0].astype('f4')
+    vertices['y'] = scene_vertices[:, 1].astype('f4')
+    vertices['z'] = scene_vertices[:, 2].astype('f4')
+    # vertices['x'] = x.astype('f4')
+    # vertices['y'] = y.astype('f4')
+    # vertices['z'] = z.astype('f4')
     vertices['red'] = red.astype('u1')
     vertices['green'] = green.astype('u1')
     vertices['blue'] = blue.astype('u1')
@@ -136,4 +163,69 @@ def represents_int(s: str) -> bool:
         int(s)
         return True
     except ValueError:
-        return False
\ No newline at end of file
+        return False
+    
+def calc_align_matrix(bbox_list):
+    RANGE = [-45, 45]
+    NUM_BIN = 90
+    angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+    angle_counts = {}
+    for _a in angles:
+        bucket = round(_a, 3)
+        for box in bbox_list:
+            box_r = rotate_z_axis_by_degrees(box, bucket)
+            bottom = box_r[4:]
+            if is_axis_aligned(bottom):
+                angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    if len(angle_counts) == 0:
+        RANGE = [-90, 90]
+        NUM_BIN = 180
+        angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+        for _a in angles:
+            bucket = round(_a, 3)
+            for box in bbox_list:
+                box_r = rotate_z_axis_by_degrees(box, bucket)
+                bottom = box_r[4:]
+                if is_axis_aligned(bottom, thres=0.15):
+                    angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    most_common_angle = max(angle_counts, key=angle_counts.get)
+    return most_common_angle
+
+def is_axis_aligned(rotated_box, thres=0.05):
+    x_diff = abs(rotated_box[0][0] - rotated_box[1][0])
+    y_diff = abs(rotated_box[0][1] - rotated_box[3][1])
+    return x_diff < thres and y_diff < thres
+
+def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True):
+    theta = np.deg2rad(theta)
+    cos_t = np.cos(theta)
+    sin_t = np.sin(theta)
+    rot_matrix = np.array([[cos_t, -sin_t, 0],
+                           [sin_t, cos_t, 0],
+                           [0, 0, 1]], pointcloud.dtype)
+    if not clockwise:
+        rot_matrix = rot_matrix.T
+    return pointcloud.dot(rot_matrix)
+
+def compute_box_3d(size, center, rotmat):
+    """Compute corners of a single box from rotation matrix
+    Args:
+        size: list of float [dx, dy, dz]
+        center: np.array [x, y, z]
+        rotmat: np.array (3, 3)
+    Returns:
+        corners: (8, 3)
+    """
+    l, h, w = [i / 2 for i in size]
+    center = np.reshape(center, (-1, 3))
+    center = center.reshape(3)
+    x_corners = [l, l, -l, -l, l, l, -l, -l]
+    y_corners = [h, -h, -h, h, h, -h, -h, h]
+    z_corners = [w, w, w, w, -w, -w, -w, -w]
+    corners_3d = np.dot(
+        np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners])
+    )
+    corners_3d[0, :] += center[0]
+    corners_3d[1, :] += center[1]
+    corners_3d[2, :] += center[2]
+    return np.transpose(corners_3d)
\ No newline at end of file

From 58f0d8e653137df1431d3185baeea80ff7fb5c8e Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 22 Apr 2025 12:00:23 +0530
Subject: [PATCH 17/18] scanbase changes to work with npz

---
 data/datasets/scanbase.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py
index b531e32..aa5d4a6 100644
--- a/data/datasets/scanbase.py
+++ b/data/datasets/scanbase.py
@@ -138,7 +138,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
         
         # Point Cloud Data -- Scene
-        points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label']
+        points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label']
         feats /= 255.
         feats -= 0.5
         
@@ -152,9 +152,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
         coords, feats = points[sel], feats[sel]
         
-        # Get coords, shift to center
+        # Get coords, already zero centered during preprocessing
         coords = np.floor(coords / self.voxel_size)
-        coords-=coords.min(0)
+        # coords-=coords.min(0)
         
         # Object Data
         scene_dict = {}
@@ -185,7 +185,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scene_dict['scene_masks'] = {}
         
-        rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings'])
+        rgb_embedding = torch.from_numpy(scandata_2d['scene'].item()['scene_embeddings'])
         rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1)
         scene_dict['rgb_embedding'] = rgb_embedding
         
@@ -194,7 +194,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         scene_dict['scene_masks']['object'] = torch.Tensor([1.0])
         
         referral_mask = torch.Tensor([0.0])       
-        referral_embedding = scandata_1d['scene']['referral_embedding']
+        referral_embedding = scandata_1d['scene'].item()['referral_embedding']
         
         if referral_embedding is not None:
             referral_embedding = torch.from_numpy(referral_embedding[0]['feat']).reshape(-1,)
@@ -202,7 +202,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         else:
             referral_embedding = torch.zeros((scene_dict['rgb_embedding'].shape[-1] // 4, ))
         
-        floorplan_embedding = scandata_2d['scene']['floorplan']['embedding']
+        floorplan_embedding = scandata_2d['scene'].item()['floorplan']['embedding']
         floorplan_mask = torch.Tensor([0.0])
         if floorplan_embedding is not None:
             floorplan_embedding = torch.from_numpy(floorplan_embedding[0, 0]).reshape(-1, )

From 736f72a63b1000de77e2690b6e19a85eac4c77e9 Mon Sep 17 00:00:00 2001
From: Gaurav Pradeep <gauravpradeep2004@gmail.com>
Date: Tue, 29 Apr 2025 10:41:14 +0530
Subject: [PATCH 18/18] scanbase change to read npz isntead of pt

---
 data/datasets/scanbase.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py
index aa5d4a6..2d054ec 100644
--- a/data/datasets/scanbase.py
+++ b/data/datasets/scanbase.py
@@ -131,11 +131,15 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scan_process_dir = osp.join(self.process_dir, 'scans', scan_id)
         
-        scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt'))
-        
-        scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt'))
-        scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt'))
-        scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
+        # scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt'))
+        scan_objects_data = np.load(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'), allow_pickle=True)
+        
+        # scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt'))
+        scandata_1d = np.load(osp.join(scan_process_dir, 'data1D.npz'), allow_pickle=True)
+        # scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt'))
+        scandata_2d = np.load(osp.join(scan_process_dir, 'data2D.npz'), allow_pickle=True)
+        # scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
+        scandata_3d = np.load(osp.join(scan_process_dir, 'data3D.npz'), allow_pickle=True)
         
         # Point Cloud Data -- Scene
         points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label']