utiasASRL · shemshesh · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024 · Sep 23, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/config/nerian_learned_features_extras.yaml b/config/nerian_learned_features_extras.yaml
@@ -2,6 +2,7 @@
   ros__parameters:
     log_enabled:
      - stereo.learned_features
+    #  - tactic.module
 
     ############ tactic configuration ############
     tactic:
@@ -32,8 +33,7 @@
           learned:
             # we're providing the surf settings (don't change this param, use a different file)
             type: "LEARNED_FEATURE"
-            modelPath: "/home/alec/ASRL/vtr3/models/unsup_unet_3.pt"
-
+            modelPath: "${VTRROOT}/models/weights.pt"
 
             stereoDisparityMinimum: 0.1
             stereoDisparityMaximum: 100.0

diff --git a/main/src/vtr_vision/CMakeLists.txt b/main/src/vtr_vision/CMakeLists.txt
@@ -5,6 +5,8 @@ project(vtr_vision)
 #   add_compile_options(-Wall -Wextra -Wpedantic)
 # endif()
 
+#Add debug symbols
+# add_compile_options(-g -Og)
 
 
 # Common setup for vtr packages

diff --git a/main/src/vtr_vision/src/features/extractor/base_feature_extractor.cpp b/main/src/vtr_vision/src/features/extractor/base_feature_extractor.cpp
@@ -170,9 +170,10 @@ ChannelFeatures BFE::extractChannelFeatures(const ChannelImages &channel,
 ChannelFeatures BFE::extractChannelFeaturesDisp(
     const ChannelImages &channel, const ChannelImages &channel_disp, 
     bool fully_matched = false) {
-  if (fully_matched && channel.cameras.size() == 2)
-    // return extractStereoFeaturesDisp(channel, channel_disp);
+  if (fully_matched && channel.cameras.size() == 2){
+    // CLOG(DEBUG, "stereo.learned_features") << "made it here";
     return extractStereoFeaturesDisp(channel, channel_disp);
+  }
 
   ChannelFeatures features;
   features.name = channel.name;

diff --git a/main/src/vtr_vision/src/features/extractor/learned_feature_extractor.cpp b/main/src/vtr_vision/src/features/extractor/learned_feature_extractor.cpp
@@ -42,14 +42,13 @@ torch::Tensor getKeypointDisparities(torch::Tensor disparity,
 
   namespace F = torch::nn::functional;
   auto options = F::GridSampleFuncOptions().mode(
-                     torch::kBilinear).padding_mode(torch::kBorder).align_corners(false);
+                     torch::kNearest).padding_mode(torch::kBorder).align_corners(false);
 
 
-  CLOG(INFO, "stereo.learned_features") << "disparity:" << disparity.sizes();
-  CLOG(INFO, "stereo.learned_features") << "kp_norm:" << keypoints_norm.sizes();
-
+  auto output = F::grid_sample(disparity.contiguous(), keypoints_norm.contiguous(), options).reshape({-1});
 
-  return F::grid_sample(disparity, keypoints_norm, options).reshape({-1});
+  return output;
+  // return F::grid_sample(disparity, keypoints_norm, options).reshape({-1});
 
 }
 
@@ -232,23 +231,20 @@ torch::Tensor LFE::getDisparity(const cv::Mat& left, const cv::Mat& right,
 ////////////////////////////////////////////////////////////////////////////////
 torch::Tensor LFE::getDisparityTensor(const cv::Mat& disp) {
 
-  float disparity_multiplier = 1.0f;
-  if (disp.type() == CV_16S) {
-    disparity_multiplier = 16.0f;
-  }
-  cv::Mat floatDisp;
-  disp.convertTo(floatDisp, CV_32F, 1.0f / disparity_multiplier);
+  // CLOG(DEBUG, "stereo.learned_features") << "disp_type " << disp.type();
+  // float disparity_multiplier = 16.0f;
+
+  // cv::Mat floatDisp;
+  // disp.convertTo(floatDisp, CV_32F, 1.0f / disparity_multiplier);
 
   //Crop the image
-  cv::Mat disp_cropped;
-  floatDisp.copyTo(disp_cropped);
   //floatDisp(cv::Rect(48, 0, 464, 384)).copyTo(disp_cropped);
 
   // Convert the cv image to a tensor
-  torch::Tensor disp_tensor = torch::from_blob(disp_cropped.data, 
-                                              {disp_cropped.rows, 
-                                               disp_cropped.cols, 1},  
-                                               torch::kFloat); 
+  torch::Tensor disp_tensor = torch::from_blob(disp.data, 
+                                              {disp.rows, 
+                                               disp.cols, 1},  
+                                               torch::kInt16).toType(torch::kFloat) / 16.0f; 
 
   // torch::Tensor disp_tensor = torch::from_blob(floatDisp.data, 
   //                                             {floatDisp.rows, 
@@ -258,7 +254,7 @@ torch::Tensor LFE::getDisparityTensor(const cv::Mat& disp) {
   disp_tensor = disp_tensor.permute({(2), (0), (1)});
   disp_tensor.unsqueeze_(0);
 
-  return disp_tensor;
+  return disp_tensor.contiguous();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -337,7 +333,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
 
   // we're about to use the gpu, lock
   std::unique_lock<std::mutex> lock(gpu_mutex_);
-
+  torch::NoGradGuard no_grad;
+
   // Convert the cv image to a tensor
   torch::Tensor image_tensor = torch::from_blob(image_cropped.data, 
                                                {image_cropped.rows, 
@@ -545,7 +542,7 @@ ChannelFeatures LFE::learnedFeaturesToStereoKeypoints(
   auto point_desc_tensor_ptr = point_desc_valid.contiguous().data_ptr<float>();
 
   left_feat.descriptors = cv::Mat(num_valid, descriptor_size, CV_32F, 
-                                  point_desc_tensor_ptr);
+                                  point_desc_tensor_ptr).clone();
 
   return channel;
 }
@@ -639,6 +636,10 @@ ChannelFeatures LFE::extractStereoFeaturesDisp(const cv::Mat &left_img,
 
   // Get disparity for each keypoint
   torch::Tensor disparity = getDisparityTensor(disp);
+
+  // // torch::Tensor disparity_testing = torch::zeros({1,1,377,512});
+  // torch::Tensor disparity_testing = disparity.clone();
+
   torch::Tensor point_disparities = getKeypointDisparities(disparity, 
                                                            keypoints);
   // return channel;

diff --git a/main/src/vtr_vision/src/modules/preprocessing/image_triangulation_module.cpp b/main/src/vtr_vision/src/modules/preprocessing/image_triangulation_module.cpp
@@ -110,7 +110,7 @@ void ImageTriangulationModule::run_(tactic::QueryCache &qdata0, tactic::OutputCa
       auto num_keypoints = channel.cameras[0].keypoints.size();
 
       // copy the descriptor info from the feature.
-      landmarks.appearance.descriptors = channel.cameras[0].descriptors.clone();
+      landmarks.appearance.descriptors = channel.cameras[0].descriptors;
       landmarks.appearance.feat_infos = channel.cameras[0].feat_infos;
       landmarks.appearance.feat_type = channel.cameras[0].feat_type;
       landmarks.appearance.name = channel.cameras[0].name;

diff --git a/models/learned_visual_features.pt b/models/learned_visual_features.pt