[Tensornet backends] Implement contraction path reuse for expectation…

… value calculation (NVIDIA#2397) * Implement the contraction path reuse for tensornet expectation calculation For a multi-term spin_op, we set up a placeholder for the operator to be used for expectation contraction <psi|op|psi>. A single prepare (path finding) is needed. Then, we just replace the placeholder op with the actual Pauli term to compute the expectation value. Bonus: as we do term-per-term contraction, the per-term expectation value is now available for tensornet backends. Thus, populate per-term data in the resulting observe_result. This makes tensornet backends fully-equivalent to others in terms of observe handling. Re-enable a ObserveResult test, which check for fine-grained result access, for tensornet backends. This wasskipped previously as we only have the final expectation value. Signed-off-by: Thien Nguyen <[email protected]> * Code tidy up: remove temporary variable use Signed-off-by: Thien Nguyen <[email protected]> * Remove unnecessary casts Signed-off-by: Thien Nguyen <[email protected]> * Update runtime/nvqir/cutensornet/simulator_cutensornet.cpp Co-authored-by: Eric Schweitz <[email protected]> Signed-off-by: Thien Nguyen <[email protected]> * Code review: simplify the code to load pauli matrix to the placeholder array Signed-off-by: Thien Nguyen <[email protected]> --------- Signed-off-by: Thien Nguyen <[email protected]> Signed-off-by: Thien Nguyen <[email protected]> Co-authored-by: Eric Schweitz <[email protected]>
bmhowe23 · Nov 22, 2024 · 79e6999 · 79e6999
1 parent 4f2104b
commit 79e6999
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 134 deletions.
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -88,16 +88,6 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
   target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only)
   # Note: only tensornet backend supports MPI at cutensornet level (distributed tensor computation)
   target_link_libraries(nvqir-tensornet PRIVATE tensornet-mpi-util)
-
-  # Check if the cutensornet plugin lib for expectation value is provided
-  if(NOT CUDAQ_CUTENSORNET_PLUGIN_LIB AND EXISTS "$ENV{CUDAQ_CUTENSORNET_PLUGIN_LIB}")
-    set(CUDAQ_CUTENSORNET_PLUGIN_LIB "$ENV{CUDAQ_CUTENSORNET_PLUGIN_LIB}")
-  endif()
-  if (EXISTS "${CUDAQ_CUTENSORNET_PLUGIN_LIB}")
-    message(STATUS "CUDA Quantum tensornet backends: link additional external plugin ${CUDAQ_CUTENSORNET_PLUGIN_LIB}")
-    target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
-    target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
-  endif()
 else()
   message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.")
 endif()
diff --git a/runtime/nvqir/cutensornet/external_plugin.h b/runtime/nvqir/cutensornet/external_plugin.h
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -9,7 +9,6 @@
 #include "simulator_cutensornet.h"
 #include "cudaq.h"
 #include "cutensornet.h"
-#include "external_plugin.h"
 #include "tensornet_spin_op.h"
 
 namespace nvqir {
@@ -224,49 +223,68 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
   return counts;
 }
 
-bool SimulatorTensorNetBase::canHandleObserve() { return true; }
+bool SimulatorTensorNetBase::canHandleObserve() {
+  // Do not compute <H> from matrix if shots based sampling requested
+  // i.e., a valid shots count value was set.
+  // Note: -1 is also used to denote non-sampling execution. Hence, we need to
+  // check for this particular -1 value as being casted to an unsigned type.
+  if (executionContext && executionContext->shots > 0 &&
+      executionContext->shots != ~0ull) {
+    // This 'shots' mode is very slow for tensor network.
+    // However, we need to respect the shots option.
+    cudaq::info("[SimulatorTensorNetBase] Shots mode expectation calculation "
+                "is requested with {} shots.",
+                executionContext->shots);
 
-static nvqir::CutensornetExecutor *getPluginInstance() {
-  using GetPluginFunction = nvqir::CutensornetExecutor *(*)();
-  auto handle = dlopen(NULL, RTLD_LAZY);
-  GetPluginFunction fcn =
-      (GetPluginFunction)(intptr_t)dlsym(handle, "getCutnExecutor");
-  if (!fcn) {
-    cudaq::info("Externally provided cutensornet plugin not found.");
-    return nullptr;
+    return false;
   }
-
-  cudaq::info("Successfully loaded the cutensornet plugin.");
-  return fcn();
+  // Otherwise, perform exact expectation value calculation/contraction.
+  return true;
 }
 
 /// @brief Evaluate the expectation value of a given observable
 cudaq::observe_result
 SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
   LOG_API_TIME();
   prepareQubitTensorState();
-  auto *cutnExtension = getPluginInstance();
-  if (cutnExtension) {
-    const auto [terms, coeffs] = ham.get_raw_data();
-    const auto termExpVals =
-        cutnExtension->computeExpVals(m_cutnHandle, m_state->getInternalState(),
-                                      m_state->getNumQubits(), terms);
-    std::complex<double> expVal = 0.0;
-    for (std::size_t i = 0; i < terms.size(); ++i) {
-      expVal += (coeffs[i] * termExpVals[i]);
-    }
-    return cudaq::observe_result(expVal.real(), ham,
-                                 cudaq::sample_result(cudaq::ExecutionResult(
-                                     {}, ham.to_string(false), expVal.real())));
-  } else {
-    TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
-    std::complex<double> expVal =
-        m_state->computeExpVal(spinOp.getNetworkOperator());
-    expVal += spinOp.getIdentityTermOffset();
-    return cudaq::observe_result(expVal.real(), ham,
-                                 cudaq::sample_result(cudaq::ExecutionResult(
-                                     {}, ham.to_string(false), expVal.real())));
+
+  std::vector<std::string> termStrs;
+  std::vector<cudaq::spin_op::spin_op_term> terms;
+  std::vector<std::complex<double>> coeffs;
+  termStrs.reserve(ham.num_terms());
+  terms.reserve(ham.num_terms());
+  coeffs.reserve(ham.num_terms());
+
+  // Note: as the spin_op terms are stored as an unordered map, we need to
+  // iterate in one loop to collect all the data (string, symplectic data, and
+  // coefficient).
+  ham.for_each_term([&](cudaq::spin_op &term) {
+    termStrs.emplace_back(term.to_string(false));
+    auto [symplecticRep, coeff] = term.get_raw_data();
+    if (symplecticRep.size() != 1 || coeff.size() != 1)
+      throw std::runtime_error(fmt::format(
+          "Unexpected data encountered when iterating spin operator terms: "
+          "expecting a single term, got {} terms.",
+          symplecticRep.size()));
+    terms.emplace_back(symplecticRep[0]);
+    coeffs.emplace_back(coeff[0]);
+  });
+
+  // Compute the expectation value for all terms
+  const auto termExpVals = m_state->computeExpVals(terms);
+  std::complex<double> expVal = 0.0;
+  // Construct per-term data in the final observe_result
+  std::vector<cudaq::ExecutionResult> results;
+  results.reserve(terms.size());
+
+  for (std::size_t i = 0; i < terms.size(); ++i) {
+    expVal += (coeffs[i] * termExpVals[i]);
+    results.emplace_back(
+        cudaq::ExecutionResult({}, termStrs[i], termExpVals[i].real()));
   }
+
+  cudaq::sample_result perTermData(expVal.real(), results);
+  return cudaq::observe_result(expVal.real(), ham, perTermData);
 }
 
 nvqir::CircuitSimulator *SimulatorTensorNetBase::clone() { return nullptr; }

diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -525,22 +525,79 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
   return mpsTensors;
 }
 
-std::complex<double> TensorNetState::computeExpVal(
-    cutensornetNetworkOperator_t tensorNetworkOperator) {
-  cutensornetStateExpectation_t tensorNetworkExpectation;
+std::vector<std::complex<double>> TensorNetState::computeExpVals(
+    const std::vector<std::vector<bool>> &symplecticRepr) {
   LOG_API_TIME();
+  if (symplecticRepr.empty())
+    return {};
+
+  const std::size_t numQubits = getNumQubits();
+  const auto numSpinOps = symplecticRepr[0].size() / 2;
+  std::vector<std::complex<double>> allExpVals;
+  allExpVals.reserve(symplecticRepr.size());
+
+  constexpr int ALIGNMENT_BYTES = 256;
+  const int placeHolderArraySize = ALIGNMENT_BYTES * numQubits;
+
+  void *pauliMats_h = malloc(placeHolderArraySize);
+  void *pauliMats_d{nullptr};
+  HANDLE_CUDA_ERROR(cudaMalloc(&pauliMats_d, placeHolderArraySize));
+  std::vector<const void *> pauliTensorData;
+  std::vector<std::vector<int32_t>> stateModes;
+
+  for (std::size_t i = 0; i < numQubits; ++i) {
+    pauliTensorData.emplace_back(static_cast<char *>(pauliMats_d) +
+                                 ALIGNMENT_BYTES * i);
+    stateModes.emplace_back(std::vector<int32_t>{static_cast<int32_t>(i)});
+  }
+
+  const std::vector<int64_t> qubitDims(numQubits, 2);
+
+  // Initialize device mem for Pauli matrices
+  constexpr std::complex<double> PauliI_h[4] = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}};
+
+  constexpr std::complex<double> PauliX_h[4]{
+      {0.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
+
+  constexpr std::complex<double> PauliY_h[4]{
+      {0.0, 0.0}, {0.0, -1.0}, {0.0, 1.0}, {0.0, 0.0}};
+
+  constexpr std::complex<double> PauliZ_h[4]{
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}};
+
+  cutensornetNetworkOperator_t cutnNetworkOperator;
+
+  HANDLE_CUTN_ERROR(cutensornetCreateNetworkOperator(
+      m_cutnHandle, numQubits, qubitDims.data(), CUDA_C_64F,
+      &cutnNetworkOperator));
+
+  const std::vector<int32_t> numModes(pauliTensorData.size(), 1);
+  int64_t id;
+  std::vector<const int32_t *> dataStateModes;
+  for (const auto &stateMode : stateModes) {
+    dataStateModes.emplace_back(stateMode.data());
+  }
+  const cuDoubleComplex termCoeff{1.0, 0.0};
+  HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(
+      m_cutnHandle, cutnNetworkOperator, termCoeff, pauliTensorData.size(),
+      numModes.data(), dataStateModes.data(),
+      /*tensorModeStrides*/ nullptr, pauliTensorData.data(), &id));
 
   // Step 1: create
+  cutensornetStateExpectation_t tensorNetworkExpectation;
   {
     ScopedTraceWithContext("cutensornetCreateExpectation");
     HANDLE_CUTN_ERROR(cutensornetCreateExpectation(m_cutnHandle, m_quantumState,
-                                                   tensorNetworkOperator,
+                                                   cutnNetworkOperator,
                                                    &tensorNetworkExpectation));
   }
   // Step 2: configure
+  // Note: as we reuse this path across many contractions, use a higher number
+  // of hyper samples.
   const int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
+      512; // desired number of hyper samples used in the tensor network
+           // contraction path finder
   {
     ScopedTraceWithContext("cutensornetExpectationConfigure");
     HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(
@@ -555,9 +612,10 @@ std::complex<double> TensorNetState::computeExpVal(
       cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc));
   {
     ScopedTraceWithContext("cutensornetExpectationPrepare");
-    HANDLE_CUTN_ERROR(cutensornetExpectationPrepare(
-        m_cutnHandle, tensorNetworkExpectation, scratchPad.scratchSize,
-        workDesc, /*cudaStream*/ 0));
+    HANDLE_CUTN_ERROR(
+        cutensornetExpectationPrepare(m_cutnHandle, tensorNetworkExpectation,
+                                      scratchPad.scratchSize, workDesc,
+                                      /*cudaStream*/ 0));
   }
 
   // Attach the workspace buffer
@@ -574,19 +632,57 @@ std::complex<double> TensorNetState::computeExpVal(
   }
 
   // Step 4: Compute
-  std::complex<double> expVal;
-  std::complex<double> stateNorm{0.0, 0.0};
-  {
-    ScopedTraceWithContext("cutensornetExpectationCompute");
-    HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
-        m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
-        static_cast<void *>(&stateNorm),
-        /*cudaStream*/ 0));
-  }
-  // Step 5: clean up
-  HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(tensorNetworkExpectation));
-  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
-  return expVal / std::abs(stateNorm);
+  for (const auto &term : symplecticRepr) {
+    bool allIdOps = true;
+    for (std::size_t i = 0; i < numQubits; ++i) {
+      // Memory address of this Pauli term in the placeholder array.
+      auto *address = static_cast<char *>(pauliMats_h) + i * ALIGNMENT_BYTES;
+      constexpr int PAULI_ARRAY_SIZE_BYTES = 4 * sizeof(std::complex<double>);
+      // The Pauli matrix data that we want to load to this slot.
+      // Default is the Identity matrix.
+      const std::complex<double> *pauliMatrixPtr = PauliI_h;
+      if (i < numSpinOps) {
+        if (term[i] && term[i + numSpinOps]) {
+          // Y
+          allIdOps = false;
+          pauliMatrixPtr = PauliY_h;
+        } else if (term[i]) {
+          // X
+          allIdOps = false;
+          pauliMatrixPtr = PauliX_h;
+        } else if (term[i + numSpinOps]) {
+          // Z
+          allIdOps = false;
+          pauliMatrixPtr = PauliZ_h;
+        }
+      }
+      // Copy the Pauli matrix data to the placeholder array at the appropriate
+      // slot.
+      std::memcpy(address, pauliMatrixPtr, PAULI_ARRAY_SIZE_BYTES);
+    }
+    if (allIdOps) {
+      allExpVals.emplace_back(1.0);
+    } else {
+      HANDLE_CUDA_ERROR(cudaMemcpy(pauliMats_d, pauliMats_h,
+                                   placeHolderArraySize,
+                                   cudaMemcpyHostToDevice));
+      std::complex<double> expVal;
+      std::complex<double> stateNorm{0.0, 0.0};
+      {
+        ScopedTraceWithContext("cutensornetExpectationCompute");
+        HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
+            m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
+            static_cast<void *>(&stateNorm),
+            /*cudaStream*/ 0));
+      }
+      allExpVals.emplace_back(expVal / std::abs(stateNorm));
+    }
+  }
+
+  free(pauliMats_h);
+  HANDLE_CUDA_ERROR(cudaFree(pauliMats_d));
+
+  return allExpVals;
 }
 
 std::unique_ptr<TensorNetState> TensorNetState::createFromMpsTensors(

diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -138,14 +138,11 @@ class TensorNetState {
                                       double relCutoff,
                                       cutensornetTensorSVDAlgo_t algo);
 
-  /// @brief  Compute the expectation value w.r.t. a
-  /// `cutensornetNetworkOperator_t`
-  ///
-  /// The `cutensornetNetworkOperator_t` can be constructed from
-  /// `cudaq::spin_op`, i.e., representing a sum of Pauli products with
-  /// different coefficients.
-  std::complex<double>
-  computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator);
+  /// @brief Compute the expectation value of an observable
+  /// @param symplecticRepr The symplectic representation of the observable
+  /// @return
+  std::vector<std::complex<double>>
+  computeExpVals(const std::vector<std::vector<bool>> &symplecticRepr);
 
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }

diff --git a/unittests/integration/observe_result_tester.cpp b/unittests/integration/observe_result_tester.cpp
@@ -26,7 +26,6 @@ struct deuteron_n3_ansatz {
   }
 };
 
-#ifndef CUDAQ_BACKEND_TENSORNET
 CUDAQ_TEST(ObserveResult, checkSimple) {
 
   using namespace cudaq::spin;
@@ -113,4 +112,3 @@ CUDAQ_TEST(ObserveResult, checkExpValBug) {
   EXPECT_NEAR(exp, .79, 1e-1);
 }
 #endif
-#endif