Skip to content

Commit

Permalink
[Tensornet backends] Implement contraction path reuse for expectation…
Browse files Browse the repository at this point in the history
… value calculation (NVIDIA#2397)

* Implement the contraction path reuse for tensornet expectation
calculation

For a multi-term spin_op,  we set up a placeholder for the operator to
be used for expectation contraction <psi|op|psi>.

A single prepare (path finding) is needed. Then, we just replace the
placeholder op with the actual Pauli term to compute the expectation
value.

Bonus: as we do term-per-term contraction, the per-term expectation
value is now available for tensornet backends. Thus, populate per-term
data in the resulting observe_result. This makes tensornet backends
fully-equivalent to others in terms of observe handling. Re-enable a ObserveResult test, which check for fine-grained result access, for tensornet backends. This wasskipped previously as we only have the final expectation value.

Signed-off-by: Thien Nguyen <[email protected]>

* Code tidy up: remove temporary variable use

Signed-off-by: Thien Nguyen <[email protected]>

* Remove unnecessary casts

Signed-off-by: Thien Nguyen <[email protected]>

* Update runtime/nvqir/cutensornet/simulator_cutensornet.cpp

Co-authored-by: Eric Schweitz <[email protected]>
Signed-off-by: Thien Nguyen <[email protected]>

* Code review: simplify the code to load pauli matrix to the placeholder array

Signed-off-by: Thien Nguyen <[email protected]>

---------

Signed-off-by: Thien Nguyen <[email protected]>
Signed-off-by: Thien Nguyen <[email protected]>
Co-authored-by: Eric Schweitz <[email protected]>
  • Loading branch information
1tnguyen and schweitzpgi authored Nov 22, 2024
1 parent 4f2104b commit 79e6999
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 134 deletions.
10 changes: 0 additions & 10 deletions runtime/nvqir/cutensornet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,6 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only)
# Note: only tensornet backend supports MPI at cutensornet level (distributed tensor computation)
target_link_libraries(nvqir-tensornet PRIVATE tensornet-mpi-util)

# Check if the cutensornet plugin lib for expectation value is provided
if(NOT CUDAQ_CUTENSORNET_PLUGIN_LIB AND EXISTS "$ENV{CUDAQ_CUTENSORNET_PLUGIN_LIB}")
set(CUDAQ_CUTENSORNET_PLUGIN_LIB "$ENV{CUDAQ_CUTENSORNET_PLUGIN_LIB}")
endif()
if (EXISTS "${CUDAQ_CUTENSORNET_PLUGIN_LIB}")
message(STATUS "CUDA Quantum tensornet backends: link additional external plugin ${CUDAQ_CUTENSORNET_PLUGIN_LIB}")
target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
endif()
else()
message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.")
endif()
58 changes: 0 additions & 58 deletions runtime/nvqir/cutensornet/external_plugin.h

This file was deleted.

86 changes: 52 additions & 34 deletions runtime/nvqir/cutensornet/simulator_cutensornet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "simulator_cutensornet.h"
#include "cudaq.h"
#include "cutensornet.h"
#include "external_plugin.h"
#include "tensornet_spin_op.h"

namespace nvqir {
Expand Down Expand Up @@ -224,49 +223,68 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
return counts;
}

bool SimulatorTensorNetBase::canHandleObserve() { return true; }
bool SimulatorTensorNetBase::canHandleObserve() {
// Do not compute <H> from matrix if shots based sampling requested
// i.e., a valid shots count value was set.
// Note: -1 is also used to denote non-sampling execution. Hence, we need to
// check for this particular -1 value as being casted to an unsigned type.
if (executionContext && executionContext->shots > 0 &&
executionContext->shots != ~0ull) {
// This 'shots' mode is very slow for tensor network.
// However, we need to respect the shots option.
cudaq::info("[SimulatorTensorNetBase] Shots mode expectation calculation "
"is requested with {} shots.",
executionContext->shots);

static nvqir::CutensornetExecutor *getPluginInstance() {
using GetPluginFunction = nvqir::CutensornetExecutor *(*)();
auto handle = dlopen(NULL, RTLD_LAZY);
GetPluginFunction fcn =
(GetPluginFunction)(intptr_t)dlsym(handle, "getCutnExecutor");
if (!fcn) {
cudaq::info("Externally provided cutensornet plugin not found.");
return nullptr;
return false;
}

cudaq::info("Successfully loaded the cutensornet plugin.");
return fcn();
// Otherwise, perform exact expectation value calculation/contraction.
return true;
}

/// @brief Evaluate the expectation value of a given observable
cudaq::observe_result
SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
LOG_API_TIME();
prepareQubitTensorState();
auto *cutnExtension = getPluginInstance();
if (cutnExtension) {
const auto [terms, coeffs] = ham.get_raw_data();
const auto termExpVals =
cutnExtension->computeExpVals(m_cutnHandle, m_state->getInternalState(),
m_state->getNumQubits(), terms);
std::complex<double> expVal = 0.0;
for (std::size_t i = 0; i < terms.size(); ++i) {
expVal += (coeffs[i] * termExpVals[i]);
}
return cudaq::observe_result(expVal.real(), ham,
cudaq::sample_result(cudaq::ExecutionResult(
{}, ham.to_string(false), expVal.real())));
} else {
TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
std::complex<double> expVal =
m_state->computeExpVal(spinOp.getNetworkOperator());
expVal += spinOp.getIdentityTermOffset();
return cudaq::observe_result(expVal.real(), ham,
cudaq::sample_result(cudaq::ExecutionResult(
{}, ham.to_string(false), expVal.real())));

std::vector<std::string> termStrs;
std::vector<cudaq::spin_op::spin_op_term> terms;
std::vector<std::complex<double>> coeffs;
termStrs.reserve(ham.num_terms());
terms.reserve(ham.num_terms());
coeffs.reserve(ham.num_terms());

// Note: as the spin_op terms are stored as an unordered map, we need to
// iterate in one loop to collect all the data (string, symplectic data, and
// coefficient).
ham.for_each_term([&](cudaq::spin_op &term) {
termStrs.emplace_back(term.to_string(false));
auto [symplecticRep, coeff] = term.get_raw_data();
if (symplecticRep.size() != 1 || coeff.size() != 1)
throw std::runtime_error(fmt::format(
"Unexpected data encountered when iterating spin operator terms: "
"expecting a single term, got {} terms.",
symplecticRep.size()));
terms.emplace_back(symplecticRep[0]);
coeffs.emplace_back(coeff[0]);
});

// Compute the expectation value for all terms
const auto termExpVals = m_state->computeExpVals(terms);
std::complex<double> expVal = 0.0;
// Construct per-term data in the final observe_result
std::vector<cudaq::ExecutionResult> results;
results.reserve(terms.size());

for (std::size_t i = 0; i < terms.size(); ++i) {
expVal += (coeffs[i] * termExpVals[i]);
results.emplace_back(
cudaq::ExecutionResult({}, termStrs[i], termExpVals[i].real()));
}

cudaq::sample_result perTermData(expVal.real(), results);
return cudaq::observe_result(expVal.real(), ham, perTermData);
}

nvqir::CircuitSimulator *SimulatorTensorNetBase::clone() { return nullptr; }
Expand Down
140 changes: 118 additions & 22 deletions runtime/nvqir/cutensornet/tensornet_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,22 +525,79 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
return mpsTensors;
}

std::complex<double> TensorNetState::computeExpVal(
cutensornetNetworkOperator_t tensorNetworkOperator) {
cutensornetStateExpectation_t tensorNetworkExpectation;
std::vector<std::complex<double>> TensorNetState::computeExpVals(
const std::vector<std::vector<bool>> &symplecticRepr) {
LOG_API_TIME();
if (symplecticRepr.empty())
return {};

const std::size_t numQubits = getNumQubits();
const auto numSpinOps = symplecticRepr[0].size() / 2;
std::vector<std::complex<double>> allExpVals;
allExpVals.reserve(symplecticRepr.size());

constexpr int ALIGNMENT_BYTES = 256;
const int placeHolderArraySize = ALIGNMENT_BYTES * numQubits;

void *pauliMats_h = malloc(placeHolderArraySize);
void *pauliMats_d{nullptr};
HANDLE_CUDA_ERROR(cudaMalloc(&pauliMats_d, placeHolderArraySize));
std::vector<const void *> pauliTensorData;
std::vector<std::vector<int32_t>> stateModes;

for (std::size_t i = 0; i < numQubits; ++i) {
pauliTensorData.emplace_back(static_cast<char *>(pauliMats_d) +
ALIGNMENT_BYTES * i);
stateModes.emplace_back(std::vector<int32_t>{static_cast<int32_t>(i)});
}

const std::vector<int64_t> qubitDims(numQubits, 2);

// Initialize device mem for Pauli matrices
constexpr std::complex<double> PauliI_h[4] = {
{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}};

constexpr std::complex<double> PauliX_h[4]{
{0.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};

constexpr std::complex<double> PauliY_h[4]{
{0.0, 0.0}, {0.0, -1.0}, {0.0, 1.0}, {0.0, 0.0}};

constexpr std::complex<double> PauliZ_h[4]{
{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}};

cutensornetNetworkOperator_t cutnNetworkOperator;

HANDLE_CUTN_ERROR(cutensornetCreateNetworkOperator(
m_cutnHandle, numQubits, qubitDims.data(), CUDA_C_64F,
&cutnNetworkOperator));

const std::vector<int32_t> numModes(pauliTensorData.size(), 1);
int64_t id;
std::vector<const int32_t *> dataStateModes;
for (const auto &stateMode : stateModes) {
dataStateModes.emplace_back(stateMode.data());
}
const cuDoubleComplex termCoeff{1.0, 0.0};
HANDLE_CUTN_ERROR(cutensornetNetworkOperatorAppendProduct(
m_cutnHandle, cutnNetworkOperator, termCoeff, pauliTensorData.size(),
numModes.data(), dataStateModes.data(),
/*tensorModeStrides*/ nullptr, pauliTensorData.data(), &id));

// Step 1: create
cutensornetStateExpectation_t tensorNetworkExpectation;
{
ScopedTraceWithContext("cutensornetCreateExpectation");
HANDLE_CUTN_ERROR(cutensornetCreateExpectation(m_cutnHandle, m_quantumState,
tensorNetworkOperator,
cutnNetworkOperator,
&tensorNetworkExpectation));
}
// Step 2: configure
// Note: as we reuse this path across many contractions, use a higher number
// of hyper samples.
const int32_t numHyperSamples =
8; // desired number of hyper samples used in the tensor network
// contraction path finder
512; // desired number of hyper samples used in the tensor network
// contraction path finder
{
ScopedTraceWithContext("cutensornetExpectationConfigure");
HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(
Expand All @@ -555,9 +612,10 @@ std::complex<double> TensorNetState::computeExpVal(
cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc));
{
ScopedTraceWithContext("cutensornetExpectationPrepare");
HANDLE_CUTN_ERROR(cutensornetExpectationPrepare(
m_cutnHandle, tensorNetworkExpectation, scratchPad.scratchSize,
workDesc, /*cudaStream*/ 0));
HANDLE_CUTN_ERROR(
cutensornetExpectationPrepare(m_cutnHandle, tensorNetworkExpectation,
scratchPad.scratchSize, workDesc,
/*cudaStream*/ 0));
}

// Attach the workspace buffer
Expand All @@ -574,19 +632,57 @@ std::complex<double> TensorNetState::computeExpVal(
}

// Step 4: Compute
std::complex<double> expVal;
std::complex<double> stateNorm{0.0, 0.0};
{
ScopedTraceWithContext("cutensornetExpectationCompute");
HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
static_cast<void *>(&stateNorm),
/*cudaStream*/ 0));
}
// Step 5: clean up
HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(tensorNetworkExpectation));
HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
return expVal / std::abs(stateNorm);
for (const auto &term : symplecticRepr) {
bool allIdOps = true;
for (std::size_t i = 0; i < numQubits; ++i) {
// Memory address of this Pauli term in the placeholder array.
auto *address = static_cast<char *>(pauliMats_h) + i * ALIGNMENT_BYTES;
constexpr int PAULI_ARRAY_SIZE_BYTES = 4 * sizeof(std::complex<double>);
// The Pauli matrix data that we want to load to this slot.
// Default is the Identity matrix.
const std::complex<double> *pauliMatrixPtr = PauliI_h;
if (i < numSpinOps) {
if (term[i] && term[i + numSpinOps]) {
// Y
allIdOps = false;
pauliMatrixPtr = PauliY_h;
} else if (term[i]) {
// X
allIdOps = false;
pauliMatrixPtr = PauliX_h;
} else if (term[i + numSpinOps]) {
// Z
allIdOps = false;
pauliMatrixPtr = PauliZ_h;
}
}
// Copy the Pauli matrix data to the placeholder array at the appropriate
// slot.
std::memcpy(address, pauliMatrixPtr, PAULI_ARRAY_SIZE_BYTES);
}
if (allIdOps) {
allExpVals.emplace_back(1.0);
} else {
HANDLE_CUDA_ERROR(cudaMemcpy(pauliMats_d, pauliMats_h,
placeHolderArraySize,
cudaMemcpyHostToDevice));
std::complex<double> expVal;
std::complex<double> stateNorm{0.0, 0.0};
{
ScopedTraceWithContext("cutensornetExpectationCompute");
HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
static_cast<void *>(&stateNorm),
/*cudaStream*/ 0));
}
allExpVals.emplace_back(expVal / std::abs(stateNorm));
}
}

free(pauliMats_h);
HANDLE_CUDA_ERROR(cudaFree(pauliMats_d));

return allExpVals;
}

std::unique_ptr<TensorNetState> TensorNetState::createFromMpsTensors(
Expand Down
13 changes: 5 additions & 8 deletions runtime/nvqir/cutensornet/tensornet_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,11 @@ class TensorNetState {
double relCutoff,
cutensornetTensorSVDAlgo_t algo);

/// @brief Compute the expectation value w.r.t. a
/// `cutensornetNetworkOperator_t`
///
/// The `cutensornetNetworkOperator_t` can be constructed from
/// `cudaq::spin_op`, i.e., representing a sum of Pauli products with
/// different coefficients.
std::complex<double>
computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator);
/// @brief Compute the expectation value of an observable
/// @param symplecticRepr The symplectic representation of the observable
/// @return
std::vector<std::complex<double>>
computeExpVals(const std::vector<std::vector<bool>> &symplecticRepr);

/// @brief Number of qubits that this state represents.
std::size_t getNumQubits() const { return m_numQubits; }
Expand Down
2 changes: 0 additions & 2 deletions unittests/integration/observe_result_tester.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ struct deuteron_n3_ansatz {
}
};

#ifndef CUDAQ_BACKEND_TENSORNET
CUDAQ_TEST(ObserveResult, checkSimple) {

using namespace cudaq::spin;
Expand Down Expand Up @@ -113,4 +112,3 @@ CUDAQ_TEST(ObserveResult, checkExpValBug) {
EXPECT_NEAR(exp, .79, 1e-1);
}
#endif
#endif

0 comments on commit 79e6999

Please sign in to comment.