Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hotfix(message/validation): optimize signer state memory usage #1874

Open
wants to merge 23 commits into
base: stage
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions message/validation/committee_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package validation

import (
"github.com/attestantio/go-eth2-client/spec/phase0"
spectypes "github.com/ssvlabs/ssv-spec/types"
)

type CommitteeInfo struct {
committeeID spectypes.CommitteeID
committee []spectypes.OperatorID
signerIndices map[spectypes.OperatorID]int
validatorIndices []phase0.ValidatorIndex
}

func newCommitteeInfo(
committeeID spectypes.CommitteeID,
operators []spectypes.OperatorID,
validatorIndices []phase0.ValidatorIndex,
) CommitteeInfo {
signerIndices := make(map[spectypes.OperatorID]int)
for i, operator := range operators {
signerIndices[operator] = i
}

return CommitteeInfo{
committeeID: committeeID,
committee: operators,
signerIndices: signerIndices,
validatorIndices: validatorIndices,
}
}

// keeping the method for readability and the comment
func (ci *CommitteeInfo) signerIndex(signer spectypes.OperatorID) int {
return ci.signerIndices[signer] // existence must be checked by ErrSignerNotInCommittee
}
2 changes: 1 addition & 1 deletion message/validation/common_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (mv *messageValidator) messageLateness(slot phase0.Slot, role spectypes.Run
case spectypes.RoleProposer, spectypes.RoleSyncCommitteeContribution:
ttl = 1 + lateSlotAllowance
case spectypes.RoleCommittee, spectypes.RoleAggregator:
ttl = phase0.Slot(mv.netCfg.Beacon.SlotsPerEpoch()) + lateSlotAllowance
ttl = MaxStoredSlots(mv.netCfg)
case spectypes.RoleValidatorRegistration, spectypes.RoleVoluntaryExit:
return 0
}
Expand Down
43 changes: 18 additions & 25 deletions message/validation/consensus_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,29 @@ import (
"sync"

"github.com/attestantio/go-eth2-client/spec/phase0"
spectypes "github.com/ssvlabs/ssv-spec/types"
)

// consensusID uniquely identifies a public key and role pair to keep track of state.
type consensusID struct {
DutyExecutorID string
Role spectypes.RunnerRole
}

// consensusState keeps track of the signers for a given public key and role.
type consensusState struct {
state map[spectypes.OperatorID]*OperatorState
// ValidatorState keeps track of the signers for a given public key and role.
type ValidatorState struct {
operators []*OperatorState
storedSlotCount phase0.Slot
mu sync.Mutex
}

func (cs *consensusState) GetOrCreate(signer spectypes.OperatorID) *OperatorState {
func (cs *ValidatorState) Signer(idx int) *OperatorState {
cs.mu.Lock()
defer cs.mu.Unlock()

if _, ok := cs.state[signer]; !ok {
cs.state[signer] = newOperatorState(cs.storedSlotCount)
if cs.operators[idx] == nil {
cs.operators[idx] = newOperatorState(cs.storedSlotCount)
}

return cs.state[signer]
return cs.operators[idx]
}

type OperatorState struct {
mu sync.RWMutex
state []*SignerState // the slice index is slot % storedSlotCount
mu sync.Mutex
signers []*SignerState // the slice index is slot % storedSlotCount
maxSlot phase0.Slot
maxEpoch phase0.Epoch
lastEpochDuties uint64
Expand All @@ -42,15 +35,15 @@ type OperatorState struct {

func newOperatorState(size phase0.Slot) *OperatorState {
return &OperatorState{
state: make([]*SignerState, size),
signers: make([]*SignerState, size),
}
}

func (os *OperatorState) Get(slot phase0.Slot) *SignerState {
os.mu.RLock()
defer os.mu.RUnlock()
os.mu.Lock()
defer os.mu.Unlock()

s := os.state[(uint64(slot) % uint64(len(os.state)))]
s := os.signers[(uint64(slot) % uint64(len(os.signers)))]
if s == nil || s.Slot != slot {
return nil
}
Expand All @@ -62,7 +55,7 @@ func (os *OperatorState) Set(slot phase0.Slot, epoch phase0.Epoch, state *Signer
os.mu.Lock()
defer os.mu.Unlock()

os.state[uint64(slot)%uint64(len(os.state))] = state
os.signers[uint64(slot)%uint64(len(os.signers))] = state
if slot > os.maxSlot {
os.maxSlot = slot
}
Expand All @@ -76,15 +69,15 @@ func (os *OperatorState) Set(slot phase0.Slot, epoch phase0.Epoch, state *Signer
}

func (os *OperatorState) MaxSlot() phase0.Slot {
os.mu.RLock()
defer os.mu.RUnlock()
os.mu.Lock()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a specific reason for replacing the RWMutex with a write-only Mutex?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@oleg-ssvlabs RWMutex consumes more memory, and the OperatorState memory consumption seems to be a bottleneck in the exporter. I'd use RWMutex here only if we benchmark it and see a significant improvement

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, interesting. Do you happen to have any numbers for comparison? It would be really compelling to see the difference (specifically between mutex and rwMutex)

Copy link
Contributor Author

@nkryuchkov nkryuchkov Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's 64 vs 80 bytes for each OperatorState IIRC, not much of a difference, but I just wanted to squeeze everything out of the state structure because we allocate a lot of them: for each validator for each role for each operator. So I guess on mainnet the total difference would be a few tens of megabytes (~60K validators * 4 roles * ~5-6 avg committee size * 64 vs 80), which is not very much.

I agree that RWMutex would reduce mutex block time but I think the difference wouldn't be very big. But generally it looks to me as a trade-off and since we're currently fighting with exporter memory issues, I tend to prefer to reduce the memory use

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@oleg-ssvlabs actually, perhaps we could remove this mutex as @moshe-blox suggested in #2034 (comment). We have validation lock by msg ID, and the message validation doesn't have concurrent checks, so we shouldn't have any data race in OperatorState and ValidatorState

defer os.mu.Unlock()

return os.maxSlot
}

func (os *OperatorState) DutyCount(epoch phase0.Epoch) uint64 {
os.mu.RLock()
defer os.mu.RUnlock()
os.mu.Lock()
defer os.mu.Unlock()

if epoch == os.maxEpoch {
return os.lastEpochDuties
Expand Down
10 changes: 5 additions & 5 deletions message/validation/consensus_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func TestOperatorState(t *testing.T) {
size := phase0.Slot(10)
os := newOperatorState(size)
require.NotNil(t, os)
require.Equal(t, len(os.state), int(size))
require.Equal(t, len(os.signers), int(size))
})

t.Run("TestGetAndSet", func(t *testing.T) {
Expand Down Expand Up @@ -58,9 +58,9 @@ func TestOperatorState(t *testing.T) {

slot := phase0.Slot(5)
epoch := phase0.Epoch(1)
signerState := &SignerState{Slot: slot}
signerState1 := &SignerState{Slot: slot}

os.Set(slot, epoch, signerState)
os.Set(slot, epoch, signerState1)

require.Equal(t, os.DutyCount(epoch), uint64(1))
require.Equal(t, os.DutyCount(epoch-1), uint64(0))
Expand All @@ -82,9 +82,9 @@ func TestOperatorState(t *testing.T) {

slot := phase0.Slot(5)
epoch := phase0.Epoch(1)
signerState := &SignerState{Slot: slot}
signerState1 := &SignerState{Slot: slot}

os.Set(slot, epoch, signerState)
os.Set(slot, epoch, signerState1)
require.Equal(t, os.DutyCount(epoch), uint64(1))

slot2 := phase0.Slot(6)
Expand Down
Loading