Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindings/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module morph-l2/bindings

go 1.24.0

replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a
replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577

replace github.com/morph-l2/go-ethereum => github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc

Expand Down
2 changes: 1 addition & 1 deletion common/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module morph-l2/common

go 1.24.0

replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a
replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577

replace github.com/morph-l2/go-ethereum => github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc

Expand Down
2 changes: 1 addition & 1 deletion contracts/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module morph-l2/contract

go 1.24.0

replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a
replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577

replace github.com/morph-l2/go-ethereum => github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc

Expand Down
18 changes: 9 additions & 9 deletions node/cmd/node/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func L2NodeMain(ctx *cli.Context) error {
if haService != nil {
ha = haService
}
tmNode, err = sequencer.SetupNode(tmCfg, tmVal, executor, nodeConfig.Logger, verifier, signer, ha)
tmNode, err = sequencer.SetupNode(tmCfg, tmVal, executor, nodeConfig.Logger, verifier, tracker, signer, ha)
if err != nil {
return fmt.Errorf("failed to setup consensus node: %v", err)
}
Expand Down Expand Up @@ -314,14 +314,7 @@ func initL1SequencerComponents(
seqPrivKeyHex := ctx.GlobalString(flags.SequencerPrivateKey.Name)
enclaveSignerAddr := ctx.GlobalString(flags.SequencerEnclaveSignerAddr.Name)

// Initialize L1 Tracker
tracker := l1sequencer.NewL1Tracker(context.Background(), l1Client, lagThreshold, logger)
if err := tracker.Start(); err != nil {
return nil, nil, nil, fmt.Errorf("failed to start L1 tracker: %w", err)
}
logger.Info("L1 Tracker started", "lagThreshold", lagThreshold)

// Initialize Sequencer Verifier
// Initialize Sequencer Verifier (built before the tracker, which refreshes it on L1 recovery).
var verifier *l1sequencer.SequencerVerifier
if contractAddr != (common.Address{}) {
caller, err := bindings.NewL1SequencerCaller(contractAddr, l1Client)
Expand All @@ -337,6 +330,13 @@ func initL1SequencerComponents(
return nil, nil, nil, fmt.Errorf("L1 Sequencer contract address is required, check l1.sequencerContract configuration")
}

// Initialize L1 Tracker (health gate: halts production/sync when L1 is stale).
tracker := l1sequencer.NewL1Tracker(context.Background(), l1Client, verifier, lagThreshold, logger)
if err := tracker.Start(); err != nil {
return nil, nil, nil, fmt.Errorf("failed to start L1 tracker: %w", err)
}
logger.Info("L1 Tracker started", "warnLag", lagThreshold)

// Initialize Signer (optional). Three mutually exclusive modes:
// 1) sequencer.privateKey set → LocalSigner (plaintext key in node memory)
// 2) sequencer.enclaveSignerAddr → EnclaveSigner (vsock to Nitro Enclave; key never in node)
Expand Down
2 changes: 1 addition & 1 deletion node/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module morph-l2/node

go 1.24.0

replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a
replace github.com/tendermint/tendermint => github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577

replace github.com/morph-l2/go-ethereum => github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc

Expand Down
4 changes: 2 additions & 2 deletions node/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lN
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc h1:2Umr8WRDBKwCgGrQQ8yCdhn71bCuMJuecId2ClK80DU=
github.com/morph-l2/go-ethereum v0.0.0-20260608072528-fe02cc1f10bc/go.mod h1:nkVzHjQWCOjvukQW8ittlwX+Xz9gmVHrP7mUi7zoHTs=
github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a h1:TeuQHBpIpH2/Z8jX9sZLtB0+4mwLBfKfII7BD/J5XME=
github.com/morph-l2/tendermint v0.0.0-20260602085346-ee68e1bcf49a/go.mod h1:qpiwqfcCB89dBYfqVJOc/HjGxDp3OdDlthgttJJYyRs=
github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577 h1:u1F8xG9X23TIE/zYeVMOY1BiHPGT9pcfpIoMz2kyrJY=
github.com/morph-l2/tendermint v0.3.8-0.20260612101929-7222401d6577/go.mod h1:qpiwqfcCB89dBYfqVJOc/HjGxDp3OdDlthgttJJYyRs=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
Expand Down
32 changes: 28 additions & 4 deletions node/hakeeper/ha_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ const (
raftInfiniteTimeout = 0 // wait forever
raftMaxConnPool = 10
raftSnapshots = 1 // snapshot data is trivial (8 bytes); keep 1 for log compaction

// advertisedAddr may be a hostname whose DNS record is not yet propagated the
// instant the pod starts (it resolves a few seconds later). Retry the initial
// resolve instead of failing fast on that startup race.
advAddrResolveRetries = 5 // retries after the first attempt (6 resolves total)
advAddrResolveInterval = 3 * time.Second // 5 retries × 3s ≈ 15s max wait for DNS to propagate
)

// HAService implements the SequencerHA interface from tendermint/sequencer.
Expand Down Expand Up @@ -259,7 +265,12 @@ func (h *HAService) ClusterMembership() (*hakeeperrpc.ClusterMembership, error)
Suffrage: hakeeperrpc.ServerSuffrage(srv.Suffrage),
})
}
return &hakeeperrpc.ClusterMembership{Servers: servers, Version: future.Index()}, nil
_, leaderID := h.r.LeaderWithID()
return &hakeeperrpc.ClusterMembership{
Servers: servers,
LeaderID: string(leaderID),
Version: future.Index(),
}, nil
}

func (h *HAService) ServerID() string { return h.cfg.ServerID }
Expand Down Expand Up @@ -336,9 +347,22 @@ func (h *HAService) initRaft() (retErr error) {
// Note: the resolved IP is only used by the transport's LocalAddr(). The ServerAddress
// stored in Raft cluster config (BootstrapCluster/AddServerAsVoter) uses the raw
// h.advertisedAddr which may be a hostname — Raft's Dial() re-resolves DNS each time.
tcpAdvAddr, err := net.ResolveTCPAddr("tcp", h.advertisedAddr)
if err != nil {
return fmt.Errorf("resolve advertised addr %q: %w", h.advertisedAddr, err)
var tcpAdvAddr *net.TCPAddr
for attempt := 0; ; attempt++ {
tcpAdvAddr, err = net.ResolveTCPAddr("tcp", h.advertisedAddr)
if err == nil {
break
}
if attempt >= advAddrResolveRetries {
// DNS still hasn't propagated after the full retry budget. Some consensus-switch
// paths only log StartSequencerRoutines errors and keep running, which would leave
// this node alive but with Raft never started. Panic instead so the process exits
// non-zero and the orchestrator (k8s) restarts the pod for another DNS attempt.
panic(fmt.Errorf("hakeeper: resolve advertised addr %q after %d retries: %w", h.advertisedAddr, advAddrResolveRetries, err))
}
h.logger.Info("hakeeper: advertised addr not resolvable yet, retrying",
"addr", h.advertisedAddr, "retry", attempt+1, "max", advAddrResolveRetries, "err", err)
time.Sleep(advAddrResolveInterval)
}

bindAddr := fmt.Sprintf("%s:%d", h.cfg.Consensus.ListenAddr, h.cfg.Consensus.ListenPort)
Expand Down
20 changes: 15 additions & 5 deletions node/hakeeper/rpc/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@ package rpc
// ServerSuffrage determines whether a Server in a Configuration gets a vote.
type ServerSuffrage int

// These values must match hashicorp/raft's ServerSuffrage exactly: cluster
// membership is read from raft and cast by raw integer value (see
// HAService.ClusterMembership), so any divergence silently mislabels roles.
const (
// Nonvoter receives log entries but is not considered for elections.
// Zero value — safer default (no voting rights).
Nonvoter ServerSuffrage = iota
// Voter is a server whose vote is counted in elections.
Voter
Voter ServerSuffrage = iota
// Nonvoter receives log entries but is not considered for elections.
Nonvoter
// Staging is a server that acts like a Nonvoter while it catches up, then is
// promoted to Voter. Present for parity with raft's enum.
Staging
)

func (s ServerSuffrage) String() string {
Expand All @@ -17,14 +22,19 @@ func (s ServerSuffrage) String() string {
return "Voter"
case Nonvoter:
return "Nonvoter"
case Staging:
return "Staging"
}
return "ServerSuffrage"
}

// ClusterMembership is a versioned list of servers in the Raft cluster.
type ClusterMembership struct {
Servers []ServerInfo `json:"servers"`
Version uint64 `json:"version"`
// LeaderID is the ID of the current leader, matching one of Servers[i].ID.
// Empty when no leader is currently known (e.g. during an election).
LeaderID string `json:"leaderId"`
Version uint64 `json:"version"`
}

// ServerInfo describes a single Raft cluster member.
Expand Down
107 changes: 87 additions & 20 deletions node/l1sequencer/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,83 @@ package l1sequencer

import (
"context"
"sync/atomic"
"time"

"github.com/morph-l2/go-ethereum/ethclient"
tmlog "github.com/tendermint/tendermint/libs/log"
)

// L1Tracker monitors L1 RPC sync status and logs warnings if behind.
// It runs as an independent service.
// checkInterval is how often the tracker polls L1. Smaller than the halt
// threshold so the gate trips/recovers promptly without burdening the RPC.
const checkInterval = 15 * time.Second

// defaultHaltLag is the threshold at which the node halts: once our reference
// timestamp (the L1 head time on success, or the first-failure time during an
// RPC outage) is more than this behind wall-clock, the sequencer stops producing
// and fullnodes stop syncing. Hardcoded for now; promote to a flag if needed.
const defaultHaltLag = 30 * time.Minute

// verifierRefresher re-syncs the L1 sequencer set on demand. The tracker forces
// one refresh when L1 recovers, before reopening the gate, so block acceptance
// resumes against the freshest sequencer set. Implemented by *SequencerVerifier.
type verifierRefresher interface {
refresh() error
}

// L1Tracker polls the L1 RPC and reports a single halt signal via IsHalt: when
// our most recent reference timestamp is older than haltLag, the sequencer must
// stop producing and fullnodes must stop syncing, to avoid acting on a stale
// view of L1 sequencer changes. It implements sequencer.L1Tracker.
type L1Tracker struct {
ctx context.Context
cancel context.CancelFunc
l1Client *ethclient.Client
lagThreshold time.Duration
verifier verifierRefresher
lagThreshold time.Duration // warn threshold (log only)
haltLag time.Duration // halt threshold
logger tmlog.Logger
stop chan struct{}

// State below is only mutated from the single loop goroutine (and tests).
healthy atomic.Bool // read concurrently by gate consumers
lastSeen time.Time // L1 head time on success, or first-failure time during an outage
inErr bool // in an RPC-failure run; keeps lastSeen anchored at the first failure
}

// NewL1Tracker creates a new L1Tracker
// NewL1Tracker creates a new L1Tracker. verifier must not be nil.
func NewL1Tracker(
ctx context.Context,
l1Client *ethclient.Client,
lagThreshold time.Duration,
verifier verifierRefresher,
warnLag time.Duration,
logger tmlog.Logger,
) *L1Tracker {
ctx, cancel := context.WithCancel(ctx)
return &L1Tracker{
t := &L1Tracker{
ctx: ctx,
cancel: cancel,
l1Client: l1Client,
lagThreshold: lagThreshold,
verifier: verifier,
lagThreshold: warnLag,
haltLag: defaultHaltLag,
logger: logger.With("module", "l1tracker"),
stop: make(chan struct{}),
}
t.healthy.Store(true) // start allowed
t.lastSeen = time.Now() // grace: tolerate initial RPC failures for haltLag
return t
}

// Start starts the L1Tracker
// IsHalt implements sequencer.L1Tracker.
func (t *L1Tracker) IsHalt() bool { return !t.healthy.Load() }

func (t *L1Tracker) Start() error {
t.logger.Info("Starting L1Tracker", "lagThreshold", t.lagThreshold)
t.logger.Info("Starting L1Tracker", "warnLag", t.lagThreshold, "haltLag", t.haltLag, "tick", checkInterval)
go t.loop()
return nil
}

// Stop stops the L1Tracker
func (t *L1Tracker) Stop() {
t.logger.Info("Stopping L1Tracker")
t.cancel()
Expand All @@ -53,34 +87,67 @@ func (t *L1Tracker) Stop() {

func (t *L1Tracker) loop() {
defer close(t.stop)

ticker := time.NewTicker(1 * time.Minute)
ticker := time.NewTicker(checkInterval)
defer ticker.Stop()

for {
select {
case <-t.ctx.Done():
return
case <-ticker.C:
t.checkL1SyncLag()
t.check()
}
}
}

func (t *L1Tracker) checkL1SyncLag() {
// check polls the L1 head, emits the warn-level log, and folds the result into
// the health state.
func (t *L1Tracker) check() {
header, err := t.l1Client.HeaderByNumber(t.ctx, nil)
if err != nil {
t.logger.Error("Failed to get L1 header", "error", err)
t.update(time.Time{}, false, time.Now())
return
}

blockTime := time.Unix(int64(header.Time), 0)
lag := time.Since(blockTime)
if lag > t.lagThreshold {
headTime := time.Unix(int64(header.Time), 0)
if lag := time.Since(headTime); lag > t.lagThreshold {
t.logger.Error("L1 RPC is behind",
"latestBlock", header.Number.Uint64(),
"blockTime", blockTime.Format(time.RFC3339),
"blockTime", headTime.Format(time.RFC3339),
"lag", lag.Round(time.Second),
)
}
t.update(headTime, true, time.Now())
}

// update folds one poll into the health state. On success it anchors lastSeen at
// the L1 head time; on the first failure of an outage it anchors at now (the
// inErr flag stops later failures from re-anchoring). It then halts when lastSeen
// is older than haltLag, and on recovery refreshes the verifier before reopening.
// now is injected for testability.
func (t *L1Tracker) update(headTime time.Time, ok bool, now time.Time) {
if ok {
t.inErr = false
t.lastSeen = headTime
} else if !t.inErr {
t.inErr = true
t.lastSeen = now
}

if now.Sub(t.lastSeen) > t.haltLag {
if t.healthy.CompareAndSwap(true, false) {
t.logger.Error("L1 health gate TRIPPED: L1 too stale, halting block production and sync", "haltLag", t.haltLag)
}
return
}

// L1 is fresh again. On recovery, force a verifier resync before reopening;
// if it fails, stay halted and retry next tick.
if !t.healthy.Load() {
if err := t.verifier.refresh(); err != nil {
t.logger.Error("verifier refresh on L1 recovery failed; staying halted", "err", err)
return
}
t.healthy.Store(true)
t.logger.Info("L1 health gate RECOVERED: resuming block production and sync")
}
}
Loading
Loading