From c52d9f25633d9d0fcc854927a79b5ee23c4b1879 Mon Sep 17 00:00:00 2001
From: Samuel Laferriere <samlaf92@gmail.com>
Date: Wed, 26 Feb 2025 12:51:21 -0500
Subject: [PATCH] test(kt-devnet): add batcher failover test

---
 go.mod                                        |   1 +
 go.sum                                        |   2 +
 kurtosis-devnet/eigenda-memstore.yaml         |   7 +-
 kurtosis-devnet/justfile                      |   5 +-
 .../tests/eigenda/failover_test.go            | 398 ++++++++++++++++++
 5 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 kurtosis-devnet/tests/eigenda/failover_test.go

diff --git a/go.mod b/go.mod
index 13e6e7421b88..9916c743c21f 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,7 @@ toolchain go1.22.7
 
 require (
 	github.com/BurntSushi/toml v1.4.0
+	github.com/Layr-Labs/eigenda-proxy/clients v1.0.1
 	github.com/andybalholm/brotli v1.1.0
 	github.com/bmatcuk/doublestar/v4 v4.8.1
 	github.com/btcsuite/btcd v0.24.2
diff --git a/go.sum b/go.sum
index 7c7222f7ec3b..b5509fc58372 100644
--- a/go.sum
+++ b/go.sum
@@ -16,6 +16,8 @@ github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3
 github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
 github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e h1:ZIWapoIRN1VqT8GR8jAwb1Ie9GyehWjVcGh32Y2MznE=
 github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
+github.com/Layr-Labs/eigenda-proxy/clients v1.0.1 h1:62NFB1fUauwQPGvTiOXhz1HKaL0fRhGy34tI9EpKz6I=
+github.com/Layr-Labs/eigenda-proxy/clients v1.0.1/go.mod h1:JbDNvSritUGHErvzwB5Tb1IrVk7kea9DSBLKEOkBebE=
 github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc=
 github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs=
 github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
diff --git a/kurtosis-devnet/eigenda-memstore.yaml b/kurtosis-devnet/eigenda-memstore.yaml
index 3d8cd1a643d5..675d248aa4df 100644
--- a/kurtosis-devnet/eigenda-memstore.yaml
+++ b/kurtosis-devnet/eigenda-memstore.yaml
@@ -51,7 +51,7 @@ optimism_package:
         image: {{ localDockerImage "op-batcher" }}
         extra_params:
           - --altda.max-concurrent-da-requests=1
-          - --max-channel-duration=25
+          - --max-channel-duration=2
           - --target-num-frames=1
           - --max-l1-tx-size-bytes=1000
           - --batch-type=1
@@ -68,7 +68,8 @@ optimism_package:
         cannon_prestates_url: "http://fileserver/proofs/op-program/cannon"
         extra_params: []
       da_server_params:
-        image: ghcr.io/layr-labs/eigenda-proxy:v1.6.4
+        # TODO: release 1.6.5 which has memstore API routes
+        image: ghcr.io/layr-labs/eigenda-proxy:dev
         cmd:
           - --addr
           - 0.0.0.0
@@ -86,6 +87,8 @@ optimism_package:
 ethereum_package:
   participants:
     - el_type: geth
+      el_extra_params:
+        - --graphql # needed to query for batcher-inbox txs to test failover working correctly
       cl_type: teku
   network_params:
     preset: minimal
diff --git a/kurtosis-devnet/justfile b/kurtosis-devnet/justfile
index 863516c7c51f..8a7bcdd53ecd 100644
--- a/kurtosis-devnet/justfile
+++ b/kurtosis-devnet/justfile
@@ -155,10 +155,13 @@ eigenda-memstore-devnet-restart-batcher:
       --altda.da-server=http://da-server-op-kurtosis:3100 \
       --altda.da-service \
       --altda.max-concurrent-da-requests=1 \
-      --max-channel-duration=25 \
+      --max-channel-duration=2 \
       --target-num-frames=1 \
       --max-l1-tx-size-bytes=1000 \
       --batch-type=1
+[group('eigenda')]
+eigenda-memstore-devnet-test:
+    go test ./tests/eigenda/...
 
 # Simple devnet
 simple-devnet: (devnet "simple.yaml")
diff --git a/kurtosis-devnet/tests/eigenda/failover_test.go b/kurtosis-devnet/tests/eigenda/failover_test.go
new file mode 100644
index 000000000000..0b2e738b9199
--- /dev/null
+++ b/kurtosis-devnet/tests/eigenda/failover_test.go
@@ -0,0 +1,398 @@
+package eigenda_test
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math/big"
+	"net/http"
+	"reflect"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/Layr-Labs/eigenda-proxy/clients/memconfig_client"
+	"github.com/ethereum-optimism/optimism/op-e2e/e2eutils/geth"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/ethclient"
+	"github.com/ethereum/go-ethereum/rpc"
+	"github.com/kurtosis-tech/kurtosis/api/golang/core/lib/enclaves"
+	"github.com/kurtosis-tech/kurtosis/api/golang/engine/lib/kurtosis_context"
+	"github.com/stretchr/testify/require"
+)
+
+// All tests are run in the context of the eigenda-memstore-devnet enclave.
+// We assume that this enclave is already running.
+const enclaveName = "eigenda-memstore-devnet"
+
+func TestFailover(t *testing.T) {
+	deadline, ok := t.Deadline()
+	if !ok {
+		deadline = time.Now().Add(1 * time.Minute)
+	}
+	ctxWithDeadline, cancel := context.WithDeadline(context.Background(), deadline)
+	defer cancel()
+
+	harness := newHarness(t)
+	t.Cleanup(func() {
+		// switch proxy back to normal mode, in case test gets cancelled
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		err := harness.clients.proxyMemconfigClient.Failback(ctx)
+		if err != nil {
+			t.Logf("Error failing back... you might need to reset proxy to normal mode manually: %v", err)
+		}
+	})
+
+	// assume kurtosis is running and is at least at block 10 (just deploying the contracts takes more than 10 blocks)
+	require.GreaterOrEqual(t, harness.testStartL1BlockNum, uint64(10), "Test started too early in the chain")
+	sinceBlock := harness.testStartL1BlockNum - 10
+
+	// 1. Check that the original commitments are EigenDA
+	harness.requireBatcherTxsToBeFromLayer(t, sinceBlock, DALayerEigenDA)
+
+	// 2. Failover and check that the commitments are now EthDA
+	err := harness.clients.proxyMemconfigClient.Failover(ctxWithDeadline)
+	require.NoError(t, err)
+
+	afterFailoverL1BlockNum, err := harness.clients.gethL1Client.BlockNumber(ctxWithDeadline)
+	require.NoError(t, err)
+	// Wait for 10 L1 blocks. Assumption is that a cert is being posted every 2 blocks.
+	// Failover should take some time so we wait for 10 blocks to be sure some new commitment has started getting posted.
+	// TODO: read max-channel-duration from batcher's config instead of assuming 2 blocks
+	_, err = geth.WaitForBlock(big.NewInt(int64(afterFailoverL1BlockNum)+10), harness.clients.gethL1Client)
+	require.NoError(t, err)
+
+	harness.requireBatcherTxsToBeFromLayer(t, afterFailoverL1BlockNum, DALayerEth)
+
+	// 3. Failback and check that the commitments are EigenDA again
+	err = harness.clients.proxyMemconfigClient.Failback(ctxWithDeadline)
+	require.NoError(t, err)
+
+	afterFailbackL1BlockNum, err := harness.clients.gethL1Client.BlockNumber(ctxWithDeadline)
+	require.NoError(t, err)
+	_, err = geth.WaitForBlock(big.NewInt(int64(afterFailbackL1BlockNum)+10), harness.clients.gethL1Client)
+	require.NoError(t, err)
+
+	harness.requireBatcherTxsToBeFromLayer(t, afterFailbackL1BlockNum, DALayerEigenDA)
+
+}
+
+// Test Harness, which contains all the state needed to run the tests.
+// harness also defines some higher-level "require" methods that are used in the tests.
+type harness struct {
+	enclaveCtx          *enclaves.EnclaveContext
+	endpoints           *EnclaveServiceEndpoints
+	clients             *EnclaveServiceClients
+	batchInboxAddr      common.Address
+	testStartL1BlockNum uint64
+}
+
+func newHarness(t *testing.T) *harness {
+	// We leave 20 seconds to build the entire testHarness.
+	ctxWithTimeout, cancel := context.WithTimeout(context.Background(), 20*time.Second)
+	defer cancel()
+
+	// Create a Kurtosis context
+	kurtosisCtx, err := kurtosis_context.NewKurtosisContextFromLocalEngine()
+	require.NoError(t, err)
+
+	// Get the eigenda-memstore-devnet enclave (assuming it's already running)
+	enclaveCtx, err := kurtosisCtx.GetEnclaveContext(ctxWithTimeout, enclaveName)
+	require.NoError(t, err, "Error getting enclave context: is enclave %v running?", enclaveName)
+
+	endpoints, err := getEndpointsFromKurtosis(enclaveCtx)
+	require.NoError(t, err)
+	t.Logf("Endpoints: %+v", endpoints)
+
+	clients, err := getClientsFromEndpoints(endpoints)
+	require.NoError(t, err)
+
+	// Get the batch inbox address
+	var rollupConfigMap struct {
+		BatchInboxAddress string `json:"batch_inbox_address"`
+	}
+	err = clients.opNodeClient.CallContext(ctxWithTimeout, &rollupConfigMap, "optimism_rollupConfig")
+	require.NoError(t, err)
+
+	// Get the current L1 block number
+	testStartL1BlockNum, err := clients.gethL1Client.BlockNumber(ctxWithTimeout)
+	require.NoError(t, err)
+
+	return &harness{
+		enclaveCtx:          enclaveCtx,
+		endpoints:           endpoints,
+		clients:             clients,
+		batchInboxAddr:      common.HexToAddress(rollupConfigMap.BatchInboxAddress),
+		testStartL1BlockNum: testStartL1BlockNum,
+	}
+}
+
+// requireBatcherTxsToBeFromLayer checks that the batcher transactions since startingFromBlockNum are all from the expectedLayer.
+// It allows for up to 3 initial commitments to be of the wrong type, as the failover/failback might not have taken effect yet.
+// It requires that at least 2 commitments of the expected type are present after the failover/failback.
+func (h *harness) requireBatcherTxsToBeFromLayer(t *testing.T, startingFromBlockNum uint64, expectedLayer DALayer) {
+	batcherTxs, err := fetchBatcherTxsSinceBlock(h.endpoints.GethL1Endpoint, h.batchInboxAddr.String(), startingFromBlockNum)
+	require.NoError(t, err)
+
+	// We allow first 3 commitments to be of the wrong DA layer, as the failover/failback might not have taken effect yet.
+	ethDACommitmentsToDiscard := 0
+	for _, batcherTx := range batcherTxs {
+		if batcherTx.daLayer == DALayerEth {
+			ethDACommitmentsToDiscard++
+		}
+		// as soon as we see 3 ethDA commitments, or an EigenDA commitment, we stop
+		// We expect every other commitment to be EigenDA after failback
+		if ethDACommitmentsToDiscard > 2 || batcherTx.daLayer == DALayerEigenDA {
+			break
+		}
+	}
+	batcherTxs = batcherTxs[ethDACommitmentsToDiscard:]
+
+	// After potentially discarding up to 3 commitments, we expect all future commitments (at least 2) to be of the expectedLayer
+	require.GreaterOrEqual(t, len(batcherTxs), 2, "Expected at least 2 %v commitments after failover/failback", expectedLayer)
+	for _, batcherTx := range batcherTxs {
+		require.Equal(t, DALayerEigenDA, batcherTx.daLayer,
+			"Invalid commitment in block %d: expected %v, received commitment %s", batcherTx.block, expectedLayer, batcherTx.commitment)
+	}
+}
+
+// See https://specs.optimism.io/experimental/alt-da.html#example-commitments
+const ethDACommitmentPrefix = "0x00"
+const eigenDACommitmentPrefix = "0x010100"
+
+type DALayer string
+
+const (
+	DALayerEth     DALayer = "ethda"
+	DALayerEigenDA DALayer = "eigenda"
+)
+
+type BatcherTx struct {
+	commitment string
+	daLayer    DALayer // commitment starts with respective prefix
+	block      uint64
+}
+
+// HexUint64 is a custom type that can unmarshal from a hex string
+type HexUint64 uint64
+
+// UnmarshalJSON implements the json.Unmarshaler interface
+func (h *HexUint64) UnmarshalJSON(data []byte) error {
+	// Remove quotes from the JSON string
+	hexStr := string(data)
+	hexStr = strings.Trim(hexStr, "\"")
+
+	// Check if it's a hex string
+	if !strings.HasPrefix(hexStr, "0x") {
+		return fmt.Errorf("not a hex string: %s", hexStr)
+	}
+
+	// Parse the hex string (without the 0x prefix)
+	val, err := strconv.ParseUint(hexStr[2:], 16, 64)
+	if err != nil {
+		return err
+	}
+
+	*h = HexUint64(val)
+	return nil
+}
+
+// Fetches all the batch-inbox posted commitments from blockNum (inclusive) to current block.
+func fetchBatcherTxsSinceBlock(gethL1Endpoint string, batchInbox string, blockNum uint64) ([]BatcherTx, error) {
+	// We'll use standard HTTP for GraphQL as it's not directly supported by the rpc package
+	query := fmt.Sprintf(`
+	{
+		"query": "query txInfo { blocks(from:%v) { transactions { to { address } inputData block { number } } } }"
+	}`, blockNum)
+
+	// Make GraphQL request
+	req, err := http.NewRequest("POST", gethL1Endpoint+"/graphql", strings.NewReader(query))
+	if err != nil {
+		return nil, err
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+	httpClient := &http.Client{}
+	resp, err := httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	// Parse the response
+	type GraphQLResponse struct {
+		Data struct {
+			Blocks []struct {
+				Transactions []struct {
+					To struct {
+						Address string `json:"address"`
+					} `json:"to"`
+					InputData string `json:"inputData"`
+					Block     struct {
+						// we use HexUint64 to properly parse the hex strings returned
+						Number HexUint64 `json:"number"`
+					} `json:"block"`
+				} `json:"transactions"`
+			} `json:"blocks"`
+		} `json:"data"`
+	}
+	var graphQLResp GraphQLResponse
+	if err := json.NewDecoder(resp.Body).Decode(&graphQLResp); err != nil {
+		return nil, err
+	}
+	if len(graphQLResp.Data.Blocks) == 0 {
+		// Assume that this is a graphQL query error, that would have returned something like
+		// "errors": [
+		// 	{
+		// 	  "message": "syntax error: unexpected \"\", expecting Ident",
+		// 	}
+		// ]
+		// TODO: prob should just switch to a proper graphql client that can handle these properly
+		return nil, fmt.Errorf("no blocks returned in GraphQL response")
+	}
+
+	// Filter transactions to the batcher address
+	var batcherTxs []BatcherTx
+	for _, block := range graphQLResp.Data.Blocks {
+		for _, tx := range block.Transactions {
+			if strings.EqualFold(tx.To.Address, batchInbox) {
+				var daLayer DALayer
+				if strings.HasPrefix(tx.InputData, eigenDACommitmentPrefix) {
+					daLayer = DALayerEigenDA
+				} else if strings.HasPrefix(tx.InputData, ethDACommitmentPrefix) {
+					daLayer = DALayerEth
+				} else {
+					return nil, fmt.Errorf("unknown commitment prefix: %s", tx.InputData)
+				}
+				batcherTxs = append(batcherTxs, BatcherTx{
+					commitment: tx.InputData,
+					daLayer:    daLayer,
+					block:      uint64(tx.Block.Number),
+				})
+			}
+		}
+	}
+
+	return batcherTxs, nil
+}
+
+// Localhost endpoints for the different services in the enclave
+// that we need to interact with.
+type EnclaveServiceEndpoints struct {
+	OpNodeEndpoint       string `kurtosis:"op-cl-1-op-node-op-geth-op-kurtosis,http"`
+	GethL1Endpoint       string `kurtosis:"el-1-geth-teku,rpc"`
+	EigendaProxyEndpoint string `kurtosis:"da-server-op-kurtosis,http"`
+	// Adding new endpoints is as simple as adding a new field with a kurtosis tag
+	// NewServiceEndpoint   string `kurtosis:"new-service-name,port-name"`
+}
+
+func getEndpointsFromKurtosis(enclaveCtx *enclaves.EnclaveContext) (*EnclaveServiceEndpoints, error) {
+	endpoints := &EnclaveServiceEndpoints{}
+
+	// Get the type of the struct to iterate over fields
+	t := reflect.TypeOf(endpoints).Elem()
+	v := reflect.ValueOf(endpoints).Elem()
+
+	// Iterate over all fields in the struct
+	for i := 0; i < t.NumField(); i++ {
+		field := t.Field(i)
+
+		// Get the kurtosis tag
+		tag := field.Tag.Get("kurtosis")
+		if tag == "" {
+			continue // Skip fields without tags
+		}
+
+		// Parse the tag to get service name and port name
+		parts := strings.Split(tag, ",")
+		if len(parts) != 2 {
+			return nil, fmt.Errorf("invalid kurtosis tag format for field %s: %s", field.Name, tag)
+		}
+
+		serviceName := parts[0]
+		portName := parts[1]
+
+		// Get the service context
+		serviceCtx, err := enclaveCtx.GetServiceContext(serviceName)
+		if err != nil {
+			return nil, fmt.Errorf("GetServiceContext for %s: %w", serviceName, err)
+		}
+
+		// Get the port
+		port, ok := serviceCtx.GetPublicPorts()[portName]
+		if !ok {
+			return nil, fmt.Errorf("service %s doesn't expose %s port", serviceName, portName)
+		}
+
+		// Set the endpoint URL in the struct field
+		endpoint := fmt.Sprintf("http://localhost:%d", port.GetNumber())
+		v.Field(i).SetString(endpoint)
+	}
+
+	return endpoints, nil
+}
+
+type EnclaveServiceClients struct {
+	opNodeClient         *rpc.Client
+	gethL1Client         *ethclient.Client
+	proxyMemconfigClient *ProxyMemconfigClient
+}
+
+func getClientsFromEndpoints(endpoints *EnclaveServiceEndpoints) (*EnclaveServiceClients, error) {
+	opNodeClient, err := rpc.Dial(endpoints.OpNodeEndpoint)
+	if err != nil {
+		return nil, fmt.Errorf("rpc.Dial: %w", err)
+	}
+
+	gethL1Client, err := ethclient.Dial(endpoints.GethL1Endpoint)
+	if err != nil {
+		return nil, fmt.Errorf("ethclient.Dial: %w", err)
+	}
+
+	proxyMemconfigClient := &ProxyMemconfigClient{
+		Client: memconfig_client.New(&memconfig_client.Config{URL: endpoints.EigendaProxyEndpoint}),
+	}
+
+	return &EnclaveServiceClients{
+		opNodeClient:         opNodeClient,
+		gethL1Client:         gethL1Client,
+		proxyMemconfigClient: proxyMemconfigClient,
+	}, nil
+}
+
+// ProxyMemconfigClient is a wrapper around the memconfig client that adds a Failover method
+// TODO: we should upstream this to eigenda-proxy repo
+type ProxyMemconfigClient struct {
+	*memconfig_client.Client
+}
+
+// Update the proxy's memstore config to start returning 503 errors
+// Note: we have to GetConfig, update it and then UpdateConfig because the client doesn't implement a "patch" method,
+//
+//	even though the API does support it.
+func (c *ProxyMemconfigClient) Failover(ctx context.Context) error {
+	memConfig, err := c.GetConfig(ctx)
+	if err != nil {
+		return fmt.Errorf("GetConfig: %w", err)
+	}
+	memConfig.PutReturnsFailoverError = true
+	_, err = c.UpdateConfig(ctx, memConfig)
+	if err != nil {
+		return fmt.Errorf("UpdateConfig: %w", err)
+	}
+	return nil
+}
+func (c *ProxyMemconfigClient) Failback(ctx context.Context) error {
+	memConfig, err := c.GetConfig(ctx)
+	if err != nil {
+		return fmt.Errorf("GetConfig: %w", err)
+	}
+	memConfig.PutReturnsFailoverError = false
+	_, err = c.UpdateConfig(ctx, memConfig)
+	if err != nil {
+		return fmt.Errorf("UpdateConfig: %w", err)
+	}
+	return nil
+}