From a83ee49eacb43e2e41b59a66a79f7dbf4f5f9519 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Fri, 8 Aug 2025 18:06:06 +0800
Subject: [PATCH 01/19] feat: wrap blockchain test for benchmark

---
 src/ethereum_test_specs/__init__.py  |   4 +
 src/ethereum_test_specs/benchmark.py | 164 +++++++++++++++++++++++++++
 src/ethereum_test_tools/__init__.py  |   4 +
 tests/benchmark/test_worst_blocks.py |   9 +-
 4 files changed, 178 insertions(+), 3 deletions(-)
 create mode 100644 src/ethereum_test_specs/benchmark.py

diff --git a/src/ethereum_test_specs/__init__.py b/src/ethereum_test_specs/__init__.py
index 790e2b4351f..e0baf8c5188 100644
--- a/src/ethereum_test_specs/__init__.py
+++ b/src/ethereum_test_specs/__init__.py
@@ -2,6 +2,7 @@
 
 from .base import BaseTest, TestSpec
 from .base_static import BaseStaticTest
+from .benchmark import BenchmarkTest, BenchmarkTestFiller, BenchmarkTestSpec
 from .blobs import BlobsTest, BlobsTestFiller, BlobsTestSpec
 from .blockchain import (
     BlockchainTest,
@@ -23,6 +24,9 @@
 __all__ = (
     "BaseStaticTest",
     "BaseTest",
+    "BenchmarkTest",
+    "BenchmarkTestFiller",
+    "BenchmarkTestSpec",
     "BlobsTest",
     "BlobsTestFiller",
     "BlobsTestSpec",
diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
new file mode 100644
index 00000000000..fb0eb8b68f1
--- /dev/null
+++ b/src/ethereum_test_specs/benchmark.py
@@ -0,0 +1,164 @@
+"""Ethereum benchmark test spec definition and filler."""
+
+from typing import Callable, ClassVar, Dict, Generator, List, Optional, Sequence, Type
+
+import pytest
+from pydantic import Field
+
+from ethereum_clis import TransitionTool
+from ethereum_test_base_types import HexNumber
+from ethereum_test_exceptions import BlockException, TransactionException
+from ethereum_test_execution import (
+    BaseExecute,
+    ExecuteFormat,
+    LabeledExecuteFormat,
+    TransactionPost,
+)
+from ethereum_test_fixtures import (
+    BaseFixture,
+    BlockchainEngineFixture,
+    BlockchainEngineXFixture,
+    BlockchainFixture,
+    FixtureFormat,
+    LabeledFixtureFormat,
+)
+from ethereum_test_forks import Fork
+from ethereum_test_types import Alloc, Environment, Transaction
+
+from .base import BaseTest
+from .blockchain import Block, BlockchainTest
+
+
+class BenchmarkTest(BaseTest):
+    """Test type designed specifically for benchmark test cases."""
+
+    pre: Alloc
+    post: Alloc
+    tx: Optional[Transaction] = None
+    blocks: Optional[List[Block]] = None
+    block_exception: (
+        List[TransactionException | BlockException] | TransactionException | BlockException | None
+    ) = None
+    env: Environment = Field(default_factory=Environment)
+    expected_benchmark_gas_used: int | None = None
+
+    supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
+        BlockchainFixture,
+        BlockchainEngineFixture,
+        BlockchainEngineXFixture,
+    ]
+
+    supported_execute_formats: ClassVar[Sequence[LabeledExecuteFormat]] = [
+        LabeledExecuteFormat(
+            TransactionPost,
+            "benchmark_test",
+            "An execute test derived from a benchmark test",
+        ),
+    ]
+
+    supported_markers: ClassVar[Dict[str, str]] = {
+        "blockchain_test_engine_only": "Only generate a blockchain test engine fixture",
+        "blockchain_test_only": "Only generate a blockchain test fixture",
+    }
+
+    @classmethod
+    def pytest_parameter_name(cls) -> str:
+        """Return the parameter name used in pytest to select this spec type."""
+        return "benchmark_test"
+
+    @classmethod
+    def discard_fixture_format_by_marks(
+        cls,
+        fixture_format: FixtureFormat,
+        fork: Fork,
+        markers: List[pytest.Mark],
+    ) -> bool:
+        """Discard a fixture format from filling if the appropriate marker is used."""
+        if "blockchain_test_only" in [m.name for m in markers]:
+            return fixture_format != BlockchainFixture
+        if "blockchain_test_engine_only" in [m.name for m in markers]:
+            return fixture_format != BlockchainEngineFixture
+        return False
+
+    def get_genesis_environment(self, fork: Fork) -> Environment:
+        """Get the genesis environment for this benchmark test."""
+        return self.env
+
+    def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[Transaction]:
+        """Split a transaction that exceeds the gas limit cap into multiple transactions."""
+        if (gas_limit_cap is None) or (tx.gas_limit <= gas_limit_cap):
+            return [tx]
+
+        total_gas = int(self.expected_benchmark_gas_used or self.env.gas_limit)
+        print(f"total_gas: {total_gas}")
+        num_splits = total_gas // gas_limit_cap
+
+        split_transactions = []
+        for i in range(num_splits):
+            split_tx = tx.model_copy()
+            total_gas -= gas_limit_cap
+            split_tx.gas_limit = HexNumber(total_gas if i == num_splits - 1 else gas_limit_cap)
+            split_tx.nonce = HexNumber(tx.nonce + i)
+            split_transactions.append(split_tx)
+
+        return split_transactions
+
+    def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
+        """Create a BlockchainTest from this BenchmarkTest."""
+        if self.blocks is not None:
+            return BlockchainTest.from_test(
+                base_test=self,
+                genesis_environment=self.env,
+                pre=self.pre,
+                post=self.post,
+                blocks=self.blocks,
+            )
+        elif self.tx is not None:
+            gas_limit_cap = fork.transaction_gas_limit_cap()
+
+            transactions = self.split_transaction(self.tx, gas_limit_cap)
+
+            blocks = [Block(txs=transactions)]
+
+            return BlockchainTest.from_test(
+                base_test=self,
+                pre=self.pre,
+                post=self.post,
+                blocks=blocks,
+                genesis_environment=self.env,
+            )
+        else:
+            raise ValueError("Cannot create BlockchainTest without transactions or blocks")
+
+    def generate(
+        self,
+        t8n: TransitionTool,
+        fork: Fork,
+        fixture_format: FixtureFormat,
+    ) -> BaseFixture:
+        """Generate the blockchain test fixture."""
+        self.check_exception_test(exception=self.tx.error is not None if self.tx else False)
+        if fixture_format in BlockchainTest.supported_fixture_formats:
+            return self.generate_blockchain_test(fork=fork).generate(
+                t8n=t8n, fork=fork, fixture_format=fixture_format
+            )
+        else:
+            raise Exception(f"Unsupported fixture format: {fixture_format}")
+
+    def execute(
+        self,
+        *,
+        fork: Fork,
+        execute_format: ExecuteFormat,
+    ) -> BaseExecute:
+        """Execute the benchmark test by sending it to the live network."""
+        if execute_format == TransactionPost:
+            return TransactionPost(
+                blocks=[[self.tx]],
+                post=self.post,
+            )
+        raise Exception(f"Unsupported execute format: {execute_format}")
+
+
+BenchmarkTestSpec = Callable[[str], Generator[BenchmarkTest, None, None]]
+BenchmarkTestFiller = Type[BenchmarkTest]
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index bb0b026ef9e..6a822305f94 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -25,6 +25,8 @@
 from ethereum_test_fixtures import BaseFixture, FixtureCollector
 from ethereum_test_specs import (
     BaseTest,
+    BenchmarkTest,
+    BenchmarkTestFiller,
     BlobsTest,
     BlobsTestFiller,
     BlockchainTest,
@@ -112,6 +114,8 @@
     "BalStorageSlot",
     "BaseFixture",
     "BaseTest",
+    "BenchmarkTest",
+    "BenchmarkTestFiller",
     "Blob",
     "BlockAccessList",
     "BlobsTest",
diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index 38e6d5f71e6..df007629349 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -15,8 +15,9 @@
     Account,
     Address,
     Alloc,
+    BenchmarkTestFiller,
     Block,
-    BlockchainTestFiller,
+    Environment,
     Hash,
     StateTestFiller,
     Transaction,
@@ -110,8 +111,9 @@ def ether_transfer_case(
     ["a_to_a", "a_to_b", "diff_acc_to_b", "a_to_diff_acc", "diff_acc_to_diff_acc"],
 )
 def test_block_full_of_ether_transfers(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
+    env: Environment,
     case_id: str,
     ether_transfer_case,
     iteration_count: int,
@@ -152,7 +154,8 @@ def test_block_full_of_ether_transfers(
         else {receiver: Account(balance=balance) for receiver, balance in balances.items()}
     )
 
-    blockchain_test(
+    benchmark_test(
+        genesis_environment=env,
         pre=pre,
         post=post_state,
         blocks=[Block(txs=txs)],

From 09c09cbb9715d8b15535eb72553b1f3a2fa195b4 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Fri, 8 Aug 2025 18:06:30 +0800
Subject: [PATCH 02/19] feat: wrap state test for benchmark

---
 src/ethereum_test_specs/__init__.py        |   4 +
 src/ethereum_test_specs/benchmark_state.py | 229 +++++++++++++++++++++
 src/ethereum_test_tools/__init__.py        |   4 +
 tests/benchmark/test_worst_compute.py      |   8 +-
 4 files changed, 243 insertions(+), 2 deletions(-)
 create mode 100644 src/ethereum_test_specs/benchmark_state.py

diff --git a/src/ethereum_test_specs/__init__.py b/src/ethereum_test_specs/__init__.py
index e0baf8c5188..9a714640746 100644
--- a/src/ethereum_test_specs/__init__.py
+++ b/src/ethereum_test_specs/__init__.py
@@ -3,6 +3,7 @@
 from .base import BaseTest, TestSpec
 from .base_static import BaseStaticTest
 from .benchmark import BenchmarkTest, BenchmarkTestFiller, BenchmarkTestSpec
+from .benchmark_state import BenchmarkStateTest, BenchmarkStateTestFiller, BenchmarkStateTestSpec
 from .blobs import BlobsTest, BlobsTestFiller, BlobsTestSpec
 from .blockchain import (
     BlockchainTest,
@@ -27,6 +28,9 @@
     "BenchmarkTest",
     "BenchmarkTestFiller",
     "BenchmarkTestSpec",
+    "BenchmarkStateTest",
+    "BenchmarkStateTestFiller",
+    "BenchmarkStateTestSpec",
     "BlobsTest",
     "BlobsTestFiller",
     "BlobsTestSpec",
diff --git a/src/ethereum_test_specs/benchmark_state.py b/src/ethereum_test_specs/benchmark_state.py
new file mode 100644
index 00000000000..e9e959f0615
--- /dev/null
+++ b/src/ethereum_test_specs/benchmark_state.py
@@ -0,0 +1,229 @@
+"""Ethereum benchmark state test spec definition and filler."""
+
+import math
+from pprint import pprint
+from typing import Callable, ClassVar, Generator, List, Sequence, Type
+
+from pydantic import ConfigDict
+
+from ethereum_clis import TransitionTool
+from ethereum_test_base_types import HexNumber
+from ethereum_test_execution import (
+    BaseExecute,
+    ExecuteFormat,
+    LabeledExecuteFormat,
+    TransactionPost,
+)
+from ethereum_test_fixtures import (
+    BaseFixture,
+    FixtureFormat,
+    LabeledFixtureFormat,
+    StateFixture,
+)
+from ethereum_test_fixtures.common import FixtureBlobSchedule
+from ethereum_test_fixtures.state import (
+    FixtureConfig,
+    FixtureEnvironment,
+    FixtureForkPost,
+    FixtureTransaction,
+)
+from ethereum_test_forks import Fork
+from ethereum_test_types import Alloc, Environment, Transaction
+from ethereum_test_vm import Bytecode
+
+from .base import BaseTest, OpMode
+from .blockchain import Block, BlockchainTest
+from .debugging import print_traces
+from .helpers import verify_transactions
+
+
+class BenchmarkStateTest(BaseTest):
+    """Test type designed specifically for benchmark state test cases with full verification."""
+
+    pre: Alloc
+    post: Alloc
+    tx: Transaction
+    gas_benchmark_value: int
+    setup_bytecode: Bytecode | None = None
+    attack_bytecode: Bytecode | None = None
+    env: Environment
+    chain_id: int = 1
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
+        StateFixture,
+    ] + [
+        LabeledFixtureFormat(
+            fixture_format,
+            f"{fixture_format.format_name}_from_benchmark_state_test",
+            f"A {fixture_format.format_name} generated from a benchmark_state_test",
+        )
+        for fixture_format in BlockchainTest.supported_fixture_formats
+    ]
+
+    supported_execute_formats: ClassVar[Sequence[LabeledExecuteFormat]] = [
+        LabeledExecuteFormat(
+            TransactionPost,
+            "benchmark_state_test_with_verification",
+            "An execute test derived from a benchmark state test with verification",
+        ),
+    ]
+
+    def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[Transaction]:
+        """Split a transaction that exceeds the gas limit cap into multiple transactions."""
+        if (gas_limit_cap is None) or (tx.gas_limit <= gas_limit_cap):
+            return [tx]
+
+        total_gas = int(tx.gas_limit)
+        num_splits = math.ceil(total_gas / gas_limit_cap)
+
+        split_transactions = []
+        remaining_gas = total_gas
+        for i in range(num_splits):
+            split_tx = tx.model_copy()
+            split_tx.gas_limit = HexNumber(min(gas_limit_cap, remaining_gas))
+            split_tx.nonce = HexNumber(tx.nonce + i)
+            split_transactions.append(split_tx)
+            remaining_gas -= gas_limit_cap
+
+        return split_transactions
+
+    def make_benchmark_state_test_fixture(
+        self,
+        t8n: TransitionTool,
+        fork: Fork,
+    ) -> StateFixture:
+        """Create a fixture from the benchmark state test definition with full verification."""
+        # We can't generate a state test fixture that names a transition fork,
+        # so we get the fork at the block number and timestamp of the state test
+        fork = fork.fork_at(self.env.number, self.env.timestamp)
+
+        env = self.env.set_fork_requirements(fork)
+        tx = self.tx.with_signature_and_sender(keep_secret_key=True)
+        pre_alloc = Alloc.merge(
+            Alloc.model_validate(fork.pre_allocation()),
+            self.pre,
+        )
+
+        # Verification 1: Check for empty accounts
+        if empty_accounts := pre_alloc.empty_accounts():
+            raise Exception(f"Empty accounts in pre state: {empty_accounts}")
+
+        transition_tool_output = t8n.evaluate(
+            transition_tool_data=TransitionTool.TransitionToolData(
+                alloc=pre_alloc,
+                txs=[tx],
+                env=env,
+                fork=fork,
+                chain_id=self.chain_id,
+                reward=0,  # Reward on state tests is always zero
+                blob_schedule=fork.blob_schedule(),
+                state_test=True,
+            ),
+            debug_output_path=self.get_next_transition_tool_output_path(),
+            slow_request=self.is_tx_gas_heavy_test(),
+        )
+
+        # Verification 2: Post-allocation verification
+        try:
+            self.post.verify_post_alloc(transition_tool_output.alloc)
+        except Exception as e:
+            print_traces(t8n.get_traces())
+            raise e
+
+        # Verification 3: Transaction verification
+        try:
+            verify_transactions(
+                txs=[tx],
+                result=transition_tool_output.result,
+                transition_tool_exceptions_reliable=t8n.exception_mapper.reliable,
+            )
+        except Exception as e:
+            print_traces(t8n.get_traces())
+            pprint(transition_tool_output.result)
+            pprint(transition_tool_output.alloc)
+            raise e
+
+        # Verification 4: Benchmark gas validation
+        if self._operation_mode == OpMode.BENCHMARKING:
+            expected_benchmark_gas_used = self.gas_benchmark_value
+            gas_used = int(transition_tool_output.result.gas_used)
+            assert expected_benchmark_gas_used is not None, "gas_benchmark_value is not set"
+            assert gas_used == expected_benchmark_gas_used, (
+                f"gas_used ({gas_used}) does not match gas_benchmark_value "
+                f"({expected_benchmark_gas_used})"
+                f", difference: {gas_used - expected_benchmark_gas_used}"
+            )
+
+        return StateFixture(
+            env=FixtureEnvironment(**env.model_dump(exclude_none=True)),
+            pre=pre_alloc,
+            post={
+                fork: [
+                    FixtureForkPost(
+                        state_root=transition_tool_output.result.state_root,
+                        logs_hash=transition_tool_output.result.logs_hash,
+                        tx_bytes=tx.rlp(),
+                        expect_exception=tx.error,
+                        state=transition_tool_output.alloc,
+                    )
+                ]
+            },
+            transaction=FixtureTransaction.from_transaction(tx),
+            config=FixtureConfig(
+                blob_schedule=FixtureBlobSchedule.from_blob_schedule(fork.blob_schedule()),
+                chain_id=self.chain_id,
+            ),
+        )
+
+    def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
+        """Create a BlockchainTest from this BenchmarkStateTestWithVerification."""
+        gas_limit_cap = fork.transaction_gas_limit_cap()
+
+        transactions = self.split_transaction(self.tx, gas_limit_cap)
+
+        blocks = [Block(txs=transactions)]
+
+        return BlockchainTest.from_test(
+            base_test=self,
+            pre=self.pre,
+            post=self.post,
+            blocks=blocks,
+            genesis_environment=self.env,
+        )
+
+    def generate(
+        self,
+        t8n: TransitionTool,
+        fork: Fork,
+        fixture_format: FixtureFormat,
+    ) -> BaseFixture:
+        """Generate the test fixture."""
+        self.check_exception_test(exception=self.tx.error is not None)
+        if fixture_format in BlockchainTest.supported_fixture_formats:
+            return self.generate_blockchain_test(fork=fork).generate(
+                t8n=t8n, fork=fork, fixture_format=fixture_format
+            )
+        elif fixture_format == StateFixture:
+            return self.make_benchmark_state_test_fixture(t8n, fork)
+
+        raise Exception(f"Unknown fixture format: {fixture_format}")
+
+    def execute(
+        self,
+        *,
+        fork: Fork,
+        execute_format: ExecuteFormat,
+    ) -> BaseExecute:
+        """Execute the benchmark state test by sending it to the live network."""
+        if execute_format == TransactionPost:
+            return TransactionPost(
+                blocks=[[self.tx]],
+                post=self.post,
+            )
+        raise Exception(f"Unsupported execute format: {execute_format}")
+
+
+BenchmarkStateTestFiller = Type[BenchmarkStateTest]
+BenchmarkStateTestSpec = Callable[[str], Generator[BenchmarkStateTest, None, None]]
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index 6a822305f94..3dd5e0439ba 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -25,6 +25,8 @@
 from ethereum_test_fixtures import BaseFixture, FixtureCollector
 from ethereum_test_specs import (
     BaseTest,
+    BenchmarkStateTest,
+    BenchmarkStateTestFiller,
     BenchmarkTest,
     BenchmarkTestFiller,
     BlobsTest,
@@ -116,6 +118,8 @@
     "BaseTest",
     "BenchmarkTest",
     "BenchmarkTestFiller",
+    "BenchmarkStateTest",
+    "BenchmarkStateTestFiller",
     "Blob",
     "BlockAccessList",
     "BlobsTest",
diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index 9bfdee16482..410e08820c5 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -19,6 +19,7 @@
 from ethereum_test_tools import (
     Address,
     Alloc,
+    BenchmarkStateTestFiller,
     Block,
     BlockchainTestFiller,
     Bytecode,
@@ -2764,8 +2765,9 @@ def test_worst_calldataload(
     ],
 )
 def test_worst_swap(
-    state_test: StateTestFiller,
+    benchmark_state_test: BenchmarkStateTestFiller,
     pre: Alloc,
+    env: Environment,
     fork: Fork,
     opcode: Opcode,
     gas_benchmark_value: int,
@@ -2785,8 +2787,10 @@ def test_worst_swap(
         sender=pre.fund_eoa(),
     )
 
-    state_test(
+    benchmark_state_test(
+        env=env,
         pre=pre,
+        gas_benchmark_value=gas_benchmark_value,
         post={},
         tx=tx,
     )

From 87bd45d34d2b4ad91e9bb77c406beccba79f7689 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 14 Aug 2025 20:48:59 +0800
Subject: [PATCH 03/19] feat(benchmark): add code generator to generate
 transaction

---
 src/ethereum_test_specs/benchmark.py          |  4 +-
 src/ethereum_test_specs/benchmark_state.py    |  3 -
 src/ethereum_test_tools/__init__.py           |  8 ++
 .../benchmark_code_generator.py               | 96 +++++++++++++++++++
 tests/benchmark/test_worst_compute.py         | 24 ++---
 5 files changed, 115 insertions(+), 20 deletions(-)
 create mode 100644 src/ethereum_test_tools/benchmark_code_generator.py

diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index fb0eb8b68f1..d1ffdb306db 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -128,7 +128,9 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
                 genesis_environment=self.env,
             )
         else:
-            raise ValueError("Cannot create BlockchainTest without transactions or blocks")
+            raise ValueError(
+                "Cannot create BlockchainTest without transactions, blocks, or code_generator"
+            )
 
     def generate(
         self,
diff --git a/src/ethereum_test_specs/benchmark_state.py b/src/ethereum_test_specs/benchmark_state.py
index e9e959f0615..454af1a3844 100644
--- a/src/ethereum_test_specs/benchmark_state.py
+++ b/src/ethereum_test_specs/benchmark_state.py
@@ -29,7 +29,6 @@
 )
 from ethereum_test_forks import Fork
 from ethereum_test_types import Alloc, Environment, Transaction
-from ethereum_test_vm import Bytecode
 
 from .base import BaseTest, OpMode
 from .blockchain import Block, BlockchainTest
@@ -44,8 +43,6 @@ class BenchmarkStateTest(BaseTest):
     post: Alloc
     tx: Transaction
     gas_benchmark_value: int
-    setup_bytecode: Bytecode | None = None
-    attack_bytecode: Bytecode | None = None
     env: Environment
     chain_id: int = 1
 
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index 3dd5e0439ba..4c6d7980166 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -86,6 +86,11 @@
     call_return_code,
 )
 
+from .benchmark_code_generator import (
+    BenchmarkCodeGenerator,
+    ExtCallGenerator,
+    JumpLoopGenerator,
+)
 from .tools_code import (
     CalldataCase,
     Case,
@@ -116,6 +121,7 @@
     "BalStorageSlot",
     "BaseFixture",
     "BaseTest",
+    "BenchmarkCodeGenerator",
     "BenchmarkTest",
     "BenchmarkTestFiller",
     "BenchmarkStateTest",
@@ -136,6 +142,7 @@
     "CodeGasMeasure",
     "Conditional",
     "ConsolidationRequest",
+    "ExtCallGenerator",
     "DeploymentTestType",
     "DepositRequest",
     "EngineAPIError",
@@ -151,6 +158,7 @@
     "Hash",
     "Header",
     "Initcode",
+    "JumpLoopGenerator",
     "Macro",
     "Macros",
     "NetworkWrappedTransaction",
diff --git a/src/ethereum_test_tools/benchmark_code_generator.py b/src/ethereum_test_tools/benchmark_code_generator.py
new file mode 100644
index 00000000000..57e7b0e1e4c
--- /dev/null
+++ b/src/ethereum_test_tools/benchmark_code_generator.py
@@ -0,0 +1,96 @@
+"""Benchmark code generator classes for creating optimized bytecode patterns."""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from ethereum_test_forks import Fork
+from ethereum_test_tools import Alloc, Bytecode, Transaction
+from ethereum_test_tools.vm.opcode import Opcodes as Op
+
+
+class BenchmarkCodeGenerator(ABC):
+    """Abstract base class for generating benchmark bytecode."""
+
+    def __init__(
+        self,
+        fork: Fork,
+        attack_block: Bytecode,
+        setup: Optional[Bytecode] = None,
+    ):
+        """Initialize with fork, attack block, and optional setup bytecode."""
+        self.fork = fork
+        self.setup = setup or Bytecode()
+        self.attack_block = attack_block
+
+    @abstractmethod
+    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+        """Generate a transaction with the specified gas limit."""
+        pass
+
+    def generate_repeated_code(self, repeated_code: Bytecode, setup: Bytecode) -> Bytecode:
+        """Calculate the maximum number of iterations that can fit in the code size limit."""
+        max_code_size = self.fork.max_code_size()
+
+        overhead = len(Op.JUMPDEST) + len(Op.JUMP(len(setup)))
+        available_space = max_code_size - overhead
+        max_iterations = available_space // len(repeated_code) if len(repeated_code) > 0 else 0
+
+        code = setup + Op.JUMPDEST + repeated_code * max_iterations + Op.JUMP(len(setup))
+
+        self._validate_code_size(code)
+
+        return code
+
+    def _validate_code_size(self, code: Bytecode) -> None:
+        """Validate that the generated code fits within size limits."""
+        if len(code) > self.fork.max_code_size():
+            raise ValueError(
+                f"Generated code size {len(code)} exceeds maximum allowed size "
+                f"{self.fork.max_code_size()}"
+            )
+
+
+class JumpLoopGenerator(BenchmarkCodeGenerator):
+    """Generates bytecode that loops execution using JUMP operations."""
+
+    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+        """Generate transaction with looping bytecode pattern."""
+        # Benchmark Test Structure:
+        # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_length)
+
+        code = self.generate_repeated_code(self.attack_block, self.setup)
+
+        return Transaction(
+            to=pre.deploy_contract(code=code),
+            gas_limit=self.fork.transaction_gas_limit_cap() or 30_000_000,
+            sender=pre.fund_eoa(),
+        )
+
+
+class ExtCallGenerator(BenchmarkCodeGenerator):
+    """Generates bytecode that fills the contract to maximum allowed code size."""
+
+    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+        """Generate transaction with maximal code size coverage."""
+        # Benchmark Test Structure:
+        # There are two contracts:
+        # 1. The target contract that executes certain operation but not loop (e.g. PUSH)
+        # 2. The loop contract that calls the target contract in a loop
+        #
+        # attack = POP(STATICCALL(GAS, target_contract_address, 0, 0, 0, 0))
+        # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_lengt)
+        # This could optimize the gas consumption and increase the cycle count.
+
+        max_stack_height = self.fork.max_stack_height()
+
+        target_contract_address = pre.deploy_contract(code=self.attack_block * max_stack_height)
+
+        code_sequence = Op.POP(Op.STATICCALL(Op.GAS, target_contract_address, 0, 0, 0, 0))
+
+        code = self.generate_repeated_code(code_sequence, Bytecode())
+
+        return Transaction(
+            to=pre.deploy_contract(code=code),
+            gas_limit=self.fork.transaction_gas_limit_cap() or 30_000_000,
+            sender=pre.fund_eoa(),
+        )
diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index 410e08820c5..fa6e8f63432 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -28,6 +28,7 @@
     Transaction,
     add_kzg_version,
 )
+from ethereum_test_tools.benchmark_code_generator import JumpLoopGenerator
 from ethereum_test_types import TransactionType
 from ethereum_test_vm import Opcode
 from ethereum_test_vm import Opcodes as Op
@@ -1843,27 +1844,19 @@ def test_worst_jumpis(
 
 @pytest.mark.slow
 def test_worst_jumpdests(
-    state_test: StateTestFiller,
+    benchmark_state_test: BenchmarkStateTestFiller,
     pre: Alloc,
+    env: Environment,
     fork: Fork,
     gas_benchmark_value: int,
 ):
     """Test running a JUMPDEST-intensive contract."""
-    max_code_size = fork.max_code_size()
+    generator = JumpLoopGenerator(fork, Op.JUMPDEST)
+    tx = generator.generate_transaction(pre, gas_benchmark_value)
 
-    # Create and deploy a contract with many JUMPDESTs
-    code_suffix = Op.JUMP(Op.PUSH0)
-    code_body = Op.JUMPDEST * (max_code_size - len(code_suffix))
-    code = code_body + code_suffix
-    jumpdests_address = pre.deploy_contract(code=code)
-
-    tx = Transaction(
-        to=jumpdests_address,
-        gas_limit=gas_benchmark_value,
-        sender=pre.fund_eoa(),
-    )
-
-    state_test(
+    benchmark_state_test(
+        env=env,
+        gas_benchmark_value=gas_benchmark_value,
         pre=pre,
         post={},
         tx=tx,
@@ -2783,7 +2776,6 @@ def test_worst_swap(
 
     tx = Transaction(
         to=pre.deploy_contract(code=code),
-        gas_limit=gas_benchmark_value,
         sender=pre.fund_eoa(),
     )
 

From 8ca027fa10ac8db37c2e93bac39f93798b594d1f Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Sep 2025 21:31:31 +0800
Subject: [PATCH 04/19] fix: resolve typing issue

---
 tests/benchmark/test_worst_blocks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index df007629349..d50ad80b91c 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -155,11 +155,10 @@ def test_block_full_of_ether_transfers(
     )
 
     benchmark_test(
-        genesis_environment=env,
+        env=env,
         pre=pre,
         post=post_state,
         blocks=[Block(txs=txs)],
-        exclude_full_post_state_in_output=True,
         expected_benchmark_gas_used=iteration_count * intrinsic_cost,
     )
 

From d76104fe037d99378a6afd3ca801835b1e463c63 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Sep 2025 20:46:56 +0800
Subject: [PATCH 05/19] refactor: update benchmark code generator and test
 wrapper

---
 src/ethereum_test_benchmark/__init__.py       |  13 ++
 .../benchmark_code_generator.py               |  79 +++++----
 src/ethereum_test_specs/benchmark.py          | 155 ++++++++++++++++--
 src/ethereum_test_tools/__init__.py           |   5 +
 tests/benchmark/conftest.py                   |  15 ++
 tests/benchmark/test_worst_blocks.py          |   2 +
 tests/benchmark/test_worst_compute.py         |  36 ++--
 .../benchmark/test_worst_stateful_opcodes.py  |  49 ++++--
 8 files changed, 266 insertions(+), 88 deletions(-)
 create mode 100644 src/ethereum_test_benchmark/__init__.py
 rename src/{ethereum_test_tools => ethereum_test_benchmark}/benchmark_code_generator.py (54%)

diff --git a/src/ethereum_test_benchmark/__init__.py b/src/ethereum_test_benchmark/__init__.py
new file mode 100644
index 00000000000..60f0e66a5fb
--- /dev/null
+++ b/src/ethereum_test_benchmark/__init__.py
@@ -0,0 +1,13 @@
+"""Benchmark code generator classes for creating optimized bytecode patterns."""
+
+from .benchmark_code_generator import (
+    BenchmarkCodeGenerator,
+    ExtCallGenerator,
+    JumpLoopGenerator,
+)
+
+__all__ = (
+    "BenchmarkCodeGenerator",
+    "ExtCallGenerator",
+    "JumpLoopGenerator",
+)
diff --git a/src/ethereum_test_tools/benchmark_code_generator.py b/src/ethereum_test_benchmark/benchmark_code_generator.py
similarity index 54%
rename from src/ethereum_test_tools/benchmark_code_generator.py
rename to src/ethereum_test_benchmark/benchmark_code_generator.py
index 57e7b0e1e4c..32b73637c5f 100644
--- a/src/ethereum_test_tools/benchmark_code_generator.py
+++ b/src/ethereum_test_benchmark/benchmark_code_generator.py
@@ -1,26 +1,26 @@
 """Benchmark code generator classes for creating optimized bytecode patterns."""
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from dataclasses import dataclass, field
 
 from ethereum_test_forks import Fork
-from ethereum_test_tools import Alloc, Bytecode, Transaction
-from ethereum_test_tools.vm.opcode import Opcodes as Op
+from ethereum_test_types import Alloc, Transaction
+from ethereum_test_vm import Bytecode
+from ethereum_test_vm.opcode import Opcodes as Op
 
 
+@dataclass
 class BenchmarkCodeGenerator(ABC):
     """Abstract base class for generating benchmark bytecode."""
 
-    def __init__(
-        self,
-        fork: Fork,
-        attack_block: Bytecode,
-        setup: Optional[Bytecode] = None,
-    ):
-        """Initialize with fork, attack block, and optional setup bytecode."""
-        self.fork = fork
-        self.setup = setup or Bytecode()
-        self.attack_block = attack_block
+    fork: Fork
+    attack_block: Bytecode
+    setup: Bytecode = field(default_factory=Bytecode)
+
+    @abstractmethod
+    def deploy_contracts(self, pre: Alloc) -> None:
+        """Deploy any contracts needed for the benchmark."""
+        pass
 
     @abstractmethod
     def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
@@ -29,14 +29,14 @@ def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
 
     def generate_repeated_code(self, repeated_code: Bytecode, setup: Bytecode) -> Bytecode:
         """Calculate the maximum number of iterations that can fit in the code size limit."""
+        assert len(repeated_code) > 0, "repeated_code cannot be empty"
         max_code_size = self.fork.max_code_size()
 
-        overhead = len(Op.JUMPDEST) + len(Op.JUMP(len(setup)))
+        overhead = len(setup) + len(Op.JUMPDEST) + len(Op.JUMP(len(setup)))
         available_space = max_code_size - overhead
-        max_iterations = available_space // len(repeated_code) if len(repeated_code) > 0 else 0
+        max_iterations = available_space // len(repeated_code)
 
         code = setup + Op.JUMPDEST + repeated_code * max_iterations + Op.JUMP(len(setup))
-
         self._validate_code_size(code)
 
         return code
@@ -50,47 +50,62 @@ def _validate_code_size(self, code: Bytecode) -> None:
             )
 
 
+@dataclass
 class JumpLoopGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that loops execution using JUMP operations."""
 
-    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
-        """Generate transaction with looping bytecode pattern."""
+    def deploy_contracts(self, pre: Alloc) -> None:
+        """Deploy the looping contract."""
         # Benchmark Test Structure:
         # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_length)
-
         code = self.generate_repeated_code(self.attack_block, self.setup)
+        self._contract_address = pre.deploy_contract(code=code)
+
+    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+        """Generate transaction that executes the looping contract."""
+        if not hasattr(self, "_contract_address"):
+            raise ValueError("deploy_contracts must be called before generate_transaction")
 
         return Transaction(
-            to=pre.deploy_contract(code=code),
-            gas_limit=self.fork.transaction_gas_limit_cap() or 30_000_000,
+            to=self._contract_address,
+            gas_limit=gas_limit,
             sender=pre.fund_eoa(),
         )
 
 
+@dataclass
 class ExtCallGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that fills the contract to maximum allowed code size."""
 
-    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
-        """Generate transaction with maximal code size coverage."""
+    def deploy_contracts(self, pre: Alloc) -> None:
+        """Deploy both target and caller contracts."""
         # Benchmark Test Structure:
         # There are two contracts:
         # 1. The target contract that executes certain operation but not loop (e.g. PUSH)
         # 2. The loop contract that calls the target contract in a loop
-        #
-        # attack = POP(STATICCALL(GAS, target_contract_address, 0, 0, 0, 0))
-        # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_lengt)
-        # This could optimize the gas consumption and increase the cycle count.
 
         max_stack_height = self.fork.max_stack_height()
 
-        target_contract_address = pre.deploy_contract(code=self.attack_block * max_stack_height)
+        # Deploy target contract that contains the actual attack block
+        self._target_contract_address = pre.deploy_contract(
+            code=self.attack_block * max_stack_height
+        )
 
-        code_sequence = Op.POP(Op.STATICCALL(Op.GAS, target_contract_address, 0, 0, 0, 0))
+        # Create caller contract that repeatedly calls the target contract
+        # attack = POP(STATICCALL(GAS, target_contract_address, 0, 0, 0, 0))
+        # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_length)
+        code_sequence = Op.POP(Op.STATICCALL(Op.GAS, self._target_contract_address, 0, 0, 0, 0))
+
+        caller_code = self.generate_repeated_code(code_sequence, Bytecode())
+        self._contract_address = pre.deploy_contract(code=caller_code)
 
-        code = self.generate_repeated_code(code_sequence, Bytecode())
+    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+        """Generate transaction that executes the caller contract."""
+        if not hasattr(self, "_contract_address"):
+            raise ValueError("deploy_contracts must be called before generate_transaction")
 
         return Transaction(
-            to=pre.deploy_contract(code=code),
-            gas_limit=self.fork.transaction_gas_limit_cap() or 30_000_000,
+            to=self._contract_address,
+            gas_limit=gas_limit,
             sender=pre.fund_eoa(),
         )
diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index d1ffdb306db..5bf670e0cce 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -1,9 +1,12 @@
 """Ethereum benchmark test spec definition and filler."""
 
-from typing import Callable, ClassVar, Dict, Generator, List, Optional, Sequence, Type
+from contextlib import contextmanager
+from contextvars import ContextVar
+from enum import Enum
+from typing import Any, Callable, ClassVar, Dict, Generator, List, Optional, Sequence, Type
 
 import pytest
-from pydantic import Field
+from pydantic import ConfigDict, Field
 
 from ethereum_clis import TransitionTool
 from ethereum_test_base_types import HexNumber
@@ -29,9 +32,74 @@
 from .blockchain import Block, BlockchainTest
 
 
+class BenchmarkPhase(Enum):
+    """Phases of a benchmark test."""
+
+    SETUP = "setup"
+    EXECUTION = "execution"
+
+
+_current_phase: ContextVar[Optional[BenchmarkPhase]] = ContextVar("benchmark_phase", default=None)
+
+
+class BenchmarkManager:
+    """Context manager for managing benchmark test phases."""
+
+    def __init__(self):
+        """Initialize the BenchmarkManager with empty transaction and block lists."""
+        self.setup_transactions: List[Transaction] = []
+        self.setup_blocks: List[Block] = []
+        self.execution_transactions: List[Transaction] = []
+        self.execution_blocks: List[Block] = []
+
+    @contextmanager
+    def setup(self):
+        """Context manager for the setup phase of a benchmark test."""
+        token = _current_phase.set(BenchmarkPhase.SETUP)
+        try:
+            yield self
+        finally:
+            _current_phase.reset(token)
+
+    @contextmanager
+    def execution(self):
+        """Context manager for the execution phase of a benchmark test."""
+        token = _current_phase.set(BenchmarkPhase.EXECUTION)
+        try:
+            yield self
+        finally:
+            _current_phase.reset(token)
+
+    def add_transaction(self, tx: Transaction):
+        """Add a transaction to the current phase."""
+        current_phase = _current_phase.get()
+        if current_phase == BenchmarkPhase.SETUP:
+            self.setup_transactions.append(tx)
+        elif current_phase == BenchmarkPhase.EXECUTION:
+            self.execution_transactions.append(tx)
+        else:
+            self.setup_transactions.append(tx)
+
+    def add_block(self, block: Block):
+        """Add a block to the current phase."""
+        current_phase = _current_phase.get()
+        if current_phase == BenchmarkPhase.SETUP:
+            self.setup_blocks.append(block)
+        elif current_phase == BenchmarkPhase.EXECUTION:
+            self.execution_blocks.append(block)
+        else:
+            self.setup_blocks.append(block)
+
+    def get_current_phase(self) -> Optional[BenchmarkPhase]:
+        """Get the current benchmark phase."""
+        return _current_phase.get()
+
+
 class BenchmarkTest(BaseTest):
     """Test type designed specifically for benchmark test cases."""
 
+    model_config = ConfigDict(extra="forbid")
+
     pre: Alloc
     post: Alloc
     tx: Optional[Transaction] = None
@@ -41,6 +109,9 @@ class BenchmarkTest(BaseTest):
     ) = None
     env: Environment = Field(default_factory=Environment)
     expected_benchmark_gas_used: int | None = None
+    gas_benchmark_value: int
+    benchmark_manager: Optional[Any] = Field(default=None, exclude=True)
+    code_generator: Optional[Any] = Field(default=None, exclude=True)
 
     supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
         BlockchainFixture,
@@ -86,26 +157,81 @@ def get_genesis_environment(self, fork: Fork) -> Environment:
 
     def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[Transaction]:
         """Split a transaction that exceeds the gas limit cap into multiple transactions."""
-        if (gas_limit_cap is None) or (tx.gas_limit <= gas_limit_cap):
+        if gas_limit_cap is None:
+            tx.gas_limit = HexNumber(self.gas_benchmark_value)
+            return [tx]
+
+        if gas_limit_cap >= self.gas_benchmark_value:
+            tx.gas_limit = HexNumber(min(tx.gas_limit, self.gas_benchmark_value))
             return [tx]
 
-        total_gas = int(self.expected_benchmark_gas_used or self.env.gas_limit)
-        print(f"total_gas: {total_gas}")
-        num_splits = total_gas // gas_limit_cap
+        remaining_gas = self.gas_benchmark_value
+        num_splits = remaining_gas // gas_limit_cap + int(remaining_gas % gas_limit_cap)
 
         split_transactions = []
         for i in range(num_splits):
             split_tx = tx.model_copy()
-            total_gas -= gas_limit_cap
-            split_tx.gas_limit = HexNumber(total_gas if i == num_splits - 1 else gas_limit_cap)
+            split_tx.gas_limit = HexNumber(remaining_gas if i == num_splits - 1 else gas_limit_cap)
+            remaining_gas -= gas_limit_cap
             split_tx.nonce = HexNumber(tx.nonce + i)
             split_transactions.append(split_tx)
 
         return split_transactions
 
+    def generate_blocks_from_code_generator(self, fork: Fork) -> List[Block]:
+        """Generate blocks using the code generator."""
+        if self.code_generator is None:
+            return []
+
+        self.code_generator.deploy_contracts(self.pre)
+        gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
+        benchmark_tx = self.code_generator.generate_transaction(self.pre, gas_limit)
+
+        execution_txs = self.split_transaction(benchmark_tx, gas_limit)
+        execution_block = Block(txs=execution_txs)
+
+        return [execution_block]
+
     def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
         """Create a BlockchainTest from this BenchmarkTest."""
-        if self.blocks is not None:
+        if self.code_generator is not None:
+            generated_blocks = self.generate_blocks_from_code_generator(fork)
+            return BlockchainTest.from_test(
+                base_test=self,
+                genesis_environment=self.env,
+                pre=self.pre,
+                post=self.post,
+                blocks=generated_blocks,
+            )
+
+        elif self.benchmark_manager is not None:
+            all_blocks = []
+            gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
+
+            if self.benchmark_manager.setup_blocks:
+                all_blocks.extend(self.benchmark_manager.setup_blocks)
+            elif self.benchmark_manager.setup_transactions:
+                setup_txs = []
+                for tx in self.benchmark_manager.setup_transactions:
+                    setup_txs.extend(self.split_transaction(tx, gas_limit))
+                all_blocks.append(Block(txs=setup_txs))
+
+            if self.benchmark_manager.execution_blocks:
+                all_blocks.extend(self.benchmark_manager.execution_blocks)
+            elif self.benchmark_manager.execution_transactions:
+                execution_txs = []
+                for tx in self.benchmark_manager.execution_transactions:
+                    execution_txs.extend(self.split_transaction(tx, gas_limit))
+                all_blocks.append(Block(txs=execution_txs))
+
+            return BlockchainTest.from_test(
+                base_test=self,
+                genesis_environment=self.env,
+                pre=self.pre,
+                post=self.post,
+                blocks=all_blocks,
+            )
+        elif self.blocks is not None:
             return BlockchainTest.from_test(
                 base_test=self,
                 genesis_environment=self.env,
@@ -114,9 +240,9 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
                 blocks=self.blocks,
             )
         elif self.tx is not None:
-            gas_limit_cap = fork.transaction_gas_limit_cap()
+            gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
 
-            transactions = self.split_transaction(self.tx, gas_limit_cap)
+            transactions = self.split_transaction(self.tx, gas_limit)
 
             blocks = [Block(txs=transactions)]
 
@@ -129,7 +255,7 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
             )
         else:
             raise ValueError(
-                "Cannot create BlockchainTest without transactions, blocks, or code_generator"
+                "Cannot create BlockchainTest without transactions, blocks, or benchmark_manager"
             )
 
     def generate(
@@ -162,5 +288,10 @@ def execute(
         raise Exception(f"Unsupported execute format: {execute_format}")
 
 
+def create_benchmark_manager() -> BenchmarkManager:
+    """Create a new BenchmarkManager instance for phase-aware benchmark testing."""
+    return BenchmarkManager()
+
+
 BenchmarkTestSpec = Callable[[str], Generator[BenchmarkTest, None, None]]
 BenchmarkTestFiller = Type[BenchmarkTest]
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index 4c6d7980166..04b1770ac61 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -16,6 +16,11 @@
     TestPrivateKey2,
 )
 from ethereum_test_base_types.reference_spec import ReferenceSpec, ReferenceSpecTypes
+from ethereum_test_benchmark import (
+    BenchmarkCodeGenerator,
+    ExtCallGenerator,
+    JumpLoopGenerator,
+)
 from ethereum_test_exceptions import (
     BlockException,
     EngineAPIError,
diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py
index 3af1bf9ade7..3f0a67ab556 100644
--- a/tests/benchmark/conftest.py
+++ b/tests/benchmark/conftest.py
@@ -4,6 +4,9 @@
 
 import pytest
 
+from ethereum_test_forks import Fork
+from ethereum_test_specs.benchmark import BenchmarkManager, create_benchmark_manager
+
 DEFAULT_BENCHMARK_FORK = "Prague"
 
 
@@ -59,3 +62,15 @@ def pytest_collection_modifyitems(config, items):
 
     for i in reversed(items_for_removal):
         items.pop(i)
+
+
+@pytest.fixture
+def tx_gas_limit_cap(fork: Fork, gas_benchmark_value: int) -> int:
+    """Return the transaction gas limit cap."""
+    return fork.transaction_gas_limit_cap() or gas_benchmark_value
+
+
+@pytest.fixture
+def benchmark_manager() -> BenchmarkManager:
+    """Return a benchmark manager."""
+    return create_benchmark_manager()
diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index d50ad80b91c..d19f34e8b35 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -119,6 +119,7 @@ def test_block_full_of_ether_transfers(
     iteration_count: int,
     transfer_amount: int,
     intrinsic_cost: int,
+    gas_benchmark_value: int,
 ):
     """
     Single test for ether transfer scenarios.
@@ -159,6 +160,7 @@ def test_block_full_of_ether_transfers(
         pre=pre,
         post=post_state,
         blocks=[Block(txs=txs)],
+        gas_benchmark_value=gas_benchmark_value,
         expected_benchmark_gas_used=iteration_count * intrinsic_cost,
     )
 
diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index fa6e8f63432..efa854625fe 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -15,11 +15,12 @@
 from py_ecc.bn128 import G1, G2, multiply
 
 from ethereum_test_base_types.base_types import Bytes
+from ethereum_test_benchmark import JumpLoopGenerator
 from ethereum_test_forks import Fork
 from ethereum_test_tools import (
     Address,
     Alloc,
-    BenchmarkStateTestFiller,
+    BenchmarkTestFiller,
     Block,
     BlockchainTestFiller,
     Bytecode,
@@ -1844,22 +1845,19 @@ def test_worst_jumpis(
 
 @pytest.mark.slow
 def test_worst_jumpdests(
-    benchmark_state_test: BenchmarkStateTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     env: Environment,
     fork: Fork,
     gas_benchmark_value: int,
 ):
     """Test running a JUMPDEST-intensive contract."""
-    generator = JumpLoopGenerator(fork, Op.JUMPDEST)
-    tx = generator.generate_transaction(pre, gas_benchmark_value)
-
-    benchmark_state_test(
+    benchmark_test(
         env=env,
-        gas_benchmark_value=gas_benchmark_value,
         pre=pre,
         post={},
-        tx=tx,
+        code_generator=JumpLoopGenerator(fork, Op.JUMPDEST),
+        gas_benchmark_value=gas_benchmark_value,
     )
 
 
@@ -2758,7 +2756,7 @@ def test_worst_calldataload(
     ],
 )
 def test_worst_swap(
-    benchmark_state_test: BenchmarkStateTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     env: Environment,
     fork: Fork,
@@ -2766,25 +2764,11 @@ def test_worst_swap(
     gas_benchmark_value: int,
 ):
     """Test running a block with as many SWAP as possible."""
-    max_code_size = fork.max_code_size()
-
-    code_prefix = Op.JUMPDEST + Op.PUSH0 * opcode.min_stack_height
-    code_suffix = Op.PUSH0 + Op.JUMP
-    opcode_sequence = opcode * (max_code_size - len(code_prefix) - len(code_suffix))
-    code = code_prefix + opcode_sequence + code_suffix
-    assert len(code) <= max_code_size
-
-    tx = Transaction(
-        to=pre.deploy_contract(code=code),
-        sender=pre.fund_eoa(),
-    )
-
-    benchmark_state_test(
-        env=env,
+    benchmark_test(
         pre=pre,
-        gas_benchmark_value=gas_benchmark_value,
         post={},
-        tx=tx,
+        code_generator=JumpLoopGenerator(fork, opcode, setup=Op.PUSH0 * opcode.min_stack_height),
+        gas_benchmark_value=gas_benchmark_value,
     )
 
 
diff --git a/tests/benchmark/test_worst_stateful_opcodes.py b/tests/benchmark/test_worst_stateful_opcodes.py
index f68783e61c3..01d74b55518 100644
--- a/tests/benchmark/test_worst_stateful_opcodes.py
+++ b/tests/benchmark/test_worst_stateful_opcodes.py
@@ -10,6 +10,8 @@
 import pytest
 
 from ethereum_test_forks import Fork
+from ethereum_test_specs import BenchmarkTestFiller
+from ethereum_test_specs.benchmark import BenchmarkManager
 from ethereum_test_tools import (
     Account,
     Address,
@@ -47,7 +49,8 @@
     ],
 )
 def test_worst_address_state_cold(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
+    benchmark_manager: BenchmarkManager,
     pre: Alloc,
     fork: Fork,
     opcode: Op,
@@ -67,7 +70,6 @@ def test_worst_address_state_cold(
         attack_gas_limit - intrinsic_gas_cost_calc()
     ) // gas_costs.G_COLD_ACCOUNT_ACCESS
 
-    blocks = []
     post = {}
 
     # Setup
@@ -76,42 +78,53 @@ def test_worst_address_state_cold(
     # collisions with the addresses indirectly created by the testing framework.
     addr_offset = int.from_bytes(pre.fund_eoa(amount=0))
 
+    # Create sender accounts upfront so we can include them in post-state
+    execution_sender = pre.fund_eoa()
+
     if not absent_accounts:
+        setup_sender = pre.fund_eoa()
         factory_code = Op.PUSH4(num_target_accounts) + While(
             body=Op.POP(Op.CALL(address=Op.ADD(addr_offset, Op.DUP6), value=10)),
             condition=Op.PUSH1(1) + Op.SWAP1 + Op.SUB + Op.DUP1 + Op.ISZERO + Op.ISZERO,
         )
         factory_address = pre.deploy_contract(code=factory_code, balance=10**18)
 
-        setup_tx = Transaction(
-            to=factory_address,
-            gas_limit=env.gas_limit,
-            sender=pre.fund_eoa(),
-        )
-        blocks.append(Block(txs=[setup_tx]))
+        with benchmark_manager.setup():
+            setup_tx = Transaction(
+                to=factory_address,
+                gas_limit=env.gas_limit,
+                sender=setup_sender,
+            )
+            benchmark_manager.add_transaction(setup_tx)
 
         for i in range(num_target_accounts):
             addr = Address(i + addr_offset + 1)
             post[addr] = Account(balance=10)
 
-    # Execution
+        # Include setup sender in post-state
+        post[setup_sender] = Account()
+
+    # Execution phase
     op_code = Op.PUSH4(num_target_accounts) + While(
         body=Op.POP(opcode(Op.ADD(addr_offset, Op.DUP1))),
         condition=Op.PUSH1(1) + Op.SWAP1 + Op.SUB + Op.DUP1 + Op.ISZERO + Op.ISZERO,
     )
     op_address = pre.deploy_contract(code=op_code)
-    op_tx = Transaction(
-        to=op_address,
-        gas_limit=attack_gas_limit,
-        sender=pre.fund_eoa(),
-    )
-    blocks.append(Block(txs=[op_tx]))
 
-    blockchain_test(
+    with benchmark_manager.execution():
+        benchmark_manager.add_transaction(
+            Transaction(
+                to=op_address,
+                gas_limit=attack_gas_limit,
+                sender=execution_sender,
+            )
+        )
+
+    benchmark_test(
         pre=pre,
         post=post,
-        blocks=blocks,
-        exclude_full_post_state_in_output=True,
+        benchmark_manager=benchmark_manager,
+        gas_benchmark_value=gas_benchmark_value,
     )
 
 

From 51d6817c2d8108f7d804c5036ed5fb4cc8fdc602 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Sep 2025 22:41:21 +0800
Subject: [PATCH 06/19] fix: udpate example changes

---
 tests/benchmark/test_worst_blocks.py          | 23 +++---
 .../benchmark/test_worst_stateful_opcodes.py  | 73 +++++++++----------
 2 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index d19f34e8b35..e8d557c1b8e 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -10,6 +10,7 @@
 import pytest
 
 from ethereum_test_forks import Fork
+from ethereum_test_specs.benchmark import BenchmarkManager
 from ethereum_test_tools import (
     AccessList,
     Account,
@@ -112,6 +113,7 @@ def ether_transfer_case(
 )
 def test_block_full_of_ether_transfers(
     benchmark_test: BenchmarkTestFiller,
+    benchmark_manager: BenchmarkManager,
     pre: Alloc,
     env: Environment,
     case_id: str,
@@ -136,17 +138,18 @@ def test_block_full_of_ether_transfers(
     # Create a single block with all transactions
     txs = []
     balances: dict[Address, int] = {}
-    for _ in range(iteration_count):
-        receiver = next(receivers)
-        balances[receiver] = balances.get(receiver, 0) + transfer_amount
-        txs.append(
-            Transaction(
-                to=receiver,
-                value=transfer_amount,
-                gas_limit=intrinsic_cost,
-                sender=next(senders),
+    with benchmark_manager.execution():
+        for _ in range(iteration_count):
+            receiver = next(receivers)
+            balances[receiver] = balances.get(receiver, 0) + transfer_amount
+            txs.append(
+                Transaction(
+                    to=receiver,
+                    value=transfer_amount,
+                    gas_limit=intrinsic_cost,
+                    sender=next(senders),
+                )
             )
-        )
 
     # Only include post state for non a_to_a cases
     post_state = (
diff --git a/tests/benchmark/test_worst_stateful_opcodes.py b/tests/benchmark/test_worst_stateful_opcodes.py
index 01d74b55518..5f92c7e6a2c 100644
--- a/tests/benchmark/test_worst_stateful_opcodes.py
+++ b/tests/benchmark/test_worst_stateful_opcodes.py
@@ -49,8 +49,7 @@
     ],
 )
 def test_worst_address_state_cold(
-    benchmark_test: BenchmarkTestFiller,
-    benchmark_manager: BenchmarkManager,
+    blockchain_test: BlockchainTestFiller,
     pre: Alloc,
     fork: Fork,
     opcode: Op,
@@ -70,6 +69,7 @@ def test_worst_address_state_cold(
         attack_gas_limit - intrinsic_gas_cost_calc()
     ) // gas_costs.G_COLD_ACCOUNT_ACCESS
 
+    blocks = []
     post = {}
 
     # Setup
@@ -78,53 +78,42 @@ def test_worst_address_state_cold(
     # collisions with the addresses indirectly created by the testing framework.
     addr_offset = int.from_bytes(pre.fund_eoa(amount=0))
 
-    # Create sender accounts upfront so we can include them in post-state
-    execution_sender = pre.fund_eoa()
-
     if not absent_accounts:
-        setup_sender = pre.fund_eoa()
         factory_code = Op.PUSH4(num_target_accounts) + While(
             body=Op.POP(Op.CALL(address=Op.ADD(addr_offset, Op.DUP6), value=10)),
             condition=Op.PUSH1(1) + Op.SWAP1 + Op.SUB + Op.DUP1 + Op.ISZERO + Op.ISZERO,
         )
         factory_address = pre.deploy_contract(code=factory_code, balance=10**18)
 
-        with benchmark_manager.setup():
-            setup_tx = Transaction(
-                to=factory_address,
-                gas_limit=env.gas_limit,
-                sender=setup_sender,
-            )
-            benchmark_manager.add_transaction(setup_tx)
+        setup_tx = Transaction(
+            to=factory_address,
+            gas_limit=env.gas_limit,
+            sender=pre.fund_eoa(),
+        )
+        blocks.append(Block(txs=[setup_tx]))
 
         for i in range(num_target_accounts):
             addr = Address(i + addr_offset + 1)
             post[addr] = Account(balance=10)
 
-        # Include setup sender in post-state
-        post[setup_sender] = Account()
-
-    # Execution phase
+    # Execution
     op_code = Op.PUSH4(num_target_accounts) + While(
         body=Op.POP(opcode(Op.ADD(addr_offset, Op.DUP1))),
         condition=Op.PUSH1(1) + Op.SWAP1 + Op.SUB + Op.DUP1 + Op.ISZERO + Op.ISZERO,
     )
     op_address = pre.deploy_contract(code=op_code)
+    op_tx = Transaction(
+        to=op_address,
+        gas_limit=attack_gas_limit,
+        sender=pre.fund_eoa(),
+    )
+    blocks.append(Block(txs=[op_tx]))
 
-    with benchmark_manager.execution():
-        benchmark_manager.add_transaction(
-            Transaction(
-                to=op_address,
-                gas_limit=attack_gas_limit,
-                sender=execution_sender,
-            )
-        )
-
-    benchmark_test(
+    blockchain_test(
         pre=pre,
         post=post,
-        benchmark_manager=benchmark_manager,
-        gas_benchmark_value=gas_benchmark_value,
+        blocks=blocks,
+        exclude_full_post_state_in_output=True,
     )
 
 
@@ -464,30 +453,36 @@ def test_worst_storage_access_warm(
 
 
 def test_worst_blockhash(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
+    benchmark_manager: BenchmarkManager,
     pre: Alloc,
     gas_benchmark_value: int,
 ):
     """Test running a block with as many blockhash accessing oldest allowed block as possible."""
     # Create 256 dummy blocks to fill the blockhash window.
-    blocks = [Block()] * 256
+    with benchmark_manager.setup():
+        for _ in range(256):
+            benchmark_manager.add_block(Block())
 
     # Always ask for the oldest allowed BLOCKHASH block.
     execution_code = Op.PUSH1(1) + While(
         body=Op.POP(Op.BLOCKHASH(Op.DUP1)),
     )
     execution_code_address = pre.deploy_contract(code=execution_code)
-    op_tx = Transaction(
-        to=execution_code_address,
-        gas_limit=gas_benchmark_value,
-        sender=pre.fund_eoa(),
-    )
-    blocks.append(Block(txs=[op_tx]))
+    with benchmark_manager.execution():
+        benchmark_manager.add_transaction(
+            Transaction(
+                to=execution_code_address,
+                gas_limit=gas_benchmark_value,
+                sender=pre.fund_eoa(),
+            )
+        )
 
-    blockchain_test(
+    benchmark_test(
         pre=pre,
         post={},
-        blocks=blocks,
+        benchmark_manager=benchmark_manager,
+        gas_benchmark_value=gas_benchmark_value,
     )
 
 

From 99f22d73d53f92559d2c1c3089348b70a85a20df Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Fri, 12 Sep 2025 13:18:37 +0800
Subject: [PATCH 07/19] refactor: resolve typing and update func interface

---
 .../benchmark_code_generator.py               | 59 +++-----------
 src/ethereum_test_specs/benchmark.py          | 79 ++++++++++++++++---
 src/ethereum_test_vm/bytecode.py              | 19 ++++-
 src/pytest_plugins/shared/execute_fill.py     |  1 +
 tests/benchmark/test_worst_compute.py         | 10 +--
 5 files changed, 102 insertions(+), 66 deletions(-)

diff --git a/src/ethereum_test_benchmark/benchmark_code_generator.py b/src/ethereum_test_benchmark/benchmark_code_generator.py
index 32b73637c5f..dce3ef7392e 100644
--- a/src/ethereum_test_benchmark/benchmark_code_generator.py
+++ b/src/ethereum_test_benchmark/benchmark_code_generator.py
@@ -1,67 +1,26 @@
 """Benchmark code generator classes for creating optimized bytecode patterns."""
 
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
 from ethereum_test_forks import Fork
+from ethereum_test_specs.benchmark import BenchmarkCodeGenerator
 from ethereum_test_types import Alloc, Transaction
 from ethereum_test_vm import Bytecode
 from ethereum_test_vm.opcode import Opcodes as Op
 
 
-@dataclass
-class BenchmarkCodeGenerator(ABC):
-    """Abstract base class for generating benchmark bytecode."""
-
-    fork: Fork
-    attack_block: Bytecode
-    setup: Bytecode = field(default_factory=Bytecode)
-
-    @abstractmethod
-    def deploy_contracts(self, pre: Alloc) -> None:
-        """Deploy any contracts needed for the benchmark."""
-        pass
-
-    @abstractmethod
-    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
-        """Generate a transaction with the specified gas limit."""
-        pass
-
-    def generate_repeated_code(self, repeated_code: Bytecode, setup: Bytecode) -> Bytecode:
-        """Calculate the maximum number of iterations that can fit in the code size limit."""
-        assert len(repeated_code) > 0, "repeated_code cannot be empty"
-        max_code_size = self.fork.max_code_size()
-
-        overhead = len(setup) + len(Op.JUMPDEST) + len(Op.JUMP(len(setup)))
-        available_space = max_code_size - overhead
-        max_iterations = available_space // len(repeated_code)
-
-        code = setup + Op.JUMPDEST + repeated_code * max_iterations + Op.JUMP(len(setup))
-        self._validate_code_size(code)
-
-        return code
-
-    def _validate_code_size(self, code: Bytecode) -> None:
-        """Validate that the generated code fits within size limits."""
-        if len(code) > self.fork.max_code_size():
-            raise ValueError(
-                f"Generated code size {len(code)} exceeds maximum allowed size "
-                f"{self.fork.max_code_size()}"
-            )
-
-
 @dataclass
 class JumpLoopGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that loops execution using JUMP operations."""
 
-    def deploy_contracts(self, pre: Alloc) -> None:
+    def deploy_contracts(self, pre: Alloc, fork: Fork) -> None:
         """Deploy the looping contract."""
         # Benchmark Test Structure:
         # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_length)
-        code = self.generate_repeated_code(self.attack_block, self.setup)
+        code = self.generate_repeated_code(self.attack_block, self.setup, fork)
         self._contract_address = pre.deploy_contract(code=code)
 
-    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+    def generate_transaction(self, pre: Alloc, gas_limit: int, fork: Fork) -> Transaction:
         """Generate transaction that executes the looping contract."""
         if not hasattr(self, "_contract_address"):
             raise ValueError("deploy_contracts must be called before generate_transaction")
@@ -77,14 +36,14 @@ def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
 class ExtCallGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that fills the contract to maximum allowed code size."""
 
-    def deploy_contracts(self, pre: Alloc) -> None:
+    def deploy_contracts(self, pre: Alloc, fork: Fork) -> None:
         """Deploy both target and caller contracts."""
         # Benchmark Test Structure:
         # There are two contracts:
         # 1. The target contract that executes certain operation but not loop (e.g. PUSH)
         # 2. The loop contract that calls the target contract in a loop
 
-        max_stack_height = self.fork.max_stack_height()
+        max_stack_height = fork.max_stack_height()
 
         # Deploy target contract that contains the actual attack block
         self._target_contract_address = pre.deploy_contract(
@@ -96,10 +55,10 @@ def deploy_contracts(self, pre: Alloc) -> None:
         # setup + JUMPDEST + attack + attack + ... + attack + JUMP(setup_length)
         code_sequence = Op.POP(Op.STATICCALL(Op.GAS, self._target_contract_address, 0, 0, 0, 0))
 
-        caller_code = self.generate_repeated_code(code_sequence, Bytecode())
+        caller_code = self.generate_repeated_code(code_sequence, Bytecode(), fork)
         self._contract_address = pre.deploy_contract(code=caller_code)
 
-    def generate_transaction(self, pre: Alloc, gas_limit: int) -> Transaction:
+    def generate_transaction(self, pre: Alloc, gas_limit: int, fork: Fork) -> Transaction:
         """Generate transaction that executes the caller contract."""
         if not hasattr(self, "_contract_address"):
             raise ValueError("deploy_contracts must be called before generate_transaction")
diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 5bf670e0cce..57c241f7478 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -1,12 +1,19 @@
 """Ethereum benchmark test spec definition and filler."""
 
+from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from contextvars import ContextVar
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Callable, ClassVar, Dict, Generator, List, Optional, Sequence, Type
 
 import pytest
-from pydantic import ConfigDict, Field
+from pydantic import ConfigDict, Field, GetCoreSchemaHandler
+from pydantic_core.core_schema import (
+    PlainValidatorFunctionSchema,
+    no_info_plain_validator_function,
+    to_string_ser_schema,
+)
 
 from ethereum_clis import TransitionTool
 from ethereum_test_base_types import HexNumber
@@ -27,11 +34,55 @@
 )
 from ethereum_test_forks import Fork
 from ethereum_test_types import Alloc, Environment, Transaction
+from ethereum_test_vm import Bytecode
+from ethereum_test_vm.opcode import Opcodes as Op
 
 from .base import BaseTest
 from .blockchain import Block, BlockchainTest
 
 
+@dataclass(kw_only=True)
+class BenchmarkCodeGenerator(ABC):
+    """Abstract base class for generating benchmark bytecode."""
+
+    attack_block: Bytecode
+    setup: Bytecode = field(default_factory=Bytecode)
+
+    @abstractmethod
+    def deploy_contracts(self, pre: Alloc, fork: Fork) -> None:
+        """Deploy any contracts needed for the benchmark."""
+        pass
+
+    @abstractmethod
+    def generate_transaction(self, pre: Alloc, gas_limit: int, fork: Fork) -> Transaction:
+        """Generate a transaction with the specified gas limit."""
+        pass
+
+    def generate_repeated_code(
+        self, repeated_code: Bytecode, setup: Bytecode, fork: Fork
+    ) -> Bytecode:
+        """Calculate the maximum number of iterations that can fit in the code size limit."""
+        assert len(repeated_code) > 0, "repeated_code cannot be empty"
+        max_code_size = fork.max_code_size()
+
+        overhead = len(setup) + len(Op.JUMPDEST) + len(Op.JUMP(len(setup)))
+        available_space = max_code_size - overhead
+        max_iterations = available_space // len(repeated_code)
+
+        code = setup + Op.JUMPDEST + repeated_code * max_iterations + Op.JUMP(len(setup))
+        self._validate_code_size(code, fork)
+
+        return code
+
+    def _validate_code_size(self, code: Bytecode, fork: Fork) -> None:
+        """Validate that the generated code fits within size limits."""
+        if len(code) > fork.max_code_size():
+            raise ValueError(
+                f"Generated code size {len(code)} exceeds maximum allowed size "
+                f"{fork.max_code_size()}"
+            )
+
+
 class BenchmarkPhase(Enum):
     """Phases of a benchmark test."""
 
@@ -94,6 +145,16 @@ def get_current_phase(self) -> Optional[BenchmarkPhase]:
         """Get the current benchmark phase."""
         return _current_phase.get()
 
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> PlainValidatorFunctionSchema:
+        """Provide Pydantic core schema for BenchmarkManager serialization and validation."""
+        return no_info_plain_validator_function(
+            cls,
+            serialization=to_string_ser_schema(),
+        )
+
 
 class BenchmarkTest(BaseTest):
     """Test type designed specifically for benchmark test cases."""
@@ -101,17 +162,17 @@ class BenchmarkTest(BaseTest):
     model_config = ConfigDict(extra="forbid")
 
     pre: Alloc
-    post: Alloc
-    tx: Optional[Transaction] = None
-    blocks: Optional[List[Block]] = None
+    post: Alloc = Field(default_factory=Alloc)
+    tx: Transaction | None = None
+    blocks: List[Block] | None = None
     block_exception: (
         List[TransactionException | BlockException] | TransactionException | BlockException | None
     ) = None
     env: Environment = Field(default_factory=Environment)
     expected_benchmark_gas_used: int | None = None
-    gas_benchmark_value: int
-    benchmark_manager: Optional[Any] = Field(default=None, exclude=True)
-    code_generator: Optional[Any] = Field(default=None, exclude=True)
+    gas_benchmark_value: int = Field(default_factory=lambda: int(Environment().gas_limit))
+    benchmark_manager: BenchmarkManager | None = None
+    code_generator: BenchmarkCodeGenerator | None = None
 
     supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
         BlockchainFixture,
@@ -183,9 +244,9 @@ def generate_blocks_from_code_generator(self, fork: Fork) -> List[Block]:
         if self.code_generator is None:
             return []
 
-        self.code_generator.deploy_contracts(self.pre)
+        self.code_generator.deploy_contracts(self.pre, fork)
         gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
-        benchmark_tx = self.code_generator.generate_transaction(self.pre, gas_limit)
+        benchmark_tx = self.code_generator.generate_transaction(self.pre, gas_limit, fork)
 
         execution_txs = self.split_transaction(benchmark_tx, gas_limit)
         execution_block = Block(txs=execution_txs)
diff --git a/src/ethereum_test_vm/bytecode.py b/src/ethereum_test_vm/bytecode.py
index e07ab2cad0e..12f07e528d3 100644
--- a/src/ethereum_test_vm/bytecode.py
+++ b/src/ethereum_test_vm/bytecode.py
@@ -1,6 +1,13 @@
 """Ethereum Virtual Machine bytecode primitives and utilities."""
 
-from typing import SupportsBytes
+from typing import Any, SupportsBytes
+
+from pydantic import GetCoreSchemaHandler
+from pydantic_core.core_schema import (
+    PlainValidatorFunctionSchema,
+    no_info_plain_validator_function,
+    to_string_ser_schema,
+)
 
 from ethereum_test_base_types import Bytes, Hash
 
@@ -217,3 +224,13 @@ def hex(self) -> str:
     def keccak256(self) -> Hash:
         """Return the keccak256 hash of the opcode byte representation."""
         return Bytes(self._bytes_).keccak256()
+
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> PlainValidatorFunctionSchema:
+        """Provide Pydantic core schema for Bytecode serialization and validation."""
+        return no_info_plain_validator_function(
+            cls,
+            serialization=to_string_ser_schema(),
+        )
diff --git a/src/pytest_plugins/shared/execute_fill.py b/src/pytest_plugins/shared/execute_fill.py
index 98fc765db07..21b9b7ea5ba 100644
--- a/src/pytest_plugins/shared/execute_fill.py
+++ b/src/pytest_plugins/shared/execute_fill.py
@@ -13,6 +13,7 @@
 from ..spec_version_checker.spec_version_checker import EIPSpecTestItem
 
 ALL_FIXTURE_PARAMETERS = {
+    "gas_benchmark_value",
     "genesis_environment",
     "env",
 }
diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index efa854625fe..e4338de3b7d 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -1849,15 +1849,13 @@ def test_worst_jumpdests(
     pre: Alloc,
     env: Environment,
     fork: Fork,
-    gas_benchmark_value: int,
 ):
     """Test running a JUMPDEST-intensive contract."""
     benchmark_test(
         env=env,
         pre=pre,
         post={},
-        code_generator=JumpLoopGenerator(fork, Op.JUMPDEST),
-        gas_benchmark_value=gas_benchmark_value,
+        code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
     )
 
 
@@ -2761,14 +2759,14 @@ def test_worst_swap(
     env: Environment,
     fork: Fork,
     opcode: Opcode,
-    gas_benchmark_value: int,
 ):
     """Test running a block with as many SWAP as possible."""
     benchmark_test(
         pre=pre,
         post={},
-        code_generator=JumpLoopGenerator(fork, opcode, setup=Op.PUSH0 * opcode.min_stack_height),
-        gas_benchmark_value=gas_benchmark_value,
+        code_generator=JumpLoopGenerator(
+            attack_block=opcode, setup=Op.PUSH0 * opcode.min_stack_height
+        ),
     )
 
 

From 67a07d767ca351749f4118ea1762e24ab65106eb Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Fri, 12 Sep 2025 13:23:48 +0800
Subject: [PATCH 08/19] refactor: remove benchmark state test wrapper

---
 src/ethereum_test_specs/__init__.py        |   4 -
 src/ethereum_test_specs/benchmark_state.py | 226 ---------------------
 src/ethereum_test_tools/__init__.py        |   4 -
 3 files changed, 234 deletions(-)
 delete mode 100644 src/ethereum_test_specs/benchmark_state.py

diff --git a/src/ethereum_test_specs/__init__.py b/src/ethereum_test_specs/__init__.py
index 9a714640746..e0baf8c5188 100644
--- a/src/ethereum_test_specs/__init__.py
+++ b/src/ethereum_test_specs/__init__.py
@@ -3,7 +3,6 @@
 from .base import BaseTest, TestSpec
 from .base_static import BaseStaticTest
 from .benchmark import BenchmarkTest, BenchmarkTestFiller, BenchmarkTestSpec
-from .benchmark_state import BenchmarkStateTest, BenchmarkStateTestFiller, BenchmarkStateTestSpec
 from .blobs import BlobsTest, BlobsTestFiller, BlobsTestSpec
 from .blockchain import (
     BlockchainTest,
@@ -28,9 +27,6 @@
     "BenchmarkTest",
     "BenchmarkTestFiller",
     "BenchmarkTestSpec",
-    "BenchmarkStateTest",
-    "BenchmarkStateTestFiller",
-    "BenchmarkStateTestSpec",
     "BlobsTest",
     "BlobsTestFiller",
     "BlobsTestSpec",
diff --git a/src/ethereum_test_specs/benchmark_state.py b/src/ethereum_test_specs/benchmark_state.py
deleted file mode 100644
index 454af1a3844..00000000000
--- a/src/ethereum_test_specs/benchmark_state.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""Ethereum benchmark state test spec definition and filler."""
-
-import math
-from pprint import pprint
-from typing import Callable, ClassVar, Generator, List, Sequence, Type
-
-from pydantic import ConfigDict
-
-from ethereum_clis import TransitionTool
-from ethereum_test_base_types import HexNumber
-from ethereum_test_execution import (
-    BaseExecute,
-    ExecuteFormat,
-    LabeledExecuteFormat,
-    TransactionPost,
-)
-from ethereum_test_fixtures import (
-    BaseFixture,
-    FixtureFormat,
-    LabeledFixtureFormat,
-    StateFixture,
-)
-from ethereum_test_fixtures.common import FixtureBlobSchedule
-from ethereum_test_fixtures.state import (
-    FixtureConfig,
-    FixtureEnvironment,
-    FixtureForkPost,
-    FixtureTransaction,
-)
-from ethereum_test_forks import Fork
-from ethereum_test_types import Alloc, Environment, Transaction
-
-from .base import BaseTest, OpMode
-from .blockchain import Block, BlockchainTest
-from .debugging import print_traces
-from .helpers import verify_transactions
-
-
-class BenchmarkStateTest(BaseTest):
-    """Test type designed specifically for benchmark state test cases with full verification."""
-
-    pre: Alloc
-    post: Alloc
-    tx: Transaction
-    gas_benchmark_value: int
-    env: Environment
-    chain_id: int = 1
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
-        StateFixture,
-    ] + [
-        LabeledFixtureFormat(
-            fixture_format,
-            f"{fixture_format.format_name}_from_benchmark_state_test",
-            f"A {fixture_format.format_name} generated from a benchmark_state_test",
-        )
-        for fixture_format in BlockchainTest.supported_fixture_formats
-    ]
-
-    supported_execute_formats: ClassVar[Sequence[LabeledExecuteFormat]] = [
-        LabeledExecuteFormat(
-            TransactionPost,
-            "benchmark_state_test_with_verification",
-            "An execute test derived from a benchmark state test with verification",
-        ),
-    ]
-
-    def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[Transaction]:
-        """Split a transaction that exceeds the gas limit cap into multiple transactions."""
-        if (gas_limit_cap is None) or (tx.gas_limit <= gas_limit_cap):
-            return [tx]
-
-        total_gas = int(tx.gas_limit)
-        num_splits = math.ceil(total_gas / gas_limit_cap)
-
-        split_transactions = []
-        remaining_gas = total_gas
-        for i in range(num_splits):
-            split_tx = tx.model_copy()
-            split_tx.gas_limit = HexNumber(min(gas_limit_cap, remaining_gas))
-            split_tx.nonce = HexNumber(tx.nonce + i)
-            split_transactions.append(split_tx)
-            remaining_gas -= gas_limit_cap
-
-        return split_transactions
-
-    def make_benchmark_state_test_fixture(
-        self,
-        t8n: TransitionTool,
-        fork: Fork,
-    ) -> StateFixture:
-        """Create a fixture from the benchmark state test definition with full verification."""
-        # We can't generate a state test fixture that names a transition fork,
-        # so we get the fork at the block number and timestamp of the state test
-        fork = fork.fork_at(self.env.number, self.env.timestamp)
-
-        env = self.env.set_fork_requirements(fork)
-        tx = self.tx.with_signature_and_sender(keep_secret_key=True)
-        pre_alloc = Alloc.merge(
-            Alloc.model_validate(fork.pre_allocation()),
-            self.pre,
-        )
-
-        # Verification 1: Check for empty accounts
-        if empty_accounts := pre_alloc.empty_accounts():
-            raise Exception(f"Empty accounts in pre state: {empty_accounts}")
-
-        transition_tool_output = t8n.evaluate(
-            transition_tool_data=TransitionTool.TransitionToolData(
-                alloc=pre_alloc,
-                txs=[tx],
-                env=env,
-                fork=fork,
-                chain_id=self.chain_id,
-                reward=0,  # Reward on state tests is always zero
-                blob_schedule=fork.blob_schedule(),
-                state_test=True,
-            ),
-            debug_output_path=self.get_next_transition_tool_output_path(),
-            slow_request=self.is_tx_gas_heavy_test(),
-        )
-
-        # Verification 2: Post-allocation verification
-        try:
-            self.post.verify_post_alloc(transition_tool_output.alloc)
-        except Exception as e:
-            print_traces(t8n.get_traces())
-            raise e
-
-        # Verification 3: Transaction verification
-        try:
-            verify_transactions(
-                txs=[tx],
-                result=transition_tool_output.result,
-                transition_tool_exceptions_reliable=t8n.exception_mapper.reliable,
-            )
-        except Exception as e:
-            print_traces(t8n.get_traces())
-            pprint(transition_tool_output.result)
-            pprint(transition_tool_output.alloc)
-            raise e
-
-        # Verification 4: Benchmark gas validation
-        if self._operation_mode == OpMode.BENCHMARKING:
-            expected_benchmark_gas_used = self.gas_benchmark_value
-            gas_used = int(transition_tool_output.result.gas_used)
-            assert expected_benchmark_gas_used is not None, "gas_benchmark_value is not set"
-            assert gas_used == expected_benchmark_gas_used, (
-                f"gas_used ({gas_used}) does not match gas_benchmark_value "
-                f"({expected_benchmark_gas_used})"
-                f", difference: {gas_used - expected_benchmark_gas_used}"
-            )
-
-        return StateFixture(
-            env=FixtureEnvironment(**env.model_dump(exclude_none=True)),
-            pre=pre_alloc,
-            post={
-                fork: [
-                    FixtureForkPost(
-                        state_root=transition_tool_output.result.state_root,
-                        logs_hash=transition_tool_output.result.logs_hash,
-                        tx_bytes=tx.rlp(),
-                        expect_exception=tx.error,
-                        state=transition_tool_output.alloc,
-                    )
-                ]
-            },
-            transaction=FixtureTransaction.from_transaction(tx),
-            config=FixtureConfig(
-                blob_schedule=FixtureBlobSchedule.from_blob_schedule(fork.blob_schedule()),
-                chain_id=self.chain_id,
-            ),
-        )
-
-    def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
-        """Create a BlockchainTest from this BenchmarkStateTestWithVerification."""
-        gas_limit_cap = fork.transaction_gas_limit_cap()
-
-        transactions = self.split_transaction(self.tx, gas_limit_cap)
-
-        blocks = [Block(txs=transactions)]
-
-        return BlockchainTest.from_test(
-            base_test=self,
-            pre=self.pre,
-            post=self.post,
-            blocks=blocks,
-            genesis_environment=self.env,
-        )
-
-    def generate(
-        self,
-        t8n: TransitionTool,
-        fork: Fork,
-        fixture_format: FixtureFormat,
-    ) -> BaseFixture:
-        """Generate the test fixture."""
-        self.check_exception_test(exception=self.tx.error is not None)
-        if fixture_format in BlockchainTest.supported_fixture_formats:
-            return self.generate_blockchain_test(fork=fork).generate(
-                t8n=t8n, fork=fork, fixture_format=fixture_format
-            )
-        elif fixture_format == StateFixture:
-            return self.make_benchmark_state_test_fixture(t8n, fork)
-
-        raise Exception(f"Unknown fixture format: {fixture_format}")
-
-    def execute(
-        self,
-        *,
-        fork: Fork,
-        execute_format: ExecuteFormat,
-    ) -> BaseExecute:
-        """Execute the benchmark state test by sending it to the live network."""
-        if execute_format == TransactionPost:
-            return TransactionPost(
-                blocks=[[self.tx]],
-                post=self.post,
-            )
-        raise Exception(f"Unsupported execute format: {execute_format}")
-
-
-BenchmarkStateTestFiller = Type[BenchmarkStateTest]
-BenchmarkStateTestSpec = Callable[[str], Generator[BenchmarkStateTest, None, None]]
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index 04b1770ac61..3f99b1b9772 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -30,8 +30,6 @@
 from ethereum_test_fixtures import BaseFixture, FixtureCollector
 from ethereum_test_specs import (
     BaseTest,
-    BenchmarkStateTest,
-    BenchmarkStateTestFiller,
     BenchmarkTest,
     BenchmarkTestFiller,
     BlobsTest,
@@ -129,8 +127,6 @@
     "BenchmarkCodeGenerator",
     "BenchmarkTest",
     "BenchmarkTestFiller",
-    "BenchmarkStateTest",
-    "BenchmarkStateTestFiller",
     "Blob",
     "BlockAccessList",
     "BlobsTest",

From 2e34a6a2313ae42f682b6f94cea77adb444532a9 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Fri, 12 Sep 2025 15:08:28 +0800
Subject: [PATCH 09/19] fix: pydantic model validation for benchmark manager

---
 src/ethereum_test_specs/benchmark.py           | 11 ++++++++++-
 tests/benchmark/test_worst_stateful_opcodes.py |  1 -
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 57c241f7478..67b1152f646 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -150,8 +150,17 @@ def __get_pydantic_core_schema__(
         cls, source_type: Any, handler: GetCoreSchemaHandler
     ) -> PlainValidatorFunctionSchema:
         """Provide Pydantic core schema for BenchmarkManager serialization and validation."""
+
+        def validate_benchmark_manager(value):
+            if isinstance(value, cls):
+                return value
+            if value is None:
+                return None
+            # If value is passed as arguments, create new instance with no args
+            return cls()
+
         return no_info_plain_validator_function(
-            cls,
+            validate_benchmark_manager,
             serialization=to_string_ser_schema(),
         )
 
diff --git a/tests/benchmark/test_worst_stateful_opcodes.py b/tests/benchmark/test_worst_stateful_opcodes.py
index 5f92c7e6a2c..ae7ac31ba22 100644
--- a/tests/benchmark/test_worst_stateful_opcodes.py
+++ b/tests/benchmark/test_worst_stateful_opcodes.py
@@ -482,7 +482,6 @@ def test_worst_blockhash(
         pre=pre,
         post={},
         benchmark_manager=benchmark_manager,
-        gas_benchmark_value=gas_benchmark_value,
     )
 
 

From 6470b46675214aad477b78487c2ecfdcef63bfdb Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Wed, 17 Sep 2025 12:16:03 +0800
Subject: [PATCH 10/19] refactor synatx and parameter

---
 src/ethereum_test_specs/benchmark.py | 6 +++---
 tests/benchmark/test_worst_blocks.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 67b1152f646..667dda9184d 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -51,12 +51,12 @@ class BenchmarkCodeGenerator(ABC):
     @abstractmethod
     def deploy_contracts(self, pre: Alloc, fork: Fork) -> None:
         """Deploy any contracts needed for the benchmark."""
-        pass
+        ...
 
     @abstractmethod
     def generate_transaction(self, pre: Alloc, gas_limit: int, fork: Fork) -> Transaction:
         """Generate a transaction with the specified gas limit."""
-        pass
+        ...
 
     def generate_repeated_code(
         self, repeated_code: Bytecode, setup: Bytecode, fork: Fork
@@ -251,7 +251,7 @@ def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[
     def generate_blocks_from_code_generator(self, fork: Fork) -> List[Block]:
         """Generate blocks using the code generator."""
         if self.code_generator is None:
-            return []
+            raise Exception("Code generator is not set")
 
         self.code_generator.deploy_contracts(self.pre, fork)
         gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index e8d557c1b8e..4f1039a04ca 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -163,7 +163,6 @@ def test_block_full_of_ether_transfers(
         pre=pre,
         post=post_state,
         blocks=[Block(txs=txs)],
-        gas_benchmark_value=gas_benchmark_value,
         expected_benchmark_gas_used=iteration_count * intrinsic_cost,
     )
 

From 56e3b280ac795352edbd8319fbba85c257809e5e Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Wed, 17 Sep 2025 15:11:07 +0800
Subject: [PATCH 11/19] refactor: remove benchmark manager feature

---
 src/ethereum_test_specs/benchmark.py          | 131 +-----------------
 tests/benchmark/conftest.py                   |   7 -
 tests/benchmark/test_worst_blocks.py          |  33 ++---
 .../benchmark/test_worst_stateful_opcodes.py  |  27 ++--
 4 files changed, 28 insertions(+), 170 deletions(-)

diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 667dda9184d..5f875ccc45d 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -1,19 +1,11 @@
 """Ethereum benchmark test spec definition and filler."""
 
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from contextvars import ContextVar
 from dataclasses import dataclass, field
-from enum import Enum
-from typing import Any, Callable, ClassVar, Dict, Generator, List, Optional, Sequence, Type
+from typing import Callable, ClassVar, Dict, Generator, List, Sequence, Type
 
 import pytest
-from pydantic import ConfigDict, Field, GetCoreSchemaHandler
-from pydantic_core.core_schema import (
-    PlainValidatorFunctionSchema,
-    no_info_plain_validator_function,
-    to_string_ser_schema,
-)
+from pydantic import ConfigDict, Field
 
 from ethereum_clis import TransitionTool
 from ethereum_test_base_types import HexNumber
@@ -83,88 +75,6 @@ def _validate_code_size(self, code: Bytecode, fork: Fork) -> None:
             )
 
 
-class BenchmarkPhase(Enum):
-    """Phases of a benchmark test."""
-
-    SETUP = "setup"
-    EXECUTION = "execution"
-
-
-_current_phase: ContextVar[Optional[BenchmarkPhase]] = ContextVar("benchmark_phase", default=None)
-
-
-class BenchmarkManager:
-    """Context manager for managing benchmark test phases."""
-
-    def __init__(self):
-        """Initialize the BenchmarkManager with empty transaction and block lists."""
-        self.setup_transactions: List[Transaction] = []
-        self.setup_blocks: List[Block] = []
-        self.execution_transactions: List[Transaction] = []
-        self.execution_blocks: List[Block] = []
-
-    @contextmanager
-    def setup(self):
-        """Context manager for the setup phase of a benchmark test."""
-        token = _current_phase.set(BenchmarkPhase.SETUP)
-        try:
-            yield self
-        finally:
-            _current_phase.reset(token)
-
-    @contextmanager
-    def execution(self):
-        """Context manager for the execution phase of a benchmark test."""
-        token = _current_phase.set(BenchmarkPhase.EXECUTION)
-        try:
-            yield self
-        finally:
-            _current_phase.reset(token)
-
-    def add_transaction(self, tx: Transaction):
-        """Add a transaction to the current phase."""
-        current_phase = _current_phase.get()
-        if current_phase == BenchmarkPhase.SETUP:
-            self.setup_transactions.append(tx)
-        elif current_phase == BenchmarkPhase.EXECUTION:
-            self.execution_transactions.append(tx)
-        else:
-            self.setup_transactions.append(tx)
-
-    def add_block(self, block: Block):
-        """Add a block to the current phase."""
-        current_phase = _current_phase.get()
-        if current_phase == BenchmarkPhase.SETUP:
-            self.setup_blocks.append(block)
-        elif current_phase == BenchmarkPhase.EXECUTION:
-            self.execution_blocks.append(block)
-        else:
-            self.setup_blocks.append(block)
-
-    def get_current_phase(self) -> Optional[BenchmarkPhase]:
-        """Get the current benchmark phase."""
-        return _current_phase.get()
-
-    @classmethod
-    def __get_pydantic_core_schema__(
-        cls, source_type: Any, handler: GetCoreSchemaHandler
-    ) -> PlainValidatorFunctionSchema:
-        """Provide Pydantic core schema for BenchmarkManager serialization and validation."""
-
-        def validate_benchmark_manager(value):
-            if isinstance(value, cls):
-                return value
-            if value is None:
-                return None
-            # If value is passed as arguments, create new instance with no args
-            return cls()
-
-        return no_info_plain_validator_function(
-            validate_benchmark_manager,
-            serialization=to_string_ser_schema(),
-        )
-
-
 class BenchmarkTest(BaseTest):
     """Test type designed specifically for benchmark test cases."""
 
@@ -180,7 +90,6 @@ class BenchmarkTest(BaseTest):
     env: Environment = Field(default_factory=Environment)
     expected_benchmark_gas_used: int | None = None
     gas_benchmark_value: int = Field(default_factory=lambda: int(Environment().gas_limit))
-    benchmark_manager: BenchmarkManager | None = None
     code_generator: BenchmarkCodeGenerator | None = None
 
     supported_fixture_formats: ClassVar[Sequence[FixtureFormat | LabeledFixtureFormat]] = [
@@ -274,33 +183,6 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
                 blocks=generated_blocks,
             )
 
-        elif self.benchmark_manager is not None:
-            all_blocks = []
-            gas_limit = fork.transaction_gas_limit_cap() or self.gas_benchmark_value
-
-            if self.benchmark_manager.setup_blocks:
-                all_blocks.extend(self.benchmark_manager.setup_blocks)
-            elif self.benchmark_manager.setup_transactions:
-                setup_txs = []
-                for tx in self.benchmark_manager.setup_transactions:
-                    setup_txs.extend(self.split_transaction(tx, gas_limit))
-                all_blocks.append(Block(txs=setup_txs))
-
-            if self.benchmark_manager.execution_blocks:
-                all_blocks.extend(self.benchmark_manager.execution_blocks)
-            elif self.benchmark_manager.execution_transactions:
-                execution_txs = []
-                for tx in self.benchmark_manager.execution_transactions:
-                    execution_txs.extend(self.split_transaction(tx, gas_limit))
-                all_blocks.append(Block(txs=execution_txs))
-
-            return BlockchainTest.from_test(
-                base_test=self,
-                genesis_environment=self.env,
-                pre=self.pre,
-                post=self.post,
-                blocks=all_blocks,
-            )
         elif self.blocks is not None:
             return BlockchainTest.from_test(
                 base_test=self,
@@ -324,9 +206,7 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
                 genesis_environment=self.env,
             )
         else:
-            raise ValueError(
-                "Cannot create BlockchainTest without transactions, blocks, or benchmark_manager"
-            )
+            raise ValueError("Cannot create BlockchainTest without transactions or blocks")
 
     def generate(
         self,
@@ -358,10 +238,5 @@ def execute(
         raise Exception(f"Unsupported execute format: {execute_format}")
 
 
-def create_benchmark_manager() -> BenchmarkManager:
-    """Create a new BenchmarkManager instance for phase-aware benchmark testing."""
-    return BenchmarkManager()
-
-
 BenchmarkTestSpec = Callable[[str], Generator[BenchmarkTest, None, None]]
 BenchmarkTestFiller = Type[BenchmarkTest]
diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py
index 3f0a67ab556..1e2e7813817 100644
--- a/tests/benchmark/conftest.py
+++ b/tests/benchmark/conftest.py
@@ -5,7 +5,6 @@
 import pytest
 
 from ethereum_test_forks import Fork
-from ethereum_test_specs.benchmark import BenchmarkManager, create_benchmark_manager
 
 DEFAULT_BENCHMARK_FORK = "Prague"
 
@@ -68,9 +67,3 @@ def pytest_collection_modifyitems(config, items):
 def tx_gas_limit_cap(fork: Fork, gas_benchmark_value: int) -> int:
     """Return the transaction gas limit cap."""
     return fork.transaction_gas_limit_cap() or gas_benchmark_value
-
-
-@pytest.fixture
-def benchmark_manager() -> BenchmarkManager:
-    """Return a benchmark manager."""
-    return create_benchmark_manager()
diff --git a/tests/benchmark/test_worst_blocks.py b/tests/benchmark/test_worst_blocks.py
index 4f1039a04ca..b71f3035067 100644
--- a/tests/benchmark/test_worst_blocks.py
+++ b/tests/benchmark/test_worst_blocks.py
@@ -9,15 +9,14 @@
 
 import pytest
 
+from ethereum_test_base_types import Account
 from ethereum_test_forks import Fork
-from ethereum_test_specs.benchmark import BenchmarkManager
 from ethereum_test_tools import (
     AccessList,
-    Account,
     Address,
     Alloc,
-    BenchmarkTestFiller,
     Block,
+    BlockchainTestFiller,
     Environment,
     Hash,
     StateTestFiller,
@@ -112,8 +111,7 @@ def ether_transfer_case(
     ["a_to_a", "a_to_b", "diff_acc_to_b", "a_to_diff_acc", "diff_acc_to_diff_acc"],
 )
 def test_block_full_of_ether_transfers(
-    benchmark_test: BenchmarkTestFiller,
-    benchmark_manager: BenchmarkManager,
+    blockchain_test: BlockchainTestFiller,
     pre: Alloc,
     env: Environment,
     case_id: str,
@@ -138,18 +136,17 @@ def test_block_full_of_ether_transfers(
     # Create a single block with all transactions
     txs = []
     balances: dict[Address, int] = {}
-    with benchmark_manager.execution():
-        for _ in range(iteration_count):
-            receiver = next(receivers)
-            balances[receiver] = balances.get(receiver, 0) + transfer_amount
-            txs.append(
-                Transaction(
-                    to=receiver,
-                    value=transfer_amount,
-                    gas_limit=intrinsic_cost,
-                    sender=next(senders),
-                )
+    for _ in range(iteration_count):
+        receiver = next(receivers)
+        balances[receiver] = balances.get(receiver, 0) + transfer_amount
+        txs.append(
+            Transaction(
+                to=receiver,
+                value=transfer_amount,
+                gas_limit=intrinsic_cost,
+                sender=next(senders),
             )
+        )
 
     # Only include post state for non a_to_a cases
     post_state = (
@@ -158,8 +155,8 @@ def test_block_full_of_ether_transfers(
         else {receiver: Account(balance=balance) for receiver, balance in balances.items()}
     )
 
-    benchmark_test(
-        env=env,
+    blockchain_test(
+        genesis_environment=env,
         pre=pre,
         post=post_state,
         blocks=[Block(txs=txs)],
diff --git a/tests/benchmark/test_worst_stateful_opcodes.py b/tests/benchmark/test_worst_stateful_opcodes.py
index ae7ac31ba22..f68783e61c3 100644
--- a/tests/benchmark/test_worst_stateful_opcodes.py
+++ b/tests/benchmark/test_worst_stateful_opcodes.py
@@ -10,8 +10,6 @@
 import pytest
 
 from ethereum_test_forks import Fork
-from ethereum_test_specs import BenchmarkTestFiller
-from ethereum_test_specs.benchmark import BenchmarkManager
 from ethereum_test_tools import (
     Account,
     Address,
@@ -453,35 +451,30 @@ def test_worst_storage_access_warm(
 
 
 def test_worst_blockhash(
-    benchmark_test: BenchmarkTestFiller,
-    benchmark_manager: BenchmarkManager,
+    blockchain_test: BlockchainTestFiller,
     pre: Alloc,
     gas_benchmark_value: int,
 ):
     """Test running a block with as many blockhash accessing oldest allowed block as possible."""
     # Create 256 dummy blocks to fill the blockhash window.
-    with benchmark_manager.setup():
-        for _ in range(256):
-            benchmark_manager.add_block(Block())
+    blocks = [Block()] * 256
 
     # Always ask for the oldest allowed BLOCKHASH block.
     execution_code = Op.PUSH1(1) + While(
         body=Op.POP(Op.BLOCKHASH(Op.DUP1)),
     )
     execution_code_address = pre.deploy_contract(code=execution_code)
-    with benchmark_manager.execution():
-        benchmark_manager.add_transaction(
-            Transaction(
-                to=execution_code_address,
-                gas_limit=gas_benchmark_value,
-                sender=pre.fund_eoa(),
-            )
-        )
+    op_tx = Transaction(
+        to=execution_code_address,
+        gas_limit=gas_benchmark_value,
+        sender=pre.fund_eoa(),
+    )
+    blocks.append(Block(txs=[op_tx]))
 
-    benchmark_test(
+    blockchain_test(
         pre=pre,
         post={},
-        benchmark_manager=benchmark_manager,
+        blocks=blocks,
     )
 
 

From d88f680f12b862e5a1e6ae2c848a198506a3b3fe Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Wed, 17 Sep 2025 16:02:09 +0800
Subject: [PATCH 12/19] refactor: update logic and add benchmark tests

---
 .../benchmark_code_generator.py               |  10 +-
 src/ethereum_test_specs/benchmark.py          |   5 +-
 .../tests/test_benchmark.py                   | 105 ++++++++++++++++++
 3 files changed, 112 insertions(+), 8 deletions(-)
 create mode 100644 src/ethereum_test_specs/tests/test_benchmark.py

diff --git a/src/ethereum_test_benchmark/benchmark_code_generator.py b/src/ethereum_test_benchmark/benchmark_code_generator.py
index dce3ef7392e..bae71051e16 100644
--- a/src/ethereum_test_benchmark/benchmark_code_generator.py
+++ b/src/ethereum_test_benchmark/benchmark_code_generator.py
@@ -1,7 +1,5 @@
 """Benchmark code generator classes for creating optimized bytecode patterns."""
 
-from dataclasses import dataclass
-
 from ethereum_test_forks import Fork
 from ethereum_test_specs.benchmark import BenchmarkCodeGenerator
 from ethereum_test_types import Alloc, Transaction
@@ -9,7 +7,6 @@
 from ethereum_test_vm.opcode import Opcodes as Op
 
 
-@dataclass
 class JumpLoopGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that loops execution using JUMP operations."""
 
@@ -32,7 +29,6 @@ def generate_transaction(self, pre: Alloc, gas_limit: int, fork: Fork) -> Transa
         )
 
 
-@dataclass
 class ExtCallGenerator(BenchmarkCodeGenerator):
     """Generates bytecode that fills the contract to maximum allowed code size."""
 
@@ -43,11 +39,13 @@ def deploy_contracts(self, pre: Alloc, fork: Fork) -> None:
         # 1. The target contract that executes certain operation but not loop (e.g. PUSH)
         # 2. The loop contract that calls the target contract in a loop
 
-        max_stack_height = fork.max_stack_height()
+        max_iterations = min(
+            fork.max_stack_height(), fork.max_code_size() // len(self.attack_block)
+        )
 
         # Deploy target contract that contains the actual attack block
         self._target_contract_address = pre.deploy_contract(
-            code=self.attack_block * max_stack_height
+            code=self.attack_block * max_iterations
         )
 
         # Create caller contract that repeatedly calls the target contract
diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 5f875ccc45d..90beec4dd55 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -1,5 +1,6 @@
 """Ethereum benchmark test spec definition and filler."""
 
+import math
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Callable, ClassVar, Dict, Generator, List, Sequence, Type
@@ -141,11 +142,11 @@ def split_transaction(self, tx: Transaction, gas_limit_cap: int | None) -> List[
             return [tx]
 
         if gas_limit_cap >= self.gas_benchmark_value:
-            tx.gas_limit = HexNumber(min(tx.gas_limit, self.gas_benchmark_value))
+            tx.gas_limit = HexNumber(self.gas_benchmark_value)
             return [tx]
 
+        num_splits = math.ceil(self.gas_benchmark_value / gas_limit_cap)
         remaining_gas = self.gas_benchmark_value
-        num_splits = remaining_gas // gas_limit_cap + int(remaining_gas % gas_limit_cap)
 
         split_transactions = []
         for i in range(num_splits):
diff --git a/src/ethereum_test_specs/tests/test_benchmark.py b/src/ethereum_test_specs/tests/test_benchmark.py
new file mode 100644
index 00000000000..bd4a699720b
--- /dev/null
+++ b/src/ethereum_test_specs/tests/test_benchmark.py
@@ -0,0 +1,105 @@
+"""Tests for the BenchmarkTest class and its transaction splitting functionality."""
+
+import pytest
+
+from ethereum_test_base_types import HexNumber
+from ethereum_test_specs.benchmark import BenchmarkTest
+from ethereum_test_types import Alloc, Environment, Transaction
+
+
+@pytest.mark.parametrize(
+    "gas_benchmark_value_millions,expected_splits",
+    [
+        (1, 1),  # 1M / 16M = 1 transaction
+        (10, 1),  # 10M / 16M = 1 transaction
+        (30, 2),  # 30M / 16M = 2 transactions (16M + 14M)
+        (45, 3),  # 45M / 16M = 3 transactions (16M + 16M + 13M)
+        (60, 4),  # 60M / 16M = 4 transactions (16M + 16M + 16M + 12M)
+        (100, 7),  # 100M / 16M = 7 transactions (6x16M + 4M)
+        (150, 10),  # 150M / 16M = 10 transactions (9x16M + 6M)
+    ],
+)
+def test_split_transaction(gas_benchmark_value_millions: int, expected_splits: int):
+    """Test that transaction splitting works correctly for Osaka fork gas cap."""
+    gas_benchmark_value = gas_benchmark_value_millions * 1_000_000
+    gas_limit_cap = 16_000_000  # Osaka's transaction gas limit cap
+
+    # Create a minimal BenchmarkTest instance
+    benchmark_test = BenchmarkTest(
+        pre=Alloc(),
+        post=Alloc(),
+        tx=Transaction(sender=HexNumber(0), to=HexNumber(0), nonce=0),
+        env=Environment(),
+        gas_benchmark_value=gas_benchmark_value,
+    )
+
+    # Test the split_transaction method
+    assert benchmark_test.tx is not None, "Transaction should not be None"
+    split_txs = benchmark_test.split_transaction(benchmark_test.tx, gas_limit_cap)
+
+    # Verify the number of transactions
+    assert len(split_txs) == expected_splits, (
+        f"Expected {expected_splits} transactions for {gas_benchmark_value_millions}M gas, "
+        f"got {len(split_txs)}"
+    )
+
+    # Verify total gas equals the benchmark value
+    total_gas = sum(tx.gas_limit for tx in split_txs)
+    assert total_gas == gas_benchmark_value, (
+        f"Total gas {total_gas} doesn't match benchmark value {gas_benchmark_value}"
+    )
+
+    # Verify no transaction exceeds the cap
+    for i, tx in enumerate(split_txs):
+        assert tx.gas_limit <= gas_limit_cap, (
+            f"Transaction {i} gas limit {tx.gas_limit} exceeds cap {gas_limit_cap}"
+        )
+
+    # Verify nonces increment correctly
+    for i, tx in enumerate(split_txs):
+        assert tx.nonce == i, f"Transaction {i} has incorrect nonce {tx.nonce}"
+
+    # Verify gas distribution
+    for i, tx in enumerate(split_txs[:-1]):  # All but last should be at cap
+        assert tx.gas_limit == gas_limit_cap, (
+            f"Transaction {i} should have gas limit {gas_limit_cap}, got {tx.gas_limit}"
+        )
+
+    # Last transaction should have the remainder
+    if expected_splits > 1:
+        expected_last_gas = gas_benchmark_value - (gas_limit_cap * (expected_splits - 1))
+        assert split_txs[-1].gas_limit == expected_last_gas, (
+            f"Last transaction should have {expected_last_gas} gas, got {split_txs[-1].gas_limit}"
+        )
+
+
+@pytest.mark.parametrize(
+    "gas_benchmark_value,gas_limit_cap",
+    [
+        (50_000_000, None),  # No cap - should return single transaction
+        (50_000_000, 100_000_000),  # Cap higher than benchmark value
+    ],
+)
+def test_split_transaction_edge_cases(gas_benchmark_value: int, gas_limit_cap: int | None):
+    """Test edge cases for transaction splitting."""
+    benchmark_test = BenchmarkTest(
+        pre=Alloc(),
+        post=Alloc(),
+        tx=Transaction(sender=HexNumber(0), to=HexNumber(0), nonce=0, gas_limit=1_000_000_000),
+        env=Environment(),
+        gas_benchmark_value=gas_benchmark_value,
+    )
+
+    assert benchmark_test.tx is not None, "Transaction should not be None"
+    split_txs = benchmark_test.split_transaction(benchmark_test.tx, gas_limit_cap)
+
+    # Should return single transaction in both cases
+    assert len(split_txs) == 1, f"Expected 1 transaction, got {len(split_txs)}"
+
+    if gas_limit_cap is None:
+        # When no cap, gas_limit should be benchmark value
+        assert split_txs[0].gas_limit == gas_benchmark_value
+    else:
+        # When cap > benchmark, gas_limit should be min of tx.gas_limit and benchmark
+        assert benchmark_test.tx is not None, "Transaction should not be None"
+        assert split_txs[0].gas_limit == min(benchmark_test.tx.gas_limit, gas_benchmark_value)

From 80281e9cc3669206396b836e4b7835c541e7444d Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 18 Sep 2025 15:59:25 +0800
Subject: [PATCH 13/19] refactor: enforce single property requirement in
 blockchain test generation

---
 src/ethereum_test_specs/benchmark.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 90beec4dd55..6c3b915891d 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -174,6 +174,21 @@ def generate_blocks_from_code_generator(self, fork: Fork) -> List[Block]:
 
     def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
         """Create a BlockchainTest from this BenchmarkTest."""
+        set_props = [
+            name
+            for name, val in [
+                ("code_generator", self.code_generator),
+                ("blocks", self.blocks),
+                ("tx", self.tx),
+            ]
+            if val is not None
+        ]
+
+        if len(set_props) != 1:
+            raise ValueError(
+                f"Exactly one must be set, but got {len(set_props)}: {', '.join(set_props)}"
+            )
+
         if self.code_generator is not None:
             generated_blocks = self.generate_blocks_from_code_generator(fork)
             return BlockchainTest.from_test(
@@ -183,7 +198,6 @@ def generate_blockchain_test(self, fork: Fork) -> BlockchainTest:
                 post=self.post,
                 blocks=generated_blocks,
             )
-
         elif self.blocks is not None:
             return BlockchainTest.from_test(
                 base_test=self,

From 0a0c149ccf6be1ce76edd2ec32b7282e13bde9f7 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 18 Sep 2025 16:02:39 +0800
Subject: [PATCH 14/19] refactor: update Bytecode serialization schema to use
 format_ser_schema

---
 src/ethereum_test_vm/bytecode.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ethereum_test_vm/bytecode.py b/src/ethereum_test_vm/bytecode.py
index 12f07e528d3..d93188ee2ca 100644
--- a/src/ethereum_test_vm/bytecode.py
+++ b/src/ethereum_test_vm/bytecode.py
@@ -5,8 +5,8 @@
 from pydantic import GetCoreSchemaHandler
 from pydantic_core.core_schema import (
     PlainValidatorFunctionSchema,
+    format_ser_schema,
     no_info_plain_validator_function,
-    to_string_ser_schema,
 )
 
 from ethereum_test_base_types import Bytes, Hash
@@ -232,5 +232,5 @@ def __get_pydantic_core_schema__(
         """Provide Pydantic core schema for Bytecode serialization and validation."""
         return no_info_plain_validator_function(
             cls,
-            serialization=to_string_ser_schema(),
+            serialization=format_ser_schema("0x{}"),
         )

From f5ca3e5c036f17d65d07ee8449b85095f606fcb0 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Mon, 22 Sep 2025 11:39:26 +0800
Subject: [PATCH 15/19] refactor: update import paths

---
 src/ethereum_test_benchmark/benchmark_code_generator.py | 2 +-
 src/ethereum_test_specs/benchmark.py                    | 2 +-
 src/ethereum_test_tools/__init__.py                     | 5 -----
 tests/benchmark/test_worst_compute.py                   | 3 +--
 4 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/ethereum_test_benchmark/benchmark_code_generator.py b/src/ethereum_test_benchmark/benchmark_code_generator.py
index bae71051e16..9c2c9b7814a 100644
--- a/src/ethereum_test_benchmark/benchmark_code_generator.py
+++ b/src/ethereum_test_benchmark/benchmark_code_generator.py
@@ -4,7 +4,7 @@
 from ethereum_test_specs.benchmark import BenchmarkCodeGenerator
 from ethereum_test_types import Alloc, Transaction
 from ethereum_test_vm import Bytecode
-from ethereum_test_vm.opcode import Opcodes as Op
+from ethereum_test_vm.opcodes import Opcodes as Op
 
 
 class JumpLoopGenerator(BenchmarkCodeGenerator):
diff --git a/src/ethereum_test_specs/benchmark.py b/src/ethereum_test_specs/benchmark.py
index 6c3b915891d..440faa8b844 100644
--- a/src/ethereum_test_specs/benchmark.py
+++ b/src/ethereum_test_specs/benchmark.py
@@ -28,7 +28,7 @@
 from ethereum_test_forks import Fork
 from ethereum_test_types import Alloc, Environment, Transaction
 from ethereum_test_vm import Bytecode
-from ethereum_test_vm.opcode import Opcodes as Op
+from ethereum_test_vm.opcodes import Opcodes as Op
 
 from .base import BaseTest
 from .blockchain import Block, BlockchainTest
diff --git a/src/ethereum_test_tools/__init__.py b/src/ethereum_test_tools/__init__.py
index 3f99b1b9772..fc8057c0017 100644
--- a/src/ethereum_test_tools/__init__.py
+++ b/src/ethereum_test_tools/__init__.py
@@ -89,11 +89,6 @@
     call_return_code,
 )
 
-from .benchmark_code_generator import (
-    BenchmarkCodeGenerator,
-    ExtCallGenerator,
-    JumpLoopGenerator,
-)
 from .tools_code import (
     CalldataCase,
     Case,
diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index e4338de3b7d..4c1f74b5137 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -15,7 +15,7 @@
 from py_ecc.bn128 import G1, G2, multiply
 
 from ethereum_test_base_types.base_types import Bytes
-from ethereum_test_benchmark import JumpLoopGenerator
+from ethereum_test_benchmark.benchmark_code_generator import JumpLoopGenerator
 from ethereum_test_forks import Fork
 from ethereum_test_tools import (
     Address,
@@ -29,7 +29,6 @@
     Transaction,
     add_kzg_version,
 )
-from ethereum_test_tools.benchmark_code_generator import JumpLoopGenerator
 from ethereum_test_types import TransactionType
 from ethereum_test_vm import Opcode
 from ethereum_test_vm import Opcodes as Op

From c4e8fbd337319a00620dd5b8b04902c1aec4b8ba Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Mon, 22 Sep 2025 12:03:24 +0800
Subject: [PATCH 16/19] refactor: update serialization schema

---
 src/ethereum_test_vm/bytecode.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ethereum_test_vm/bytecode.py b/src/ethereum_test_vm/bytecode.py
index d93188ee2ca..5eea5b0cce3 100644
--- a/src/ethereum_test_vm/bytecode.py
+++ b/src/ethereum_test_vm/bytecode.py
@@ -5,8 +5,8 @@
 from pydantic import GetCoreSchemaHandler
 from pydantic_core.core_schema import (
     PlainValidatorFunctionSchema,
-    format_ser_schema,
     no_info_plain_validator_function,
+    plain_serializer_function_ser_schema,
 )
 
 from ethereum_test_base_types import Bytes, Hash
@@ -232,5 +232,8 @@ def __get_pydantic_core_schema__(
         """Provide Pydantic core schema for Bytecode serialization and validation."""
         return no_info_plain_validator_function(
             cls,
-            serialization=format_ser_schema("0x{}"),
+            serialization=plain_serializer_function_ser_schema(
+                lambda bytecode: "0x" + bytecode.hex(),
+                info_arg=False,
+            ),
         )

From 1df840bac7fcf6f9d42257caa2b5db3ba2f8bcd7 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Mon, 22 Sep 2025 12:10:14 +0800
Subject: [PATCH 17/19] refactor: remove unused parameters

---
 tests/benchmark/test_worst_compute.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/benchmark/test_worst_compute.py b/tests/benchmark/test_worst_compute.py
index 4c1f74b5137..d7437a77e4c 100644
--- a/tests/benchmark/test_worst_compute.py
+++ b/tests/benchmark/test_worst_compute.py
@@ -1846,12 +1846,9 @@ def test_worst_jumpis(
 def test_worst_jumpdests(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
-    env: Environment,
-    fork: Fork,
 ):
     """Test running a JUMPDEST-intensive contract."""
     benchmark_test(
-        env=env,
         pre=pre,
         post={},
         code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
@@ -2755,8 +2752,6 @@ def test_worst_calldataload(
 def test_worst_swap(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
-    env: Environment,
-    fork: Fork,
     opcode: Opcode,
 ):
     """Test running a block with as many SWAP as possible."""

From e2f462b03406ec7b6b7d9dbe227c84f3758aefe9 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 25 Sep 2025 01:01:53 +0800
Subject: [PATCH 18/19] doc: add changelog entry

---
 docs/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index c2bc908362e..c075e66cb9d 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -10,6 +10,8 @@ Test fixtures for use by clients are available for each release on the [Github r
 
 ### 🛠️ Framework
 
+- ✨ Add benchmark specify test wrapper (`benchmark_test`) that supports **EIP-7825** and create a benchmark code generator for common test pattern ([#1945](https://github.com/ethereum/execution-spec-tests/pull/1945)).
+
 #### `fill`
 
 - Move pytest marker registration for `fill` and `execute-*` from their respective ini files to the shared `pytest_plugins.shared.execute_fill` pytest plugin ([#2110](https://github.com/ethereum/execution-spec-tests/pull/2110)).

From 0e597d5a0e032f889a767d97019fcce767196ac4 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 25 Sep 2025 01:17:15 +0800
Subject: [PATCH 19/19] fix typo

---
 docs/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index c075e66cb9d..f24e5611029 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -10,7 +10,7 @@ Test fixtures for use by clients are available for each release on the [Github r
 
 ### 🛠️ Framework
 
-- ✨ Add benchmark specify test wrapper (`benchmark_test`) that supports **EIP-7825** and create a benchmark code generator for common test pattern ([#1945](https://github.com/ethereum/execution-spec-tests/pull/1945)).
+- ✨ Add benchmark-specific test wrapper (`benchmark_test`) that supports **EIP-7825** and create a benchmark code generator for common test pattern ([#1945](https://github.com/ethereum/execution-spec-tests/pull/1945)).
 
 #### `fill`