apple · apoorvtintin · Feb 14, 2025 · Feb 26, 2025 · kelvin-zou · Feb 14, 2025
@@ -13,7 +13,7 @@
 from axlearn.common.config import ConfigOr, maybe_instantiate
 from axlearn.common.metrics import MetricAccumulator
 from axlearn.common.update_transformation import ForwardFn, ForwardOutputs
-from axlearn.common.utils import Nested, Tensor, input_partition_spec, with_sharding_constraint
+from axlearn.common.utils import Nested, Tensor
 
 
 def _compute_minibatch_size(input_batch: Nested[Tensor], *, steps: int) -> int:
@@ -64,9 +64,6 @@ def _make_scan_minibatch_inputs(
     within a scan function body and is meant to slice the inputs
     into `minibatch_size` sized slices to run the ForwardFn on.
 
-    Note that this only preserves the input sharding if the `input_partition_spec`
-    returns the correct partition spec to shard the input slices with.
-
     Args:
         inputs: Same pytree as ForwardFn inputs.
         forward_key: The `forward_key` from the ForwardFn inputs
@@ -78,18 +75,16 @@ def _make_scan_minibatch_inputs(
         A tuple of minibatch inputs which of the same structure as `inputs`
         and new (carry) forward_key and param_noise_key.
     """
-    minibatch_input = with_sharding_constraint(
-        jax.tree.map(
-            lambda x: jax.lax.dynamic_slice_in_dim(
-                x,
-                start_index=minibatch_index * minibatch_size,
-                slice_size=minibatch_size,
-                axis=0,
-            ),
-            inputs["input_batch"],
+    minibatch_input = jax.tree.map(
+        lambda x: jax.lax.dynamic_slice_in_dim(
+            x,
+            start_index=minibatch_index * minibatch_size,
+            slice_size=minibatch_size,
+            axis=0,
         ),
-        input_partition_spec(),
+        inputs["input_batch"],
     )
+
     next_forward_key, forward_key = jax.random.split(forward_key)
     next_param_noise_key, param_noise_key = jax.random.split(param_noise_key)
 
@@ -172,12 +167,26 @@ def fwd_helper(
                 otherwise None.
             """
             minibatch_size = _compute_minibatch_size(inputs["input_batch"], steps=steps)
+
+            # Create a sample minibatch for the carry buffer creation below
+            (
+                sample_minibatch_inputs,
+                _,
+                _,
+            ) = _make_scan_minibatch_inputs(
+                inputs,
+                forward_key=inputs["forward_key"],
+                param_noise_key=inputs["param_noise_key"],
+                minibatch_size=minibatch_size,
+                minibatch_index=0,
+            )
+
             # Carry initialization for the lax.scan procedure. Since we are passing a
             # `MetricAccumulator` into carry and carry input/output shapes must match
             # we need initialize the `MetricAccumulator` summary with the right PyTree
             # structure.
             _, primal_output_shape = jax.eval_shape(
-                original_func_positional_args, model_params, inputs
+                original_func_positional_args, model_params, sample_minibatch_inputs
             )
             init_primal_out = jax.tree.map(jnp.zeros_like, primal_output_shape)
             init_accumulator = maybe_instantiate(metric_accumulator)

@@ -1,5 +1,6 @@
 # Copyright © 2024 Apple Inc.
 """Test module for gradient_accumulation.py"""
+
 import chex
 import jax
 import jax.numpy as jnp

@@ -352,6 +352,9 @@ def get_trainer_kwargs(
                             ),
                             *trn2_config.module_modifications,
                             *trn2_config.partition_spec_modifications,
+                            GradientAccumulationModifier.default_config().set(
+                                grad_acc_steps=4,
+                            ),
                         ],
                     ),
                 ),