pytorch · RdoubleA · Nov 19, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 19, 2024
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -207,6 +207,9 @@ def load_from_full_model_state_dict(
                 requires_grad=sharded_meta_param.requires_grad,
             )
 
+        elif not hasattr(sharded_meta_param, "device_mesh"):
+            # In cases where parts of the model aren't sharded, some parameters will be plain tensors
+            sharded_tensor = full_tensor
         else:
             sharded_tensor = distribute_tensor(
                 full_tensor,
@@ -242,7 +245,9 @@ def gather_cpu_state_dict(
         if sharded_param.is_cpu:
             # Move back to device if offloaded to CPU
             sharded_param = sharded_param.to(device)
-        if isinstance(sharded_param._local_tensor, NF4Tensor):
+        if hasattr(sharded_param, "_local_tensor") and isinstance(
+            sharded_param._local_tensor, NF4Tensor
+        ):
             # NF4Tensor does not support all_gather from DTensor
             # so we need to manually all_gather
             mesh = sharded_param.device_mesh
@@ -264,9 +269,16 @@ def gather_cpu_state_dict(
             )
             # upcasting NF4 to original dtype
             full_param = full_param.to(full_param.dtype)
+        elif isinstance(sharded_param, NF4Tensor):
+            # upcasting NF4 to original dtype
+            full_param = sharded_param.to(sharded_param.dtype)
         else:
-            # Gather DTensor
-            full_param = sharded_param.full_tensor()
+            if hasattr(sharded_param, "full_tensor"):
+                # Gather DTensor
+                full_param = sharded_param.full_tensor()
+            else:
+                # In cases where parts of the model aren't sharded, some parameters will be plain tensors
+                full_param = sharded_param
         if is_rank_zero:
             cpu_state_dict[param_name] = full_param.cpu()
         else: