huydhn
diff --git a/‎docs/source/distributed.fsdp.fully_shard.rst
Lines changed: 85 additions & 0 deletions b/‎docs/source/distributed.fsdp.fully_shard.rst
Lines changed: 85 additions & 0 deletions
diff --git a/‎docs/source/index.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_autograd.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/fsdp/test_fully_shard_autograd.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_comm.py
Lines changed: 9 additions & 9 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_comm.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_compile.py
Lines changed: 8 additions & 6 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_compile.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_extensions.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/fsdp/test_fully_shard_extensions.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_frozen.py
Lines changed: 2 additions & 2 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_frozen.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_init.py
Lines changed: 30 additions & 7 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_init.py
Lines changed: 30 additions & 7 deletions
@@ -0,0 +1,85 @@
+torch.distributed.fsdp.fully_shard
+==================================
+
+PyTorch FSDP2 (``fully_shard``)
+-------------------------------
+
+PyTorch FSDP2 provides a fully sharded data parallelism (FSDP) implementation
+targeting performant eager-mode while using per-parameter sharding for improved
+usability.
+
+- If you are new to FSDP, we recommend that you start with FSDP2 due to improved
+  usability.
+- If you are currently using FSDP1, consider evaluating the following
+  differences to see if you should switch to FSDP2:
+
+Compared to PyTorch FSDP1 (``FullyShardedDataParallel``):
+
+- FSDP2 uses ``DTensor``-based dim-0 per-parameter sharding for a simpler
+  sharding representation compared to FSDP1's flat-parameter sharding, while
+  preserving similar throughput performance. More specifically, FSDP2 chunks
+  each parameter on dim-0 across the data parallel workers (using
+  ``torch.chunk(dim=0)``), whereas FSDP1 flattens, concatenates, and chunks a
+  group of tensors together, making reasoning about what data is present on
+  each worker and resharding to different parallelisms complex. Per-parameter
+  sharding provides a more intuitive user experience, relaxes constraints
+  around frozen parameters, and allows for communication-free (sharded) state
+  dicts, which otherwise require all-gathers in FSDP1.
+- FSDP2 implements a different memory management approach to handle the
+  multi-stream usages that avoids ``torch.Tensor.record_stream``. This ensures
+  deterministic and expected memory usage and does not require blocking the CPU
+  like in FSDP1's ``limit_all_gathers=True``.
+- FSDP2 exposes APIs for manual control over prefetching and collective
+  scheduling, allowing power users more customization. See the methods on
+  ``FSDPModule`` below for details.
+- FSDP2 simplifies some of the API surface: e.g. FSDP2 does not directly
+  support full state dicts. Instead, users can reshard the sharded state dicts
+  containing ``DTensor`` s to full state dicts themselves using ``DTensor``
+  APIs like ``DTensor.full_tensor()`` or by using higher-level APIs like
+  `PyTorch Distributed Checkpoint <https://pytorch.org/docs/stable/distributed.checkpoint.html>`_ 's
+  distributed state dict APIs. Also, some other args have been removed; see
+  `here <https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md>`_ for
+  details.
+
+If you are onboarding FSDP for the first time or if any of the above appeals to
+your use case, we recommend that you consider using FSDP2.
+
+See `this RFC <https://github.com/pytorch/pytorch/issues/114299>`_ for details
+on system design and implementation.
+
+.. note::
+  ``torch.distributed.fsdp.fully_shard`` is currently in prototype state and
+  under development. The core API will likely not change, but we may make some
+  API changes if necessary.
+
+.. currentmodule:: torch.distributed.fsdp
+
+The frontend API is ``fully_shard`` that can be called on a ``module``:
+
+.. autofunction:: fully_shard
+
+Calling ``fully_shard(module)`` dynamically constructs a new class that
+subclasses ``type(module)`` and an FSDP class ``FSDPModule``. For example, if
+we call ``fully_shard(linear)`` on a module ``linear: nn.Linear``, then FSDP
+constructs a new class ``FSDPLinear`` and changes ``linear`` 's type to this.
+Otherwise, ``fully_shard`` does not change the module structure and parameter
+fully-qualified names. The class ``FSDPModule`` allows providing some
+FSDP-specific methods on the module.
+
+.. autoclass:: FSDPModule
+    :members:
+    :member-order: bysource
+
+.. autoclass:: UnshardHandle
+    :members:
+
+.. autofunction:: register_fsdp_forward_method
+
+.. autoclass:: MixedPrecisionPolicy
+    :members:
+
+.. autoclass:: OffloadPolicy
+    :members:
+
+.. autoclass:: CPUOffloadPolicy
+    :members:
@@ -79,6 +79,7 @@ Features described in this documentation are classified by release status:
    torch.distributed.algorithms.join <distributed.algorithms.join>
    torch.distributed.elastic <distributed.elastic>
    torch.distributed.fsdp <fsdp>
+   torch.distributed.fsdp.fully_shard <distributed.fsdp.fully_shard>
    torch.distributed.tensor.parallel <distributed.tensor.parallel>
    torch.distributed.optim <distributed.optim>
    torch.distributed.pipelining <distributed.pipelining>
 
@@ -10,7 +10,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed.fsdp import fully_shard
 from torch.nn.parallel.scatter_gather import _is_namedtuple
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 
@@ -7,8 +7,8 @@
 import torch
 import torch.nn as nn
 from torch.distributed._composable import replicate
-from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, MLPStack
 
@@ -11,30 +11,30 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._composable import checkpoint, replicate
-from torch.distributed._composable.fsdp import (
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import (
     FSDPModule,
     fully_shard,
     MixedPrecisionPolicy,
     OffloadPolicy,
 )
-from torch.distributed._composable.fsdp._fsdp_collectives import (
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
     _div_if_needed,
     _get_gradient_divide_factors,
     foreach_all_gather,
     foreach_all_gather_copy_out,
     foreach_reduce,
 )
-from torch.distributed._composable.fsdp._fsdp_common import FSDPMeshInfo, TrainingState
-from torch.distributed._composable.fsdp._fsdp_init import (
+from torch.distributed.fsdp._fully_shard._fsdp_common import FSDPMeshInfo, TrainingState
+from torch.distributed.fsdp._fully_shard._fsdp_init import (
     _get_post_forward_mesh_info,
     _init_default_fully_shard_mesh,
 )
-from torch.distributed._composable.fsdp._fsdp_param import ShardedState
-from torch.distributed._composable.fsdp._fsdp_param_group import FSDPParamGroup
-from torch.distributed._tensor import DTensor
-from torch.distributed._tensor.experimental import implicit_replication
-from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp._fully_shard._fsdp_param import ShardedState
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental import implicit_replication
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
 
@@ -12,17 +12,19 @@
 
 import torch
 import torch._dynamo.testing
-import torch.distributed._composable.fsdp._fsdp_param
 import torch.nn.functional as F
 from torch import nn
 from torch._dynamo.utils import counters
 from torch._inductor import comms
 from torch._inductor.utils import is_fallback_op, run_and_get_code
-from torch.distributed._composable.fsdp import fully_shard
-from torch.distributed._composable.fsdp._fsdp_common import TrainingState
-from torch.distributed._composable.fsdp._fsdp_param_group import FSDPParamGroup
 from torch.distributed._tensor import init_device_mesh
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy
+from torch.distributed.fsdp import (
+    fully_shard,
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_common import TrainingState
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
 from torch.testing import FileCheck
 from torch.testing._internal.common_distributed import (
     at_least_x_gpu,
@@ -83,7 +85,7 @@ def _test_disable_compiling_hooks(
     ):
         torch._dynamo.reset()
         trace_rules_check_count = 0
-        HOOKS_FILE_NAME = "torch/distributed/_composable/fsdp/_fsdp_state.py"
+        HOOKS_FILE_NAME = "torch/distributed/fsdp/_fully_shard/_fsdp_state.py"
         HOOK_WRAPPER_NAME = "fsdp_hook_wrapper"
 
         def patched_trace_rules_check(*args, **kwargs):
 
@@ -13,8 +13,8 @@
 import torch.nn as nn
 import torch.utils._pytree as pytree
 from torch.autograd.grad_mode import _unsafe_preserve_version_counter
-from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
 
@@ -10,8 +10,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._composable import checkpoint, replicate
-from torch.distributed._composable.fsdp import fully_shard
-from torch.distributed._composable.fsdp._fsdp_param_group import (
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import (
     RegisterPostBackwardFunction,
 )
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 from torch.amp.grad_scaler import GradScaler, OptState
-from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed._tensor import init_device_mesh
+from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
 
@@ -9,13 +9,6 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import replicate
-from torch.distributed._composable.fsdp import fully_shard
-from torch.distributed._composable.fsdp._fsdp_init import (
-    _get_managed_modules,
-    _get_managed_states,
-)
-from torch.distributed._composable.fsdp._fsdp_param import ParamModuleInfo
-from torch.distributed._composable.fsdp._fsdp_param_group import _get_param_module_infos
 from torch.distributed._tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -24,6 +17,15 @@
     Shard,
 )
 from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.fsdp._fully_shard._fsdp_init import (
+    _get_managed_modules,
+    _get_managed_states,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_param import ParamModuleInfo
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import (
+    _get_param_module_infos,
+)
 from torch.distributed.fsdp._init_utils import (
     _init_inter_node_process_group,
     _init_intra_node_process_group,
@@ -1156,5 +1158,26 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             fully_shard(model, shard_placement_fn=shard_placement_fn)
 
 
+# TODO: Remove this test class once we remove the old import path:
+# torch/distributed/_composable/fsdp
+class TestFullyShardOldImport(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_old_import_training(self):
+        from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
+
+        model = nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 16))
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
+        fully_shard(model[0], mp_policy=mp_policy)
+        fully_shard(model[1], mp_policy=mp_policy)
+        fully_shard(model, mp_policy=mp_policy)
+
+        inp = torch.randn((8, 16), device="cuda")
+        model(inp).sum().backward()
+
+
 if __name__ == "__main__":
     run_tests()