Lightning-AI
diff --git a/‎.github/ISSUE_TEMPLATE/1_bug_report.yaml
Lines changed: 12 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/1_bug_report.yaml
Lines changed: 12 additions & 2 deletions
diff --git a/‎.github/dependabot.yml
Lines changed: 0 additions & 4 deletions b/‎.github/dependabot.yml
Lines changed: 0 additions & 4 deletions
diff --git a/‎.github/workflows/_legacy-checkpoints.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_legacy-checkpoints.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-checkpoints.yml renamed to ‎.github/workflows/ci-checkpoints.yml.disabled b/‎.github/workflows/ci-checkpoints.yml renamed to ‎.github/workflows/ci-checkpoints.yml.disabled
diff --git a/‎.github/workflows/docker-build.yml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/docker-build.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/release-pkg.yml
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/release-pkg.yml
Lines changed: 10 additions & 10 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎_notebooks b/‎_notebooks
diff --git a/‎docs/source-fabric/advanced/model_parallel/tp_fsdp.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source-fabric/advanced/model_parallel/tp_fsdp.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-pytorch/advanced/compile.rst
Lines changed: 2 additions & 2 deletions b/‎docs/source-pytorch/advanced/compile.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-pytorch/advanced/post_training_quantization.rst
Lines changed: 2 additions & 2 deletions b/‎docs/source-pytorch/advanced/post_training_quantization.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source-pytorch/advanced/pruning_quantization.rst
Lines changed: 2 additions & 2 deletions b/‎docs/source-pytorch/advanced/pruning_quantization.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source-pytorch/advanced/training_tricks.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source-pytorch/advanced/training_tricks.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/ci.txt
Lines changed: 2 additions & 2 deletions b/‎requirements/ci.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/docs.txt
Lines changed: 5 additions & 5 deletions b/‎requirements/docs.txt
Lines changed: 5 additions & 5 deletions
diff --git a/‎requirements/fabric/base.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/fabric/base.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/fabric/test.txt
Lines changed: 5 additions & 5 deletions b/‎requirements/fabric/test.txt
Lines changed: 5 additions & 5 deletions
diff --git a/‎requirements/pytorch/base.txt
Lines changed: 2 additions & 2 deletions b/‎requirements/pytorch/base.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/pytorch/docs.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/docs.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/extra.txt
Lines changed: 3 additions & 3 deletions b/‎requirements/pytorch/extra.txt
Lines changed: 3 additions & 3 deletions
diff --git a/‎requirements/pytorch/test.txt
Lines changed: 5 additions & 5 deletions b/‎requirements/pytorch/test.txt
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/lightning/fabric/connector.py
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/connector.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/precision/bitsandbytes.py
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/plugins/precision/bitsandbytes.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/precision/transformer_engine.py
Lines changed: 3 additions & 1 deletion b/‎src/lightning/fabric/plugins/precision/transformer_engine.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/lightning/fabric/utilities/cloud_io.py
Lines changed: 11 additions & 4 deletions b/‎src/lightning/fabric/utilities/cloud_io.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎src/lightning/fabric/utilities/imports.py
Lines changed: 0 additions & 2 deletions b/‎src/lightning/fabric/utilities/imports.py
Lines changed: 0 additions & 2 deletions
@@ -46,12 +46,22 @@ body:
     attributes:
       value: "**Note: The rest of this form is optional, but filling it out may help us to provide better support.**"
 
+  - type: input
+    attributes:
+      label: Reproduced in studio
+      description: >
+        Create a new Lightning Studio with code that reproduces the issue and share the link.
+        Also include all the relevant files and data required to reproduce shared issue.
+        In case the code does not crash, please add assert statements to show what is the real and expected output.
+        A simple guide on how to create such a studio can be found [here](https://www.youtube.com/watch?v=YcW-2Zt_bFg&ab_channel=LightningAI).
+      placeholder: https://lightning.ai/live-session/...
+    validations:
+      required: false
   - type: textarea
     attributes:
       label: How to reproduce the bug
       description: >
-        Provide steps and example code here.
-        You can also paste a link to Google Colab (see our [Colab bug report template](https://colab.research.google.com/github/Lightning-AI/lightning/blob/master/examples/pytorch/bug_report/bug_report_model.ipynb)) or adapt this minimal [snippet](https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/bug_report/bug_report_model.py).
+        In the special case when the issue can't be reproduced in a studio, provide steps and example code here.
       placeholder: |
         ```python
         # Sample code to reproduce the problem
 
@@ -19,8 +19,6 @@ updates:
       separator: "-"
     # Allow up to 5 open pull requests for pip dependencies
     open-pull-requests-limit: 10
-    reviewers:
-      - "Lightning-AI/teams/core-lightning"
 
   # Enable version updates for GitHub Actions
   - package-ecosystem: "github-actions"
@@ -37,5 +35,3 @@ updates:
       separator: "-"
     # Allow up to 5 open pull requests for GitHub Actions
     open-pull-requests-limit: 10
-    reviewers:
-      - "Lightning-AI/teams/core-lightning"
@@ -113,7 +113,7 @@ jobs:
 
       - run: pip install -r requirements/ci.txt
       - name: Upload checkpoints to S3
-        if: secrets.AWS_REGION != ''
+        if: ${{ secrets[AWS_REGION] != '' }}
         working-directory: ${{ env.LEGACY_FOLDER }}
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY }}
 
@@ -94,6 +94,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        # adding dome more images as Thunder mainly using python 3.10,
+        # and we need to support integrations as for example LitGPT
+        python_version: ["3.10"]
+        pytorch_version: ["2.6.0", "2.7.0"]
+        cuda_version: ["12.6.3"]
         include:
           # These are the base images for PL release docker images.
           # Make sure the matrix here matches the one above.
 
@@ -13,7 +13,7 @@ on:
       - "requirements/ci.txt"
       - ".github/actions/pkg-check/*"
       - ".github/actions/pkg-publish/*"
-      - ".github/workflows/_legacy-checkpoints.yml.yml"
+      - ".github/workflows/_legacy-checkpoints.yml"
       - ".github/workflows/_build-packages.yml"
       - ".github/workflows/release-pypi.yml"
 
@@ -179,12 +179,12 @@ jobs:
         with:
           pkg-folder: dist/${{ steps.folder.outputs.pkg }}
           pypi-token: ${{ secrets[format('PYPI_TOKEN_{0}', matrix.name)] }}
-
-  legacy-checkpoints:
-    needs: [build-packages]
-    uses: ./.github/workflows/_legacy-checkpoints.yml
-    with:
-      push_to_s3: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
-      upload_local: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
-      create_pr: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
-    secrets: inherit
+# FIXME: this is not working suddenly, Unrecognized named-value: 'secrets'
+#  legacy-checkpoints:
+#    needs: [build-packages]
+#    uses: ./.github/workflows/_legacy-checkpoints.yml
+#    with:
+#      push_to_s3: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
+#      upload_local: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
+#      create_pr: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
+#    secrets: inherit
@@ -198,7 +198,7 @@ node_modules/
 **/events.out.tfevents.*
 examples/**/*.png
 
-# instalation artifacts
+# installation artifacts
 requirements/base.txt
 
 # CI
 
@@ -276,7 +276,7 @@ Next steps
 
 .. displayitem::
     :header: Pipeline Parallelism
-    :description: Coming sooon
+    :description: Coming soon
     :col_css: col-md-4
     :height: 160
     :tag: advanced
 
@@ -262,7 +262,7 @@ Avoid graph breaks
 When ``torch.compile`` looks at the code in your model's ``forward()`` or ``*_step()`` method, it will try to compile as much of the code as possible.
 If there are regions in the code that it doesn't understand, it will introduce a so-called "graph break" that essentially splits the code in optimized and unoptimized parts.
 Graph breaks aren't a deal breaker, since the optimized parts should still run faster.
-But if you want to get the most out of ``torch.compile``, you might want to invest rewriting the problematic section of the code that produce the breaks.
+But if you want to get the most out of ``torch.compile``, you might want to invest rewriting the problematic section of the code that produces the breaks.
 
 You can check whether your model produces graph breaks by calling ``torch.compile`` with ``fullgraph=True``:
 
@@ -332,7 +332,7 @@ Enabling CUDA Graphs often results in a significant speedup, but sometimes also
 
 **Shape padding:** The specific shape/size of the tensors involved in the computation of your model (input, activations, weights, gradients, etc.) can have an impact on the performance.
 With shape padding enabled, ``torch.compile`` can extend the tensors by padding to a size that gives a better memory alignment.
-Naturally, the tradoff here is that it will consume a bit more memory.
+Naturally, the tradeoff here is that it will consume a bit more memory.
 
 .. code-block:: python
 
 
@@ -282,7 +282,7 @@ Next steps
 
 .. displayitem::
     :header: Pipeline Parallelism
-    :description: Coming sooon
+    :description: Coming soon
     :col_css: col-md-4
     :height: 160
     :tag: advanced
 
@@ -106,7 +106,7 @@ The "approach" parameter in PostTrainingQuantConfig is defined by the user to ma
 Quantize the model
 ==================
 
-The model can be qutized by Intel® Neural Compressor with:
+The model can be quantized by Intel® Neural Compressor with:
 
 .. code-block:: python
 
@@ -126,7 +126,7 @@ At last, the quantized model can be saved by:
 Hands-on Examples
 *****************
 
-Based on the `given example code <https://lightning.ai/docs/pytorch/2.1.0/notebooks/lightning_examples/text-transformers.html>`_, we show how Intel Neural Compressor conduct model quantization on PyTorch Lightning. We first define the basic config of the quantization process.
+Based on the `given example code <https://lightning.ai/docs/pytorch/2.1.0/notebooks/lightning_examples/text-transformers.html>`_, we show how Intel Neural Compressor conducts model quantization on PyTorch Lightning. We first define the basic config of the quantization process.
 
 .. code-block:: python
 
 
@@ -32,7 +32,7 @@ You can also perform iterative pruning, apply the `lottery ticket hypothesis <ht
 .. code-block:: python
 
     def compute_amount(epoch):
-        # the sum of all returned values need to be smaller than 1
+        # the sum of all returned values needs to be smaller than 1
         if epoch == 10:
             return 0.5
 
@@ -43,7 +43,7 @@ You can also perform iterative pruning, apply the `lottery ticket hypothesis <ht
             return 0.01
 
 
-    # the amount can be also be a callable
+    # the amount can also be a callable
     trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=compute_amount)])
 
 
 
@@ -46,7 +46,7 @@ If the Trainer's ``gradient_clip_algorithm`` is set to ``'value'`` (``'norm'`` b
     # clip gradients' maximum magnitude to <=0.5
     trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
 
-Read more about :ref:`Configuring Gradient Clipping <configure_gradient_clipping>` for advanced use-cases.
+Read more about :ref:`Configuring Gradient Clipping <configure_gradient_clipping>` for advanced use cases.
 
 ----------
 
 
@@ -1,7 +1,7 @@
-setuptools <70.1.1
+setuptools <80.9.1
 wheel <0.46.0
 awscli >=1.30.0, <1.41.0
-twine ==6.0.1
+twine ==6.1.0
 importlib-metadata <9.0.0
 wget
 pkginfo ==1.12.1.2
 
@@ -1,7 +1,7 @@
 sphinx >5.0, <6.0
-myst-parser >=0.18.1, <3.0.0
-nbsphinx >=0.8.5, <=0.9.2
-nbconvert <7.14  # temporary fix for https://github.com/jupyter/nbconvert/issues/2092
+myst-parser >=0.18.1, <4.0.0
+nbsphinx >=0.8.5, <=0.9.7
+nbconvert >7.14, <7.17
 pandoc >=1.0, <=2.3
 docutils >=0.16, <0.22
 sphinxcontrib-fulltoc >=1.0, <=1.2.0
@@ -12,9 +12,9 @@ sphinx-paramlinks >=0.5.1, <=0.6.0
 sphinx-togglebutton >=0.2, <=0.3.2
 sphinx-copybutton >=0.3, <=0.5.2
 sphinx-multiproject
-sphinx-toolbox ==3.5.0
+sphinx-toolbox ==4.0.0
 sphinx-rtd-dark-mode
-sphinxcontrib-video ==0.2.0
+sphinxcontrib-video ==0.4.1
 jinja2 <3.2.0
 
 lightning-utilities >=0.11.1, <0.15.0
 
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 torch >=2.1.0, <2.8.0
-fsspec[http] >=2022.5.0, <2025.4.0
+fsspec[http] >=2022.5.0, <2025.6.0
 packaging >=20.0, <=25.0
 typing-extensions >=4.4.0, <4.14.0
 lightning-utilities >=0.10.0, <0.15.0
@@ -1,9 +1,9 @@
-coverage ==7.8.0
+coverage ==7.8.2
 numpy >=1.17.2, <1.27.0
 pytest ==8.3.5
 pytest-cov ==6.1.1
-pytest-timeout ==2.3.1
-pytest-rerunfailures ==12.0
-pytest-random-order ==1.1.0
-click ==8.1.7
+pytest-timeout ==2.4.0
+pytest-rerunfailures ==15.1
+pytest-random-order ==1.1.1
+click ==8.1.8
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
@@ -2,9 +2,9 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 torch >=2.1.0, <2.8.0
-tqdm >=4.57.0, <4.67.0
+tqdm >=4.57.0, <4.68.0
 PyYAML >=5.4, <6.1.0
-fsspec[http] >=2022.5.0, <2025.4.0
+fsspec[http] >=2022.5.0, <2025.6.0
 torchmetrics >=0.7.0, <1.8.0
 packaging >=20.0, <=25.0
 typing-extensions >=4.4.0, <4.14.0
 
@@ -2,6 +2,6 @@
 
 nbformat  # used for generate empty notebook
 ipython[notebook] <8.19.0
-setuptools<58.0  # workaround for `error in ipython setup command: use_2to3 is invalid.`
+setuptools<81.0  # workaround for `error in ipython setup command: use_2to3 is invalid.`
 
 #-r ../../_notebooks/.actions/requires.txt
@@ -2,10 +2,10 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 # extended list of package dependencies to reach full functionality
-matplotlib>3.1, <3.9.0
+matplotlib>3.1, <3.10.0
 omegaconf >=2.2.3, <2.4.0
 hydra-core >=1.2.0, <1.4.0
-jsonargparse[signatures] >=4.27.7, <=4.35.0
-rich >=12.3.0, <13.6.0
+jsonargparse[signatures] >=4.28.0, <=4.40.0
+rich >=12.3.0, <14.1.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
 bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin"
@@ -1,15 +1,15 @@
-coverage ==7.8.0
+coverage ==7.8.2
 pytest ==8.3.5
 pytest-cov ==6.1.1
-pytest-timeout ==2.3.1
-pytest-rerunfailures ==12.0
-pytest-random-order ==1.1.0
+pytest-timeout ==2.4.0
+pytest-rerunfailures ==15.1
+pytest-random-order ==1.1.1
 
 # needed in tests
 cloudpickle >=1.3, <3.2.0
 scikit-learn >0.22.1, <1.7.0
 numpy >=1.17.2, <1.27.0
-onnx >=1.12.0, <1.18.0
+onnx >=1.12.0, <1.19.0
 onnxruntime >=1.12.0, <1.21.0
 onnxscript >= 0.2.2, <0.2.6
 psutil <7.0.1 # for `DeviceStatsMonitor`
 
@@ -239,7 +239,7 @@ def _check_config_and_set_final_flags(
                 else:
                     raise TypeError(
                         f"Found invalid type for plugin {plugin}. Expected one of: Precision, "
-                        "CheckpointIO, ClusterEnviroment."
+                        "CheckpointIO, ClusterEnvironment."
                     )
 
             duplicated_plugin_key = [k for k, v in plugins_flags_types.items() if v > 1]
 
@@ -403,7 +403,7 @@ class _NF4DQLinear(_Linear4bit):
         def __init__(self, *args: Any, **kwargs: Any) -> None:
             super().__init__(*args, quant_type="nf4", compress_statistics=True, **kwargs)
 
-    # these classes are defined programatically like this to avoid importing bitsandbytes in environments that have
+    # these classes are defined programmatically like this to avoid importing bitsandbytes in environments that have
     # it available but will not use it
     classes = {
         "_Linear8bitLt": _Linear8bitLt,
 
@@ -171,7 +171,9 @@ def _convert_layers(module: torch.nn.Module) -> None:
         elif isinstance(child, torch.nn.LayerNorm):
             replacement = te.LayerNorm(child.normalized_shape[0], eps=child.eps)
             replacement.weight.data = child.weight.data.clone()
-            replacement.bias.data = child.bias.data.clone()
+            # Check if bias exists before attempting to clone its data
+            if child.bias is not None and replacement.bias is not None:
+                replacement.bias.data = child.bias.data.clone()
             log.debug(f"Replacing layer {name!r} with Transformer Engine equivalent")
             module.__setattr__(name, replacement)
         else:
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Utilities related to data saving/loading."""
 
+import errno
 import io
 import logging
 from pathlib import Path
@@ -84,10 +85,16 @@ def _atomic_save(checkpoint: dict[str, Any], filepath: Union[str, Path]) -> None
     log.debug(f"Saving checkpoint: {filepath}")
     torch.save(checkpoint, bytesbuffer)
 
-    # We use a transaction here to avoid file corruption if the save gets interrupted
-    fs, urlpath = fsspec.core.url_to_fs(str(filepath))
-    with fs.transaction, fs.open(urlpath, "wb") as f:
-        f.write(bytesbuffer.getvalue())
+    try:
+        # We use a transaction here to avoid file corruption if the save gets interrupted
+        fs, urlpath = fsspec.core.url_to_fs(str(filepath))
+        with fs.transaction, fs.open(urlpath, "wb") as f:
+            f.write(bytesbuffer.getvalue())
+    except PermissionError as e:
+        if isinstance(e.__context__, OSError) and getattr(e.__context__, "errno", None) == errno.EXDEV:
+            raise RuntimeError(
+                'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"',
+            ) from e
 
 
 def _is_object_storage(fs: AbstractFileSystem) -> bool:
 
@@ -37,5 +37,3 @@
 _TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
 
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
-
-_UTILITIES_GREATER_EQUAL_0_10 = compare_version("lightning_utilities", operator.ge, "0.10.0")
Original file line number	Diff line number	Diff line change
`@@ -239,7 +239,7 @@ def _check_config_and_set_final_flags(`
`239`	`239`	`else:`
`240`	`240`	`raise TypeError(`
`241`	`241`	`f"Found invalid type for plugin {plugin}. Expected one of: Precision, "`
`242`		`- "CheckpointIO, ClusterEnviroment."`
	`242`	`+ "CheckpointIO, ClusterEnvironment."`
`243`	`243`	`)`
`244`	`244`
`245`	`245`	`duplicated_plugin_key = [k for k, v in plugins_flags_types.items() if v > 1]`