Skip to content

Commit 9f5e604

Browse files
authored
Merge branch 'master' into feat/dynamo_export_onnx
2 parents 019125d + fafc239 commit 9f5e604

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+353
-139
lines changed

.github/ISSUE_TEMPLATE/1_bug_report.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,22 @@ body:
4646
attributes:
4747
value: "**Note: The rest of this form is optional, but filling it out may help us to provide better support.**"
4848

49+
- type: input
50+
attributes:
51+
label: Reproduced in studio
52+
description: >
53+
Create a new Lightning Studio with code that reproduces the issue and share the link.
54+
Also include all the relevant files and data required to reproduce shared issue.
55+
In case the code does not crash, please add assert statements to show what is the real and expected output.
56+
A simple guide on how to create such a studio can be found [here](https://www.youtube.com/watch?v=YcW-2Zt_bFg&ab_channel=LightningAI).
57+
placeholder: https://lightning.ai/live-session/...
58+
validations:
59+
required: false
4960
- type: textarea
5061
attributes:
5162
label: How to reproduce the bug
5263
description: >
53-
Provide steps and example code here.
54-
You can also paste a link to Google Colab (see our [Colab bug report template](https://colab.research.google.com/github/Lightning-AI/lightning/blob/master/examples/pytorch/bug_report/bug_report_model.ipynb)) or adapt this minimal [snippet](https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/bug_report/bug_report_model.py).
64+
In the special case when the issue can't be reproduced in a studio, provide steps and example code here.
5565
placeholder: |
5666
```python
5767
# Sample code to reproduce the problem

.github/dependabot.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ updates:
1919
separator: "-"
2020
# Allow up to 5 open pull requests for pip dependencies
2121
open-pull-requests-limit: 10
22-
reviewers:
23-
- "Lightning-AI/teams/core-lightning"
2422

2523
# Enable version updates for GitHub Actions
2624
- package-ecosystem: "github-actions"
@@ -37,5 +35,3 @@ updates:
3735
separator: "-"
3836
# Allow up to 5 open pull requests for GitHub Actions
3937
open-pull-requests-limit: 10
40-
reviewers:
41-
- "Lightning-AI/teams/core-lightning"

.github/workflows/_legacy-checkpoints.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ jobs:
113113

114114
- run: pip install -r requirements/ci.txt
115115
- name: Upload checkpoints to S3
116-
if: secrets.AWS_REGION != ''
116+
if: ${{ secrets[AWS_REGION] != '' }}
117117
working-directory: ${{ env.LEGACY_FOLDER }}
118118
env:
119119
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY }}

.github/workflows/docker-build.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ jobs:
9494
strategy:
9595
fail-fast: false
9696
matrix:
97+
# adding dome more images as Thunder mainly using python 3.10,
98+
# and we need to support integrations as for example LitGPT
99+
python_version: ["3.10"]
100+
pytorch_version: ["2.6.0", "2.7.0"]
101+
cuda_version: ["12.6.3"]
97102
include:
98103
# These are the base images for PL release docker images.
99104
# Make sure the matrix here matches the one above.

.github/workflows/release-pkg.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ on:
1313
- "requirements/ci.txt"
1414
- ".github/actions/pkg-check/*"
1515
- ".github/actions/pkg-publish/*"
16-
- ".github/workflows/_legacy-checkpoints.yml.yml"
16+
- ".github/workflows/_legacy-checkpoints.yml"
1717
- ".github/workflows/_build-packages.yml"
1818
- ".github/workflows/release-pypi.yml"
1919

@@ -179,12 +179,12 @@ jobs:
179179
with:
180180
pkg-folder: dist/${{ steps.folder.outputs.pkg }}
181181
pypi-token: ${{ secrets[format('PYPI_TOKEN_{0}', matrix.name)] }}
182-
183-
legacy-checkpoints:
184-
needs: [build-packages]
185-
uses: ./.github/workflows/_legacy-checkpoints.yml
186-
with:
187-
push_to_s3: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
188-
upload_local: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
189-
create_pr: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
190-
secrets: inherit
182+
# FIXME: this is not working suddenly, Unrecognized named-value: 'secrets'
183+
# legacy-checkpoints:
184+
# needs: [build-packages]
185+
# uses: ./.github/workflows/_legacy-checkpoints.yml
186+
# with:
187+
# push_to_s3: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
188+
# upload_local: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
189+
# create_pr: ${{ startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' }}
190+
# secrets: inherit

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ node_modules/
198198
**/events.out.tfevents.*
199199
examples/**/*.png
200200

201-
# instalation artifacts
201+
# installation artifacts
202202
requirements/base.txt
203203

204204
# CI

_notebooks

docs/source-fabric/advanced/model_parallel/tp_fsdp.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ Next steps
276276

277277
.. displayitem::
278278
:header: Pipeline Parallelism
279-
:description: Coming sooon
279+
:description: Coming soon
280280
:col_css: col-md-4
281281
:height: 160
282282
:tag: advanced

docs/source-pytorch/advanced/compile.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ Avoid graph breaks
262262
When ``torch.compile`` looks at the code in your model's ``forward()`` or ``*_step()`` method, it will try to compile as much of the code as possible.
263263
If there are regions in the code that it doesn't understand, it will introduce a so-called "graph break" that essentially splits the code in optimized and unoptimized parts.
264264
Graph breaks aren't a deal breaker, since the optimized parts should still run faster.
265-
But if you want to get the most out of ``torch.compile``, you might want to invest rewriting the problematic section of the code that produce the breaks.
265+
But if you want to get the most out of ``torch.compile``, you might want to invest rewriting the problematic section of the code that produces the breaks.
266266

267267
You can check whether your model produces graph breaks by calling ``torch.compile`` with ``fullgraph=True``:
268268

@@ -332,7 +332,7 @@ Enabling CUDA Graphs often results in a significant speedup, but sometimes also
332332
333333
**Shape padding:** The specific shape/size of the tensors involved in the computation of your model (input, activations, weights, gradients, etc.) can have an impact on the performance.
334334
With shape padding enabled, ``torch.compile`` can extend the tensors by padding to a size that gives a better memory alignment.
335-
Naturally, the tradoff here is that it will consume a bit more memory.
335+
Naturally, the tradeoff here is that it will consume a bit more memory.
336336

337337
.. code-block:: python
338338

docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ Next steps
282282

283283
.. displayitem::
284284
:header: Pipeline Parallelism
285-
:description: Coming sooon
285+
:description: Coming soon
286286
:col_css: col-md-4
287287
:height: 160
288288
:tag: advanced

docs/source-pytorch/advanced/post_training_quantization.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ The "approach" parameter in PostTrainingQuantConfig is defined by the user to ma
106106
Quantize the model
107107
==================
108108

109-
The model can be qutized by Intel® Neural Compressor with:
109+
The model can be quantized by Intel® Neural Compressor with:
110110

111111
.. code-block:: python
112112
@@ -126,7 +126,7 @@ At last, the quantized model can be saved by:
126126
Hands-on Examples
127127
*****************
128128

129-
Based on the `given example code <https://lightning.ai/docs/pytorch/2.1.0/notebooks/lightning_examples/text-transformers.html>`_, we show how Intel Neural Compressor conduct model quantization on PyTorch Lightning. We first define the basic config of the quantization process.
129+
Based on the `given example code <https://lightning.ai/docs/pytorch/2.1.0/notebooks/lightning_examples/text-transformers.html>`_, we show how Intel Neural Compressor conducts model quantization on PyTorch Lightning. We first define the basic config of the quantization process.
130130

131131
.. code-block:: python
132132

docs/source-pytorch/advanced/pruning_quantization.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ You can also perform iterative pruning, apply the `lottery ticket hypothesis <ht
3232
.. code-block:: python
3333
3434
def compute_amount(epoch):
35-
# the sum of all returned values need to be smaller than 1
35+
# the sum of all returned values needs to be smaller than 1
3636
if epoch == 10:
3737
return 0.5
3838
@@ -43,7 +43,7 @@ You can also perform iterative pruning, apply the `lottery ticket hypothesis <ht
4343
return 0.01
4444
4545
46-
# the amount can be also be a callable
46+
# the amount can also be a callable
4747
trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=compute_amount)])
4848
4949

docs/source-pytorch/advanced/training_tricks.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ If the Trainer's ``gradient_clip_algorithm`` is set to ``'value'`` (``'norm'`` b
4646
# clip gradients' maximum magnitude to <=0.5
4747
trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
4848

49-
Read more about :ref:`Configuring Gradient Clipping <configure_gradient_clipping>` for advanced use-cases.
49+
Read more about :ref:`Configuring Gradient Clipping <configure_gradient_clipping>` for advanced use cases.
5050

5151
----------
5252

requirements/ci.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
setuptools <70.1.1
1+
setuptools <80.9.1
22
wheel <0.46.0
33
awscli >=1.30.0, <1.41.0
4-
twine ==6.0.1
4+
twine ==6.1.0
55
importlib-metadata <9.0.0
66
wget
77
pkginfo ==1.12.1.2

requirements/docs.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
sphinx >5.0, <6.0
2-
myst-parser >=0.18.1, <3.0.0
3-
nbsphinx >=0.8.5, <=0.9.2
4-
nbconvert <7.14 # temporary fix for https://github.com/jupyter/nbconvert/issues/2092
2+
myst-parser >=0.18.1, <4.0.0
3+
nbsphinx >=0.8.5, <=0.9.7
4+
nbconvert >7.14, <7.17
55
pandoc >=1.0, <=2.3
66
docutils >=0.16, <0.22
77
sphinxcontrib-fulltoc >=1.0, <=1.2.0
@@ -12,9 +12,9 @@ sphinx-paramlinks >=0.5.1, <=0.6.0
1212
sphinx-togglebutton >=0.2, <=0.3.2
1313
sphinx-copybutton >=0.3, <=0.5.2
1414
sphinx-multiproject
15-
sphinx-toolbox ==3.5.0
15+
sphinx-toolbox ==4.0.0
1616
sphinx-rtd-dark-mode
17-
sphinxcontrib-video ==0.2.0
17+
sphinxcontrib-video ==0.4.1
1818
jinja2 <3.2.0
1919

2020
lightning-utilities >=0.11.1, <0.15.0

requirements/fabric/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
33

44
torch >=2.1.0, <2.8.0
5-
fsspec[http] >=2022.5.0, <2025.4.0
5+
fsspec[http] >=2022.5.0, <2025.6.0
66
packaging >=20.0, <=25.0
77
typing-extensions >=4.4.0, <4.14.0
88
lightning-utilities >=0.10.0, <0.15.0

requirements/fabric/test.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
coverage ==7.8.0
1+
coverage ==7.8.2
22
numpy >=1.17.2, <1.27.0
33
pytest ==8.3.5
44
pytest-cov ==6.1.1
5-
pytest-timeout ==2.3.1
6-
pytest-rerunfailures ==12.0
7-
pytest-random-order ==1.1.0
8-
click ==8.1.7
5+
pytest-timeout ==2.4.0
6+
pytest-rerunfailures ==15.1
7+
pytest-random-order ==1.1.1
8+
click ==8.1.8
99
tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute

requirements/pytorch/base.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
33

44
torch >=2.1.0, <2.8.0
5-
tqdm >=4.57.0, <4.67.0
5+
tqdm >=4.57.0, <4.68.0
66
PyYAML >=5.4, <6.1.0
7-
fsspec[http] >=2022.5.0, <2025.4.0
7+
fsspec[http] >=2022.5.0, <2025.6.0
88
torchmetrics >=0.7.0, <1.8.0
99
packaging >=20.0, <=25.0
1010
typing-extensions >=4.4.0, <4.14.0

requirements/pytorch/docs.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
nbformat # used for generate empty notebook
44
ipython[notebook] <8.19.0
5-
setuptools<58.0 # workaround for `error in ipython setup command: use_2to3 is invalid.`
5+
setuptools<81.0 # workaround for `error in ipython setup command: use_2to3 is invalid.`
66

77
#-r ../../_notebooks/.actions/requires.txt

requirements/pytorch/extra.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
33

44
# extended list of package dependencies to reach full functionality
5-
matplotlib>3.1, <3.9.0
5+
matplotlib>3.1, <3.10.0
66
omegaconf >=2.2.3, <2.4.0
77
hydra-core >=1.2.0, <1.4.0
8-
jsonargparse[signatures] >=4.27.7, <=4.35.0
9-
rich >=12.3.0, <13.6.0
8+
jsonargparse[signatures] >=4.28.0, <=4.40.0
9+
rich >=12.3.0, <14.1.0
1010
tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute
1111
bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin"

requirements/pytorch/test.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
coverage ==7.8.0
1+
coverage ==7.8.2
22
pytest ==8.3.5
33
pytest-cov ==6.1.1
4-
pytest-timeout ==2.3.1
5-
pytest-rerunfailures ==12.0
6-
pytest-random-order ==1.1.0
4+
pytest-timeout ==2.4.0
5+
pytest-rerunfailures ==15.1
6+
pytest-random-order ==1.1.1
77

88
# needed in tests
99
cloudpickle >=1.3, <3.2.0
1010
scikit-learn >0.22.1, <1.7.0
1111
numpy >=1.17.2, <1.27.0
12-
onnx >=1.12.0, <1.18.0
12+
onnx >=1.12.0, <1.19.0
1313
onnxruntime >=1.12.0, <1.21.0
1414
onnxscript >= 0.2.2, <0.2.6
1515
psutil <7.0.1 # for `DeviceStatsMonitor`

src/lightning/fabric/connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def _check_config_and_set_final_flags(
239239
else:
240240
raise TypeError(
241241
f"Found invalid type for plugin {plugin}. Expected one of: Precision, "
242-
"CheckpointIO, ClusterEnviroment."
242+
"CheckpointIO, ClusterEnvironment."
243243
)
244244

245245
duplicated_plugin_key = [k for k, v in plugins_flags_types.items() if v > 1]

src/lightning/fabric/plugins/precision/bitsandbytes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ class _NF4DQLinear(_Linear4bit):
403403
def __init__(self, *args: Any, **kwargs: Any) -> None:
404404
super().__init__(*args, quant_type="nf4", compress_statistics=True, **kwargs)
405405

406-
# these classes are defined programatically like this to avoid importing bitsandbytes in environments that have
406+
# these classes are defined programmatically like this to avoid importing bitsandbytes in environments that have
407407
# it available but will not use it
408408
classes = {
409409
"_Linear8bitLt": _Linear8bitLt,

src/lightning/fabric/plugins/precision/transformer_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,9 @@ def _convert_layers(module: torch.nn.Module) -> None:
171171
elif isinstance(child, torch.nn.LayerNorm):
172172
replacement = te.LayerNorm(child.normalized_shape[0], eps=child.eps)
173173
replacement.weight.data = child.weight.data.clone()
174-
replacement.bias.data = child.bias.data.clone()
174+
# Check if bias exists before attempting to clone its data
175+
if child.bias is not None and replacement.bias is not None:
176+
replacement.bias.data = child.bias.data.clone()
175177
log.debug(f"Replacing layer {name!r} with Transformer Engine equivalent")
176178
module.__setattr__(name, replacement)
177179
else:

src/lightning/fabric/utilities/cloud_io.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
"""Utilities related to data saving/loading."""
1515

16+
import errno
1617
import io
1718
import logging
1819
from pathlib import Path
@@ -84,10 +85,16 @@ def _atomic_save(checkpoint: dict[str, Any], filepath: Union[str, Path]) -> None
8485
log.debug(f"Saving checkpoint: {filepath}")
8586
torch.save(checkpoint, bytesbuffer)
8687

87-
# We use a transaction here to avoid file corruption if the save gets interrupted
88-
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
89-
with fs.transaction, fs.open(urlpath, "wb") as f:
90-
f.write(bytesbuffer.getvalue())
88+
try:
89+
# We use a transaction here to avoid file corruption if the save gets interrupted
90+
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
91+
with fs.transaction, fs.open(urlpath, "wb") as f:
92+
f.write(bytesbuffer.getvalue())
93+
except PermissionError as e:
94+
if isinstance(e.__context__, OSError) and getattr(e.__context__, "errno", None) == errno.EXDEV:
95+
raise RuntimeError(
96+
'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"',
97+
) from e
9198

9299

93100
def _is_object_storage(fs: AbstractFileSystem) -> bool:

src/lightning/fabric/utilities/imports.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,3 @@
3737
_TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
3838

3939
_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
40-
41-
_UTILITIES_GREATER_EQUAL_0_10 = compare_version("lightning_utilities", operator.ge, "0.10.0")

0 commit comments

Comments
 (0)