Speed up imports and add a CI (#2845)

muellerzr · web-flow · commit 3086e26db9ea · 2024-07-01T18:50:18.000-04:00
* Working test

* Timing cleanup

* Add CI

* Fix nits

* Mixup imports

* Clean

* tuna -&gt; tuna-interpreter

* Refactor pippy imports

* Accelerator

* Fin

* Fin

* Keep specific ones for docs
diff --git a/.github/workflows/test_imports.yml b/.github/workflows/test_imports.yml
@@ -0,0 +1,53 @@
+name: Run Import Tests
+
+on:
+  pull_request:
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "examples/**"
+      - "setup.py"
+    types: [opened, synchronize, reopened]
+
+env:
+  HF_HOME: ~/hf_cache
+  TESTING_MOCKED_DATALOADERS: "1"
+  IS_GITHUB_CI: "1"
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        pytorch-version: [
+          latest,
+          minimum,
+        ]
+    steps:
+    - uses: actions/checkout@v3.1.0
+    - name: Set up python 3.8
+      uses: actions/setup-python@v3
+      with:
+        python-version: 3.8
+    
+    - name: Install the library
+      run: |
+        pip install -e .
+        pip install pytest-reportlog tabulate setuptools git+https://github.com/muellerzr/import-timer
+
+    - name: Show installed libraries
+      run: |
+        pip freeze
+    
+    - name: Run Import Tests
+      env: 
+        PYTORCH_VERSION: ${{ matrix.pytorch-version }}
+      run: |
+        pytest -sv tests/test_imports.py
+
+    - name: Generate Report
+      if: always()
+      run: |
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -81,7 +81,6 @@
     has_transformer_engine_layers,
     is_bf16_available,
     is_deepspeed_available,
-    is_fp8_available,
     is_ipex_available,
     is_lomo_available,
     is_megatron_lm_available,
@@ -117,11 +116,6 @@
         DummyScheduler,
     )
 
-if is_fp8_available():
-    import transformer_engine.common.recipe as te_recipe
-    from transformer_engine.pytorch import fp8_autocast
-
-
 if is_megatron_lm_available():
     from .utils import (
         MegatronEngine,
@@ -1384,6 +1378,10 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
 
         # We prepare fp8 after, allowing for bf16 autocast to happen first
         if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
+            # Import here to keep base imports fast
+            import transformer_engine.common.recipe as te_recipe
+            from transformer_engine.pytorch import fp8_autocast
+
             if not has_transformer_engine_layers(model):
                 with torch.no_grad():
                     convert_model(model)
diff --git a/src/accelerate/inference.py b/src/accelerate/inference.py
@@ -28,11 +28,6 @@
 )
 
 
-if is_pippy_available():
-    from pippy.IR import Pipe, PipeSplitWrapper, annotate_split_points
-    from pippy.PipelineStage import PipelineStage
-
-
 def generate_device_map(model, num_processes: int = 1, no_split_module_classes=None, max_memory: dict = None):
     """
     Calculates the device map for `model` with an offset for PiPPy
@@ -83,6 +78,10 @@ def build_pipeline(model, split_points, args, kwargs, num_chunks):
     Users can pass in custom `num_chunks` as an optional hyper-parameter. By default will use
     `AcceleratorState.num_processes`
     """
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    from pippy.IR import Pipe, PipeSplitWrapper, annotate_split_points
+    from pippy.PipelineStage import PipelineStage
+
     # We need to annotate the split points in the model for PiPPy
     state = PartialState()
     annotate_split_points(model, {split_point: PipeSplitWrapper.SplitPoint.BEGINNING for split_point in split_points})
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
@@ -40,6 +40,7 @@
     is_datasets_available,
     is_deepspeed_available,
     is_dvclive_available,
+    is_import_timer_available,
     is_mlu_available,
     is_mps_available,
     is_npu_available,
@@ -377,6 +378,14 @@ def require_pippy(test_case):
     return unittest.skipUnless(is_pippy_available(), "test requires pippy")(test_case)
 
 
+def require_import_timer(test_case):
+    """
+    Decorator marking a test that requires tuna interpreter installed. These tests are skipped when tuna isn't
+    installed
+    """
+    return unittest.skipUnless(is_import_timer_available(), "test requires tuna interpreter")(test_case)
+
+
 _atleast_one_tracker_available = (
     any([is_wandb_available(), is_tensorboard_available()]) and not is_comet_ml_available()
 )
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -86,6 +86,7 @@
     is_deepspeed_available,
     is_dvclive_available,
     is_fp8_available,
+    is_import_timer_available,
     is_ipex_available,
     is_lomo_available,
     is_megatron_lm_available,
@@ -195,24 +196,31 @@
     prepare_simple_launcher_cmd_env,
     prepare_tpu,
 )
+
+# For docs
 from .megatron_lm import (
     AbstractTrainStep,
     BertTrainStep,
     GPTTrainStep,
-    MegatronEngine,
     MegatronLMDummyDataLoader,
     MegatronLMDummyScheduler,
-    MegatronLMOptimizerWrapper,
-    MegatronLMSchedulerWrapper,
     T5TrainStep,
     avg_losses_across_data_parallel_group,
-    gather_across_data_parallel_groups,
 )
-from .megatron_lm import initialize as megatron_lm_initialize
-from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
-from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler
-from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
-from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
+
+
+if is_megatron_lm_available():
+    from .megatron_lm import (
+        MegatronEngine,
+        MegatronLMOptimizerWrapper,
+        MegatronLMSchedulerWrapper,
+        gather_across_data_parallel_groups,
+    )
+    from .megatron_lm import initialize as megatron_lm_initialize
+    from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
+    from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler
+    from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
+    from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
 from .memory import find_executable_batch_size, release_memory
 from .other import (
     check_os_kernel,
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -18,24 +18,12 @@
 import torch
 
 from ..logging import get_logger
-from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
-from .imports import is_torch_distributed_available
+from .constants import FSDP_MODEL_NAME, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from .modeling import is_peft_model
 from .other import save
 from .versions import is_torch_version
 
 
-if is_torch_version(">=", FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
-    import torch.distributed.checkpoint as dist_cp
-    from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner, DefaultSavePlanner
-    from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
-    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
-# `dist_cp_format_utils is only available from pt>=2.3.0
-if is_torch_version(">=", "2.3.0") and is_torch_distributed_available():
-    import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
-
-
 logger = get_logger(__name__)
 
 
@@ -58,8 +46,13 @@ def _set_model_state_dict(model, state_dict, adapter_only=False):
 
 
 def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
-    os.makedirs(output_dir, exist_ok=True)
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    import torch.distributed.checkpoint as dist_cp
+    from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 
+    os.makedirs(output_dir, exist_ok=True)
     if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
         # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
         # so, only enable it when num_processes>1
@@ -103,6 +96,12 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
 
 
 def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    import torch.distributed.checkpoint as dist_cp
+    from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
+    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+
     accelerator.wait_for_everyone()
     if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
         # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
@@ -156,6 +155,12 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
 
 
 def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    import torch.distributed.checkpoint as dist_cp
+    from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+
     os.makedirs(output_dir, exist_ok=True)
     with FSDP.state_dict_type(
         model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
@@ -183,6 +188,12 @@ def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir,
 
 
 def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    import torch.distributed.checkpoint as dist_cp
+    from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+
     accelerator.wait_for_everyone()
     with FSDP.state_dict_type(
         model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
@@ -221,6 +232,10 @@ def _distributed_checkpoint_to_merged_weights(checkpoint_dir: str, save_path: st
 
     Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
     """
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    import torch.distributed.checkpoint as dist_cp
+    import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
+
     state_dict = {}
     save_path = Path(save_path)
     save_path.mkdir(exist_ok=True)
diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
@@ -81,6 +81,10 @@ def get_ccl_version():
     return importlib.metadata.version("oneccl_bind_pt")
 
 
+def is_import_timer_available():
+    return _is_package_available("import_timer")
+
+
 def is_pynvml_available():
     return _is_package_available("pynvml")
 
diff --git a/tests/test_imports.py b/tests/test_imports.py
@@ -0,0 +1,83 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+
+from accelerate.test_utils.testing import TempDirTestCase, require_import_timer
+from accelerate.utils import is_import_timer_available
+
+
+if is_import_timer_available():
+    from import_timer import calculate_total_time, read_import_profile
+    from import_timer.core import get_paths_above_threshold, sort_nodes_by_total_time
+
+
+def convert_list_to_string(data):
+    end_result = ""
+    arrow_right = "->"
+    for path in data:
+        end_result += f"{arrow_right.join(path[0])} {path[1]:.3f}s\n"
+    return end_result
+
+
+def run_import_time(command: str):
+    output = subprocess.run(["python3", "-X", "importtime", "-c", command], capture_output=True, text=True)
+    return output.stderr
+
+
+@require_import_timer
+class ImportSpeedTester(TempDirTestCase):
+    """
+    Test suite which checks if imports have seen slowdowns
+    based on a particular baseline.
+
+    If the error messages are not clear enough to get a
+    full view of what is slowing things down (or to
+    figure out how deep the initial depth should be),
+    please view the profile with the `tuna` framework:
+    `tuna import.log`.
+    """
+
+    clear_on_setup = False
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        output = run_import_time("import torch")
+        data = read_import_profile(output)
+        total_time = calculate_total_time(data)
+        cls.pytorch_time = total_time
+
+    def test_base_import(self):
+        output = run_import_time("import accelerate")
+        data = read_import_profile(output)
+        total_time = calculate_total_time(data)
+        pct_more = total_time / self.pytorch_time
+        # Base import should never be more than 10% slower than raw torch import
+        err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more * 100:.2f}%), please check the attached `tuna` profile:\n"
+        sorted_data = sort_nodes_by_total_time(data)
+        paths_above_threshold = get_paths_above_threshold(sorted_data, 0.1, max_depth=7)
+        err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
+        self.assertLess(pct_more, 1.2, err_msg)
+
+    def test_cli_import(self):
+        output = run_import_time("from accelerate.commands.launch import launch_command_parser")
+        data = read_import_profile(output)
+        total_time = calculate_total_time(data)
+        pct_more = total_time / self.pytorch_time
+        # Base import should never be more than 10% slower than raw torch import
+        err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more * 100:.2f}%), please check the attached `tuna` profile:\n"
+        sorted_data = sort_nodes_by_total_time(data)
+        paths_above_threshold = get_paths_above_threshold(sorted_data, 0.1, max_depth=7)
+        err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
+        self.assertLess(pct_more, 1.2, err_msg)