feat: _TrainingScriptPythonPackager to support folders (#812)

ivanmkc · web-flow · commit 3aec6a7b8f26 · 2022-01-13T12:40:50.000-05:00
* _TrainingScriptPythonPackager to support folders

Allow _TrainingScriptPythonPackager to support a folder in addition to a single script.

* Update source_utils.py

* Fixed tests

* More fixes to test

* Added missing imports

* Removed unused import
diff --git a/google/cloud/aiplatform/utils/source_utils.py b/google/cloud/aiplatform/utils/source_utils.py
@@ -16,6 +16,7 @@
 
 
 import functools
+import os
 import pathlib
 import shutil
 import subprocess
@@ -62,7 +63,7 @@ class _TrainingScriptPythonPackager:
             Constant command to generate the source distribution package.
 
     Attributes:
-        script_path: local path of script to package
+        script_path: local path of script or folder to package
         requirements: list of Python dependencies to add to package
 
     Usage:
@@ -79,7 +80,6 @@ class _TrainingScriptPythonPackager:
 
     _TRAINER_FOLDER = "trainer"
     _ROOT_MODULE = "aiplatform_custom_trainer_script"
-    _TASK_MODULE_NAME = "task"
     _SETUP_PY_VERSION = "0.1"
 
     _SETUP_PY_TEMPLATE = """from setuptools import find_packages
@@ -96,10 +96,12 @@ class _TrainingScriptPythonPackager:
 
     _SETUP_PY_SOURCE_DISTRIBUTION_CMD = "setup.py sdist --formats=gztar"
 
-    # Module name that can be executed during training. ie. python -m
-    module_name = f"{_ROOT_MODULE}.{_TASK_MODULE_NAME}"
-
-    def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = None):
+    def __init__(
+        self,
+        script_path: str,
+        task_module_name: str = "task",
+        requirements: Optional[Sequence[str]] = None,
+    ):
         """Initializes packager.
 
         Args:
@@ -109,8 +111,14 @@ def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = Non
         """
 
         self.script_path = script_path
+        self.task_module_name = task_module_name
         self.requirements = requirements or []
 
+    @property
+    def module_name(self) -> str:
+        # Module name that can be executed during training. ie. python -m
+        return f"{self._ROOT_MODULE}.{self.task_module_name}"
+
     def make_package(self, package_directory: str) -> str:
         """Converts script into a Python package suitable for python module
         execution.
@@ -134,9 +142,6 @@ def make_package(self, package_directory: str) -> str:
         # __init__.py path in root module
         init_path = trainer_path / "__init__.py"
 
-        # The module that will contain the script
-        script_out_path = trainer_path / f"{self._TASK_MODULE_NAME}.py"
-
         # The path to setup.py in the package.
         setup_py_path = trainer_root_path / "setup.py"
 
@@ -165,8 +170,14 @@ def make_package(self, package_directory: str) -> str:
         with setup_py_path.open("w") as fp:
             fp.write(setup_py_output)
 
-        # Copy script as module of python package.
-        shutil.copy(self.script_path, script_out_path)
+        if os.path.isdir(self.script_path):
+            shutil.copytree(self.script_path, trainer_path, dirs_exist_ok=True)
+        else:
+            # The module that will contain the script
+            script_out_path = trainer_path / f"{self.task_module_name}.py"
+
+            # Copy script as module of python package.
+            shutil.copy(self.script_path, script_out_path)
 
         # Run setup.py to create the source distribution.
         setup_cmd = [
diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py
@@ -24,7 +24,6 @@
 from google.cloud.aiplatform import initializer
 from google.cloud.aiplatform import models
 from google.cloud.aiplatform import schema
-from google.cloud.aiplatform.utils import source_utils
 
 from google.cloud.aiplatform_v1.types import (
     dataset as gca_dataset,
@@ -224,7 +223,7 @@ def test_dataset_create_to_model_predict(
             },
             "python_package_spec": {
                 "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": test_training_jobs._TEST_MODULE_NAME,
                 "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
             },
@@ -411,7 +410,7 @@ def test_dataset_create_to_model_predict_with_pipeline_fail(
             },
             "python_package_spec": {
                 "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": test_training_jobs._TEST_MODULE_NAME,
                 "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
             },
diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py
@@ -31,6 +31,8 @@
 from unittest import mock
 from unittest.mock import patch
 
+import test_training_jobs
+
 from google.auth import credentials as auth_credentials
 
 from google.cloud import aiplatform
@@ -89,6 +91,7 @@
 _TEST_SERVING_CONTAINER_IMAGE = "gcr.io/test-serving/container:image"
 _TEST_SERVING_CONTAINER_PREDICTION_ROUTE = "predict"
 _TEST_SERVING_CONTAINER_HEALTH_ROUTE = "metadata"
+_TEST_MODULE_NAME = f"{source_utils._TrainingScriptPythonPackager._ROOT_MODULE}.task"
 
 _TEST_METADATA_SCHEMA_URI_NONTABULAR = schema.dataset.metadata.image
 _TEST_ANNOTATION_SCHEMA_URI = schema.dataset.annotation.image.classification
@@ -827,7 +830,7 @@ def test_run_call_pipeline_service_create_with_tabular_dataset(
             },
             "python_package_spec": {
                 "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": _TEST_MODULE_NAME,
                 "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
                 "env": true_env,
@@ -995,7 +998,7 @@ def test_run_call_pipeline_service_create_with_bigquery_destination(
             },
             "python_package_spec": {
                 "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": test_training_jobs._TEST_MODULE_NAME,
                 "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
                 "env": true_env,
@@ -1303,7 +1306,7 @@ def test_run_call_pipeline_service_create_with_no_dataset(
             },
             "python_package_spec": {
                 "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": test_training_jobs._TEST_MODULE_NAME,
                 "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
                 "env": true_env,
@@ -1606,7 +1609,7 @@ def test_run_call_pipeline_service_create_distributed_training(
                 },
                 "python_package_spec": {
                     "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                    "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                    "python_module": test_training_jobs._TEST_MODULE_NAME,
                     "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                     "args": true_args,
                     "env": true_env,
@@ -1625,7 +1628,7 @@ def test_run_call_pipeline_service_create_distributed_training(
                 },
                 "python_package_spec": {
                     "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                    "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                    "python_module": _TEST_MODULE_NAME,
                     "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                     "args": true_args,
                     "env": true_env,
@@ -1756,7 +1759,7 @@ def test_run_call_pipeline_service_create_distributed_training_with_reduction_se
                 },
                 "python_package_spec": {
                     "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                    "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                    "python_module": _TEST_MODULE_NAME,
                     "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                     "args": true_args,
                     "env": true_env,
@@ -1775,7 +1778,7 @@ def test_run_call_pipeline_service_create_distributed_training_with_reduction_se
                 },
                 "python_package_spec": {
                     "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                    "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                    "python_module": test_training_jobs._TEST_MODULE_NAME,
                     "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                     "args": true_args,
                     "env": true_env,
@@ -2013,7 +2016,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_without_model_
             },
             "python_package_spec": {
                 "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
-                "python_module": source_utils._TrainingScriptPythonPackager.module_name,
+                "python_module": test_training_jobs._TEST_MODULE_NAME,
                 "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                 "args": true_args,
             },