NVIDIA · gerashegalov · Apr 4, 2025 · Apr 1, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/integration_tests/src/main/java/org/apache/spark/rapids/tests/TimeoutSparkListener.java b/integration_tests/src/main/java/org/apache/spark/rapids/tests/TimeoutSparkListener.java
@@ -51,72 +51,86 @@
  */
 public class TimeoutSparkListener extends SparkListener {
   private static final Logger LOG = LoggerFactory.getLogger(TimeoutSparkListener.class);
-  private final JavaSparkContext sparkContext;
-  private final int timeoutSeconds;
-  private final boolean shouldDumpThreads;
-  private final ScheduledExecutorService runner = Executors.newScheduledThreadPool(1,
+  private static final ScheduledExecutorService runner = Executors.newScheduledThreadPool(1,
     runnable -> {
       final Thread t = new Thread(runnable);
       t.setDaemon(true);
       t.setName("spark-job-timeout-thread-" + t.hashCode());
       return t;
     }
   );
-  private final Map<Integer,ScheduledFuture<?>> cancelJobMap = new ConcurrentHashMap<>();
 
-  boolean registered;
+  private static final Map<Integer,ScheduledFuture<?>> cancelJobMap = new ConcurrentHashMap<>();
+  private static int timeoutSeconds;
+  private static boolean shouldDumpThreads;
+  private static JavaSparkContext sparkContext;
+  private static final TimeoutSparkListener SINGLETON = new TimeoutSparkListener();
 
-  public TimeoutSparkListener(JavaSparkContext sparkContext,
-    int timeoutSeconds,
-    boolean shouldDumpThreads) {
+
+  public TimeoutSparkListener() {
     super();
-    this.sparkContext = sparkContext;
-    this.timeoutSeconds = timeoutSeconds;
-    this.shouldDumpThreads = shouldDumpThreads;
   }
 
-  public synchronized void register() {
-    if (!registered) {
-      LOG.debug("Adding TimeoutSparkListener to kill hung jobs");
-      sparkContext.sc().addSparkListener(this);
-      registered = true;
+  public static synchronized void init(JavaSparkContext sc) {
+    if (sparkContext == null) {
+      sparkContext = sc;
+      sparkContext.sc().addSparkListener(SINGLETON);
     }
   }
 
-  public synchronized void unregister() {
-    if (registered) {
-      sparkContext.sc().removeSparkListener(this);
-      registered = false;
+  private static synchronized void unregister() {
+    if (sparkContext != null) {
+      sparkContext.sc().removeSparkListener(SINGLETON);
+      sparkContext = null;
+    }
+  }
+
+  private static synchronized void cancelJob(int jobId, String message) {
+    if (sparkContext != null) {
+      sparkContext.sc().cancelJob(jobId, message);
     }
   }
 
   public void onJobStart(SparkListenerJobStart jobStart) {
     final int jobId = jobStart.jobId();
     LOG.debug("JobStart: registering timeout for Job {}", jobId);
+    // create a task config snapshot
+    final boolean taskShouldDumpThreads = shouldDumpThreads;
+    final int taskTimeout = timeoutSeconds;
     final ScheduledFuture<?> scheduledFuture = runner.schedule(() -> {
       final String message = "RAPIDS Integration Test Job " + jobId + " exceeded the timeout of " +
         timeoutSeconds + " seconds, cancelling. " +
         "Look into fixing the test or reducing its execution time. " +
         "If necessary, adjust the timeout using the marker " +
         "pytest.mark.spark_job_timeout(seconds,dump_threads)";
-      if (shouldDumpThreads) {
+      if (taskShouldDumpThreads) {
         LOG.error(message + " Driver thread dump follows");
         dumpThreads();
       }
-      sparkContext.sc().cancelJob(jobId, message);
-    }, timeoutSeconds, TimeUnit.SECONDS);
+      cancelJob(jobId, message);
+    }, taskTimeout, TimeUnit.SECONDS);
     cancelJobMap.put(jobId, scheduledFuture);
   }
 
   public void onJobEnd(SparkListenerJobEnd jobEnd) {
     final int jobId = jobEnd.jobId();
     LOG.debug("JobEnd: cancelling timeout for Job {}", jobId);
     final ScheduledFuture<?> cancelFuture = cancelJobMap.remove(jobId);
-    cancelFuture.cancel(false);
+    if (cancelFuture != null) {
+      cancelFuture.cancel(false);
+    }
+  }
+
+  public static void setSparkJobTimeout(int ts, boolean dummpThreads) {
+    timeoutSeconds = ts;
+    shouldDumpThreads = dummpThreads;
   }
 
   public void onApplicationEnd(SparkListenerApplicationEnd applicationEnd) {
+    unregister();
+    // no new work
     runner.shutdownNow();
+    cancelJobMap.clear();
   }
 
   private static void dumpThreads() {

diff --git a/integration_tests/src/main/python/spark_init_internal.py b/integration_tests/src/main/python/spark_init_internal.py
@@ -149,6 +149,13 @@ def pytest_sessionstart(session):
     # make it a better error message
     _s.sparkContext.setLogLevel("WARN")
     java_import(_s._jvm, 'org.apache.spark.rapids.tests.TimeoutSparkListener')
+    # TODO dial down after identifying all long tests
+    # and set exceptions there
+    global default_timeout_seconds
+    global default_dump_threads
+    default_timeout_seconds = 60 * 60
+    default_dump_threads = True
+    _s._jvm.org.apache.spark.rapids.tests.TimeoutSparkListener.init(_s._jsc)
     global _spark
     _spark = _s
 
@@ -180,12 +187,12 @@ def _get_driver_opts_for_worker_logs(_sb, wid):
         ' -Dlogfile={}'.format(log_file)
 
     # Set up Logging to the WORKERID_worker_logs
-    # Note: This logger is only used for logging the test name in method `log_test_name`. 
+    # Note: This logger is only used for logging the test name in method `log_test_name`.
     global logger
     logger.setLevel(logging.INFO)
     # Create file handler to output logs into corresponding worker log file
-    # This file_handler is modifying the worker_log file that the plugin will also write to 
-    # The reason for doing this is to get all test logs in one place from where we can do other analysis 
+    # This file_handler is modifying the worker_log file that the plugin will also write to
+    # The reason for doing this is to get all test logs in one place from where we can do other analysis
     # that might be needed in future to look at the execs that were used in our integration tests
     file_handler = logging.FileHandler(log_file)
     # Set the formatter for the file handler, we match the formatter from the basicConfig for consistency in logs
@@ -259,28 +266,21 @@ def log_test_name(request):
 
 @pytest.fixture(scope="function", autouse=True)
 def set_spark_job_timeout(request):
-    # TODO dial down after identifying all long tests
-    # and set exceptions there
-    default_timeout_seconds = 60 * 60
     logger.debug("set_spark_job_timeout: BEFORE TEST\n")
     tm = request.node.get_closest_marker("spark_job_timeout")
     if tm:
         spark_timeout = tm.kwargs.get('seconds', default_timeout_seconds)
-        dump_threads = tm.kwargs.get('dump_threads', True)
+        dump_threads = tm.kwargs.get('dump_threads', default_dump_threads)
     else:
         spark_timeout = default_timeout_seconds
-        dump_threads = True
+        dump_threads = default_dump_threads
     # before the test
-    hung_job_listener = (
-      _spark._jvm.org.apache.spark.rapids.tests.TimeoutSparkListener(
-          _spark._jsc, 
-          spark_timeout, 
-          dump_threads)
-    ) 
-    hung_job_listener.register()
+    _spark._jvm.org.apache.spark.rapids.tests.TimeoutSparkListener.setSparkJobTimeout(
+        spark_timeout,
+        dump_threads
+    )
     # yield for test
-    yield 
+    yield
     # after the test
-    logger.debug("set_spark_job_timeout: AFTER TEST\n")
-    hung_job_listener.unregister()
+