Skip to content

Commit f9bf654

Browse files
committed
Reworked anchor sequence timeout mechanism to not use unix signals so it's compatible with the multi-threaded execution from karaoke-gen
1 parent ee74a3a commit f9bf654

File tree

3 files changed

+65
-66
lines changed

3 files changed

+65
-66
lines changed

lyrics_transcriber/correction/anchor_sequence.py

Lines changed: 47 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1+
import threading
2+
import time
13
from typing import Any, Dict, List, Optional, Tuple, Union
24
import logging
35
from tqdm import tqdm
46
from multiprocessing import Pool, cpu_count
57
from functools import partial
6-
import time
78
from pathlib import Path
89
import json
910
import hashlib
10-
import signal
1111

12-
from lyrics_transcriber.types import LyricsData, PhraseScore, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
12+
from lyrics_transcriber.types import LyricsData, PhraseScore, PhraseType, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
1313
from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
1414
from lyrics_transcriber.correction.text_utils import clean_text
1515
from lyrics_transcriber.utils.word_utils import WordUtils
@@ -45,7 +45,14 @@ def __init__(
4545
# Initialize cache directory
4646
self.cache_dir = Path(cache_dir)
4747
self.cache_dir.mkdir(parents=True, exist_ok=True)
48-
self.logger.debug(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}, timeout: {timeout_seconds}s")
48+
self.logger.info(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}, timeout: {timeout_seconds}s")
49+
50+
def _check_timeout(self, start_time: float, operation_name: str = "operation"):
51+
"""Check if timeout has occurred and raise exception if so."""
52+
if self.timeout_seconds > 0:
53+
elapsed_time = time.time() - start_time
54+
if elapsed_time > self.timeout_seconds:
55+
raise AnchorSequenceTimeoutError(f"{operation_name} exceeded {self.timeout_seconds} seconds (elapsed: {elapsed_time:.1f}s)")
4956

5057
def _clean_text(self, text: str) -> str:
5158
"""Clean text by removing punctuation and normalizing whitespace."""
@@ -177,10 +184,6 @@ def _load_from_cache(self, cache_path: Path) -> Optional[List[ScoredAnchor]]:
177184
self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
178185
return None
179186

180-
def _timeout_handler(self, signum, frame):
181-
"""Handle timeout signal by raising AnchorSequenceTimeoutError."""
182-
raise AnchorSequenceTimeoutError(f"Anchor sequence computation exceeded {self.timeout_seconds} seconds")
183-
184187
def _process_ngram_length(
185188
self,
186189
n: int,
@@ -286,11 +289,6 @@ def find_anchors(
286289
"""Find anchor sequences that appear in both transcription and references with timeout protection."""
287290
start_time = time.time()
288291

289-
# Set up timeout signal handler
290-
if self.timeout_seconds > 0:
291-
old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
292-
signal.alarm(self.timeout_seconds)
293-
294292
try:
295293
cache_key = self._get_cache_key(transcribed, references, transcription_result)
296294
cache_path = self.cache_dir / f"anchors_{cache_key}.json"
@@ -316,6 +314,9 @@ def find_anchors(
316314
self.logger.info(f"Cache miss for key {cache_key} - computing anchors with timeout {self.timeout_seconds}s")
317315
self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
318316

317+
# Check timeout before starting computation
318+
self._check_timeout(start_time, "anchor computation initialization")
319+
319320
# Get all words from transcription
320321
all_words = []
321322
for segment in transcription_result.result.segments:
@@ -329,6 +330,9 @@ def find_anchors(
329330
}
330331
ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
331332

333+
# Check timeout after preprocessing
334+
self._check_timeout(start_time, "anchor computation preprocessing")
335+
332336
# Filter out very short reference sources for n-gram length calculation
333337
valid_ref_lengths = [
334338
len(words) for words in ref_texts_clean.values()
@@ -355,7 +359,10 @@ def find_anchors(
355359

356360
# Process n-gram lengths in parallel with timeout
357361
candidate_anchors = []
358-
pool_timeout = max(60, self.timeout_seconds // 2) # Use half the total timeout for pool operations
362+
pool_timeout = max(60, self.timeout_seconds // 2) if self.timeout_seconds > 0 else 300 # Use half the total timeout for pool operations
363+
364+
# Check timeout before parallel processing
365+
self._check_timeout(start_time, "parallel processing start")
359366

360367
try:
361368
with Pool(processes=max(cpu_count() - 1, 1)) as pool:
@@ -368,65 +375,55 @@ def find_anchors(
368375
# Collect results with individual timeouts
369376
for i, async_result in enumerate(async_results):
370377
try:
371-
# Check remaining time
378+
# Check timeout before each result collection
379+
self._check_timeout(start_time, f"collecting n-gram {n_gram_lengths[i]} results")
380+
381+
# Check remaining time for pool timeout
372382
elapsed_time = time.time() - start_time
373-
remaining_time = max(10, self.timeout_seconds - elapsed_time)
383+
remaining_time = max(10, self.timeout_seconds - elapsed_time) if self.timeout_seconds > 0 else pool_timeout
374384

375385
result = async_result.get(timeout=min(pool_timeout, remaining_time))
376386
results.append(result)
377387

378388
self.logger.debug(f"Completed n-gram length {n_gram_lengths[i]} ({i+1}/{len(n_gram_lengths)})")
379389

390+
except AnchorSequenceTimeoutError:
391+
# Re-raise timeout errors
392+
raise
380393
except Exception as e:
381394
self.logger.warning(f"n-gram length {n_gram_lengths[i]} failed or timed out: {str(e)}")
382395
results.append([]) # Add empty result to maintain order
383396

384397
for anchors in results:
385398
candidate_anchors.extend(anchors)
386399

400+
except AnchorSequenceTimeoutError:
401+
# Re-raise timeout errors
402+
raise
387403
except Exception as e:
388404
self.logger.error(f"Parallel processing failed: {str(e)}")
389-
# Fall back to sequential processing with strict timeout
405+
# Fall back to sequential processing with timeout checks
390406
self.logger.info("Falling back to sequential processing")
391407
for n in n_gram_lengths:
392-
elapsed_time = time.time() - start_time
393-
if elapsed_time >= self.timeout_seconds * 0.8: # Use 80% of timeout
394-
self.logger.warning(f"Stopping sequential processing due to timeout after {elapsed_time:.1f}s")
395-
break
396-
397408
try:
409+
# Check timeout before each n-gram length
410+
self._check_timeout(start_time, f"sequential processing n-gram {n}")
411+
398412
anchors = self._process_ngram_length(
399413
n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
400414
)
401415
candidate_anchors.extend(anchors)
416+
except AnchorSequenceTimeoutError:
417+
# Re-raise timeout errors
418+
raise
402419
except Exception as e:
403420
self.logger.warning(f"Sequential processing failed for n-gram length {n}: {str(e)}")
404421
continue
405422

406423
self.logger.info(f"Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
407424

408425
# Check timeout before expensive filtering operation
409-
elapsed_time = time.time() - start_time
410-
if elapsed_time >= self.timeout_seconds * 0.9: # Use 90% of timeout
411-
self.logger.warning(f"Skipping overlap filtering due to timeout ({elapsed_time:.1f}s elapsed)")
412-
# Return basic scored anchors without filtering
413-
basic_scored = []
414-
for anchor in candidate_anchors[:100]: # Limit to first 100 anchors
415-
try:
416-
phrase_score = PhraseScore(
417-
total_score=1.0,
418-
natural_break_score=1.0,
419-
phrase_type=PhraseType.COMPLETE
420-
)
421-
basic_scored.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
422-
except:
423-
continue
424-
425-
# Save basic results to cache
426-
if basic_scored:
427-
self._save_to_cache(cache_path, basic_scored)
428-
429-
return basic_scored
426+
self._check_timeout(start_time, "overlap filtering start")
430427

431428
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
432429

@@ -445,10 +442,8 @@ def find_anchors(
445442
self.logger.error(f"Anchor sequence computation failed: {str(e)}")
446443
raise
447444
finally:
448-
# Clean up timeout signal
449-
if self.timeout_seconds > 0:
450-
signal.alarm(0)
451-
signal.signal(signal.SIGALRM, old_handler)
445+
# No cleanup needed for time-based timeout checks
446+
pass
452447

453448
def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
454449
"""Score a sequence based on its phrase quality"""
@@ -625,15 +620,14 @@ def _remove_overlapping_sequences(
625620

626621
self.logger.info(f"Filtering {len(scored_anchors)} overlapping sequences")
627622
filtered_scored = []
628-
max_filter_time = 60 # Maximum 1 minute for filtering
629-
filter_start = time.time()
630623

631624
for i, scored_anchor in enumerate(scored_anchors):
632-
# Check timeout every 100 anchors
625+
# Check timeout every 100 anchors using our timeout mechanism
633626
if i % 100 == 0:
634-
elapsed = time.time() - filter_start
635-
if elapsed > max_filter_time:
636-
self.logger.warning(f"Filtering timed out after {elapsed:.1f}s, returning {len(filtered_scored)} anchors")
627+
try:
628+
self._check_timeout(start_time, f"filtering anchors (processed {i}/{len(scored_anchors)})")
629+
except AnchorSequenceTimeoutError:
630+
self.logger.warning(f"Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
637631
break
638632

639633
overlaps = False

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "lyrics-transcriber"
3-
version = "0.64.0"
3+
version = "0.65.0"
44
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
55
authors = ["Andrew Beveridge <[email protected]>"]
66
license = "MIT"

tests/unit/correction/test_anchor_sequence_2.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,20 @@ def test_basic_scoring_fallback(setup_teardown):
194194
transcription_result = create_test_transcription_result_from_text(transcribed)
195195
lyrics_data_references = convert_references_to_lyrics_data(references)
196196

197-
# Should complete with basic scoring if needed
198-
anchors = finder.find_anchors(transcribed, lyrics_data_references, transcription_result)
199-
200-
assert isinstance(anchors, list)
201-
# Should have basic scores even if timeout occurred
202-
for anchor in anchors:
203-
assert isinstance(anchor, ScoredAnchor)
204-
assert isinstance(anchor.phrase_score, PhraseScore)
205-
assert anchor.phrase_score.total_score > 0
197+
# Should either complete with basic scoring or timeout gracefully
198+
try:
199+
anchors = finder.find_anchors(transcribed, lyrics_data_references, transcription_result)
200+
201+
assert isinstance(anchors, list)
202+
# Should have basic scores even if timeout occurred
203+
for anchor in anchors:
204+
assert isinstance(anchor, ScoredAnchor)
205+
assert isinstance(anchor.phrase_score, PhraseScore)
206+
assert anchor.phrase_score.total_score > 0
207+
except AnchorSequenceTimeoutError as e:
208+
# Timeout is acceptable behavior for this test with very short timeout
209+
assert "exceeded 3 seconds" in str(e)
210+
print(f"Expected timeout occurred: {e}")
206211

207212

208213
def test_timeout_disabled_functionality(setup_teardown):
@@ -293,9 +298,9 @@ def test_multiprocessing_timeout_handling(setup_teardown):
293298
assert elapsed_time <= 15 # Should timeout within reasonable time of setting
294299

295300

296-
def test_concurrent_signal_handling(setup_teardown):
297-
"""Test that signal handling works correctly with concurrent operations."""
298-
# This test ensures the timeout signal handling doesn't interfere with normal operations
301+
def test_concurrent_timeout_handling(setup_teardown):
302+
"""Test that threading-based timeout handling works correctly with concurrent operations."""
303+
# This test ensures the timeout mechanism doesn't interfere with normal operations
299304
finder = AnchorSequenceFinder(
300305
min_sequence_length=2,
301306
min_sources=1,

0 commit comments

Comments
 (0)