Skip to content

Commit 012d430

Browse files
committed
Implemented RapidAPI alternative to LyricsGenius for lyrics fetch
1 parent 966cb1d commit 012d430

File tree

10 files changed

+874
-8
lines changed

10 files changed

+874
-8
lines changed

lyrics_transcriber/cli/cli_main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def get_config_from_env() -> Dict[str, str]:
114114
return {
115115
"audioshake_api_token": os.getenv("AUDIOSHAKE_API_TOKEN"),
116116
"genius_api_token": os.getenv("GENIUS_API_TOKEN"),
117+
"rapidapi_key": os.getenv("RAPIDAPI_KEY"),
117118
"spotify_cookie": os.getenv("SPOTIFY_COOKIE_SP_DC"),
118119
"runpod_api_key": os.getenv("RUNPOD_API_KEY"),
119120
"whisper_runpod_id": os.getenv("WHISPER_RUNPOD_ID"),
@@ -145,6 +146,7 @@ def create_configs(args: argparse.Namespace, env_config: Dict[str, str]) -> tupl
145146

146147
lyrics_config = LyricsConfig(
147148
genius_api_token=args.genius_api_token or env_config.get("genius_api_token"),
149+
rapidapi_key=env_config.get("rapidapi_key"),
148150
spotify_cookie=args.spotify_cookie or env_config.get("spotify_cookie"),
149151
lyrics_file=args.lyrics_file,
150152
)

lyrics_transcriber/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class LyricsConfig:
1717
"""Configuration for lyrics services."""
1818

1919
genius_api_token: Optional[str] = None
20+
rapidapi_key: Optional[str] = None
2021
spotify_cookie: Optional[str] = None
2122
lyrics_file: Optional[str] = None
2223

lyrics_transcriber/core/controller.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def _initialize_lyrics_providers(self) -> Dict[str, BaseLyricsProvider]:
191191
# Create provider config with all necessary parameters
192192
provider_config = LyricsProviderConfig(
193193
genius_api_token=self.lyrics_config.genius_api_token,
194+
rapidapi_key=self.lyrics_config.rapidapi_key,
194195
spotify_cookie=self.lyrics_config.spotify_cookie,
195196
lyrics_file=self.lyrics_config.lyrics_file,
196197
cache_dir=self.output_config.cache_dir,

lyrics_transcriber/lyrics/base_lyrics_provider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class LyricsProviderConfig:
1616
"""Configuration for lyrics providers."""
1717

1818
genius_api_token: Optional[str] = None
19+
rapidapi_key: Optional[str] = None
1920
spotify_cookie: Optional[str] = None
2021
lyrics_file: Optional[str] = None
2122
cache_dir: Optional[str] = None

lyrics_transcriber/lyrics/genius.py

Lines changed: 255 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import re
33
from typing import Optional, Dict, Any
4+
import requests
45
import lyricsgenius
56
from lyrics_transcriber.types import LyricsData, LyricsMetadata
67
from lyrics_transcriber.lyrics.base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig
@@ -12,6 +13,7 @@ class GeniusProvider(BaseLyricsProvider):
1213
def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
1314
super().__init__(config, logger)
1415
self.api_token = config.genius_api_token
16+
self.rapidapi_key = config.rapidapi_key
1517
self.client = None
1618
if self.api_token:
1719
self.client = lyricsgenius.Genius(
@@ -25,9 +27,17 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
2527
)
2628

2729
def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
28-
"""Fetch raw song data from Genius API."""
30+
"""Fetch raw song data from Genius API or RapidAPI."""
31+
# Try RapidAPI first if available
32+
if self.rapidapi_key:
33+
self.logger.info(f"Trying RapidAPI for {artist} - {title}")
34+
result = self._fetch_from_rapidapi(artist, title)
35+
if result:
36+
return result
37+
38+
# Fall back to direct Genius API
2939
if not self.client:
30-
self.logger.warning("No Genius API token provided")
40+
self.logger.warning("No Genius API token provided and RapidAPI failed")
3141
return None
3242

3343
self.logger.info(f"Searching Genius for {artist} - {title}")
@@ -40,8 +50,186 @@ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str,
4050
self.logger.error(f"Error fetching from Genius: {str(e)}")
4151
return None
4252

53+
def _fetch_from_rapidapi(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
54+
"""Fetch song data using RapidAPI."""
55+
try:
56+
# Step 1: Search for the song
57+
search_url = "https://genius-song-lyrics1.p.rapidapi.com/search/"
58+
search_params = {
59+
"q": f"{artist} {title}",
60+
"per_page": "10",
61+
"page": "1"
62+
}
63+
64+
headers = {
65+
"x-rapidapi-key": self.rapidapi_key,
66+
"x-rapidapi-host": "genius-song-lyrics1.p.rapidapi.com"
67+
}
68+
69+
self.logger.debug(f"Making RapidAPI search request for '{artist} {title}'")
70+
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=10)
71+
search_response.raise_for_status()
72+
73+
search_data = search_response.json()
74+
75+
# Find the best match from search results
76+
if not search_data.get("hits"):
77+
self.logger.warning("No search results from RapidAPI")
78+
return None
79+
80+
best_match = None
81+
for hit in search_data["hits"]:
82+
result = hit.get("result", {})
83+
if result.get("id"):
84+
best_match = result
85+
break
86+
87+
if not best_match:
88+
self.logger.warning("No valid song ID found in RapidAPI search results")
89+
return None
90+
91+
song_id = best_match["id"]
92+
self.logger.debug(f"Found song ID: {song_id}")
93+
94+
# Step 2: Fetch lyrics using the song ID
95+
lyrics_url = "https://genius-song-lyrics1.p.rapidapi.com/song/lyrics/"
96+
lyrics_params = {"id": str(song_id)}
97+
98+
self.logger.debug(f"Making RapidAPI lyrics request for song ID {song_id}")
99+
lyrics_response = requests.get(lyrics_url, headers=headers, params=lyrics_params, timeout=10)
100+
lyrics_response.raise_for_status()
101+
102+
lyrics_data = lyrics_response.json()
103+
104+
# Extract lyrics from the nested response structure
105+
lyrics_text = self._extract_lyrics_from_rapidapi_response(lyrics_data)
106+
if not lyrics_text:
107+
self.logger.warning("No lyrics found in RapidAPI response")
108+
return None
109+
110+
# Create a clean RapidAPI-only response structure
111+
# Don't mix search metadata (which contains Genius fields) with our clean structure
112+
rapidapi_response = {
113+
"title": best_match.get("title", ""),
114+
"primary_artist": best_match.get("primary_artist", {}),
115+
"lyrics": lyrics_text,
116+
"id": song_id,
117+
"url": best_match.get("url", ""),
118+
"release_date_for_display": best_match.get("release_date_for_display", ""),
119+
# Mark this as RapidAPI source
120+
"_rapidapi_source": True
121+
}
122+
123+
self.logger.info("Successfully fetched lyrics from RapidAPI")
124+
return rapidapi_response
125+
126+
except requests.exceptions.RequestException as e:
127+
self.logger.error(f"RapidAPI request failed: {str(e)}")
128+
return None
129+
except Exception as e:
130+
self.logger.error(f"Error fetching from RapidAPI: {str(e)}")
131+
return None
132+
133+
def _extract_lyrics_from_rapidapi_response(self, lyrics_data: Dict[str, Any]) -> Optional[str]:
134+
"""Extract lyrics text from RapidAPI response structure."""
135+
try:
136+
# Log the actual response structure for debugging
137+
self.logger.debug(f"RapidAPI response structure: {lyrics_data}")
138+
139+
# Try different possible response structures
140+
141+
# Structure 1: lyrics.lyrics.body.html (the actual RapidAPI structure)
142+
nested_lyrics = lyrics_data.get("lyrics", {}).get("lyrics", {})
143+
if isinstance(nested_lyrics, dict):
144+
html_content = nested_lyrics.get("body", {}).get("html")
145+
if html_content:
146+
return self._clean_html_lyrics(html_content)
147+
148+
# Structure 2: lyrics.lyrics (simple string)
149+
if isinstance(lyrics_data.get("lyrics", {}).get("lyrics"), str):
150+
return lyrics_data["lyrics"]["lyrics"]
151+
152+
# Structure 3: lyrics.body.html (HTML content)
153+
html_content = lyrics_data.get("lyrics", {}).get("body", {}).get("html")
154+
if html_content:
155+
return self._clean_html_lyrics(html_content)
156+
157+
# Structure 4: Direct lyrics field
158+
if isinstance(lyrics_data.get("lyrics"), str):
159+
return lyrics_data["lyrics"]
160+
161+
# Structure 5: body.html at top level
162+
if lyrics_data.get("body", {}).get("html"):
163+
return self._clean_html_lyrics(lyrics_data["body"]["html"])
164+
165+
# Structure 6: Check if lyrics is a dict with other possible keys
166+
lyrics_obj = lyrics_data.get("lyrics", {})
167+
if isinstance(lyrics_obj, dict):
168+
# Try common alternative keys
169+
for key in ["text", "content", "plain", "body"]:
170+
if key in lyrics_obj:
171+
content = lyrics_obj[key]
172+
if isinstance(content, str):
173+
return content
174+
elif isinstance(content, dict) and "html" in content:
175+
return self._clean_html_lyrics(content["html"])
176+
elif isinstance(content, dict) and "text" in content:
177+
return content["text"]
178+
179+
self.logger.warning(f"Unknown RapidAPI response structure: {list(lyrics_data.keys())}")
180+
if "lyrics" in lyrics_data:
181+
self.logger.warning(f"Lyrics object structure: {lyrics_data['lyrics']}")
182+
return None
183+
184+
except Exception as e:
185+
self.logger.error(f"Error extracting lyrics from RapidAPI response: {str(e)}")
186+
return None
187+
188+
def _clean_html_lyrics(self, html_content: str) -> str:
189+
"""Clean HTML content to extract plain text lyrics."""
190+
import re
191+
192+
if not html_content:
193+
return ""
194+
195+
# Remove HTML tags while preserving line breaks
196+
text = re.sub(r'<br\s*/?>', '\n', html_content) # Convert <br> to newlines
197+
text = re.sub(r'<[^>]+>', '', text) # Remove all other HTML tags
198+
199+
# Decode HTML entities
200+
text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
201+
text = text.replace('&quot;', '"').replace('&#x27;', "'").replace('&nbsp;', ' ')
202+
203+
# Remove section markers but keep the lyrics content
204+
# Instead of removing entire lines, just remove the square bracket markers
205+
text = re.sub(r'\[Verse \d+\]', '', text)
206+
text = re.sub(r'\[Pre-Chorus\]', '', text)
207+
text = re.sub(r'\[Chorus\]', '', text)
208+
text = re.sub(r'\[Refrain\]', '', text)
209+
text = re.sub(r'\[Outro\]', '', text)
210+
text = re.sub(r'\[Bridge\]', '', text)
211+
text = re.sub(r'\[Intro\]', '', text)
212+
213+
# Clean up multiple consecutive newlines
214+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
215+
216+
# Clean up leading/trailing whitespace
217+
text = text.strip()
218+
219+
return text
220+
43221
def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
44222
"""Convert Genius's raw API response to standardized format."""
223+
# Use our explicit source marker for detection
224+
is_rapidapi = raw_data.get("_rapidapi_source", False)
225+
226+
if is_rapidapi:
227+
return self._convert_rapidapi_format(raw_data)
228+
else:
229+
return self._convert_lyricsgenius_format(raw_data)
230+
231+
def _convert_lyricsgenius_format(self, raw_data: Dict[str, Any]) -> LyricsData:
232+
"""Convert lyricsgenius format to standardized format."""
45233
# Clean the lyrics before processing
46234
lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
47235

@@ -74,6 +262,46 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
74262
"verified_annotations": len(raw_data.get("verified_annotations_by", [])),
75263
"verified_contributors": len(raw_data.get("verified_contributors", [])),
76264
"external_urls": {"genius": raw_data.get("url")},
265+
"api_source": "lyricsgenius",
266+
},
267+
)
268+
269+
# Create segments with words from cleaned lyrics
270+
segments = self._create_segments_with_words(lyrics, is_synced=False)
271+
272+
# Create result object with segments
273+
return LyricsData(source="genius", segments=segments, metadata=metadata)
274+
275+
def _convert_rapidapi_format(self, raw_data: Dict[str, Any]) -> LyricsData:
276+
"""Convert RapidAPI format to standardized format."""
277+
# Clean the lyrics before processing
278+
lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
279+
280+
# Extract artist name from primary_artist
281+
primary_artist = raw_data.get("primary_artist", {})
282+
artist_name = primary_artist.get("name", "")
283+
284+
# Extract release date from release_date_for_display
285+
release_date = raw_data.get("release_date_for_display")
286+
287+
# Create metadata object
288+
metadata = LyricsMetadata(
289+
source="genius",
290+
track_name=raw_data.get("title", ""),
291+
artist_names=artist_name,
292+
album_name=raw_data.get("album", {}).get("name") if raw_data.get("album") else None,
293+
lyrics_provider="genius",
294+
lyrics_provider_id=str(raw_data.get("id")),
295+
is_synced=False, # Genius doesn't provide synced lyrics
296+
provider_metadata={
297+
"genius_id": raw_data.get("id"),
298+
"release_date": release_date,
299+
"page_url": raw_data.get("url"),
300+
"annotation_count": raw_data.get("annotation_count"),
301+
"lyrics_state": raw_data.get("lyrics_state"),
302+
"pyongs_count": raw_data.get("pyongs_count"),
303+
"external_urls": {"genius": raw_data.get("url")},
304+
"api_source": "rapidapi",
77305
},
78306
)
79307

@@ -86,6 +314,19 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
86314
def _clean_lyrics(self, lyrics: str) -> str:
87315
"""Clean and process lyrics from Genius to remove unwanted content."""
88316
self.logger.debug("Starting lyrics cleaning process")
317+
318+
# Handle unexpected input types
319+
if not isinstance(lyrics, str):
320+
self.logger.warning(f"Expected string for lyrics, got {type(lyrics)}: {repr(lyrics)}")
321+
if lyrics is None:
322+
return ""
323+
# Try to convert to string
324+
try:
325+
lyrics = str(lyrics)
326+
except Exception as e:
327+
self.logger.error(f"Failed to convert lyrics to string: {e}")
328+
return ""
329+
89330
original = lyrics
90331

91332
lyrics = lyrics.replace("\\n", "\n")
@@ -123,10 +364,20 @@ def _clean_lyrics(self, lyrics: str) -> str:
123364
if original != lyrics:
124365
self.logger.debug("Removed standalone 'Embed' text")
125366

367+
# Remove section markers but keep the lyrics content (for non-HTML lyrics)
368+
# Instead of removing entire lines, just remove the square bracket markers
126369
original = lyrics
127-
lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics)
370+
lyrics = re.sub(r'\[Verse \d+\]', '', lyrics)
371+
lyrics = re.sub(r'\[Pre-Chorus\]', '', lyrics)
372+
lyrics = re.sub(r'\[Chorus\]', '', lyrics)
373+
lyrics = re.sub(r'\[Refrain\]', '', lyrics)
374+
lyrics = re.sub(r'\[Outro\]', '', lyrics)
375+
lyrics = re.sub(r'\[Bridge\]', '', lyrics)
376+
lyrics = re.sub(r'\[Intro\]', '', lyrics)
128377
if original != lyrics:
129-
self.logger.debug("Removed lines containing square brackets")
378+
self.logger.debug("Removed section markers while preserving lyrics content")
379+
380+
# Remove common LyricsGenius page elements
130381

131382
self.logger.debug("Completed lyrics cleaning process")
132383
return lyrics

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "lyrics-transcriber"
3-
version = "0.58.0"
3+
version = "0.59.0"
44
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
55
authors = ["Andrew Beveridge <[email protected]>"]
66
license = "MIT"

tests/manual/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""
2+
Manual tests that require real API keys and network access.
3+
4+
These tests are not part of the regular test suite and must be run manually
5+
with appropriate environment variables set.
6+
"""

0 commit comments

Comments
 (0)