1
1
import logging
2
2
import re
3
3
from typing import Optional , Dict , Any
4
+ import requests
4
5
import lyricsgenius
5
6
from lyrics_transcriber .types import LyricsData , LyricsMetadata
6
7
from lyrics_transcriber .lyrics .base_lyrics_provider import BaseLyricsProvider , LyricsProviderConfig
@@ -12,6 +13,7 @@ class GeniusProvider(BaseLyricsProvider):
12
13
def __init__ (self , config : LyricsProviderConfig , logger : Optional [logging .Logger ] = None ):
13
14
super ().__init__ (config , logger )
14
15
self .api_token = config .genius_api_token
16
+ self .rapidapi_key = config .rapidapi_key
15
17
self .client = None
16
18
if self .api_token :
17
19
self .client = lyricsgenius .Genius (
@@ -25,9 +27,17 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
25
27
)
26
28
27
29
def _fetch_data_from_source (self , artist : str , title : str ) -> Optional [Dict [str , Any ]]:
28
- """Fetch raw song data from Genius API."""
30
+ """Fetch raw song data from Genius API or RapidAPI."""
31
+ # Try RapidAPI first if available
32
+ if self .rapidapi_key :
33
+ self .logger .info (f"Trying RapidAPI for { artist } - { title } " )
34
+ result = self ._fetch_from_rapidapi (artist , title )
35
+ if result :
36
+ return result
37
+
38
+ # Fall back to direct Genius API
29
39
if not self .client :
30
- self .logger .warning ("No Genius API token provided" )
40
+ self .logger .warning ("No Genius API token provided and RapidAPI failed " )
31
41
return None
32
42
33
43
self .logger .info (f"Searching Genius for { artist } - { title } " )
@@ -40,8 +50,186 @@ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str,
40
50
self .logger .error (f"Error fetching from Genius: { str (e )} " )
41
51
return None
42
52
53
+ def _fetch_from_rapidapi (self , artist : str , title : str ) -> Optional [Dict [str , Any ]]:
54
+ """Fetch song data using RapidAPI."""
55
+ try :
56
+ # Step 1: Search for the song
57
+ search_url = "https://genius-song-lyrics1.p.rapidapi.com/search/"
58
+ search_params = {
59
+ "q" : f"{ artist } { title } " ,
60
+ "per_page" : "10" ,
61
+ "page" : "1"
62
+ }
63
+
64
+ headers = {
65
+ "x-rapidapi-key" : self .rapidapi_key ,
66
+ "x-rapidapi-host" : "genius-song-lyrics1.p.rapidapi.com"
67
+ }
68
+
69
+ self .logger .debug (f"Making RapidAPI search request for '{ artist } { title } '" )
70
+ search_response = requests .get (search_url , headers = headers , params = search_params , timeout = 10 )
71
+ search_response .raise_for_status ()
72
+
73
+ search_data = search_response .json ()
74
+
75
+ # Find the best match from search results
76
+ if not search_data .get ("hits" ):
77
+ self .logger .warning ("No search results from RapidAPI" )
78
+ return None
79
+
80
+ best_match = None
81
+ for hit in search_data ["hits" ]:
82
+ result = hit .get ("result" , {})
83
+ if result .get ("id" ):
84
+ best_match = result
85
+ break
86
+
87
+ if not best_match :
88
+ self .logger .warning ("No valid song ID found in RapidAPI search results" )
89
+ return None
90
+
91
+ song_id = best_match ["id" ]
92
+ self .logger .debug (f"Found song ID: { song_id } " )
93
+
94
+ # Step 2: Fetch lyrics using the song ID
95
+ lyrics_url = "https://genius-song-lyrics1.p.rapidapi.com/song/lyrics/"
96
+ lyrics_params = {"id" : str (song_id )}
97
+
98
+ self .logger .debug (f"Making RapidAPI lyrics request for song ID { song_id } " )
99
+ lyrics_response = requests .get (lyrics_url , headers = headers , params = lyrics_params , timeout = 10 )
100
+ lyrics_response .raise_for_status ()
101
+
102
+ lyrics_data = lyrics_response .json ()
103
+
104
+ # Extract lyrics from the nested response structure
105
+ lyrics_text = self ._extract_lyrics_from_rapidapi_response (lyrics_data )
106
+ if not lyrics_text :
107
+ self .logger .warning ("No lyrics found in RapidAPI response" )
108
+ return None
109
+
110
+ # Create a clean RapidAPI-only response structure
111
+ # Don't mix search metadata (which contains Genius fields) with our clean structure
112
+ rapidapi_response = {
113
+ "title" : best_match .get ("title" , "" ),
114
+ "primary_artist" : best_match .get ("primary_artist" , {}),
115
+ "lyrics" : lyrics_text ,
116
+ "id" : song_id ,
117
+ "url" : best_match .get ("url" , "" ),
118
+ "release_date_for_display" : best_match .get ("release_date_for_display" , "" ),
119
+ # Mark this as RapidAPI source
120
+ "_rapidapi_source" : True
121
+ }
122
+
123
+ self .logger .info ("Successfully fetched lyrics from RapidAPI" )
124
+ return rapidapi_response
125
+
126
+ except requests .exceptions .RequestException as e :
127
+ self .logger .error (f"RapidAPI request failed: { str (e )} " )
128
+ return None
129
+ except Exception as e :
130
+ self .logger .error (f"Error fetching from RapidAPI: { str (e )} " )
131
+ return None
132
+
133
+ def _extract_lyrics_from_rapidapi_response (self , lyrics_data : Dict [str , Any ]) -> Optional [str ]:
134
+ """Extract lyrics text from RapidAPI response structure."""
135
+ try :
136
+ # Log the actual response structure for debugging
137
+ self .logger .debug (f"RapidAPI response structure: { lyrics_data } " )
138
+
139
+ # Try different possible response structures
140
+
141
+ # Structure 1: lyrics.lyrics.body.html (the actual RapidAPI structure)
142
+ nested_lyrics = lyrics_data .get ("lyrics" , {}).get ("lyrics" , {})
143
+ if isinstance (nested_lyrics , dict ):
144
+ html_content = nested_lyrics .get ("body" , {}).get ("html" )
145
+ if html_content :
146
+ return self ._clean_html_lyrics (html_content )
147
+
148
+ # Structure 2: lyrics.lyrics (simple string)
149
+ if isinstance (lyrics_data .get ("lyrics" , {}).get ("lyrics" ), str ):
150
+ return lyrics_data ["lyrics" ]["lyrics" ]
151
+
152
+ # Structure 3: lyrics.body.html (HTML content)
153
+ html_content = lyrics_data .get ("lyrics" , {}).get ("body" , {}).get ("html" )
154
+ if html_content :
155
+ return self ._clean_html_lyrics (html_content )
156
+
157
+ # Structure 4: Direct lyrics field
158
+ if isinstance (lyrics_data .get ("lyrics" ), str ):
159
+ return lyrics_data ["lyrics" ]
160
+
161
+ # Structure 5: body.html at top level
162
+ if lyrics_data .get ("body" , {}).get ("html" ):
163
+ return self ._clean_html_lyrics (lyrics_data ["body" ]["html" ])
164
+
165
+ # Structure 6: Check if lyrics is a dict with other possible keys
166
+ lyrics_obj = lyrics_data .get ("lyrics" , {})
167
+ if isinstance (lyrics_obj , dict ):
168
+ # Try common alternative keys
169
+ for key in ["text" , "content" , "plain" , "body" ]:
170
+ if key in lyrics_obj :
171
+ content = lyrics_obj [key ]
172
+ if isinstance (content , str ):
173
+ return content
174
+ elif isinstance (content , dict ) and "html" in content :
175
+ return self ._clean_html_lyrics (content ["html" ])
176
+ elif isinstance (content , dict ) and "text" in content :
177
+ return content ["text" ]
178
+
179
+ self .logger .warning (f"Unknown RapidAPI response structure: { list (lyrics_data .keys ())} " )
180
+ if "lyrics" in lyrics_data :
181
+ self .logger .warning (f"Lyrics object structure: { lyrics_data ['lyrics' ]} " )
182
+ return None
183
+
184
+ except Exception as e :
185
+ self .logger .error (f"Error extracting lyrics from RapidAPI response: { str (e )} " )
186
+ return None
187
+
188
+ def _clean_html_lyrics (self , html_content : str ) -> str :
189
+ """Clean HTML content to extract plain text lyrics."""
190
+ import re
191
+
192
+ if not html_content :
193
+ return ""
194
+
195
+ # Remove HTML tags while preserving line breaks
196
+ text = re .sub (r'<br\s*/?>' , '\n ' , html_content ) # Convert <br> to newlines
197
+ text = re .sub (r'<[^>]+>' , '' , text ) # Remove all other HTML tags
198
+
199
+ # Decode HTML entities
200
+ text = text .replace ('<' , '<' ).replace ('>' , '>' ).replace ('&' , '&' )
201
+ text = text .replace ('"' , '"' ).replace (''' , "'" ).replace (' ' , ' ' )
202
+
203
+ # Remove section markers but keep the lyrics content
204
+ # Instead of removing entire lines, just remove the square bracket markers
205
+ text = re .sub (r'\[Verse \d+\]' , '' , text )
206
+ text = re .sub (r'\[Pre-Chorus\]' , '' , text )
207
+ text = re .sub (r'\[Chorus\]' , '' , text )
208
+ text = re .sub (r'\[Refrain\]' , '' , text )
209
+ text = re .sub (r'\[Outro\]' , '' , text )
210
+ text = re .sub (r'\[Bridge\]' , '' , text )
211
+ text = re .sub (r'\[Intro\]' , '' , text )
212
+
213
+ # Clean up multiple consecutive newlines
214
+ text = re .sub (r'\n\s*\n\s*\n+' , '\n \n ' , text )
215
+
216
+ # Clean up leading/trailing whitespace
217
+ text = text .strip ()
218
+
219
+ return text
220
+
43
221
def _convert_result_format (self , raw_data : Dict [str , Any ]) -> LyricsData :
44
222
"""Convert Genius's raw API response to standardized format."""
223
+ # Use our explicit source marker for detection
224
+ is_rapidapi = raw_data .get ("_rapidapi_source" , False )
225
+
226
+ if is_rapidapi :
227
+ return self ._convert_rapidapi_format (raw_data )
228
+ else :
229
+ return self ._convert_lyricsgenius_format (raw_data )
230
+
231
+ def _convert_lyricsgenius_format (self , raw_data : Dict [str , Any ]) -> LyricsData :
232
+ """Convert lyricsgenius format to standardized format."""
45
233
# Clean the lyrics before processing
46
234
lyrics = self ._clean_lyrics (raw_data .get ("lyrics" , "" ))
47
235
@@ -74,6 +262,46 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
74
262
"verified_annotations" : len (raw_data .get ("verified_annotations_by" , [])),
75
263
"verified_contributors" : len (raw_data .get ("verified_contributors" , [])),
76
264
"external_urls" : {"genius" : raw_data .get ("url" )},
265
+ "api_source" : "lyricsgenius" ,
266
+ },
267
+ )
268
+
269
+ # Create segments with words from cleaned lyrics
270
+ segments = self ._create_segments_with_words (lyrics , is_synced = False )
271
+
272
+ # Create result object with segments
273
+ return LyricsData (source = "genius" , segments = segments , metadata = metadata )
274
+
275
+ def _convert_rapidapi_format (self , raw_data : Dict [str , Any ]) -> LyricsData :
276
+ """Convert RapidAPI format to standardized format."""
277
+ # Clean the lyrics before processing
278
+ lyrics = self ._clean_lyrics (raw_data .get ("lyrics" , "" ))
279
+
280
+ # Extract artist name from primary_artist
281
+ primary_artist = raw_data .get ("primary_artist" , {})
282
+ artist_name = primary_artist .get ("name" , "" )
283
+
284
+ # Extract release date from release_date_for_display
285
+ release_date = raw_data .get ("release_date_for_display" )
286
+
287
+ # Create metadata object
288
+ metadata = LyricsMetadata (
289
+ source = "genius" ,
290
+ track_name = raw_data .get ("title" , "" ),
291
+ artist_names = artist_name ,
292
+ album_name = raw_data .get ("album" , {}).get ("name" ) if raw_data .get ("album" ) else None ,
293
+ lyrics_provider = "genius" ,
294
+ lyrics_provider_id = str (raw_data .get ("id" )),
295
+ is_synced = False , # Genius doesn't provide synced lyrics
296
+ provider_metadata = {
297
+ "genius_id" : raw_data .get ("id" ),
298
+ "release_date" : release_date ,
299
+ "page_url" : raw_data .get ("url" ),
300
+ "annotation_count" : raw_data .get ("annotation_count" ),
301
+ "lyrics_state" : raw_data .get ("lyrics_state" ),
302
+ "pyongs_count" : raw_data .get ("pyongs_count" ),
303
+ "external_urls" : {"genius" : raw_data .get ("url" )},
304
+ "api_source" : "rapidapi" ,
77
305
},
78
306
)
79
307
@@ -86,6 +314,19 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
86
314
def _clean_lyrics (self , lyrics : str ) -> str :
87
315
"""Clean and process lyrics from Genius to remove unwanted content."""
88
316
self .logger .debug ("Starting lyrics cleaning process" )
317
+
318
+ # Handle unexpected input types
319
+ if not isinstance (lyrics , str ):
320
+ self .logger .warning (f"Expected string for lyrics, got { type (lyrics )} : { repr (lyrics )} " )
321
+ if lyrics is None :
322
+ return ""
323
+ # Try to convert to string
324
+ try :
325
+ lyrics = str (lyrics )
326
+ except Exception as e :
327
+ self .logger .error (f"Failed to convert lyrics to string: { e } " )
328
+ return ""
329
+
89
330
original = lyrics
90
331
91
332
lyrics = lyrics .replace ("\\ n" , "\n " )
@@ -123,10 +364,20 @@ def _clean_lyrics(self, lyrics: str) -> str:
123
364
if original != lyrics :
124
365
self .logger .debug ("Removed standalone 'Embed' text" )
125
366
367
+ # Remove section markers but keep the lyrics content (for non-HTML lyrics)
368
+ # Instead of removing entire lines, just remove the square bracket markers
126
369
original = lyrics
127
- lyrics = re .sub (r".*?\[.*?\].*?" , "" , lyrics )
370
+ lyrics = re .sub (r'\[Verse \d+\]' , '' , lyrics )
371
+ lyrics = re .sub (r'\[Pre-Chorus\]' , '' , lyrics )
372
+ lyrics = re .sub (r'\[Chorus\]' , '' , lyrics )
373
+ lyrics = re .sub (r'\[Refrain\]' , '' , lyrics )
374
+ lyrics = re .sub (r'\[Outro\]' , '' , lyrics )
375
+ lyrics = re .sub (r'\[Bridge\]' , '' , lyrics )
376
+ lyrics = re .sub (r'\[Intro\]' , '' , lyrics )
128
377
if original != lyrics :
129
- self .logger .debug ("Removed lines containing square brackets" )
378
+ self .logger .debug ("Removed section markers while preserving lyrics content" )
379
+
380
+ # Remove common LyricsGenius page elements
130
381
131
382
self .logger .debug ("Completed lyrics cleaning process" )
132
383
return lyrics
0 commit comments