Skip to content

Commit aa58485

Browse files
committed
ID3v2: Support single-BOM COMM/USLT frames
UTF-16 encoded COMM and USLT frames can sometimes have a single BOM. Why do some encoders choose to do that? Who knows.
1 parent 0668917 commit aa58485

File tree

4 files changed

+95
-49
lines changed

4 files changed

+95
-49
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1818
- **Vorbis Comments**: Check `ALBUM ARTIST` for `ItemKey::AlbumArtist` conversions
1919
* **ItemKey**: `ItemKey` is now `Copy` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526))
2020

21+
### Fixed
22+
- **ID3v2**: Support parsing UTF-16 `COMM`/`USLT` frames with a single BOM ([issue](https://github.com/Serial-ATA/lofty-rs/issues/532)) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/535))
23+
- Some encoders will only write a BOM to the frame's description, rather than to every string in the frame.
24+
This was previously only supported in `SYLT` frames, and has been extended to `COMM` and `USLT`.
25+
2126
### Removed
2227

2328
* **ItemKey**: `ItemKey::Unknown` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526))

lofty/src/id3/v2/items/language_frame.rs

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ use crate::id3::v2::frame::content::verify_encoding;
33
use crate::id3::v2::header::Id3v2Version;
44
use crate::id3::v2::{FrameFlags, FrameHeader, FrameId};
55
use crate::tag::items::Lang;
6-
use crate::util::text::{TextDecodeOptions, TextEncoding, decode_text, encode_text};
6+
use crate::util::text::{
7+
DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text,
8+
utf16_decode_terminated_maybe_bom,
9+
};
710

811
use std::borrow::Cow;
912
use std::hash::{Hash, Hasher};
@@ -35,12 +38,34 @@ impl LanguageFrame {
3538
let mut language = [0; 3];
3639
reader.read_exact(&mut language)?;
3740

38-
let description = decode_text(
41+
let DecodeTextResult {
42+
content: description,
43+
bom,
44+
..
45+
} = decode_text(
3946
reader,
4047
TextDecodeOptions::new().encoding(encoding).terminated(true),
41-
)?
42-
.content;
43-
let content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content;
48+
)?;
49+
50+
let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes;
51+
52+
// It's possible for the description to be the only string with a BOM
53+
// To be safe, we change the encoding to the concrete variant determined from the description
54+
if encoding == TextEncoding::UTF16 {
55+
endianness = match bom {
56+
[0xFF, 0xFE] => u16::from_le_bytes,
57+
[0xFE, 0xFF] => u16::from_be_bytes,
58+
// The BOM has to be valid for `decode_text` to succeed
59+
_ => unreachable!("Bad BOM {bom:?}"),
60+
};
61+
}
62+
63+
let content;
64+
if encoding == TextEncoding::UTF16 {
65+
(content, _) = utf16_decode_terminated_maybe_bom(reader, endianness)?;
66+
} else {
67+
content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content;
68+
}
4469

4570
Ok(Some(Self {
4671
encoding,

lofty/src/id3/v2/items/sync_text.rs

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ use crate::error::{ErrorKind, Id3v2Error, Id3v2ErrorKind, LoftyError, Result};
22
use crate::id3::v2::{FrameFlags, FrameHeader, FrameId};
33
use crate::macros::err;
44
use crate::util::text::{
5-
TextDecodeOptions, TextEncoding, decode_text, encode_text, read_to_terminator,
6-
utf16_decode_bytes,
5+
DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text,
6+
utf16_decode_terminated_maybe_bom,
77
};
88

99
use std::borrow::Cow;
10-
use std::io::{Cursor, Read, Seek, SeekFrom, Write};
10+
use std::io::{Cursor, Seek, Write};
1111

1212
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
1313

@@ -148,23 +148,26 @@ impl SynchronizedTextFrame<'_> {
148148
.ok_or_else(|| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
149149

150150
let mut cursor = Cursor::new(&data[6..]);
151-
let description = crate::util::text::decode_text(
151+
let DecodeTextResult {
152+
content: description,
153+
bom,
154+
..
155+
} = decode_text(
152156
&mut cursor,
153157
TextDecodeOptions::new().encoding(encoding).terminated(true),
154158
)
155-
.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?
156-
.text_or_none();
159+
.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
157160

158161
let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes;
159162

160163
// It's possible for the description to be the only string with a BOM
161164
// To be safe, we change the encoding to the concrete variant determined from the description
162165
if encoding == TextEncoding::UTF16 {
163-
endianness = match cursor.get_ref()[..=1] {
166+
endianness = match bom {
164167
[0xFF, 0xFE] => u16::from_le_bytes,
165168
[0xFE, 0xFF] => u16::from_be_bytes,
166169
// Since the description was already read, we can assume the BOM was valid
167-
_ => unreachable!(),
170+
_ => unreachable!("Bad BOM {bom:?}"),
168171
};
169172
}
170173

@@ -174,33 +177,23 @@ impl SynchronizedTextFrame<'_> {
174177
let mut content = Vec::new();
175178

176179
while pos < total {
177-
let text = (|| -> Result<String> {
178-
if encoding == TextEncoding::UTF16 {
179-
// Check for a BOM
180-
let mut bom = [0; 2];
181-
cursor
182-
.read_exact(&mut bom)
180+
let text;
181+
if encoding == TextEncoding::UTF16 {
182+
let (decoded, bytes_read) =
183+
utf16_decode_terminated_maybe_bom(&mut cursor, endianness)
183184
.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
184-
185-
cursor.seek(SeekFrom::Current(-2))?;
186-
187-
// Encountered text that doesn't include a BOM
188-
if bom != [0xFF, 0xFE] && bom != [0xFE, 0xFF] {
189-
let (raw_text, _) = read_to_terminator(&mut cursor, TextEncoding::UTF16);
190-
return utf16_decode_bytes(&raw_text, endianness)
191-
.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText).into());
192-
}
193-
}
194-
185+
pos += bytes_read as u64;
186+
text = decoded;
187+
} else {
195188
let decoded_text = decode_text(
196189
&mut cursor,
197190
TextDecodeOptions::new().encoding(encoding).terminated(true),
198191
)
199192
.map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?;
200193
pos += decoded_text.bytes_read as u64;
201194

202-
Ok(decoded_text.content)
203-
})()?;
195+
text = decoded_text.content;
196+
}
204197

205198
let time = cursor
206199
.read_u32::<BigEndian>()
@@ -217,7 +210,11 @@ impl SynchronizedTextFrame<'_> {
217210
language,
218211
timestamp_format,
219212
content_type,
220-
description,
213+
description: if description.is_empty() {
214+
None
215+
} else {
216+
Some(description)
217+
},
221218
content,
222219
})
223220
}

lofty/src/util/text.rs

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -160,22 +160,15 @@ where
160160
err!(TextDecode("UTF-16 string has an odd length"));
161161
}
162162

163-
let bom_to_check;
164163
if options.bom == [0, 0] {
165-
bom_to_check = [raw_bytes[0], raw_bytes[1]];
164+
bom = [raw_bytes[0], raw_bytes[1]];
166165
} else {
167-
bom_to_check = options.bom;
166+
bom = options.bom;
168167
}
169168

170-
match bom_to_check {
171-
[0xFE, 0xFF] => {
172-
bom = [0xFE, 0xFF];
173-
utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?
174-
},
175-
[0xFF, 0xFE] => {
176-
bom = [0xFF, 0xFE];
177-
utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?
178-
},
169+
match bom {
170+
[0xFE, 0xFF] => utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?,
171+
[0xFF, 0xFE] => utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?,
179172
_ => err!(TextDecode("UTF-16 string has an invalid byte order mark")),
180173
}
181174
},
@@ -184,10 +177,6 @@ where
184177
.map_err(|_| LoftyError::new(ErrorKind::TextDecode("Expected a UTF-8 string")))?,
185178
};
186179

187-
if read_string.is_empty() {
188-
return Ok(EMPTY_DECODED_TEXT);
189-
}
190-
191180
Ok(DecodeTextResult {
192181
content: read_string,
193182
bytes_read,
@@ -278,6 +267,36 @@ pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) -
278267
utf16_decode(&unverified)
279268
}
280269

270+
// TODO: Can probably just be merged into an option on `TextDecodeOptions`
271+
/// Read a null-terminated UTF-16 string that may or may not have a BOM
272+
///
273+
/// This is needed for ID3v2, as some encoders will encode *only* the first string in a frame with a BOM,
274+
/// and the others are assumed to have the same byte order.
275+
///
276+
/// This is seen in frames like SYLT, COMM, and USLT, where the description will be the only string
277+
/// with a BOM.
278+
///
279+
/// If no BOM is present, the string will be decoded using `endianness`.
280+
pub(crate) fn utf16_decode_terminated_maybe_bom<R>(
281+
reader: &mut R,
282+
endianness: fn([u8; 2]) -> u16,
283+
) -> Result<(String, usize)>
284+
where
285+
R: Read,
286+
{
287+
let (raw_text, terminator_len) = read_to_terminator(reader, TextEncoding::UTF16);
288+
289+
let bytes_read = raw_text.len() + terminator_len;
290+
let decoded;
291+
match &*raw_text {
292+
[0xFF, 0xFE, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_le_bytes),
293+
[0xFE, 0xFF, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_be_bytes),
294+
_ => decoded = utf16_decode_bytes(&raw_text, endianness),
295+
}
296+
297+
decoded.map(|d| (d, bytes_read))
298+
}
299+
281300
pub(crate) fn encode_text(text: &str, text_encoding: TextEncoding, terminated: bool) -> Vec<u8> {
282301
match text_encoding {
283302
TextEncoding::Latin1 => {

0 commit comments

Comments
 (0)