Fix word break when the first character of a token is multi-byte (#753)

and3md · mikke89 · web-flow · commit ba9775e37fb9 · 2025-04-16T16:44:42.000+02:00
Co-authored-by: Michael Ragazzon &lt;michael.ragazzon@gmail.com&gt;
diff --git a/Source/Core/ElementText.cpp b/Source/Core/ElementText.cpp
@@ -250,32 +250,33 @@ bool ElementText::GenerateLine(String& line, int& line_length, float& line_width
 					// Try to break up the word
 					max_token_width = int(maximum_line_width - line_width);
 					const int token_max_size = int(next_token_begin - token_begin);
-					bool force_loop_break_after_next = false;
+					const char* partial_string_end = token_begin + token_max_size;
 
 					// @performance: Can be made much faster. Use string width heuristics and logarithmic search.
-					for (int i = token_max_size - 1; i > 0; --i)
+					while (true)
 					{
+						partial_string_end = StringUtilities::SeekBackwardUTF8(partial_string_end - 1, token_begin);
+
+						bool force_loop_break_at_end = false;
+						if (partial_string_end == token_begin)
+						{
+							// Not even the first character of the token fits. Let it overflow onto the next line if we can.
+							if (allow_empty || !line.empty())
+								return false;
+
+							// Continue by forcing the first character to be consumed, even though it will overflow.
+							partial_string_end = StringUtilities::SeekForwardUTF8(token_begin + 1, token_begin + token_max_size);
+							force_loop_break_at_end = true;
+						}
+
 						token.clear();
 						next_token_begin = token_begin;
-						const char* partial_string_end = StringUtilities::SeekBackwardUTF8(token_begin + i, token_begin);
 						BuildToken(token, next_token_begin, partial_string_end, line.empty() && trim_whitespace_prefix, collapse_white_space,
 							break_at_endline, text_transform_property, decode_escape_characters);
 						token_width = font_engine_interface->GetStringWidth(font_face_handle, token, text_shaping_context, previous_codepoint);
 
-						if (force_loop_break_after_next || token_width <= max_token_width)
-						{
+						if (force_loop_break_at_end || token_width <= max_token_width)
 							break;
-						}
-						else if (next_token_begin == token_begin)
-						{
-							// This means the first character of the token doesn't fit. Let it overflow into the next line if we can.
-							if (allow_empty || !line.empty())
-								return false;
-
-							// Not even the first character of the line fits. Go back to consume the first character even though it will overflow.
-							i += 2;
-							force_loop_break_after_next = true;
-						}
 					}
 
 					break_line = true;
diff --git a/Tests/Data/VisualTests/word_break.rml b/Tests/Data/VisualTests/word_break.rml
@@ -44,10 +44,10 @@
 	<hr/>
 	<h1>Zero-width box</h1>
 	<p>word-break: normal</p>
-	<div class="box zero">A WORD</div>
+	<div class="box zero">€ WORD</div>
 	<p>word-break: break-all</p>
-	<div class="box zero break-all">A WORD</div>
+	<div class="box zero break-all">€ WORD</div>
 	<p>word-break: break-word</p>
-	<div class="box zero break-word">A WORD</div>
+	<div class="box zero break-word">€ WORD</div>
 </body>
 </rml>