python · StanFromIreland · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025 · Mar 1, 2025
diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot
@@ -33,65 +33,77 @@ msgid ""
 "    multiline!\n"
 msgstr ""
 
-#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94
-#: messages.py:99 messages.py:100 messages.py:101
+#: messages.py:32
+msgid ""
+"this is a very very very very very very very very very very very very very "
+"long string!"
+msgstr ""
+
+#: messages.py:33
+msgid ""
+"this  is  a  very  very    very very  very    very  very   very       very  "
+"very    very  very    very   long string with weird spaces!"
+msgstr ""
+
+#: messages.py:50 messages.py:93 messages.py:94 messages.py:97 messages.py:98
+#: messages.py:103 messages.py:104 messages.py:105
 msgid "foo"
 msgid_plural "foos"
 msgstr[0] ""
 msgstr[1] ""
 
-#: messages.py:47
+#: messages.py:51
 msgid "something"
 msgstr ""
 
-#: messages.py:50
+#: messages.py:54
 msgid "Hello, {}!"
 msgstr ""
 
-#: messages.py:54
+#: messages.py:58
 msgid "1"
 msgstr ""
 
-#: messages.py:54
+#: messages.py:58
 msgid "2"
 msgstr ""
 
-#: messages.py:55 messages.py:56
+#: messages.py:59 messages.py:60
 msgid "A"
 msgstr ""
 
-#: messages.py:55 messages.py:56
+#: messages.py:59 messages.py:60
 msgid "B"
 msgstr ""
 
-#: messages.py:57
+#: messages.py:61
 msgid "set"
 msgstr ""
 
-#: messages.py:62 messages.py:63
+#: messages.py:66 messages.py:67
 msgid "nested string"
 msgstr ""
 
-#: messages.py:68
+#: messages.py:72
 msgid "baz"
 msgstr ""
 
-#: messages.py:71 messages.py:75
+#: messages.py:75 messages.py:79
 msgid "default value"
 msgstr ""
 
-#: messages.py:91 messages.py:92 messages.py:95 messages.py:96
+#: messages.py:95 messages.py:96 messages.py:99 messages.py:100
 msgctxt "context"
 msgid "foo"
 msgid_plural "foos"
 msgstr[0] ""
 msgstr[1] ""
 
-#: messages.py:102
+#: messages.py:106
 msgid "domain foo"
 msgstr ""
 
-#: messages.py:118 messages.py:119
+#: messages.py:122 messages.py:123
 msgid "world"
 msgid_plural "worlds"
 msgstr[0] ""

diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py
@@ -28,6 +28,10 @@
     multiline!
 """)
 
+# very long strings that should be wrapped by normalize
+_("this is a very very very very very very very very very very very very very long string!")
+_("this  is  a  very  very    very very  very    very  very   very       very  very    very  very    very   long string with weird spaces!")
+
 # Invalid arguments
 _()
 _(None)

diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py
@@ -5,6 +5,7 @@
 import sys
 import unittest
 from textwrap import dedent
+from types import SimpleNamespace
 from pathlib import Path
 
 from test.support.script_helper import assert_python_ok
@@ -18,7 +19,7 @@
 
 
 with imports_under_tool("i18n"):
-    from pygettext import parse_spec
+    from pygettext import parse_spec, make_escapes, normalize
 
 
 def normalize_POT_file(pot):
@@ -516,6 +517,46 @@ def test_parse_keyword_spec(self):
                     parse_spec(spec)
                 self.assertEqual(str(cm.exception), message)
 
+    def setUp(self):
+        # required to set up normalize
+        make_escapes(True)
+
+    def test_normalize_multiline(self):
+        s = 'multi-line\n translation'
+        s_expected = '""\n"multi-line\\n"\n" translation"'
+
+        data = normalize(s, 'UTF-8', 'msgid', 78)
+        self.assertEqual(s_expected, data)
+
+    def test_normalize_wrap(self):
+        cases = (
+            ('multi-line\n translation', '""\n"multi-line\\n"\n" translation"'),
+            ('fee fi fo fum fee fi ', '"fee fi fo fum fee fi "'),         # len = 29
+            ('fee fi fo fum fee fi f',  '"fee fi fo fum fee fi f"'),      # len = 30
+            ('fee fi fo fum fee fi fo', '""\n"fee fi fo fum fee fi fo"' ),# len = 31
+        )
+        for raw, expected in cases:
+            with self.subTest(raw):
+                data = normalize(raw, 'UTF-8', 'msgid', 30)
+                self.assertEqual(expected, data)
+
+    def test_normalize_nostr(self):
+        data = normalize('', 'UTF-8', 'msgid', 30)
+        self.assertEqual('""', data)
+
+    def test_normalize_single_word(self):
+        for s in ("fee", "fi", "fo", "fums"):
+            data = normalize(s, 'UTF-8', 'msgid', 3)
+            self.assertNotIn('""', data) # did not wrap
+
+    def test_normalize_split_on_whitespace(self):
+        for space in (' ', ' ', ' ', '\t', '\r'):
+            s = f'longlonglong{space}word'
+            space = {'\t': '\\t', '\r': '\\r'}.get(space, space)
+            s_expected = f'""\n"longlonglong{space}"\n"word"'
+            data = normalize(s, 'UTF-8', 'msgid', 10)
+            self.assertEqual(s_expected, data)
+
 
 def extract_from_snapshots():
     snapshots = {

diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-02-28-19-30-00.gh-issue-130703.ajhd21.rst b/Misc/NEWS.d/next/Tools-Demos/2025-02-28-19-30-00.gh-issue-130703.ajhd21.rst
@@ -0,0 +1 @@
+Wrap msgids to specified ``width`` in :program:`pygettext`.
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
@@ -145,6 +145,7 @@
 import importlib.machinery
 import importlib.util
 import os
+import re
 import sys
 import time
 import tokenize
@@ -154,6 +155,7 @@
 
 __version__ = '1.5'
 
+from test.test_doctest.test_doctest import wrapped
 
 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
 # there.
@@ -213,21 +215,40 @@ def escape_nonascii(s, encoding):
     return ''.join(escapes[b] for b in s.encode(encoding))
 
 
-def normalize(s, encoding):
+_space_splitter = re.compile(r'\s+|\S+\s*')
+
+def normalize(s, encoding, prefix, width):
     # This converts the various Python string types into a format that is
-    # appropriate for .po files, namely much closer to C style.
-    lines = s.split('\n')
-    if len(lines) == 1:
-        s = '"' + escape(s, encoding) + '"'
-    else:
-        if not lines[-1]:
-            del lines[-1]
-            lines[-1] = lines[-1] + '\n'
-        for i in range(len(lines)):
-            lines[i] = escape(lines[i], encoding)
-        lineterm = '\\n"\n"'
-        s = '""\n"' + lineterm.join(lines) + '"'
-    return s
+    # appropriate for .po files, namely much closer to C style,
+    # while wrapping to options.width.
+    lines = []
+    wrap = False
+    for line in s.splitlines(True):
+        escaped_line = escape(line, encoding)
+        if len(escaped_line) + len(prefix) + 3 > width:
+            wrap = True
+            words = _space_splitter.findall(line)
+            words.reverse()
+            buf = []
+            size = 2
+            while words:
+                word = words.pop()
+                escaped_word = escape(word, encoding)
+                escaped_word_len = len(escaped_word)
+                new_size = size + escaped_word_len
+                if new_size <= width or not buf:
+                    buf.append(escaped_word)
+                    size = new_size
+                else:
+                    lines.append(''.join(buf))
+                    buf = [escaped_word]
+                    size = escaped_word_len
+            lines.append(''.join(buf))
+        else:
+            lines.append(escaped_line)
+    if len(lines) <= 1 and (not wrap or len(_space_splitter.findall(lines[0])) == 1):
+        return f'"{escape(s, encoding)}"'
+    return '""\n' + '\n'.join(f'"{line}"' for line in lines)
 
 
 def containsAny(str, set):
@@ -618,10 +639,10 @@ def write_pot_file(messages, options, fp):
             # to skip translating some unimportant docstrings.
             print('#, docstring', file=fp)
         if msg.msgctxt is not None:
-            print('msgctxt', normalize(msg.msgctxt, encoding), file=fp)
-        print('msgid', normalize(msg.msgid, encoding), file=fp)
+            print('msgctxt', normalize(msg.msgctxt, encoding, 'msgctxt', options.width), file=fp)
+        print('msgid', normalize(msg.msgid, encoding, 'msgid', options.width), file=fp)
         if msg.msgid_plural is not None:
-            print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp)
+            print('msgid_plural', normalize(msg.msgid_plural, encoding, 'msgid_plural', options.width), file=fp)
             print('msgstr[0] ""', file=fp)
             print('msgstr[1] ""\n', file=fp)
         else:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Wrap msgids to specified ``width`` in :program:`pygettext`.