Skip to content

gh-130703: Implement wrapping to width for msgids #130705

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b3ccc45
Add logic to wrap and test
StanFromIreland Feb 28, 2025
33149ed
Fix NEWS name -- We don't want miliseconds
StanFromIreland Feb 28, 2025
0e35e36
Change extract func in test
StanFromIreland Feb 28, 2025
92f227f
Use a modified version of pybabel's code in normalize
StanFromIreland Mar 1, 2025
f0ee9c4
Minor tweak
StanFromIreland Mar 1, 2025
843e3fa
Update argparse snapshot
StanFromIreland Mar 1, 2025
7fc34ca
Bénédikt's suggestions
picnixz Mar 1, 2025
8d319b4
Preserve spaces and remove unnecessary checks
StanFromIreland Mar 1, 2025
9197688
Improve comment
StanFromIreland Mar 1, 2025
7c8637e
Add test and sort imports
StanFromIreland Mar 1, 2025
66d8eac
Benedikt's suggestion
StanFromIreland Mar 1, 2025
430c051
Add tests and simplify normalize
StanFromIreland Mar 2, 2025
abb90c2
tomasr8 suggestion
StanFromIreland Mar 2, 2025
7f947db
Fix typo in test str
StanFromIreland Mar 2, 2025
ea5fa91
Benedikt's suggestions
StanFromIreland Mar 2, 2025
4b02678
More of Benedikt's suggestions
StanFromIreland Mar 2, 2025
8d03cbf
Don't wrap for single words
StanFromIreland Mar 2, 2025
fbe5b93
Address Serhiy's suggestions
StanFromIreland Mar 2, 2025
8d5f84f
Use more complex pattern
StanFromIreland Mar 2, 2025
ae53774
Serhiy's suggestions
StanFromIreland Mar 2, 2025
794fc8b
Serhiy's suggestions
StanFromIreland Mar 3, 2025
47bfa29
Clean up
StanFromIreland Mar 3, 2025
b6f128f
Apply suggestions from Tomas
StanFromIreland Mar 3, 2025
a4823a7
Apply suggestions from Serhiy
StanFromIreland Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 27 additions & 15 deletions Lib/test/test_tools/i18n_data/messages.pot
Original file line number Diff line number Diff line change
Expand Up @@ -33,65 +33,77 @@ msgid ""
" multiline!\n"
msgstr ""

#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94
#: messages.py:99 messages.py:100 messages.py:101
#: messages.py:32
msgid ""
"this is a very very very very very very very very very very very very very "
"long string!"
msgstr ""

#: messages.py:33
msgid ""
"this is a very very very very very very very very very "
"very very very very long string with weird spaces!"
msgstr ""

#: messages.py:50 messages.py:93 messages.py:94 messages.py:97 messages.py:98
#: messages.py:103 messages.py:104 messages.py:105
msgid "foo"
msgid_plural "foos"
msgstr[0] ""
msgstr[1] ""

#: messages.py:47
#: messages.py:51
msgid "something"
msgstr ""

#: messages.py:50
#: messages.py:54
msgid "Hello, {}!"
msgstr ""

#: messages.py:54
#: messages.py:58
msgid "1"
msgstr ""

#: messages.py:54
#: messages.py:58
msgid "2"
msgstr ""

#: messages.py:55 messages.py:56
#: messages.py:59 messages.py:60
msgid "A"
msgstr ""

#: messages.py:55 messages.py:56
#: messages.py:59 messages.py:60
msgid "B"
msgstr ""

#: messages.py:57
#: messages.py:61
msgid "set"
msgstr ""

#: messages.py:62 messages.py:63
#: messages.py:66 messages.py:67
msgid "nested string"
msgstr ""

#: messages.py:68
#: messages.py:72
msgid "baz"
msgstr ""

#: messages.py:71 messages.py:75
#: messages.py:75 messages.py:79
msgid "default value"
msgstr ""

#: messages.py:91 messages.py:92 messages.py:95 messages.py:96
#: messages.py:95 messages.py:96 messages.py:99 messages.py:100
msgctxt "context"
msgid "foo"
msgid_plural "foos"
msgstr[0] ""
msgstr[1] ""

#: messages.py:102
#: messages.py:106
msgid "domain foo"
msgstr ""

#: messages.py:118 messages.py:119
#: messages.py:122 messages.py:123
msgid "world"
msgid_plural "worlds"
msgstr[0] ""
Expand Down
4 changes: 4 additions & 0 deletions Lib/test/test_tools/i18n_data/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
multiline!
""")

# very long strings that should be wrapped by normalize
_("this is a very very very very very very very very very very very very very long string!")
_("this is a very very very very very very very very very very very very very long string with weird spaces!")

# Invalid arguments
_()
_(None)
Expand Down
43 changes: 42 additions & 1 deletion Lib/test/test_tools/test_i18n.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
import unittest
from textwrap import dedent
from types import SimpleNamespace
from pathlib import Path

from test.support.script_helper import assert_python_ok
Expand All @@ -18,7 +19,7 @@


with imports_under_tool("i18n"):
from pygettext import parse_spec
from pygettext import parse_spec, make_escapes, normalize


def normalize_POT_file(pot):
Expand Down Expand Up @@ -516,6 +517,46 @@ def test_parse_keyword_spec(self):
parse_spec(spec)
self.assertEqual(str(cm.exception), message)

def setUp(self):
# required to set up normalize
make_escapes(True)

def test_normalize_multiline(self):
s = 'multi-line\n translation'
s_expected = '""\n"multi-line\\n"\n" translation"'

data = normalize(s, 'UTF-8', 'msgid', 78)
self.assertEqual(s_expected, data)

def test_normalize_wrap(self):
cases = (
('multi-line\n translation', '""\n"multi-line\\n"\n" translation"'),
('fee fi fo fum fee fi ', '"fee fi fo fum fee fi "'), # len = 29
('fee fi fo fum fee fi f', '"fee fi fo fum fee fi f"'), # len = 30
('fee fi fo fum fee fi fo', '""\n"fee fi fo fum fee fi fo"' ),# len = 31
)
for raw, expected in cases:
with self.subTest(raw):
data = normalize(raw, 'UTF-8', 'msgid', 30)
self.assertEqual(expected, data)

def test_normalize_nostr(self):
data = normalize('', 'UTF-8', 'msgid', 30)
self.assertEqual('""', data)

def test_normalize_single_word(self):
for s in ("fee", "fi", "fo", "fums"):
data = normalize(s, 'UTF-8', 'msgid', 3)
self.assertNotIn('""', data) # did not wrap

def test_normalize_split_on_whitespace(self):
for space in (' ', ' ', ' ', '\t', '\r'):
s = f'longlonglong{space}word'
space = {'\t': '\\t', '\r': '\\r'}.get(space, space)
s_expected = f'""\n"longlonglong{space}"\n"word"'
data = normalize(s, 'UTF-8', 'msgid', 10)
self.assertEqual(s_expected, data)


def extract_from_snapshots():
snapshots = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Wrap msgids to specified ``width`` in :program:`pygettext`.
55 changes: 38 additions & 17 deletions Tools/i18n/pygettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
import importlib.machinery
import importlib.util
import os
import re
import sys
import time
import tokenize
Expand All @@ -154,6 +155,7 @@

__version__ = '1.5'

from test.test_doctest.test_doctest import wrapped

# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
# there.
Expand Down Expand Up @@ -213,21 +215,40 @@ def escape_nonascii(s, encoding):
return ''.join(escapes[b] for b in s.encode(encoding))


def normalize(s, encoding):
_space_splitter = re.compile(r'\s+|\S+\s*')

def normalize(s, encoding, prefix, width):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
lines = s.split('\n')
if len(lines) == 1:
s = '"' + escape(s, encoding) + '"'
else:
if not lines[-1]:
del lines[-1]
lines[-1] = lines[-1] + '\n'
for i in range(len(lines)):
lines[i] = escape(lines[i], encoding)
lineterm = '\\n"\n"'
s = '""\n"' + lineterm.join(lines) + '"'
return s
# appropriate for .po files, namely much closer to C style,
# while wrapping to options.width.
lines = []
wrap = False
for line in s.splitlines(True):
escaped_line = escape(line, encoding)
if len(escaped_line) + len(prefix) + 3 > width:
wrap = True
words = _space_splitter.findall(line)
words.reverse()
buf = []
size = 2
while words:
word = words.pop()
escaped_word = escape(word, encoding)
escaped_word_len = len(escaped_word)
new_size = size + escaped_word_len
if new_size <= width or not buf:
buf.append(escaped_word)
size = new_size
else:
lines.append(''.join(buf))
buf = [escaped_word]
size = escaped_word_len
lines.append(''.join(buf))
else:
lines.append(escaped_line)
if len(lines) <= 1 and (not wrap or len(_space_splitter.findall(lines[0])) == 1):
return f'"{escape(s, encoding)}"'
return '""\n' + '\n'.join(f'"{line}"' for line in lines)


def containsAny(str, set):
Expand Down Expand Up @@ -618,10 +639,10 @@ def write_pot_file(messages, options, fp):
# to skip translating some unimportant docstrings.
print('#, docstring', file=fp)
if msg.msgctxt is not None:
print('msgctxt', normalize(msg.msgctxt, encoding), file=fp)
print('msgid', normalize(msg.msgid, encoding), file=fp)
print('msgctxt', normalize(msg.msgctxt, encoding, 'msgctxt', options.width), file=fp)
print('msgid', normalize(msg.msgid, encoding, 'msgid', options.width), file=fp)
if msg.msgid_plural is not None:
print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp)
print('msgid_plural', normalize(msg.msgid_plural, encoding, 'msgid_plural', options.width), file=fp)
print('msgstr[0] ""', file=fp)
print('msgstr[1] ""\n', file=fp)
else:
Expand Down
Loading