Skip to content

Commit 0f4c884

Browse files
committed
Add non_capturing_groups argument and extra input validation to pyparsing.util.make_compressed_re
1 parent 67d3078 commit 0f4c884

File tree

6 files changed

+106
-13
lines changed

6 files changed

+106
-13
lines changed

CHANGES

+5
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ Version 3.2.2 - under development
6363
infinite parsing loop when parsing `rest_of_line` at the end of the input string.
6464
Reported by user Kylotan, thanks! (Issue #593)
6565

66+
- Enhancements and extra input validation for `pyparsing.util.make_compressed_re` - see
67+
usage in `examples/complex_chemical_formulas.py` and result in the generated railroad
68+
diagram `examples/complex_chemical_formulas_diagram.html`. Properly escapes characters
69+
like "." and "*" that have special meaning in regular expressions.
70+
6671
- Better exception message for `MatchFirst` and `Or` expressions, showing all alternatives
6772
rather than just the first one. Fixes Issue #592, reported by Focke, thanks!
6873

examples/complex_chemical_formulas.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
# - subcript_int - an integer made up of subscript digits
4747
# (a normal integer definition uses the one defined in pyparsing.common)
4848
#
49-
element = pp.one_of(table_of_elements).set_name("element")
49+
# element = pp.one_of(table_of_elements).set_name("element")
50+
element = pp.Regex(pp.util.make_compressed_re(table_of_elements)).set_name("element")
5051
element.add_parse_action(lambda t: Counter([t[0]]))
5152

5253
subscript_digits = "₀₁₂₃₄₅₆₇₈₉"

examples/complex_chemical_formulas_diagram.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,11 @@ <h1 class="railroad-heading" id="subscript-0006">subscript</h1>
404404
<h1 class="railroad-heading" id="element-0003">element</h1>
405405
<div class="railroad-description"></div>
406406
<div class="railroad-svg">
407-
<svg class="railroad-diagram" height="62" viewBox="0 0 3001.5 62" width="3001.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
407+
<svg class="railroad-diagram" height="62" viewBox="0 0 1803.0 62" width="1803.0" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
408408
<g transform="translate(.5 .5)">
409409
<g>
410410
<path d="M20 21v20m10 -20v20m-10 -10h20" /></g><path d="M40 31h10" /><g class="terminal ">
411-
<path d="M50 31h0.0" /><path d="M2951.5 31h0.0" /><rect height="22" rx="10" ry="10" width="2901.5" x="50" y="20"></rect><text x="1500.75" y="35">He|Ho|Hf|Hg|Hs|H|Li|Be|Br|Ba|Bi|Bk|Bh|B|Cl|Ca|Cr|Co|Cu|Cd|Cs|Ce|Cm|Cf|Cn|C|Ne|Na|Ni|Nb|Nd|Np|No|Nh|N|Os|Og|O|Fe|Fr|Fm|Fl|F|Mg|Al|Si|Pd|Pr|Pm|Pt|Pb|Po|Pa|Pu|P|Sc|Se|Sr|Sn|Sb|Sm|Sg|S|Ar|Kr|K|Ti|V|Mn|Zn|Ga|Ge|As|Rb|Yb|Y|Zr|Mo|Tc|Ru|Rh|Ag|In|Te|Ir|I|Xe|La|Eu|Gd|Tb|Dy|Er|Tm|Lu|Ta|W|Re|Au|Tl|At|Rn|Ra|Ac|Th|U|Am|Es|Md|Lr|Rf|Db|Mt|Ds|Rg|Mc|Lv|Ts</text></g><path d="M2951.5 31h10" /><path d="M 2961.5 31 h 20 m -10 -10 v 20 m 10 -20 v 20"></path></g><style>/* <![CDATA[ */
411+
<path d="M50 31h0.0" /><path d="M1753.0 31h0.0" /><rect height="22" rx="10" ry="10" width="1703" x="50" y="20"></rect><text x="901.5" y="35">A[cglmrstu]|B[aehikr]?|C[adeflmnorsu]?|D[bsy]|E[rsu]|F[elmr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airuv]|M[cdgnot]|N[abdehiop]?|O[gs]?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilms]|U|V|W|Xe|Yb?|Z[nr]</text></g><path d="M1753.0 31h10" /><path d="M 1763.0 31 h 20 m -10 -10 v 20 m 10 -20 v 20"></path></g><style>/* <![CDATA[ */
412412
svg.railroad-diagram {
413413
background-color:hsl(30,20%,95%);
414414
}

pyparsing/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def __repr__(self):
121121

122122

123123
__version_info__ = version_info(3, 2, 2, "final", 1)
124-
__version_time__ = "18 Mar 2025 23:03 UTC"
124+
__version_time__ = "21 Mar 2025 05:28 UTC"
125125
__version__ = __version_info__.__version__
126126
__versionTime__ = __version_time__
127127
__author__ = "Paul McGuire <[email protected]>"

pyparsing/util.py

+49-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# util.py
22
import contextlib
3+
import re
34
from functools import lru_cache, wraps
45
import inspect
56
import itertools
@@ -303,7 +304,11 @@ def _flatten(ll: Iterable) -> list:
303304

304305

305306
def make_compressed_re(
306-
word_list: Iterable[str], max_level: int = 2, _level: int = 1
307+
word_list: Iterable[str],
308+
max_level: int = 2,
309+
*,
310+
non_capturing_groups: bool = True,
311+
_level: int = 1,
307312
) -> str:
308313
"""
309314
Create a regular expression string from a list of words, collapsing by common
@@ -320,37 +325,72 @@ def get_suffixes_from_common_prefixes(namelist: list[str]):
320325
else:
321326
yield namelist[0][0], [namelist[0][1:]]
322327

328+
if _level == 1:
329+
if not word_list:
330+
raise ValueError("no words given to make_compressed_re()")
331+
332+
if "" in word_list:
333+
raise ValueError("word list cannot contain empty string")
334+
else:
335+
# internal recursive call, just return empty string if no words
336+
if not word_list:
337+
return ""
338+
339+
# dedupe the word list
340+
word_list = list({}.fromkeys(word_list))
341+
323342
if max_level == 0:
324-
return "|".join(sorted(word_list, key=len, reverse=True))
343+
if any(len(wd) > 1 for wd in word_list):
344+
return "|".join(
345+
sorted([re.escape(wd) for wd in word_list], key=len, reverse=True)
346+
)
347+
else:
348+
return f"[{''.join(_escape_regex_range_chars(wd) for wd in word_list)}]"
325349

326350
ret = []
327351
sep = ""
352+
ncgroup = "?:" if non_capturing_groups else ""
353+
328354
for initial, suffixes in get_suffixes_from_common_prefixes(sorted(word_list)):
329355
ret.append(sep)
330356
sep = "|"
331357

358+
initial = re.escape(initial)
359+
332360
trailing = ""
333361
if "" in suffixes:
334362
trailing = "?"
335363
suffixes.remove("")
336364

337365
if len(suffixes) > 1:
338366
if all(len(s) == 1 for s in suffixes):
339-
ret.append(f"{initial}[{''.join(suffixes)}]{trailing}")
367+
ret.append(
368+
f"{initial}[{''.join(_escape_regex_range_chars(s) for s in suffixes)}]{trailing}"
369+
)
340370
else:
341371
if _level < max_level:
342372
suffix_re = make_compressed_re(
343-
sorted(suffixes), max_level, _level + 1
373+
sorted(suffixes),
374+
max_level,
375+
non_capturing_groups=non_capturing_groups,
376+
_level=_level + 1,
344377
)
345-
ret.append(f"{initial}({suffix_re}){trailing}")
378+
ret.append(f"{initial}({ncgroup}{suffix_re}){trailing}")
346379
else:
347-
suffixes.sort(key=len, reverse=True)
348-
ret.append(f"{initial}({'|'.join(suffixes)}){trailing}")
380+
if all(len(s) == 1 for s in suffixes):
381+
ret.append(
382+
f"{initial}[{''.join(_escape_regex_range_chars(s) for s in suffixes)}]{trailing}"
383+
)
384+
else:
385+
suffixes.sort(key=len, reverse=True)
386+
ret.append(
387+
f"{initial}({ncgroup}{'|'.join(re.escape(s) for s in suffixes)}){trailing}"
388+
)
349389
else:
350390
if suffixes:
351-
suffix = suffixes[0]
391+
suffix = re.escape(suffixes[0])
352392
if len(suffix) > 1 and trailing:
353-
ret.append(f"{initial}({suffix}){trailing}")
393+
ret.append(f"{initial}({ncgroup}{suffix}){trailing}")
354394
else:
355395
ret.append(f"{initial}{suffix}{trailing}")
356396
else:

tests/test_util.py

+47
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,50 @@ def test_make_compressed_re() -> None:
195195
print(i, make_compressed_re(words, max_level=i))
196196
regex = re.compile(make_compressed_re(words, max_level=i) + "$")
197197
assert all(regex.match(wd) for wd in words)
198+
199+
def test_make_compressed_re_bad_input():
200+
from pyparsing.util import make_compressed_re
201+
202+
with pytest.raises(ValueError):
203+
make_compressed_re([])
204+
205+
with pytest.raises(ValueError):
206+
make_compressed_re(["a", "", "b", "c"])
207+
208+
# handle duplicate input strings
209+
assert make_compressed_re(["a", "b", "c"]) == make_compressed_re(["a", "b", "c", "a"])
210+
211+
212+
def test_make_compressed_re_random():
213+
import itertools
214+
import re
215+
from pyparsing.util import make_compressed_re
216+
217+
def generate_random_word(max_length: int) -> str:
218+
import random
219+
import string
220+
length = random.randint(1, max_length)
221+
return ''.join(random.choice(string.ascii_lowercase + ".*? ") for _ in range(length))
222+
223+
def generate_word_lists(num_lists: int, num_words: int, word_length: int) -> Iterable[list[str]]:
224+
yield from (
225+
[generate_random_word(word_length) for _ in range(num_words)]
226+
for _ in range(num_lists)
227+
)
228+
229+
for word_length, list_length in itertools.product(range(3, 9), range(1, 32)):
230+
for word_list in generate_word_lists(100, list_length, word_length):
231+
regex_pattern = make_compressed_re(word_list)
232+
try:
233+
regex = re.compile(f"^({regex_pattern})$")
234+
except Exception as e:
235+
assert False, f"Failed to compile {word_list} to regex pattern {regex_pattern!r}: {e}"
236+
237+
for word in word_list:
238+
assert regex.match(word), f"Regex {regex_pattern!r} did not match word: {word}"
239+
240+
# Check that the regex does not match a random word not in the list
241+
random_word = generate_random_word(word_length)
242+
while random_word in word_list:
243+
random_word = generate_random_word(word_length)
244+
assert regex.match(random_word) is None, f"Regex {regex_pattern!r} incorrectly matched word: {random_word!r}"

0 commit comments

Comments
 (0)