Add non_capturing_groups argument and extra input validation to pyparsing.util.make_compressed_re

ptmcg · ptmcg · commit 0f4c884e7c64 · 2025-03-21T00:30:52.000-05:00
diff --git a/CHANGES b/CHANGES
@@ -63,6 +63,11 @@ Version 3.2.2 - under development
   infinite parsing loop when parsing `rest_of_line` at the end of the input string.
   Reported by user Kylotan, thanks! (Issue #593)
 
+- Enhancements and extra input validation for `pyparsing.util.make_compressed_re` - see
+  usage in `examples/complex_chemical_formulas.py` and result in the generated railroad
+  diagram `examples/complex_chemical_formulas_diagram.html`. Properly escapes characters
+  like "." and "*" that have special meaning in regular expressions.
+
 - Better exception message for `MatchFirst` and `Or` expressions, showing all alternatives
   rather than just the first one. Fixes Issue #592, reported by Focke, thanks!
 
diff --git a/examples/complex_chemical_formulas.py b/examples/complex_chemical_formulas.py
@@ -46,7 +46,8 @@
 #  - subcript_int - an integer made up of subscript digits
 #  (a normal integer definition uses the one defined in pyparsing.common)
 #
-element = pp.one_of(table_of_elements).set_name("element")
+# element = pp.one_of(table_of_elements).set_name("element")
+element = pp.Regex(pp.util.make_compressed_re(table_of_elements)).set_name("element")
 element.add_parse_action(lambda t: Counter([t[0]]))
 
 subscript_digits = "₀₁₂₃₄₅₆₇₈₉"
diff --git a/examples/complex_chemical_formulas_diagram.html b/examples/complex_chemical_formulas_diagram.html
@@ -404,11 +404,11 @@ <h1 class="railroad-heading" id="subscript-0006">subscript</h1>
         <h1 class="railroad-heading" id="element-0003">element</h1>
         <div class="railroad-description"></div>
         <div class="railroad-svg">
-            <svg class="railroad-diagram" height="62" viewBox="0 0 3001.5 62" width="3001.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+            <svg class="railroad-diagram" height="62" viewBox="0 0 1803.0 62" width="1803.0" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 <g transform="translate(.5 .5)">
 <g>
 <path d="M20 21v20m10 -20v20m-10 -10h20" /></g><path d="M40 31h10" /><g class="terminal ">
-<path d="M50 31h0.0" /><path d="M2951.5 31h0.0" /><rect height="22" rx="10" ry="10" width="2901.5" x="50" y="20"></rect><text x="1500.75" y="35">He|Ho|Hf|Hg|Hs|H|Li|Be|Br|Ba|Bi|Bk|Bh|B|Cl|Ca|Cr|Co|Cu|Cd|Cs|Ce|Cm|Cf|Cn|C|Ne|Na|Ni|Nb|Nd|Np|No|Nh|N|Os|Og|O|Fe|Fr|Fm|Fl|F|Mg|Al|Si|Pd|Pr|Pm|Pt|Pb|Po|Pa|Pu|P|Sc|Se|Sr|Sn|Sb|Sm|Sg|S|Ar|Kr|K|Ti|V|Mn|Zn|Ga|Ge|As|Rb|Yb|Y|Zr|Mo|Tc|Ru|Rh|Ag|In|Te|Ir|I|Xe|La|Eu|Gd|Tb|Dy|Er|Tm|Lu|Ta|W|Re|Au|Tl|At|Rn|Ra|Ac|Th|U|Am|Es|Md|Lr|Rf|Db|Mt|Ds|Rg|Mc|Lv|Ts</text></g><path d="M2951.5 31h10" /><path d="M 2961.5 31 h 20 m -10 -10 v 20 m 10 -20 v 20"></path></g><style>/* <![CDATA[ */
+<path d="M50 31h0.0" /><path d="M1753.0 31h0.0" /><rect height="22" rx="10" ry="10" width="1703" x="50" y="20"></rect><text x="901.5" y="35">A[cglmrstu]|B[aehikr]?|C[adeflmnorsu]?|D[bsy]|E[rsu]|F[elmr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airuv]|M[cdgnot]|N[abdehiop]?|O[gs]?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilms]|U|V|W|Xe|Yb?|Z[nr]</text></g><path d="M1753.0 31h10" /><path d="M 1763.0 31 h 20 m -10 -10 v 20 m 10 -20 v 20"></path></g><style>/* <![CDATA[ */
 	svg.railroad-diagram {
 		background-color:hsl(30,20%,95%);
 	}
diff --git a/pyparsing/__init__.py b/pyparsing/__init__.py
@@ -121,7 +121,7 @@ def __repr__(self):
 
 
 __version_info__ = version_info(3, 2, 2, "final", 1)
-__version_time__ = "18 Mar 2025 23:03 UTC"
+__version_time__ = "21 Mar 2025 05:28 UTC"
 __version__ = __version_info__.__version__
 __versionTime__ = __version_time__
 __author__ = "Paul McGuire <ptmcg.gm+pyparsing@gmail.com>"
diff --git a/pyparsing/util.py b/pyparsing/util.py
@@ -1,5 +1,6 @@
 # util.py
 import contextlib
+import re
 from functools import lru_cache, wraps
 import inspect
 import itertools
@@ -303,7 +304,11 @@ def _flatten(ll: Iterable) -> list:
 
 
 def make_compressed_re(
-    word_list: Iterable[str], max_level: int = 2, _level: int = 1
+    word_list: Iterable[str],
+    max_level: int = 2,
+    *,
+    non_capturing_groups: bool = True,
+    _level: int = 1,
 ) -> str:
     """
     Create a regular expression string from a list of words, collapsing by common
@@ -320,37 +325,72 @@ def get_suffixes_from_common_prefixes(namelist: list[str]):
         else:
             yield namelist[0][0], [namelist[0][1:]]
 
+    if _level == 1:
+        if not word_list:
+            raise ValueError("no words given to make_compressed_re()")
+
+        if "" in word_list:
+            raise ValueError("word list cannot contain empty string")
+    else:
+        # internal recursive call, just return empty string if no words
+        if not word_list:
+            return ""
+
+    # dedupe the word list
+    word_list = list({}.fromkeys(word_list))
+
     if max_level == 0:
-        return "|".join(sorted(word_list, key=len, reverse=True))
+        if any(len(wd) > 1 for wd in word_list):
+            return "|".join(
+                sorted([re.escape(wd) for wd in word_list], key=len, reverse=True)
+            )
+        else:
+            return f"[{''.join(_escape_regex_range_chars(wd) for wd in word_list)}]"
 
     ret = []
     sep = ""
+    ncgroup = "?:" if non_capturing_groups else ""
+
     for initial, suffixes in get_suffixes_from_common_prefixes(sorted(word_list)):
         ret.append(sep)
         sep = "|"
 
+        initial = re.escape(initial)
+
         trailing = ""
         if "" in suffixes:
             trailing = "?"
             suffixes.remove("")
 
         if len(suffixes) > 1:
             if all(len(s) == 1 for s in suffixes):
-                ret.append(f"{initial}[{''.join(suffixes)}]{trailing}")
+                ret.append(
+                    f"{initial}[{''.join(_escape_regex_range_chars(s) for s in suffixes)}]{trailing}"
+                )
             else:
                 if _level < max_level:
                     suffix_re = make_compressed_re(
-                        sorted(suffixes), max_level, _level + 1
+                        sorted(suffixes),
+                        max_level,
+                        non_capturing_groups=non_capturing_groups,
+                        _level=_level + 1,
                     )
-                    ret.append(f"{initial}({suffix_re}){trailing}")
+                    ret.append(f"{initial}({ncgroup}{suffix_re}){trailing}")
                 else:
-                    suffixes.sort(key=len, reverse=True)
-                    ret.append(f"{initial}({'|'.join(suffixes)}){trailing}")
+                    if all(len(s) == 1 for s in suffixes):
+                        ret.append(
+                            f"{initial}[{''.join(_escape_regex_range_chars(s) for s in suffixes)}]{trailing}"
+                        )
+                    else:
+                        suffixes.sort(key=len, reverse=True)
+                        ret.append(
+                            f"{initial}({ncgroup}{'|'.join(re.escape(s) for s in suffixes)}){trailing}"
+                        )
         else:
             if suffixes:
-                suffix = suffixes[0]
+                suffix = re.escape(suffixes[0])
                 if len(suffix) > 1 and trailing:
-                    ret.append(f"{initial}({suffix}){trailing}")
+                    ret.append(f"{initial}({ncgroup}{suffix}){trailing}")
                 else:
                     ret.append(f"{initial}{suffix}{trailing}")
             else:
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -195,3 +195,50 @@ def test_make_compressed_re() -> None:
         print(i, make_compressed_re(words, max_level=i))
         regex = re.compile(make_compressed_re(words, max_level=i) + "$")
         assert all(regex.match(wd) for wd in words)
+
+def test_make_compressed_re_bad_input():
+    from pyparsing.util import make_compressed_re
+
+    with pytest.raises(ValueError):
+        make_compressed_re([])
+
+    with pytest.raises(ValueError):
+        make_compressed_re(["a", "", "b", "c"])
+
+    # handle duplicate input strings
+    assert make_compressed_re(["a", "b", "c"]) == make_compressed_re(["a", "b", "c", "a"])
+
+
+def test_make_compressed_re_random():
+    import itertools
+    import re
+    from pyparsing.util import make_compressed_re
+
+    def generate_random_word(max_length: int) -> str:
+        import random
+        import string
+        length = random.randint(1, max_length)
+        return ''.join(random.choice(string.ascii_lowercase + ".*? ") for _ in range(length))
+
+    def generate_word_lists(num_lists: int, num_words: int, word_length: int) -> Iterable[list[str]]:
+        yield from (
+            [generate_random_word(word_length) for _ in range(num_words)]
+            for _ in range(num_lists)
+        )
+
+    for word_length, list_length in itertools.product(range(3, 9), range(1, 32)):
+        for word_list in generate_word_lists(100, list_length, word_length):
+            regex_pattern = make_compressed_re(word_list)
+            try:
+                regex = re.compile(f"^({regex_pattern})$")
+            except Exception as e:
+                assert False, f"Failed to compile {word_list} to regex pattern {regex_pattern!r}: {e}"
+
+            for word in word_list:
+                assert regex.match(word), f"Regex {regex_pattern!r} did not match word: {word}"
+
+            # Check that the regex does not match a random word not in the list
+            random_word = generate_random_word(word_length)
+            while random_word in word_list:
+                random_word = generate_random_word(word_length)
+            assert regex.match(random_word) is None, f"Regex {regex_pattern!r} incorrectly matched word: {random_word!r}"

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,8 @@`
`46`	`46`	`# - subcript_int - an integer made up of subscript digits`
`47`	`47`	`# (a normal integer definition uses the one defined in pyparsing.common)`
`48`	`48`	`#`
`49`		`-element = pp.one_of(table_of_elements).set_name("element")`
	`49`	`+# element = pp.one_of(table_of_elements).set_name("element")`
	`50`	`+element = pp.Regex(pp.util.make_compressed_re(table_of_elements)).set_name("element")`
`50`	`51`	`element.add_parse_action(lambda t: Counter([t[0]]))`
`51`	`52`
`52`	`53`	`subscript_digits = "₀₁₂₃₄₅₆₇₈₉"`