CLDR-18712 Don't include grammar for new units (#4784)

macchiati · web-flow · commit 5025948e356b · 2025-06-04T15:30:30.000-07:00
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/GrammarInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/GrammarInfo.java
@@ -758,8 +758,29 @@ public static Set<String> getGrammarLocales() {
                     "knot", // US/UK specific
                     "astronomical-unit", // specialized
                     "dalton", // specialized
-                    "electronvolt" // specialized
-                    );
+                    "electronvolt", // specialized
+
+                    // specialized
+                    "g-force",
+                    "steradian",
+                    "katal",
+                    "ofglucose",
+                    "part",
+                    "coulomb",
+                    "farad",
+                    "henry",
+                    "siemens",
+                    "becquerel",
+                    "calorie-it",
+                    "gray",
+                    "sievert",
+                    "kilogram-force",
+                    "em",
+                    "tesla",
+                    "weber",
+                    "ofhg",
+                    "light-speed",
+                    "fluid-ounce-metric");
 
     public static Set<String> getSpecialsToTranslate() {
         return INCLUDE_OTHER;
@@ -770,11 +791,12 @@ public static Set<String> getSpecialsToTranslate() {
     /** Internal class for thread-safety */
     static class UnitsToAddGrammar {
         static final Set<String> data;
+        static final Set<String> skipped;
 
         static {
             final CLDRConfig config = CLDRConfig.getInstance();
             final UnitConverter converter = config.getSupplementalDataInfo().getUnitConverter();
-            Set<String> missing = new TreeSet<>();
+            Set<String> _skipped = new TreeSet<>();
             Set<String> _data = new TreeSet<>();
             for (String path :
                     With.in(
@@ -784,22 +806,31 @@ static class UnitsToAddGrammar {
                 String unit = parts.getAttributeValue(3, "type");
                 // Add simple units
                 String shortUnit = converter.getShortId(unit);
+
                 if (INCLUDE_OTHER.contains(shortUnit)) {
                     _data.add(unit);
                     continue;
                 }
-                if (!EXCLUDE_GRAMMAR.contains(shortUnit)) {
-                    Set<UnitSystem> systems = converter.getSystemsEnum(shortUnit);
-                    // we now add all SI and metric and si_acceptable and metric_adjacent
-                    if (!Collections.disjoint(systems, UnitSystem.SiOrMetric)) {
-                        _data.add(unit);
-                        continue;
-                    }
+
+                if (EXCLUDE_GRAMMAR.contains(shortUnit)) {
+                    _skipped.add(unit);
+                    continue;
                 }
-                missing.add(unit);
+
+                // we now add all SI and metric and si_acceptable and metric_adjacent
+
+                Set<UnitSystem> systems = converter.getSystemsEnum(shortUnit);
+                if (!Collections.disjoint(systems, UnitSystem.SiOrMetric)) {
+                    _data.add(unit);
+                    continue;
+                }
+
+                // and skip the rest
+
+                _skipped.add(unit);
             }
             if (DEBUG)
-                for (String unit : missing) {
+                for (String unit : _skipped) {
                     String shortUnit = converter.getShortId(unit);
                     System.out.println(
                             "*Skipping\t"
@@ -812,11 +843,17 @@ static class UnitsToAddGrammar {
                                     + (converter.isSimple(shortUnit) ? "SIMPLE" : ""));
                 }
             data = ImmutableSet.copyOf(_data);
+            skipped = ImmutableSet.copyOf(_skipped);
         }
     }
 
     /** Return the units that we should get grammar information for. */
     public static Set<String> getUnitsToAddGrammar() {
         return UnitsToAddGrammar.data;
     }
+
+    /** Return the units that we should get grammar information for. */
+    public static Set<String> getUnitsToSkipGrammar() {
+        return UnitsToAddGrammar.skipped;
+    }
 }
diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java
@@ -4758,4 +4758,113 @@ private Multimap<Level, String> getCoverage(String locale, String xpathPrefix) {
         }
         return result;
     }
+
+    // for ALL units, should have paths for each unit X
+    // ldml/units/unitLength[@type="long"]/unit[@type="X"]/gender
+    // ldml/units/unitLength[@type="long"]/unit[@type="X"]/displayName
+    // ldml/units/unitLength[@type="short"]/unit[@type="X"]/displayName
+    // ldml/units/unitLength[@type="narrow"]/unit[@type="X"]/displayName
+    // ldml/units/unitLength[@type="long"]/unit[@type="X"]/unitPattern[@count="other"]
+
+    // will also have prefix units (long, narrow, short) (all prefixes)
+    // ldml/units/unitLength[@type="long"]/compoundUnit[@type="10p-1"]/unitPrefixPattern
+
+    // will also have per & times units (long, narrow, short)
+    // ldml/units/unitLength[@type="long"]/compoundUnit[@type="per"]/compoundUnitPattern
+    // ldml/units/unitLength[@type="long"]/compoundUnit[@type="times"]/compoundUnitPattern
+
+    // will also have long power2/power3 in all available plurals/case
+    // ldml/units/unitLength[@type="long"]/compoundUnit[@type="power2"]/compoundUnitPattern1[@count="one"][@gender="feminine"]
+    // ldml/units/unitLength[@type="long"]/compoundUnit[@type="power3"]/compoundUnitPattern1[@count="one"][@gender="feminine"]
+
+    // and short/narrow power2/3, but only for plurals, no case
+    // ldml/units/unitLength[@type="short"]/compoundUnit[@type="power2"]/compoundUnitPattern1[@count="one"]
+    // ldml/units/unitLength[@type="narrow"]/compoundUnit[@type="power2"]/compoundUnitPattern1[@count="one"]
+
+    // For ALL units, if there are plurals for the locale, will have other counts
+    // ldml/units/unitLength[@type="long"]/unit[@type="X"]/unitPattern[@count="Y"]
+
+    // For CORE units, if there is grammar for the locale, will have other cases, eg
+    // ldml/units/unitLength[@type="long"]/unit[@type="volume-cubic-meter"]/unitPattern[@count="one"][@case="dative"]
+
+    // For non-CORE units, if even if there is grammar for the locale, we won't have paths
+    // ldml/units/unitLength[@type="long"]/unit[@type="volume-fluid-ounce-metric"]/unitPattern[@count="one"][@case="dative"]
+
+    private static enum GrammarStatus {
+        always,
+        never
+    }
+
+    public void testSkippedUnitsForGrammer() {
+
+        // Note: the list for 'never' also includes the enOrJaOnly units (see above in this file).
+
+        final Multimap<GrammarStatus, String> statusToLongUnit =
+                ImmutableMultimap.<GrammarStatus, String>builder()
+                        .putAll(
+                                GrammarStatus.never,
+                                "angle-steradian",
+                                "area-bu-jp",
+                                "area-cho",
+                                "area-se-jp",
+                                "concentr-katal",
+                                "concentr-ofglucose",
+                                "concentr-part",
+                                "concentr-portion",
+                                "duration-fortnight",
+                                "electric-coulomb",
+                                "electric-farad",
+                                "electric-henry",
+                                "electric-siemens",
+                                "energy-becquerel",
+                                "energy-british-thermal-unit-it",
+                                "energy-calorie-it",
+                                "energy-gray",
+                                "energy-sievert",
+                                "force-kilogram-force",
+                                "length-chain",
+                                "length-jo-jp",
+                                "length-ken",
+                                "length-ri-jp",
+                                "length-rin",
+                                "length-rod",
+                                "length-shaku-cloth",
+                                "length-shaku-length",
+                                "length-sun",
+                                "magnetic-tesla",
+                                "magnetic-weber",
+                                "mass-fun",
+                                "mass-slug",
+                                "pressure-ofhg",
+                                "speed-light-speed",
+                                "temperature-rankine",
+                                "volume-cup-imperial",
+                                "volume-cup-jp",
+                                "volume-fluid-ounce-metric",
+                                "volume-koku",
+                                "volume-kosaji",
+                                "volume-osaji",
+                                "volume-pint-imperial",
+                                "volume-sai",
+                                "volume-shaku",
+                                "volume-to-jp")
+                        .putAll(GrammarStatus.always, "meter")
+                        .build();
+
+        Set<String> unitsToAdd = GrammarInfo.getUnitsToAddGrammar();
+
+        assertEquals(
+                "Should be missing",
+                Set.of(),
+                Sets.intersection(
+                        new TreeSet<String>(statusToLongUnit.get(GrammarStatus.never)),
+                        unitsToAdd));
+
+        assertEquals(
+                "Should be present always",
+                Set.of(),
+                Sets.intersection(
+                        new TreeSet<String>(statusToLongUnit.get(GrammarStatus.always)),
+                        unitsToAdd));
+    }
 }