Skip to content

Commit 65e8c4f

Browse files
committed
Unified the style of regular expression constants to PCRE. Fixes jpmml/sklearn2pmml#228
1 parent 86be03d commit 65e8c4f

File tree

7 files changed

+266
-8
lines changed

7 files changed

+266
-8
lines changed

pmml-sklearn/src/main/java/sklearn2pmml/preprocessing/MatchesTransformer.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535

3636
public class MatchesTransformer extends RegExTransformer {
3737

38+
public MatchesTransformer(){
39+
this("sklearn2pmml.preprocessing", "MatchesTransformer");
40+
}
41+
3842
public MatchesTransformer(String module, String name){
3943
super(module, name);
4044
}
@@ -66,4 +70,14 @@ public List<Feature> encodeFeatures(List<Feature> features, SkLearnEncoder encod
6670

6771
return Collections.singletonList(new BooleanFeature(encoder, derivedField));
6872
}
73+
74+
@Override
75+
MatchesTransformer setPattern(String pattern){
76+
return (MatchesTransformer)super.setPattern(pattern);
77+
}
78+
79+
@Override
80+
MatchesTransformer setReFlavour(String reFlavour){
81+
return (MatchesTransformer)super.setReFlavour(reFlavour);
82+
}
6983
}

pmml-sklearn/src/main/java/sklearn2pmml/preprocessing/RegExTransformer.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,22 @@ public String getPattern(){
4545
return getString("pattern");
4646
}
4747

48+
RegExTransformer setPattern(String pattern){
49+
setattr("pattern", pattern);
50+
51+
return this;
52+
}
53+
4854
public String getReFlavour(){
4955
return getOptionalEnum("re_flavour", this::getOptionalString, Arrays.asList(RegExTransformer.RE_FLAVOUR_PCRE, RegExTransformer.RE_FLAVOUR_PCRE2, RegExTransformer.RE_FLAVOUR_RE));
5056
}
5157

58+
RegExTransformer setReFlavour(String reFlavour){
59+
setattr("re_flavour", reFlavour);
60+
61+
return this;
62+
}
63+
5264
static
5365
public String translatePattern(String pattern, String reFlavour){
5466

pmml-sklearn/src/main/java/sklearn2pmml/preprocessing/ReplaceTransformer.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535

3636
public class ReplaceTransformer extends RegExTransformer {
3737

38+
public ReplaceTransformer(){
39+
this("sklearn2pmml.preprocessing", "ReplaceTransformer");
40+
}
41+
3842
public ReplaceTransformer(String module, String name){
3943
super(module, name);
4044
}
@@ -43,8 +47,8 @@ public ReplaceTransformer(String module, String name){
4347
public List<Feature> encodeFeatures(List<Feature> features, SkLearnEncoder encoder){
4448
String pattern = getPattern();
4549
String replacement = getReplacement();
46-
4750
String reFlavour = getReFlavour();
51+
4852
if(reFlavour != null){
4953
pattern = translatePattern(pattern, reFlavour);
5054
replacement = translateReplacement(replacement, reFlavour);
@@ -69,18 +73,37 @@ public List<Feature> encodeFeatures(List<Feature> features, SkLearnEncoder encod
6973
return Collections.singletonList(new StringFeature(encoder, derivedField));
7074
}
7175

76+
@Override
77+
ReplaceTransformer setPattern(String pattern){
78+
return (ReplaceTransformer)super.setPattern(pattern);
79+
}
80+
81+
@Override
82+
ReplaceTransformer setReFlavour(String reFlavour){
83+
return (ReplaceTransformer)super.setReFlavour(reFlavour);
84+
}
85+
7286
public String getReplacement(){
7387
return getString("replacement");
7488
}
7589

90+
ReplaceTransformer setReplacement(String replacement){
91+
setattr("replacement", replacement);
92+
93+
return this;
94+
}
95+
7696
static
7797
public String translateReplacement(String replacement, String reFlavour){
7898

7999
switch(reFlavour){
80100
case RegExTransformer.RE_FLAVOUR_PCRE:
81101
case RegExTransformer.RE_FLAVOUR_PCRE2:
82-
case RegExTransformer.RE_FLAVOUR_RE:
83102
return replacement;
103+
case RegExTransformer.RE_FLAVOUR_RE:
104+
return replacement
105+
.replaceAll("\\$", "\\$\\$")
106+
.replaceAll("\\\\(\\d)", "\\$$1");
84107
default:
85108
throw new IllegalArgumentException(reFlavour);
86109
}

pmml-sklearn/src/test/java/sklearn2pmml/preprocessing/ExpressionTransformerTest.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,6 @@ public void encode(){
100100

101101
static
102102
private Expression encode(String expr, Object mapMissingTo, Object defaultValue, String invalidValueTreatment){
103-
SkLearnEncoder encoder = new SkLearnEncoder();
104-
105-
DataField dataField = encoder.createDataField("x", OpType.CONTINUOUS, DataType.DOUBLE);
106-
107-
Feature inputFeature = new WildcardFeature(encoder, dataField);
108-
109103
ExpressionTransformer expressionTransformer = new ExpressionTransformer();
110104

111105
expressionTransformer
@@ -114,6 +108,17 @@ private Expression encode(String expr, Object mapMissingTo, Object defaultValue,
114108
.setDefaultValue(defaultValue)
115109
.setInvalidValueTreatment(invalidValueTreatment);
116110

111+
return encode(expressionTransformer);
112+
}
113+
114+
static
115+
private Expression encode(ExpressionTransformer expressionTransformer){
116+
SkLearnEncoder encoder = new SkLearnEncoder();
117+
118+
DataField dataField = encoder.createDataField("x", OpType.CONTINUOUS, DataType.DOUBLE);
119+
120+
Feature inputFeature = new WildcardFeature(encoder, dataField);
121+
117122
List<Feature> outputFeatures = expressionTransformer.encode(Collections.singletonList(inputFeature), encoder);
118123

119124
Feature outputFeature = Iterables.getOnlyElement(outputFeatures);
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Copyright (c) 2024 Villu Ruusmann
3+
*
4+
* This file is part of JPMML-SkLearn
5+
*
6+
* JPMML-SkLearn is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU Affero General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* JPMML-SkLearn is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU Affero General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Affero General Public License
17+
* along with JPMML-SkLearn. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
package sklearn2pmml.preprocessing;
20+
21+
import org.dmg.pmml.Apply;
22+
import org.junit.Test;
23+
24+
import static org.junit.Assert.assertFalse;
25+
import static org.junit.Assert.assertTrue;
26+
27+
public class MatchesTransformerTest extends RegExTransformerTest {
28+
29+
@Test
30+
public void matches(){
31+
MatchesTransformer matchesTransformer = new MatchesTransformer()
32+
.setPattern("ar?y")
33+
.setReFlavour(RegExTransformer.RE_FLAVOUR_RE);
34+
35+
Apply apply = encode(matchesTransformer);
36+
37+
assertTrue((Boolean)evaluate(apply, "January"));
38+
assertFalse((Boolean)evaluate(apply, "March"));
39+
assertTrue((Boolean)evaluate(apply, "May"));
40+
41+
matchesTransformer = matchesTransformer
42+
.setPattern("r$");
43+
44+
apply = encode(matchesTransformer);
45+
46+
assertFalse((Boolean)evaluate(apply, "March"));
47+
assertTrue((Boolean)evaluate(apply, "October"));
48+
}
49+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright (c) 2024 Villu Ruusmann
3+
*
4+
* This file is part of JPMML-SkLearn
5+
*
6+
* JPMML-SkLearn is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU Affero General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* JPMML-SkLearn is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU Affero General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Affero General Public License
17+
* along with JPMML-SkLearn. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
package sklearn2pmml.preprocessing;
20+
21+
import java.util.Collections;
22+
import java.util.List;
23+
import java.util.Objects;
24+
25+
import com.google.common.collect.Iterables;
26+
import org.dmg.pmml.Apply;
27+
import org.dmg.pmml.Constant;
28+
import org.dmg.pmml.DataField;
29+
import org.dmg.pmml.DataType;
30+
import org.dmg.pmml.DerivedField;
31+
import org.dmg.pmml.Expression;
32+
import org.dmg.pmml.OpType;
33+
import org.dmg.pmml.PMMLFunctions;
34+
import org.jpmml.converter.Feature;
35+
import org.jpmml.converter.WildcardFeature;
36+
import org.jpmml.evaluator.EvaluationContext;
37+
import org.jpmml.evaluator.ExpressionUtil;
38+
import org.jpmml.evaluator.FieldValue;
39+
import org.jpmml.evaluator.FieldValueUtil;
40+
import org.jpmml.evaluator.VirtualEvaluationContext;
41+
import org.jpmml.sklearn.SkLearnEncoder;
42+
43+
abstract
44+
public class RegExTransformerTest {
45+
46+
static
47+
Object evaluate(Expression expression, String string){
48+
Apply apply = (Apply)expression;
49+
50+
// XXX
51+
if(Objects.equals(PMMLFunctions.REPLACE, apply.requireFunction())){
52+
List<Expression> expressions = apply.getExpressions();
53+
54+
Constant replacementConstant = (Constant)expressions.get(2);
55+
56+
String replacement = (String)replacementConstant.getValue();
57+
58+
// Replace PCRE-style dollar literal with Java-style dollar literal
59+
replacement = replacement.replace("$$", "\\$");
60+
61+
replacementConstant.setValue(replacement);
62+
}
63+
64+
EvaluationContext content = new VirtualEvaluationContext();
65+
content.declare("x", string);
66+
67+
FieldValue value = ExpressionUtil.evaluate(expression, content);
68+
69+
return FieldValueUtil.getValue(value);
70+
}
71+
72+
static
73+
Apply encode(RegExTransformer regExTransformer){
74+
SkLearnEncoder encoder = new SkLearnEncoder();
75+
76+
DataField dataField = encoder.createDataField("x", OpType.CATEGORICAL, DataType.STRING);
77+
78+
Feature inputFeature = new WildcardFeature(encoder, dataField);
79+
80+
List<Feature> outputFeatures = regExTransformer.encode(Collections.singletonList(inputFeature), encoder);
81+
82+
Feature outputFeature = Iterables.getOnlyElement(outputFeatures);
83+
84+
DerivedField derivedField = (DerivedField)outputFeature.getField();
85+
86+
return (Apply)derivedField.getExpression();
87+
}
88+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (c) 2024 Villu Ruusmann
3+
*
4+
* This file is part of JPMML-SkLearn
5+
*
6+
* JPMML-SkLearn is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU Affero General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* JPMML-SkLearn is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU Affero General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Affero General Public License
17+
* along with JPMML-SkLearn. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
package sklearn2pmml.preprocessing;
20+
21+
import org.dmg.pmml.Apply;
22+
import org.junit.Test;
23+
24+
import static org.junit.Assert.assertEquals;
25+
26+
public class ReplaceTransformerTest extends RegExTransformerTest {
27+
28+
@Test
29+
public void replace(){
30+
ReplaceTransformer replaceTransformer = new ReplaceTransformer()
31+
.setPattern("(\\w)")
32+
.setReplacement("$1 ")
33+
.setReFlavour(RegExTransformer.RE_FLAVOUR_RE);
34+
35+
Apply apply = encode(replaceTransformer);
36+
37+
assertEquals("$1 $1 $1 $1 $1", ((String)evaluate(apply, "Puppy")).trim());
38+
39+
replaceTransformer = replaceTransformer
40+
.setReplacement("\\1 ");
41+
42+
apply = encode(replaceTransformer);
43+
44+
assertEquals("P u p p y", ((String)evaluate(apply, "Puppy")).trim());
45+
46+
replaceTransformer = replaceTransformer
47+
.setReplacement("$");
48+
49+
apply = encode(replaceTransformer);
50+
51+
assertEquals("$$$$$", evaluate(apply, "Puppy"));
52+
}
53+
54+
@Test
55+
public void translateReplacement(){
56+
assertEquals("$$", ReplaceTransformer.translateReplacement("$", RegExTransformer.RE_FLAVOUR_RE));
57+
58+
assertEquals("$$", ReplaceTransformer.translateReplacement("$$", RegExTransformer.RE_FLAVOUR_PCRE));
59+
assertEquals("$$$$", ReplaceTransformer.translateReplacement("$$", RegExTransformer.RE_FLAVOUR_RE));
60+
61+
assertEquals("$1", ReplaceTransformer.translateReplacement("$1", RegExTransformer.RE_FLAVOUR_PCRE));
62+
assertEquals("$$1", ReplaceTransformer.translateReplacement("$1", RegExTransformer.RE_FLAVOUR_RE));
63+
64+
assertEquals("\\1", ReplaceTransformer.translateReplacement("\\1", RegExTransformer.RE_FLAVOUR_PCRE));
65+
assertEquals("$1", ReplaceTransformer.translateReplacement("\\1", RegExTransformer.RE_FLAVOUR_RE));
66+
}
67+
}

0 commit comments

Comments
 (0)