Skip to content

Commit fce377e

Browse files
add basic regex functions
Signed-off-by: Nikolaj Bjorner <[email protected]>
1 parent b143a95 commit fce377e

File tree

2 files changed

+167
-26
lines changed

2 files changed

+167
-26
lines changed

src/ast/sls/sls_seq_plugin.cpp

+155-25
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,16 @@ Revert bias on long strings:
7474
- bake in bias for shorter strings into equation solving?
7575
7676
Equality solving using stochastic Nelson.
77+
- When solving for an equality w = v, first convert them into two vectors by removing concatenations.
78+
The updates are then performed on the arguments to concatenations and not the concatenations themselves.
79+
This saves some amount of spurious work when pushing assignments down over concatenations, which is
80+
what the current first version of the solver does.
7781
- Given equality where current assignment does not satisfy it:
7882
- Xw = v:
7983
- let X' range over prefixes of X that matches v.
8084
- non-deterministic set X <- strval0(X')
81-
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a' matches prefix of strval0(v), and X' is longest prefix of X that matches v.
85+
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a'
86+
matches prefix of strval0(v), and X' is longest prefix of X that matches v.
8287
- If X fully matches a prefix of v, then, in addition to the rules above:
8388
- consume constant character from strval0(X)w = v
8489
- reveal the next variable to solve for.
@@ -90,6 +95,8 @@ Equality solving using stochastic Nelson.
9095
#include "ast/sls/sls_seq_plugin.h"
9196
#include "ast/sls/sls_context.h"
9297
#include "ast/ast_pp.h"
98+
#include "ast/rewriter/seq_rewriter.h"
99+
#include "ast/rewriter/th_rewriter.h"
93100

94101

95102
namespace sls {
@@ -258,7 +265,6 @@ namespace sls {
258265
VERIFY(seq.str.is_contains(e, a, b));
259266
if (seq.is_string(a->get_sort()))
260267
return strval0(a).contains(strval0(b));
261-
262268
NOT_IMPLEMENTED_YET();
263269
break;
264270
case OP_SEQ_PREFIX:
@@ -274,6 +280,11 @@ namespace sls {
274280
NOT_IMPLEMENTED_YET();
275281
break;
276282
case OP_SEQ_IN_RE:
283+
VERIFY(seq.str.is_in_re(e, a, b));
284+
if (seq.is_string(a->get_sort()))
285+
return is_in_re(strval0(a), b);
286+
NOT_IMPLEMENTED_YET();
287+
break;
277288
case OP_SEQ_NTH:
278289
case OP_SEQ_NTH_I:
279290
case OP_SEQ_NTH_U:
@@ -420,35 +431,24 @@ namespace sls {
420431
}
421432

422433
void seq_plugin::repair_up(app* e) {
423-
424434
if (m.is_bool(e))
425435
return;
426-
427-
if (seq.str.is_itos(e)) {
428-
repair_up_str_itos(e);
436+
if (is_value(e))
429437
return;
430-
}
431-
if (seq.str.is_stoi(e)) {
438+
if (seq.str.is_itos(e))
439+
repair_up_str_itos(e);
440+
else if (seq.str.is_stoi(e))
432441
repair_up_str_stoi(e);
433-
return;
434-
}
435-
if (seq.str.is_length(e)) {
442+
else if (seq.str.is_length(e))
436443
repair_up_str_length(e);
437-
return;
438-
}
439-
if (seq.str.is_index(e)) {
444+
else if (seq.str.is_index(e))
440445
repair_up_str_indexof(e);
441-
return;
442-
}
443-
if (seq.is_string(e->get_sort())) {
444-
if (is_value(e))
445-
return;
446+
else if (seq.is_string(e->get_sort())) {
446447
strval0(e) = strval1(e);
447448
ctx.new_value_eh(e);
448-
return;
449449
}
450-
451-
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
450+
else
451+
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
452452
}
453453

454454
bool seq_plugin::repair_down(app* e) {
@@ -461,6 +461,7 @@ namespace sls {
461461
if (m.is_eq(e))
462462
return repair_down_eq(e);
463463

464+
464465
NOT_IMPLEMENTED_YET();
465466
return false;
466467
}
@@ -621,6 +622,10 @@ namespace sls {
621622
return repair_down_str_itos(e);
622623
case OP_STRING_STOI:
623624
return repair_down_str_stoi(e);
625+
case OP_SEQ_IN_RE:
626+
if (seq.is_string(to_app(e)->get_arg(0)->get_sort()))
627+
return repair_down_in_re(e);
628+
break;
624629
case OP_STRING_UBVTOS:
625630
case OP_STRING_SBVTOS:
626631
case OP_STRING_TO_CODE:
@@ -639,8 +644,6 @@ namespace sls {
639644
case OP_SEQ_FOLDLI:
640645

641646
case OP_SEQ_TO_RE:
642-
case OP_SEQ_IN_RE:
643-
644647
case OP_RE_PLUS:
645648
case OP_RE_STAR:
646649
case OP_RE_OPTION:
@@ -679,7 +682,6 @@ namespace sls {
679682
m_int_updates.push_back({ x, r, 1 });
680683
else
681684
m_int_updates.push_back({ x, rational(-1 - ctx.rand(10)), 1 });
682-
683685
return apply_update();
684686
}
685687

@@ -1137,4 +1139,132 @@ namespace sls {
11371139
return get_eval(e).is_value;
11381140
return m.is_value(e);
11391141
}
1142+
1143+
// Regular expressions
1144+
1145+
bool seq_plugin::is_in_re(zstring const& s, expr* r) {
1146+
expr_ref sval(seq.str.mk_string(s), m);
1147+
th_rewriter rw(m);
1148+
expr_ref in_re(seq.re.mk_in_re(sval, r), m);
1149+
rw(in_re);
1150+
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
1151+
return m.is_true(in_re);
1152+
}
1153+
1154+
bool seq_plugin::repair_down_in_re(app* e) {
1155+
expr* x, * y;
1156+
VERIFY(seq.str.is_in_re(e, x, y));
1157+
auto info = seq.re.get_info(y);
1158+
if (!info.interpreted)
1159+
return false;
1160+
auto s = strval0(x);
1161+
expr_ref xval(seq.str.mk_string(s), m);
1162+
expr_ref in_re(seq.re.mk_in_re(xval, y), m);
1163+
th_rewriter rw(m);
1164+
rw(in_re);
1165+
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
1166+
if (m.is_true(in_re) == ctx.is_true(e))
1167+
return true;
1168+
1169+
if (is_value(x))
1170+
return false;
1171+
1172+
vector<zstring> conts;
1173+
expr_ref d_r(y, m);
1174+
seq_rewriter seqrw(m);
1175+
for (unsigned i = 0; i < s.length(); ++i) {
1176+
verbose_stream() << "Derivative " << s.extract(0, i) << ": " << d_r << "\n";
1177+
if (seq.re.is_empty(d_r))
1178+
break;
1179+
zstring prefix = s.extract(0, i);
1180+
choose(d_r, 2, prefix, conts);
1181+
expr_ref ch(seq.str.mk_char(s[i]), m);
1182+
d_r = seqrw.mk_derivative(ch, d_r);
1183+
}
1184+
if (!seq.re.is_empty(d_r))
1185+
choose(d_r, 2, s, conts);
1186+
1187+
verbose_stream() << "repair in_re " << mk_pp(e, m) << " " << s << "\n";
1188+
for (auto& str : conts)
1189+
verbose_stream() << "prefix " << str << "\n";
1190+
1191+
// TODO: do some length analysis to prune out short candidates when there are longer ones.
1192+
// TODO: when matching .*"bcd" with string ab, the extension abc is more interesting than aba.
1193+
if (ctx.is_true(e)) {
1194+
for (auto& str : conts)
1195+
m_str_updates.push_back({ x, str, 1 });
1196+
}
1197+
else {
1198+
for (auto& str : conts)
1199+
m_str_updates.push_back({ x, str + m_chars[ctx.rand(m_chars.size())], 1});
1200+
}
1201+
return apply_update();
1202+
}
1203+
1204+
void seq_plugin::next_char(expr* r, unsigned_vector& chars) {
1205+
SASSERT(seq.is_re(r));
1206+
expr* x, * y;
1207+
zstring s;
1208+
if (seq.re.is_concat(r, x, y)) {
1209+
auto info = seq.re.get_info(x);
1210+
next_char(x, chars);
1211+
if (info.nullable == l_true)
1212+
next_char(y, chars);
1213+
}
1214+
else if (seq.re.is_to_re(r, x)) {
1215+
if (seq.str.is_string(x, s) && !s.empty())
1216+
chars.push_back(s[0]);
1217+
}
1218+
else if (seq.re.is_union(r, x, y)) {
1219+
next_char(x, chars);
1220+
next_char(y, chars);
1221+
}
1222+
else if (seq.re.is_range(r, x, y)) {
1223+
zstring s1, s2;
1224+
seq.str.is_string(x, s1);
1225+
seq.str.is_string(y, s2);
1226+
if (s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) {
1227+
chars.push_back(s1[0] + ctx.rand(s2[0] - s1[0] + 1));
1228+
chars.push_back(s1[0]);
1229+
chars.push_back(s2[0]);
1230+
}
1231+
}
1232+
else if (seq.re.is_star(r, x) || seq.re.is_plus(r, x)) {
1233+
next_char(x, chars);
1234+
}
1235+
else if (seq.re.is_empty(r)) {
1236+
;
1237+
}
1238+
else if (seq.re.is_full_seq(r)) {
1239+
if (!m_chars.empty())
1240+
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
1241+
}
1242+
else if (seq.re.is_full_char(r)) {
1243+
if (!m_chars.empty())
1244+
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
1245+
}
1246+
else {
1247+
verbose_stream() << "regex nyi " << mk_bounded_pp(r, m) << "\n";
1248+
NOT_IMPLEMENTED_YET();
1249+
}
1250+
}
1251+
1252+
void seq_plugin::choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result) {
1253+
auto info = seq.re.get_info(r);
1254+
result.push_back(prefix);
1255+
if (k == 0)
1256+
return;
1257+
unsigned_vector chars;
1258+
next_char(r, chars);
1259+
std::stable_sort(chars.begin(), chars.end());
1260+
auto it = std::unique(chars.begin(), chars.end());
1261+
chars.shrink((unsigned)(it - chars.begin()));
1262+
for (auto ch : chars) {
1263+
expr_ref c(seq.str.mk_char(ch), m);
1264+
seq_rewriter rw(m);
1265+
expr_ref r2 = rw.mk_derivative(c, r);
1266+
zstring prefix2 = prefix + zstring(ch);
1267+
choose(r2, k - 1, prefix2, result);
1268+
}
1269+
}
11401270
}

src/ast/sls/sls_seq_plugin.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ namespace sls {
4444
arith_util a;
4545
scoped_ptr_vector<eval> m_values;
4646
indexed_uint_set m_chars;
47-
bool m_initialized = false;
47+
bool m_initialized = false;
4848

4949
struct str_update {
5050
expr* e;
@@ -81,13 +81,24 @@ namespace sls {
8181
bool repair_down_str_suffixof(app* e);
8282
bool repair_down_str_itos(app* e);
8383
bool repair_down_str_stoi(app* e);
84+
bool repair_down_in_re(app* e);
8485

8586
void repair_up_str_length(app* e);
8687
void repair_up_str_indexof(app* e);
8788
void repair_up_str_itos(app* e);
8889
void repair_up_str_stoi(app* e);
8990

91+
// regex functionality
92+
93+
// enumerate set of strings that can match a prefix of regex r.
94+
void choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result);
95+
96+
// enumerate set of possible next chars, including possibly sampling from m_chars for whild-cards.
97+
void next_char(expr* r, unsigned_vector& chars);
98+
99+
bool is_in_re(zstring const& s, expr* r);
90100

101+
// access evaluation
91102
bool is_seq_predicate(expr* e);
92103

93104
eval& get_eval(expr* e);

0 commit comments

Comments
 (0)