Skip to content

Commit b4c303b

Browse files
authored
Merge pull request #1327 from ampli/count-ovfl
A fix for undetected count overflows
2 parents 06f660d + 44e987e commit b4c303b

File tree

1 file changed

+76
-12
lines changed

1 file changed

+76
-12
lines changed

link-grammar/parse/count.c

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,52 @@ static Count_bin do_count(int lineno, count_context_t *ctxt,
930930
return r;
931931
}
932932

933+
/*
934+
* See do_parse() for the purpose of this function.
935+
*
936+
* The returned number of parses (called here "count") is a 32-bit
937+
* integer. However, this count may sometimes be very big - much more than
938+
* can be represented in 32-bits. In such a case it is just enough to know
939+
* that such an "overflow" occurred. Internally, big counts are clamped to
940+
* INT_MAX (2^31-1) - see parse_count_clamp() (we refer below to such
941+
* values as "clamped"). If the top-level do_count() (the one that is
942+
* called from do_parse()) returns this value, it means such an overflow
943+
* has occurred.
944+
*
945+
* The function uses a 64-bit signed integer as a count accumulator - named
946+
* "total". The maximum value it can hold is 2^63-1. If it becomes greater
947+
* than INT_MAX, it is considered as a count overflow. A care should be
948+
* taken that this total itself would not overflow, else this detection
949+
* mechanism would be rendered useless. To that end, each value from which
950+
* this total is computed should be small enough so it would not overflow.
951+
*
952+
* The function has 4 code sections to calculate the count. Each of them,
953+
* when entered, returns a value which is clamped (or doesn't need to be
954+
* clamped). The are marked in the code with "Path 1a", "Path 1b",
955+
* "Path 2", and "Path 3".
956+
*
957+
* Path 1a, Path 1b: If there is a possible linkage between the given
958+
* words, return 1, else return 0. Here a count overflow cannot occur.
959+
*
960+
* Path 2: The total accumulate the result of the do_count() invocations
961+
* that are done in a loop. The upper bound on the number of iterations is
962+
* twice (out loop) the maximum number of word disjuncts )inner loop).
963+
* Assuming no more than 2^31 disjuncts per word, and considering that
964+
* each value is a result of do_count() which is clamped, the total is
965+
* less than (2*2^31)*(2^31`-1), which is less than 2^63-1, and hence just
966+
* needs to be clamped before returning.
967+
*
968+
* Path 3: The total is calculated as a sum of series of multiplications.
969+
* To prevent its overflow, we ensure that each term (including the total
970+
* itself) would not be greater than INT_MAX (2^31-1), so the result will
971+
* not be more than (2^31-1)+((2^31-1)*(2^31-1)) which is less than
972+
* 2^63-1. In this path, each multiplication term that may be greater then
973+
* INT_MAX (leftcount and rightcount) is clamped before the
974+
* multiplication, and the total is clamped after the multiplication.
975+
* Multiplication terms that result from caching (or directly from
976+
* do_count()) are already clamped.
977+
*/
978+
933979
#define do_count do_count1
934980
#else
935981
#define TRACE_LABEL(l, do_count) (do_count)
@@ -968,6 +1014,8 @@ static Count_bin do_count(
9681014

9691015
unsigned int unparseable_len = rw-lw-1;
9701016

1017+
/* Path 1a. */
1018+
9711019
#if 1
9721020
/* This check is not necessary for correctness, as it is handled in
9731021
* the general case below. It looks like it should be slightly faster. */
@@ -982,12 +1030,15 @@ static Count_bin do_count(
9821030
}
9831031
#endif
9841032

1033+
9851034
/* The left and right connectors are null, but the two words are
9861035
* NOT next to each-other. */
9871036
if ((le == NULL) && (re == NULL))
9881037
{
9891038
int nopt_words = num_optional_words(ctxt, lw, rw);
9901039

1040+
/* Path 1b. */
1041+
9911042
if ((null_count == 0) ||
9921043
(!ctxt->islands_ok && (lw != -1) && (ctxt->sent->word[lw].d != NULL)))
9931044
{
@@ -1004,6 +1055,8 @@ static Count_bin do_count(
10041055
return table_store(ctxt, lw, rw, le, re, null_count, h, hist_zero());
10051056
}
10061057

1058+
/* Path 2. */
1059+
10071060
/* Here null_count != 0 and we allow islands (a set of words
10081061
* linked together but separate from the rest of the sentence).
10091062
* Because we don't know here if an optional word is just
@@ -1012,6 +1065,12 @@ static Count_bin do_count(
10121065
* rest of the sentence must contain one less null-word. Else
10131066
* the rest of the sentence still contains the required number
10141067
* of null words. */
1068+
1069+
/* total (w_Count_bin which is int64_t) cannot overflow in this
1070+
* loop since the number of disjuncts in the inner loop is
1071+
* surely < 2^31, the outer loop can be iterated at most twice,
1072+
* and do_count() may return at most 2^31-1. However, it may
1073+
* become > 2^31-1 and hence needs to be clamped after the loop. */
10151074
w = lw + 1;
10161075
for (int opt = 0; opt <= (int)ctxt->sent->word[w].optional; opt++)
10171076
{
@@ -1024,26 +1083,23 @@ static Count_bin do_count(
10241083
hist_accumv(&total, d->cost,
10251084
do_count(ctxt, w, rw, d->right, NULL, try_null_count-1));
10261085
}
1027-
if (parse_count_clamp(&total))
1028-
{
1029-
#if 0
1030-
printf("OVERFLOW 1\n");
1031-
#endif
1032-
}
10331086
}
10341087

10351088
hist_accumv(&total, 0.0,
10361089
do_count(ctxt, w, rw, NULL, NULL, try_null_count-1));
1037-
if (parse_count_clamp(&total))
1038-
{
1090+
}
1091+
1092+
if (parse_count_clamp(&total))
1093+
{
10391094
#if 0
1040-
printf("OVERFLOW 2\n");
1095+
printf("OVERFLOW 1\n");
10411096
#endif
1042-
}
10431097
}
10441098
return table_store(ctxt, lw, rw, le, re, null_count, h, total);
10451099
}
10461100

1101+
/* Path 3. */
1102+
10471103
/* The word range (lw, rw) gets split in all tentatively possible ways
10481104
* to LHS term and RHS term.
10491105
* There can be a total count > 0 only if one of the following
@@ -1130,7 +1186,6 @@ static Count_bin do_count(
11301186
Count_bin *l_cache = NULL;
11311187
Count_bin *r_cache = NULL;
11321188
unsigned int lcount_index = 0; /* Cached left count index */
1133-
#define S(c) (!c?"(nil)":connector_string(c))
11341189

11351190
if (ctxt->is_short)
11361191
{
@@ -1355,14 +1410,21 @@ static Count_bin do_count(
13551410

13561411
#define CACHE_COUNT(c, how_to_count, do_count) \
13571412
{ \
1358-
w_Count_bin count = (hist_total(&c) == NO_COUNT) ? \
1413+
Count_bin count = (hist_total(&c) == NO_COUNT) ? \
13591414
TRACE_LABEL(c, do_count) : c; \
13601415
how_to_count; \
13611416
}
13621417
/* If the pseudocounting above indicates one of the terms
13631418
* in the count multiplication is zero,
13641419
* we know that the true total is zero. So we don't
13651420
* bother counting the other term at all, in that case. */
1421+
1422+
/* To enable 31-bit overflow detection, total, leftcount and
1423+
* rightcount are signed 64-bit, and are , a clamped cached
1424+
* value, or are clamped below before they are used. total is
1425+
* initially 0 and is clamped at the end of each iteration.
1426+
* So the result will not be more than (2^31-1)+((2^31-1)*(2^31-1))
1427+
* which is less than 2^63-1. */
13661428
if (leftpcount &&
13671429
(!lcnt_optimize || rightpcount || (0 != hist_total(&l_bnr))))
13681430
{
@@ -1383,6 +1445,7 @@ static Count_bin do_count(
13831445

13841446
if (0 < hist_total(&leftcount))
13851447
{
1448+
parse_count_clamp(&leftcount); /* May be up to 4*2^31. */
13861449
lrcnt_found = true;
13871450
d->match_left = true;
13881451

@@ -1412,6 +1475,7 @@ static Count_bin do_count(
14121475

14131476
if (0 < hist_total(&rightcount))
14141477
{
1478+
parse_count_clamp(&rightcount); /* May be up to 4*INT_MAX. */
14151479
if (le == NULL)
14161480
{
14171481
lrcnt_found = true;

0 commit comments

Comments
 (0)