@@ -930,6 +930,52 @@ static Count_bin do_count(int lineno, count_context_t *ctxt,
930
930
return r ;
931
931
}
932
932
933
+ /*
934
+ * See do_parse() for the purpose of this function.
935
+ *
936
+ * The returned number of parses (called here "count") is a 32-bit
937
+ * integer. However, this count may sometimes be very big - much more than
938
+ * can be represented in 32-bits. In such a case it is just enough to know
939
+ * that such an "overflow" occurred. Internally, big counts are clamped to
940
+ * INT_MAX (2^31-1) - see parse_count_clamp() (we refer below to such
941
+ * values as "clamped"). If the top-level do_count() (the one that is
942
+ * called from do_parse()) returns this value, it means such an overflow
943
+ * has occurred.
944
+ *
945
+ * The function uses a 64-bit signed integer as a count accumulator - named
946
+ * "total". The maximum value it can hold is 2^63-1. If it becomes greater
947
+ * than INT_MAX, it is considered as a count overflow. A care should be
948
+ * taken that this total itself would not overflow, else this detection
949
+ * mechanism would be rendered useless. To that end, each value from which
950
+ * this total is computed should be small enough so it would not overflow.
951
+ *
952
+ * The function has 4 code sections to calculate the count. Each of them,
953
+ * when entered, returns a value which is clamped (or doesn't need to be
954
+ * clamped). The are marked in the code with "Path 1a", "Path 1b",
955
+ * "Path 2", and "Path 3".
956
+ *
957
+ * Path 1a, Path 1b: If there is a possible linkage between the given
958
+ * words, return 1, else return 0. Here a count overflow cannot occur.
959
+ *
960
+ * Path 2: The total accumulate the result of the do_count() invocations
961
+ * that are done in a loop. The upper bound on the number of iterations is
962
+ * twice (out loop) the maximum number of word disjuncts )inner loop).
963
+ * Assuming no more than 2^31 disjuncts per word, and considering that
964
+ * each value is a result of do_count() which is clamped, the total is
965
+ * less than (2*2^31)*(2^31`-1), which is less than 2^63-1, and hence just
966
+ * needs to be clamped before returning.
967
+ *
968
+ * Path 3: The total is calculated as a sum of series of multiplications.
969
+ * To prevent its overflow, we ensure that each term (including the total
970
+ * itself) would not be greater than INT_MAX (2^31-1), so the result will
971
+ * not be more than (2^31-1)+((2^31-1)*(2^31-1)) which is less than
972
+ * 2^63-1. In this path, each multiplication term that may be greater then
973
+ * INT_MAX (leftcount and rightcount) is clamped before the
974
+ * multiplication, and the total is clamped after the multiplication.
975
+ * Multiplication terms that result from caching (or directly from
976
+ * do_count()) are already clamped.
977
+ */
978
+
933
979
#define do_count do_count1
934
980
#else
935
981
#define TRACE_LABEL (l , do_count ) (do_count)
@@ -968,6 +1014,8 @@ static Count_bin do_count(
968
1014
969
1015
unsigned int unparseable_len = rw - lw - 1 ;
970
1016
1017
+ /* Path 1a. */
1018
+
971
1019
#if 1
972
1020
/* This check is not necessary for correctness, as it is handled in
973
1021
* the general case below. It looks like it should be slightly faster. */
@@ -982,12 +1030,15 @@ static Count_bin do_count(
982
1030
}
983
1031
#endif
984
1032
1033
+
985
1034
/* The left and right connectors are null, but the two words are
986
1035
* NOT next to each-other. */
987
1036
if ((le == NULL ) && (re == NULL ))
988
1037
{
989
1038
int nopt_words = num_optional_words (ctxt , lw , rw );
990
1039
1040
+ /* Path 1b. */
1041
+
991
1042
if ((null_count == 0 ) ||
992
1043
(!ctxt -> islands_ok && (lw != -1 ) && (ctxt -> sent -> word [lw ].d != NULL )))
993
1044
{
@@ -1004,6 +1055,8 @@ static Count_bin do_count(
1004
1055
return table_store (ctxt , lw , rw , le , re , null_count , h , hist_zero ());
1005
1056
}
1006
1057
1058
+ /* Path 2. */
1059
+
1007
1060
/* Here null_count != 0 and we allow islands (a set of words
1008
1061
* linked together but separate from the rest of the sentence).
1009
1062
* Because we don't know here if an optional word is just
@@ -1012,6 +1065,12 @@ static Count_bin do_count(
1012
1065
* rest of the sentence must contain one less null-word. Else
1013
1066
* the rest of the sentence still contains the required number
1014
1067
* of null words. */
1068
+
1069
+ /* total (w_Count_bin which is int64_t) cannot overflow in this
1070
+ * loop since the number of disjuncts in the inner loop is
1071
+ * surely < 2^31, the outer loop can be iterated at most twice,
1072
+ * and do_count() may return at most 2^31-1. However, it may
1073
+ * become > 2^31-1 and hence needs to be clamped after the loop. */
1015
1074
w = lw + 1 ;
1016
1075
for (int opt = 0 ; opt <= (int )ctxt -> sent -> word [w ].optional ; opt ++ )
1017
1076
{
@@ -1024,26 +1083,23 @@ static Count_bin do_count(
1024
1083
hist_accumv (& total , d -> cost ,
1025
1084
do_count (ctxt , w , rw , d -> right , NULL , try_null_count - 1 ));
1026
1085
}
1027
- if (parse_count_clamp (& total ))
1028
- {
1029
- #if 0
1030
- printf ("OVERFLOW 1\n" );
1031
- #endif
1032
- }
1033
1086
}
1034
1087
1035
1088
hist_accumv (& total , 0.0 ,
1036
1089
do_count (ctxt , w , rw , NULL , NULL , try_null_count - 1 ));
1037
- if (parse_count_clamp (& total ))
1038
- {
1090
+ }
1091
+
1092
+ if (parse_count_clamp (& total ))
1093
+ {
1039
1094
#if 0
1040
- printf ("OVERFLOW 2 \n" );
1095
+ printf ("OVERFLOW 1 \n" );
1041
1096
#endif
1042
- }
1043
1097
}
1044
1098
return table_store (ctxt , lw , rw , le , re , null_count , h , total );
1045
1099
}
1046
1100
1101
+ /* Path 3. */
1102
+
1047
1103
/* The word range (lw, rw) gets split in all tentatively possible ways
1048
1104
* to LHS term and RHS term.
1049
1105
* There can be a total count > 0 only if one of the following
@@ -1130,7 +1186,6 @@ static Count_bin do_count(
1130
1186
Count_bin * l_cache = NULL ;
1131
1187
Count_bin * r_cache = NULL ;
1132
1188
unsigned int lcount_index = 0 ; /* Cached left count index */
1133
- #define S (c ) (!c?"(nil)":connector_string(c))
1134
1189
1135
1190
if (ctxt -> is_short )
1136
1191
{
@@ -1355,14 +1410,21 @@ static Count_bin do_count(
1355
1410
1356
1411
#define CACHE_COUNT (c , how_to_count , do_count ) \
1357
1412
{ \
1358
- w_Count_bin count = (hist_total(&c) == NO_COUNT) ? \
1413
+ Count_bin count = (hist_total(&c) == NO_COUNT) ? \
1359
1414
TRACE_LABEL(c, do_count) : c; \
1360
1415
how_to_count; \
1361
1416
}
1362
1417
/* If the pseudocounting above indicates one of the terms
1363
1418
* in the count multiplication is zero,
1364
1419
* we know that the true total is zero. So we don't
1365
1420
* bother counting the other term at all, in that case. */
1421
+
1422
+ /* To enable 31-bit overflow detection, total, leftcount and
1423
+ * rightcount are signed 64-bit, and are , a clamped cached
1424
+ * value, or are clamped below before they are used. total is
1425
+ * initially 0 and is clamped at the end of each iteration.
1426
+ * So the result will not be more than (2^31-1)+((2^31-1)*(2^31-1))
1427
+ * which is less than 2^63-1. */
1366
1428
if (leftpcount &&
1367
1429
(!lcnt_optimize || rightpcount || (0 != hist_total (& l_bnr ))))
1368
1430
{
@@ -1383,6 +1445,7 @@ static Count_bin do_count(
1383
1445
1384
1446
if (0 < hist_total (& leftcount ))
1385
1447
{
1448
+ parse_count_clamp (& leftcount ); /* May be up to 4*2^31. */
1386
1449
lrcnt_found = true;
1387
1450
d -> match_left = true;
1388
1451
@@ -1412,6 +1475,7 @@ static Count_bin do_count(
1412
1475
1413
1476
if (0 < hist_total (& rightcount ))
1414
1477
{
1478
+ parse_count_clamp (& rightcount ); /* May be up to 4*INT_MAX. */
1415
1479
if (le == NULL )
1416
1480
{
1417
1481
lrcnt_found = true;
0 commit comments