Skip to content

Commit 318a081

Browse files
committed
Don't capture hard errors for old cagg format
If an error occurs inside the deserialization function for continuous aggregates, we are using the old numeric format and this was captured inside `inner_agg_deserialize` and the format was "repaired". However, this captured hard errors as well, which can cause cascading errors if execution continues after the error. This commit fixes that by capturing data exception errors, internal error (which is the default for elog()) and protocol error (c.f. `pq_getmsgint64` used by `numeric_deserialize`), but re-throwing all other errors. It also adds tests to make sure that the repair is done and not generates an error.
1 parent 64d10ef commit 318a081

File tree

6 files changed

+106
-13
lines changed

6 files changed

+106
-13
lines changed

.unreleased/pr_7903

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixes: #7903 Don't capture hard errors for old cagg format

tsl/src/hypercore/attr_capture.c

+1-4
Original file line numberDiff line numberDiff line change
@@ -240,10 +240,7 @@ capture_ExecutorStart(QueryDesc *queryDesc, int eflags)
240240
foreach (cell, queryDesc->plannedstmt->rtable)
241241
{
242242
RangeTblEntry *rte = lfirst(cell);
243-
TS_DEBUG_LOG("rtable #%d: %s (relid: %d)",
244-
foreach_current_index(cell),
245-
get_rel_name(rte->relid),
246-
rte->relid);
243+
TS_DEBUG_LOG("rtable #%d: %s", foreach_current_index(cell), get_rel_name(rte->relid));
247244
}
248245
#endif
249246

tsl/src/partialize_finalize.c

+44-8
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
#include <utils/builtins.h>
1717
#include <utils/datum.h>
1818
#include <utils/fmgroids.h>
19+
#include <utils/resowner.h>
1920
#include <utils/syscache.h>
2021

2122
#include "partialize_finalize.h"
23+
#include "utils.h"
2224

2325
/*
2426
* This file implements functions to split the calculation of partial and final
@@ -223,12 +225,15 @@ static bytea *
223225
sanitize_serialized_partial(Oid deserialfnoid, bytea *serialized_partial)
224226
{
225227
if ((deserialfnoid == F_NUMERIC_DESERIALIZE) || (deserialfnoid == F_NUMERIC_AVG_DESERIALIZE))
228+
{
226229
/*
227230
* Always add NUMERIC_PARTIAL_MISSING_LENGTH extra bytes because the length is not fixed.
228231
* This is only safe to do when the partial state is known to be short, otherwise an
229232
* exception is thrown if the serialized_partial is not fully consumed by deserialfn().
230233
*/
234+
TS_DEBUG_LOG("zero-filling serialized numeric type");
231235
return zero_fill_bytearray(serialized_partial, NUMERIC_PARTIAL_MISSING_LENGTH);
236+
}
232237

233238
return serialized_partial;
234239
}
@@ -260,25 +265,56 @@ inner_agg_deserialize(FACombineFnMeta *combine_meta, bytea *volatile serialized_
260265
deser_fcinfo->isnull = false;
261266

262267
/*
263-
* When an exception is thrown and longjmp() is called, CurrentMemoryContext is potentially
264-
* different than what it was inside the PG_TRY() block below.
268+
* When an exception is thrown and longjmp() is called,
269+
* CurrentMemoryContext is potentially different than what it was
270+
* inside the PG_TRY() block below.
271+
*
272+
* Restore it to the old value so that the code in the subsequent
273+
* PG_CATCH() block does not corrupt the memory.
265274
*
266-
* Restore it to the old value so that the code in the subsequent PG_CATCH() block does not
267-
* corrupt the memory.
275+
* No need for volatile variables since we don't modify any of this
276+
* function's stack frame inside the PG_TRY() block.
268277
*
269-
* No need for volatile variables since we don't modify any of this function's stack frame
270-
* inside the PG_TRY() block.
278+
* We create a subtransaction to run the functions below, in case they
279+
* abort.
271280
*/
272281
MemoryContext oldcontext = CurrentMemoryContext;
282+
ResourceOwner oldowner = CurrentResourceOwner;
283+
BeginInternalSubTransaction(NULL);
273284
PG_TRY();
274285
{
275286
deserialized = FunctionCallInvoke(deser_fcinfo);
287+
ReleaseCurrentSubTransaction();
288+
MemoryContextSwitchTo(oldcontext);
289+
CurrentResourceOwner = oldowner;
276290
}
277291
PG_CATCH();
278292
{
279-
CurrentMemoryContext = oldcontext;
293+
const int sqlerrcode = geterrcode();
294+
295+
/*
296+
* If we get a data exception category error or a protocol
297+
* violation or an internal error (which is generated by default
298+
* when using elog()), the format is wrong and we can attempt to
299+
* repair it by switching to a different function for
300+
* deserializing.
301+
*
302+
* If the error is an insufficient resources category we should
303+
* *not* continue executing.
304+
*
305+
* Errors in other categories are not expected, but we cannot do
306+
* anything with them anyway, so just re-throw them too.
307+
*/
308+
if (ERRCODE_TO_CATEGORY(sqlerrcode) != ERRCODE_DATA_EXCEPTION &&
309+
sqlerrcode != ERRCODE_PROTOCOL_VIOLATION && sqlerrcode != ERRCODE_INTERNAL_ERROR)
310+
{
311+
PG_RE_THROW();
312+
}
280313
FlushErrorState();
281-
/* attempt to repair the serialized partial */
314+
RollbackAndReleaseCurrentSubTransaction();
315+
MemoryContextSwitchTo(oldcontext);
316+
CurrentResourceOwner = oldowner;
317+
TS_DEBUG_LOG("attempting repair of serialized partial");
282318
serialized_partial =
283319
sanitize_serialized_partial(combine_meta->deserialfnoid, serialized_partial);
284320
FC_ARG(deser_fcinfo, 0) = PointerGetDatum(serialized_partial);

tsl/test/expected/partialize_finalize.out

+40
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ as
166166
, _timescaledb_functions.partialize_agg(max(d))
167167
, _timescaledb_functions.partialize_agg(stddev(b))
168168
, _timescaledb_functions.partialize_agg(stddev(e)) from foo group by a, b ;
169+
create table saved_partials as select * from v1;
169170
create table t1 as select * from v1;
170171
--sum 2114, collid 0, min(text) 2145, collid 100, max(ts) 2127
171172
insert into foo values(12, 10, 'hello', '2010-01-02 09:00:00-05', 10);
@@ -1102,3 +1103,42 @@ FROM issue4922_partials_parallel;
11021103
5001129 | 50.0112900000000000 | 0 | 100 | 100000
11031104
(1 row)
11041105

1106+
-- Test that a "soft" error causes an attempt to repair the row
1107+
-- We don't care about the result here (but it is the sum of the b
1108+
-- column where a = 1 in table "foo" above), just that it should
1109+
-- attempt a repair, do the repair, and then not fail.
1110+
--
1111+
-- Do this by reading the partial numeric format and drop bytes from
1112+
-- the end of it (16 bytes, meaning 32 characters). This is what we
1113+
-- know how to repair. Next, attempt to decode the partial and check
1114+
-- that it attempt a repair and actually does the repair (zero-filling
1115+
-- the end).
1116+
set client_min_messages to debug2;
1117+
WITH
1118+
sample AS (SELECT encode(partialb, 'hex') part FROM saved_partials WHERE a = 1),
1119+
broken AS (SELECT decode(left(part, -32), 'hex') partialb FROM sample)
1120+
SELECT _timescaledb_functions.finalize_agg('sum(numeric)', null, null, null, broken.partialb, null::numeric) sumb
1121+
FROM broken;
1122+
LOG: statement: WITH
1123+
sample AS (SELECT encode(partialb, 'hex') part FROM saved_partials WHERE a = 1),
1124+
broken AS (SELECT decode(left(part, -32), 'hex') partialb FROM sample)
1125+
SELECT _timescaledb_functions.finalize_agg('sum(numeric)', null, null, null, broken.partialb, null::numeric) sumb
1126+
FROM broken;
1127+
DEBUG: capture_ExecutorStart - rtable #0: (null)
1128+
DEBUG: capture_ExecutorStart - rtable #1: (null)
1129+
DEBUG: capture_ExecutorStart - rtable #2: saved_partials
1130+
DEBUG: inner_agg_deserialize - attempting repair of serialized partial
1131+
DEBUG: sanitize_serialized_partial - zero-filling serialized numeric type
1132+
DEBUG: inner_agg_deserialize - attempting repair of serialized partial
1133+
DEBUG: sanitize_serialized_partial - zero-filling serialized numeric type
1134+
DEBUG: inner_agg_deserialize - attempting repair of serialized partial
1135+
DEBUG: sanitize_serialized_partial - zero-filling serialized numeric type
1136+
DEBUG: inner_agg_deserialize - attempting repair of serialized partial
1137+
DEBUG: sanitize_serialized_partial - zero-filling serialized numeric type
1138+
DEBUG: inner_agg_deserialize - attempting repair of serialized partial
1139+
DEBUG: sanitize_serialized_partial - zero-filling serialized numeric type
1140+
sumb
1141+
------
1142+
150
1143+
(1 row)
1144+

tsl/test/sql/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ set(TEST_FILES
6767
merge_compress.sql
6868
modify_exclusion.sql
6969
move.sql
70-
partialize_finalize.sql
7170
policy_generalization.sql
7271
reorder.sql
7372
size_utils_tsl.sql
@@ -149,6 +148,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug)
149148
tsl_tables.sql
150149
license_tsl.sql
151150
fixed_schedules.sql
151+
partialize_finalize.sql
152152
recompress_chunk_segmentwise.sql
153153
feature_flags.sql
154154
vector_agg_default.sql

tsl/test/sql/partialize_finalize.sql

+19
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ as
137137
, _timescaledb_functions.partialize_agg(stddev(b))
138138
, _timescaledb_functions.partialize_agg(stddev(e)) from foo group by a, b ;
139139

140+
create table saved_partials as select * from v1;
140141
create table t1 as select * from v1;
141142

142143
--sum 2114, collid 0, min(text) 2145, collid 100, max(ts) 2127
@@ -370,3 +371,21 @@ SELECT
370371
_timescaledb_functions.finalize_agg('pg_catalog.max(integer)'::text, NULL::name, NULL::name, '{{pg_catalog,int4}}'::name[], partial_max, NULL::integer) AS max,
371372
_timescaledb_functions.finalize_agg('pg_catalog.count()'::text, NULL::name, NULL::name, '{}'::name[], partial_count, NULL::bigint) AS count
372373
FROM issue4922_partials_parallel;
374+
375+
-- Test that a "soft" error causes an attempt to repair the row
376+
377+
-- We don't care about the result here (but it is the sum of the b
378+
-- column where a = 1 in table "foo" above), just that it should
379+
-- attempt a repair, do the repair, and then not fail.
380+
--
381+
-- Do this by reading the partial numeric format and drop bytes from
382+
-- the end of it (16 bytes, meaning 32 characters). This is what we
383+
-- know how to repair. Next, attempt to decode the partial and check
384+
-- that it attempt a repair and actually does the repair (zero-filling
385+
-- the end).
386+
set client_min_messages to debug2;
387+
WITH
388+
sample AS (SELECT encode(partialb, 'hex') part FROM saved_partials WHERE a = 1),
389+
broken AS (SELECT decode(left(part, -32), 'hex') partialb FROM sample)
390+
SELECT _timescaledb_functions.finalize_agg('sum(numeric)', null, null, null, broken.partialb, null::numeric) sumb
391+
FROM broken;

0 commit comments

Comments
 (0)