Skip to content

Commit 8412696

Browse files
authored
2 unreleased bug fixes for #18115 from main (#19992)
1 parent 2f3e3ad commit 8412696

File tree

2 files changed

+84
-24
lines changed

2 files changed

+84
-24
lines changed

server/datastore/mysql/hosts.go

+81-24
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
// Since many hosts may have issues, we need to batch the inserts of host issues.
2828
// This is a variable, so it can be adjusted during unit testing.
2929
var hostIssuesInsertBatchSize = 10000
30+
var hostIssuesUpdateFailingPoliciesBatchSize = 10000
3031

3132
// A large number of hosts could be changing teams at once, so we need to batch this operation to prevent excessive locks
3233
var addHostsToTeamBatchSize = 10000
@@ -5269,37 +5270,71 @@ func updateHostIssuesFailingPolicies(ctx context.Context, tx sqlx.ExecerContext,
52695270
return nil
52705271
}
52715272

5272-
masterStmt := `
5273+
// For 1 host, we use a single statement to update the host_issues entry.
5274+
if len(hostIDs) == 1 {
5275+
stmt := `
5276+
INSERT INTO host_issues (host_id, failing_policies_count, total_issues_count)
5277+
SELECT host_id.id, COALESCE(SUM(!pm.passes), 0), COALESCE(SUM(!pm.passes), 0)
5278+
FROM policy_membership pm
5279+
RIGHT JOIN (SELECT ? as id) as host_id
5280+
ON pm.host_id = host_id.id
5281+
GROUP BY host_id.id
5282+
ON DUPLICATE KEY UPDATE
5283+
failing_policies_count = VALUES(failing_policies_count),
5284+
total_issues_count = VALUES(failing_policies_count) + critical_vulnerabilities_count`
5285+
if _, err := tx.ExecContext(ctx, stmt, hostIDs[0]); err != nil {
5286+
return ctxerr.Wrap(ctx, err, "updating failing policies in host issues for one host")
5287+
}
5288+
return nil
5289+
}
5290+
5291+
// Clear host_issues entries for hosts that are not in policy_membership
5292+
clearStmt := `
5293+
UPDATE host_issues
5294+
SET failing_policies_count = 0, total_issues_count = critical_vulnerabilities_count
5295+
WHERE host_id NOT IN (
5296+
SELECT host_id
5297+
FROM policy_membership
5298+
WHERE host_id IN (?)
5299+
) AND host_id IN (?)`
5300+
5301+
// Insert/update host_issues entries for hosts that are in policy_membership.
5302+
// Initially, these two statements were combined into one statement using `SELECT ? AS id UNION ALL` approach to include the host IDs that
5303+
// were not in policy_membership (similar how the above query for 1 host works). However, in load testing we saw an error: Thread stack overrun: 242191 bytes used of a 262144 byte stack
5304+
insertStmt := `
52735305
INSERT INTO host_issues (host_id, failing_policies_count, total_issues_count)
5274-
SELECT host_ids.id, COALESCE(SUM(!pm.passes), 0), COALESCE(SUM(!pm.passes), 0)
5306+
SELECT pm.host_id, COALESCE(SUM(!pm.passes), 0), COALESCE(SUM(!pm.passes), 0)
52755307
FROM policy_membership pm
5276-
RIGHT JOIN (%s) as host_ids
5277-
ON pm.host_id = host_ids.id
5278-
GROUP BY host_ids.id
5308+
WHERE pm.host_id IN (?)
5309+
GROUP BY pm.host_id
52795310
ON DUPLICATE KEY UPDATE
52805311
failing_policies_count = VALUES(failing_policies_count),
52815312
total_issues_count = VALUES(failing_policies_count) + critical_vulnerabilities_count`
52825313

52835314
// Large number of hosts could be impacted, so we update their host issues entries in batches to reduce lock time.
5284-
for i := 0; i < len(hostIDs); i += hostIssuesInsertBatchSize {
5315+
for i := 0; i < len(hostIDs); i += hostIssuesUpdateFailingPoliciesBatchSize {
52855316
start := i
5286-
end := i + hostIssuesInsertBatchSize
5317+
end := i + hostIssuesUpdateFailingPoliciesBatchSize
52875318
if end > len(hostIDs) {
52885319
end = len(hostIDs)
52895320
}
5290-
totalToProcess := end - start
52915321
hostIDsBatch := hostIDs[start:end]
52925322

5293-
inlineTable := strings.TrimSuffix(
5294-
strings.Repeat("SELECT ? AS id UNION ALL ", totalToProcess), " UNION ALL ",
5295-
)
5296-
5297-
args := make([]interface{}, totalToProcess)
5298-
for j := range hostIDsBatch {
5299-
args[j] = hostIDsBatch[j]
5323+
// Zero out failing policies count for hosts that are not in policy_membership
5324+
stmt, args, err := sqlx.In(clearStmt, hostIDsBatch, hostIDsBatch)
5325+
if err != nil {
5326+
return ctxerr.Wrap(ctx, err, "building IN statement for clearing host failing policy issues")
5327+
}
5328+
if _, err := tx.ExecContext(ctx, stmt, args...); err != nil {
5329+
return ctxerr.Wrap(ctx, err, "clearing failing policies in host issues")
5330+
}
5331+
// Update failing policies count for hosts that are in policy_membership
5332+
stmt, args, err = sqlx.In(insertStmt, hostIDsBatch)
5333+
if err != nil {
5334+
return ctxerr.Wrap(ctx, err, "building IN statement for updating host failing policy issues")
53005335
}
5301-
if _, err := tx.ExecContext(ctx, fmt.Sprintf(masterStmt, inlineTable), args...); err != nil {
5302-
return ctxerr.Wrap(ctx, err, "update failing policies in host issues")
5336+
if _, err := tx.ExecContext(ctx, stmt, args...); err != nil {
5337+
return ctxerr.Wrap(ctx, err, "updating failing policies in host issues")
53035338
}
53045339
}
53055340
return nil
@@ -5322,28 +5357,50 @@ func (ds *Datastore) UpdateHostIssuesVulnerabilities(ctx context.Context) error
53225357
return clearAllFn()
53235358
}
53245359

5360+
var allHostIDs []uint
5361+
if err := sqlx.SelectContext(ctx, ds.reader(ctx), &allHostIDs, `SELECT id FROM hosts ORDER BY id`); err != nil {
5362+
return ctxerr.Wrap(ctx, err, "get all host IDs")
5363+
}
5364+
53255365
type issuesCount struct {
53265366
HostID uint64 `db:"host_id"`
53275367
Count uint64 `db:"count"`
53285368
}
5329-
53305369
var criticalCounts []issuesCount
5331-
criticalVulnerabilitiesCountStmt := `
5370+
// We must batch the query extracting the critical vulnerabilities count because the query is too complex for MySQL to handle in one go.
5371+
// We saw MySQL error 1114 (HY000), where the temporary table reached its max capacity.
5372+
for i := 0; i < len(allHostIDs); i += hostIssuesInsertBatchSize {
5373+
start := i
5374+
end := i + hostIssuesInsertBatchSize
5375+
if end > len(allHostIDs) {
5376+
end = len(allHostIDs)
5377+
}
5378+
criticalVulnerabilitiesCountStmt := `
53325379
SELECT combined.host_id, COUNT(*) as count
53335380
FROM (SELECT host_id, cve
53345381
FROM host_software hs
53355382
INNER JOIN software_cve sc ON sc.software_id = hs.software_id
5383+
WHERE host_id IN (?)
53365384
UNION
53375385
SELECT host_id, cve
53385386
FROM host_operating_system hos
5339-
INNER JOIN operating_system_vulnerabilities osv ON osv.operating_system_id = hos.os_id) combined
5387+
INNER JOIN operating_system_vulnerabilities osv ON osv.operating_system_id = hos.os_id
5388+
WHERE host_id IN (?)
5389+
) combined
53405390
INNER JOIN cve_meta cm ON cm.cve = combined.cve
53415391
WHERE cm.cvss_score > ?
53425392
GROUP BY combined.host_id
53435393
ORDER BY combined.host_id`
5344-
err := sqlx.SelectContext(ctx, ds.reader(ctx), &criticalCounts, criticalVulnerabilitiesCountStmt, criticalCVSSScoreCutoff)
5345-
if err != nil {
5346-
return ctxerr.Wrap(ctx, err, "get critical vulnerabilities count")
5394+
stmt, args, err := sqlx.In(criticalVulnerabilitiesCountStmt, allHostIDs[start:end], allHostIDs[start:end], criticalCVSSScoreCutoff)
5395+
if err != nil {
5396+
return ctxerr.Wrap(ctx, err, "building IN statement for getting critical vulnerabilities count")
5397+
}
5398+
var batchCriticalCounts []issuesCount
5399+
err = sqlx.SelectContext(ctx, ds.reader(ctx), &batchCriticalCounts, stmt, args...)
5400+
if err != nil {
5401+
return ctxerr.Wrap(ctx, err, "get critical vulnerabilities count")
5402+
}
5403+
criticalCounts = append(criticalCounts, batchCriticalCounts...)
53475404
}
53485405

53495406
// Update the host_issues table, including deleting items with no issues
@@ -5382,7 +5439,7 @@ func (ds *Datastore) UpdateHostIssuesVulnerabilities(ctx context.Context) error
53825439
for j := start; j < end; j++ {
53835440
hostIDs = append(hostIDs, criticalCounts[j].HostID)
53845441
}
5385-
stmt, args, err = sqlx.In(
5442+
stmt, args, err := sqlx.In(
53865443
"UPDATE host_issues SET critical_vulnerabilities_count = 0, total_issues_count = failing_policies_count WHERE host_id NOT IN (?)",
53875444
hostIDs,
53885445
)

server/datastore/mysql/hosts_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -9381,12 +9381,15 @@ func testUpdateHostIssues(t *testing.T, ds *Datastore) {
93819381
// Test with small batch size and premium license
93829382
ctx = license.NewContext(ctx, &fleet.LicenseInfo{Tier: fleet.TierPremium})
93839383
insertBatchSizeOrig := hostIssuesInsertBatchSize
9384+
updateBatchSizeOrig := hostIssuesUpdateFailingPoliciesBatchSize
93849385
t.Cleanup(
93859386
func() {
93869387
hostIssuesInsertBatchSize = insertBatchSizeOrig
9388+
hostIssuesUpdateFailingPoliciesBatchSize = updateBatchSizeOrig
93879389
},
93889390
)
93899391
hostIssuesInsertBatchSize = 2
9392+
hostIssuesUpdateFailingPoliciesBatchSize = 2
93909393

93919394
assert.NoError(t, ds.UpdateHostIssuesFailingPolicies(ctx, hostIDs))
93929395
assert.NoError(t, ds.UpdateHostIssuesVulnerabilities(ctx))

0 commit comments

Comments
 (0)