fix: improve handling of triage data (intel#4160)

r-vdp · terriko · web-flow · commit f1d3c750ca05 · 2024-07-29T10:53:18.000-07:00
* Fix handling of triage data

Before this change, there were two issues when using an SBOM file
together with a VEX file for triage:
1. new CVEs for a product for which there were existing CVEs in the
   triage file, were not added to the triage file
2. triage info recorded in the triage file was overwritten when
   cve-bin-tool was executed with both the `--triage-file` and `vex`
   options.

This commit fixes both issues by:
1. still scanning for CVEs even if the product was already present in
   the triage file. Before we would not scan for CVEs as soon as we
   found that the product was already present, but by doing so we might
   miss new CVEs since the last time we did a scan.
2. merge recorded triage info into the CVEs that we found for the SBOM
   components.

In order to properly identify products, I implemented the hash and eq
methods on the `ProductInfo` type to not consider the `location` field
as this field does not seem to be populated in a consistent manner.

* fixup! Fix handling of triage data

* chore: nitpick for black

---------

Co-authored-by: Terri Oda &lt;terri.oda@intel.com&gt;
diff --git a/cve_bin_tool/cve_scanner.py b/cve_bin_tool/cve_scanner.py
@@ -16,7 +16,7 @@
 from cve_bin_tool.input_engine import TriageData
 from cve_bin_tool.log import LOGGER
 from cve_bin_tool.theme import cve_theme
-from cve_bin_tool.util import CVE, CVEData, ProductInfo, VersionInfo
+from cve_bin_tool.util import CVE, CVEData, ProductInfo, Remarks, VersionInfo
 from cve_bin_tool.version_compare import Version
 
 
@@ -180,8 +180,13 @@ def get_cves(self, product_info: ProductInfo, triage_data: TriageData):
                     end_excluding=version_end_excluding,
                 )
 
-        # Go through and get all the severities
+        product_info_data: CVEData | None = self.all_cve_data.get(product_info)
+        prev_cves: List[CVE] = (
+            product_info_data.get("cves", []) if product_info_data is not None else []  # type: ignore
+        )
         cves: List[CVE] = []
+
+        # Go through and get all the severities
         if cve_list:
             finished = False
             max_cves = 500
@@ -223,15 +228,26 @@ def get_cves(self, product_info: ProductInfo, triage_data: TriageData):
                     if duplicate_found:
                         continue
 
+                    # Check if we already found this CVE with a previous scan.
+                    # In that case we need to check where to get our triage info
+                    # from.
+                    # TODO: turn the list of CVEs into a set to avoid needing
+                    # the linear-time lookup.
+                    prev_cve = next(
+                        (
+                            cve
+                            for cve in prev_cves
+                            if cve.cve_number == row["cve_number"]
+                        ),
+                        None,
+                    )
+
                     triage = triage_data.get(row["cve_number"]) or triage_data.get(
                         "default"
                     )
-                    # Only scan cves if triage is not None.
-                    # Triage will only be None if triage_data don't have default attribute.
-                    # NOTE: Triage can be empty dictionary so checking `if triage:` won't suffice.
-                    if triage is not None:
+                    if prev_cve is None:
                         row_dict = dict(row)
-                        row_dict.update(triage)
+
                         # print(row_dict)
                         row_dict["severity"] = row_dict["severity"] or row["severity"]
                         # Checking for exploits
@@ -274,7 +290,39 @@ def get_cves(self, product_info: ProductInfo, triage_data: TriageData):
                                 f'metrics found in CVE {row_dict["cve_number"]}  is {row_dict["metric"]}'
                             )
                         cve = CVE(**row_dict)
-                        cves.append(cve)
+                    else:
+                        cve = prev_cve
+
+                    # We assume that only one source has the triage info.
+                    # We try to figure out here which one.
+                    # If we have useful info in the triage data we received,
+                    # then we use it.
+                    if triage is not None and (
+                        # Either the new cve does not have triage data,
+                        # or it is trivial (newly found cve)
+                        not cve.remarks
+                        or cve.remarks == Remarks.NewFound
+                    ):
+                        for key in [
+                            "remarks",
+                            "comments",
+                            "response",
+                            "justification",
+                            "severity",
+                        ]:
+                            data = triage.get(key)
+                            if data:
+                                if (
+                                    key == "severity"
+                                    and self.check_exploits
+                                    and row_dict["cve_number"] in self.exploits_list
+                                ):
+                                    data += "-EXPLOIT"
+
+                                self.logger.debug(f"Setting field {key} to: {data}")
+                                cve = cve._replace(**{key: data})
+
+                    cves.append(cve)
 
             if cves:
                 self.products_with_cve += 1
diff --git a/cve_bin_tool/util.py b/cve_bin_tool/util.py
@@ -1,7 +1,8 @@
 # Copyright (C) 2021 Intel Corporation
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-""" Utility classes for the CVE Binary Tool """
+"""Utility classes for the CVE Binary Tool"""
+
 from __future__ import annotations
 
 import fnmatch
@@ -165,6 +166,24 @@ class ProductInfo(NamedTuple):
     location: str
     purl: str | None = None
 
+    def __identity_members(self):
+        """The members that will be used for eq and hash implementations.
+        We do not include location here since it can take on different values
+        depending on where the product info is coming from and we want to be
+        able to properly identify products that are actually the same.
+        """
+        # TODO: what is the meaning of the location field exactly?
+        return (self.vendor, self.product, self.version)
+
+    def __eq__(self, other):
+        if type(other) is type(self):
+            return self.__identity_members() == other.__identity_members()
+        else:
+            return False
+
+    def __hash__(self):
+        return hash(self.__identity_members())
+
 
 class ScanInfo(NamedTuple):
     """