Convert Cuboid2D to/from KITTI 3D data (#1639)

itrushkin · web-flow · commit 3d533b9cf8b9 · 2024-10-16T21:42:37.000+09:00
### Summary CVS-151427 #### New features - New `Cuboid2D` methods: - `Cuboid2D.from_3d(dimensions, location, rotation_y, P, Tr_velo_to_cam)`: Creates a Cuboid2D object from KITTI 3D bbox annotation data. Matrix `P` (`P2` in Kitti format context) is a 3x4 projection matrix in the left color camera coordinate system. Matrix `Tr_velo_to_cam` is a 3x4 projection matrix between LiDAR and camera coordinate systems. - `cuboid_2d.to_3d(P_inv)`: Reconstructs approximate KITTI 3D bbox annotation data (`dimensions`, `location` and `rotation_y`) from 2D projection coordinates. `P_inv` matrix is a [pseudo-inverse](https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse) of camera-to-image projection matrix.  ### How to test See unit test changes ### Checklist  - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: Ilya Trushkin <ilya.trushkin@intel.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1619>)
 - Add PseudoLabeling transform for unlabeled dataset
   (<https://github.com/openvinotoolkit/datumaro/pull/1594>)
+- Convert Cuboid2D annotation to/from 3D data
+  (<https://github.com/openvinotoolkit/datumaro/pull/1639>)
 
 ### Enhancements
 - Enhance 'id_from_image_name' transform to ensure each identifier is unique
diff --git a/src/datumaro/components/annotation.py b/src/datumaro/components/annotation.py
@@ -24,6 +24,7 @@
 )
 
 import attr
+import cv2
 import numpy as np
 import shapely.geometry as sg
 from attr import asdict, attrs, field
@@ -1372,19 +1373,19 @@ class Cuboid2D(Annotation):
     [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x5, y5), (x6, y6), (x7, y7), (x8, y8)].
 
 
-      6---7
+      2---3
      /|  /|
-    5-+-8 |
-    | 2 + 3
+    1-+-4 |
+    | 5 + 6
     |/  |/
-    1---4
+    8---7
 
     Attributes:
-        _type (AnnotationType): The type of annotation, set to `AnnotationType.bbox`.
+        _type (AnnotationType): The type of annotation, set to `AnnotationType.cuboid_2d`.
 
     Methods:
         __init__: Initializes the Cuboid2D with its coordinates.
-        wrap: Creates a new Bbox instance with updated attributes.
+        wrap: Creates a new Cuboid2D instance with updated attributes.
     """
 
     _type = AnnotationType.cuboid_2d
@@ -1393,11 +1394,194 @@ class Cuboid2D(Annotation):
         converter=attr.converters.optional(int), default=None, kw_only=True
     )
     z_order: int = field(default=0, validator=default_if_none(int), kw_only=True)
+    y_3d: float = field(default=None, validator=default_if_none(float), kw_only=True)
 
-    def __init__(self, _points: Iterable[Tuple[float, float]], *args, **kwargs):
+    def __init__(
+        self,
+        _points: Iterable[Tuple[float, float]],
+        *args,
+        **kwargs,
+    ):
         kwargs.pop("points", None)  # comes from wrap()
         self.__attrs_init__(points=_points, *args, **kwargs)
 
+    @staticmethod
+    def _get_plane_equation(points):
+        """Calculates coefficients of the plane equation from three points."""
+        x1, y1, z1 = points[0, 0], points[0, 1], points[0, 2]
+        x2, y2, z2 = points[1, 0], points[1, 1], points[1, 2]
+        x3, y3, z3 = points[2, 0], points[2, 1], points[2, 2]
+        a1 = x2 - x1
+        b1 = y2 - y1
+        c1 = z2 - z1
+        a2 = x3 - x1
+        b2 = y3 - y1
+        c2 = z3 - z1
+        a = b1 * c2 - b2 * c1
+        b = a2 * c1 - a1 * c2
+        c = a1 * b2 - b1 * a2
+        d = -a * x1 - b * y1 - c * z1
+        return np.array([a, b, c, d])
+
+    @staticmethod
+    def _get_denorm(Tr_velo_to_cam_homo):
+        """Calculates the denormalized vector perpendicular to the image plane.
+        Args:
+            Tr_velo_to_cam_homo (np.ndarray): Homogeneous (4x4) LiDAR-to-camera transformation matrix
+        Returns:
+            np.ndarray: vector"""
+        ground_points_lidar = np.array([[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 0.0]])
+        ground_points_lidar = np.concatenate(
+            (ground_points_lidar, np.ones((ground_points_lidar.shape[0], 1))), axis=1
+        )
+        ground_points_cam = np.matmul(Tr_velo_to_cam_homo, ground_points_lidar.T).T
+        denorm = -1 * Cuboid2D._get_plane_equation(ground_points_cam)
+        return denorm
+
+    @staticmethod
+    def _get_3d_points(dim, location, rotation_y, denorm):
+        """Get corner points according to the 3D bounding box parameters.
+
+        Args:
+            dim (List[float]): The dimensions of the 3D bounding box as [l, w, h].
+            location (List[float]): The location of the 3D bounding box as [x, y, z].
+            rotation_y (float): The rotation angle around the y-axis.
+
+        Returns:
+            np.ndarray: The corner points of the 3D bounding box.
+        """
+
+        c, s = np.cos(rotation_y), np.sin(rotation_y)
+        R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
+        l, w, h = dim[2], dim[1], dim[0]
+        x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2]
+        y_corners = [0, 0, 0, 0, -h, -h, -h, -h]
+        z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2]
+
+        corners = np.array([x_corners, y_corners, z_corners], dtype=np.float32)
+        corners_3d = np.dot(R, corners)
+
+        denorm = denorm[:3]
+        denorm_norm = denorm / np.sqrt(denorm[0] ** 2 + denorm[1] ** 2 + denorm[2] ** 2)
+        ori_denorm = np.array([0.0, -1.0, 0.0])
+        theta = -1 * math.acos(np.dot(denorm_norm, ori_denorm))
+        n_vector = np.cross(denorm, ori_denorm)
+        n_vector_norm = n_vector / np.sqrt(n_vector[0] ** 2 + n_vector[1] ** 2 + n_vector[2] ** 2)
+        rotation_matrix, j = cv2.Rodrigues(theta * n_vector_norm)
+        corners_3d = np.dot(rotation_matrix, corners_3d)
+        corners_3d = corners_3d + np.array(location, dtype=np.float32).reshape(3, 1)
+        return corners_3d.transpose(1, 0)
+
+    @staticmethod
+    def _project_to_2d(pts_3d, P):
+        """Project 3D points to 2D image plane.
+
+        Args:
+            pts_3d (np.ndarray): The 3D points to project.
+            P (np.ndarray): The projection matrix.
+
+        Returns:
+            np.ndarray: The 2D points projected to the image
+        """
+        # Convert to homogeneous coordinates
+        pts_3d = pts_3d.T
+        pts_3d_homo = np.vstack((pts_3d, np.ones(pts_3d.shape[1])))
+        pts_2d = P @ pts_3d_homo
+        pts_2d[0, :] = np.divide(pts_2d[0, :], pts_2d[2, :])
+        pts_2d[1, :] = np.divide(pts_2d[1, :], pts_2d[2, :])
+        pts_2d = pts_2d[:2, :].T
+
+        return pts_2d
+
+    @classmethod
+    def from_3d(
+        cls,
+        dim: np.ndarray,
+        location: np.ndarray,
+        rotation_y: float,
+        P: np.ndarray,
+        Tr_velo_to_cam: np.ndarray,
+    ) -> Cuboid2D:
+        """Creates an instance of Cuboid2D class from 3D bounding box parameters.
+
+        Args:
+            dim (np.ndarray): 3 scalars describing length, width and height of a 3D bounding box
+            location (np.ndarray): (x, y, z) coordinates of the middle of the top face.
+            rotation_y (np.ndarray): rotation along the Y-axis (from -pi to pi)
+            P (np.ndarray): Camera-to-Image transformation matrix (3x4)
+            Tr_velo_to_cam (np.ndarray): LiDAR-to-Camera transformation matrix (3x4)
+
+        Returns:
+            Cuboid2D: Projection points for the given bounding box
+        """
+        Tr_velo_to_cam_homo = np.eye(4)
+        Tr_velo_to_cam_homo[:3, :4] = Tr_velo_to_cam
+        denorm = cls._get_denorm(Tr_velo_to_cam_homo)
+        pts_3d = cls._get_3d_points(dim, location, rotation_y, denorm)
+        y_3d = np.mean(pts_3d[:4, 1])
+        pts_2d = cls._project_to_2d(pts_3d, P)
+
+        return cls(list(map(tuple, pts_2d)), y_3d=y_3d)
+
+    def to_3d(self, P_inv: np.ndarray) -> tuple[np.ndarray, np.ndarray, float]:
+        """Reconstructs 3D object Velodyne coordinates (dimensions, location and rotation along the Y-axis)
+        from the given Cuboid2D instance.
+
+        Args:
+            P_inv (np.ndarray): Pseudo-inverse of Camera-to-Image projection matrix
+        Returns:
+            tuple: dimensions, location and rotation along the Y-axis
+        """
+        recon_3d = []
+        for idx, coord_2d in enumerate(self.points):
+            coord_2d = np.append(coord_2d, 1)
+            coord_3d = P_inv @ coord_2d
+            if idx < 4:
+                coord_3d = coord_3d * self.y_3d / coord_3d[1]
+            else:
+                coord_3d = coord_3d * recon_3d[idx - 4][0] / coord_3d[0]
+            recon_3d.append(coord_3d[:3])
+        recon_3d = np.array(recon_3d)
+
+        x = np.mean(recon_3d[:, 0])
+        z = np.mean(recon_3d[:, 2])
+
+        yaws = []
+        pairs = [(0, 1), (3, 2), (4, 5), (7, 6)]
+        for p in pairs:
+            delta_x = recon_3d[p[0]][0] - recon_3d[p[1]][0]
+            delta_z = recon_3d[p[0]][2] - recon_3d[p[1]][2]
+            yaws.append(np.arctan2(delta_x, delta_z))
+        yaw = np.mean(yaws)
+
+        widths = []
+        pairs = [(0, 1), (2, 3), (4, 5), (6, 7)]
+        for p in pairs:
+            delta_x = np.sqrt(
+                (recon_3d[p[0]][0] - recon_3d[p[1]][0]) ** 2
+                + (recon_3d[p[0]][2] - recon_3d[p[1]][2]) ** 2
+            )
+            widths.append(delta_x)
+        w = np.mean(widths)
+
+        lengths = []
+        pairs = [(1, 2), (0, 3), (5, 6), (4, 7)]
+        for p in pairs:
+            delta_z = np.sqrt(
+                (recon_3d[p[0]][0] - recon_3d[p[1]][0]) ** 2
+                + (recon_3d[p[0]][2] - recon_3d[p[1]][2]) ** 2
+            )
+            lengths.append(delta_z)
+        l = np.mean(lengths)
+
+        heights = []
+        pairs = [(0, 4), (1, 5), (2, 6), (3, 7)]
+        for p in pairs:
+            delta_y = np.abs(recon_3d[p[0]][1] - recon_3d[p[1]][1])
+            heights.append(delta_y)
+        h = np.mean(heights)
+        return np.array([h, w, l]), np.array([x, self.y_3d, z]), yaw
+
 
 @attrs(slots=True, order=False)
 class PointsCategories(Categories):
diff --git a/src/datumaro/components/visualizer.py b/src/datumaro/components/visualizer.py
@@ -679,8 +679,8 @@ def _draw_cuboid_2d(
         # Define the faces based on vertex indices
 
         faces = [
-            [points[i] for i in [0, 1, 2, 3]],  # Bottom face
-            [points[i] for i in [4, 5, 6, 7]],  # Top face
+            [points[i] for i in [0, 1, 2, 3]],  # Top face
+            [points[i] for i in [4, 5, 6, 7]],  # Bottom face
             [points[i] for i in [0, 1, 5, 4]],  # Front face
             [points[i] for i in [1, 2, 6, 5]],  # Right face
             [points[i] for i in [2, 3, 7, 6]],  # Back face
diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py
@@ -13,6 +13,7 @@
 
 from datumaro.components.annotation import (
     Annotations,
+    Cuboid2D,
     Ellipse,
     ExtractedMask,
     HashKey,
@@ -142,3 +143,66 @@ def test_get_semantic_seg_mask_binary_mask(self, fxt_index_mask, dtype):
         semantic_seg_mask = annotations.get_semantic_seg_mask(ignore_index=255, dtype=dtype)
 
         assert np.allclose(semantic_seg_mask, fxt_index_mask)
+
+
+class Cuboid2DTest:
+    @pytest.fixture
+    def fxt_cuboid_2d(self):
+        return Cuboid2D(
+            [
+                (684.172, 237.810),
+                (750.486, 237.673),
+                (803.791, 256.313),
+                (714.712, 256.542),
+                (684.035, 174.227),
+                (750.263, 174.203),
+                (803.399, 170.990),
+                (714.476, 171.015),
+            ],
+            y_3d=1.49,
+        )
+
+    @pytest.fixture
+    def fxt_kitti_data(self):
+        dimensions = np.array([1.49, 1.56, 4.34])
+        location = np.array([2.51, 1.49, 14.75])
+        rot_y = -1.59
+
+        return dimensions, location, rot_y
+
+    @pytest.fixture
+    def fxt_P2(self):
+        return np.array(
+            [
+                [7.215377000000e02, 0.000000000000e00, 6.095593000000e02, 4.485728000000e01],
+                [0.000000000000e00, 7.215377000000e02, 1.728540000000e02, 2.163791000000e-01],
+                [0.000000000000e00, 0.000000000000e00, 1.000000000000e00, 2.745884000000e-03],
+            ]
+        )
+
+    @pytest.fixture
+    def fxt_velo_to_cam(self):
+        return np.array(
+            [
+                [7.533745000000e-03, -9.999714000000e-01, -6.166020000000e-04, -4.069766000000e-03],
+                [1.480249000000e-02, 7.280733000000e-04, -9.998902000000e-01, -7.631618000000e-02],
+                [9.998621000000e-01, 7.523790000000e-03, 1.480755000000e-02, -2.717806000000e-01],
+            ]
+        )
+
+    def test_create_from_3d(self, fxt_kitti_data, fxt_cuboid_2d, fxt_P2, fxt_velo_to_cam):
+        actual = Cuboid2D.from_3d(
+            dim=fxt_kitti_data[0],
+            location=fxt_kitti_data[1],
+            rotation_y=fxt_kitti_data[2],
+            P=fxt_P2,
+            Tr_velo_to_cam=fxt_velo_to_cam,
+        )
+
+        assert np.allclose(actual.points, fxt_cuboid_2d.points, atol=1e-3)
+
+    def test_to_3d(self, fxt_kitti_data, fxt_cuboid_2d, fxt_P2):
+        P_inv = np.linalg.pinv(fxt_P2)
+        actual = fxt_cuboid_2d.to_3d(P_inv)
+        for act, exp in zip(actual, fxt_kitti_data):
+            assert np.allclose(act, exp, atol=2)