Merged commit includes the following changes:

441839576 by Waymo Research: Add SemSeg to submission_manager -- 441818945 by Waymo Research: Update readme -- 441802666 by Waymo Research: Add instruction on creating 3d semseg submission files. -- 441799995 by Waymo Research: Internal changes. -- 441648297 by Waymo Research: Add a few new utils for Segmentation metric calculation -- 441621310 by Waymo Research: Update readme for the April release -- 441620538 by Waymo Research: Internal change. -- 441582306 by Waymo Research: Add z-axis speed and acceleration in lidar label metadata. -- 441576134 by Waymo Research: Support rolling shutter projection on moving points. -- 441545916 by Waymo Research: Add Camera-Only 3D Detection task to submission proto. -- 441386180 by Waymo Research: Add camera_synced_box to label proto. -- 441274777 by Waymo Research: Fix documentation: order was not correct for height/width on build_camera_depth_image -- 441223622 by Waymo Research: Add Camera breakdown in metrics. -- 440180344 by Waymo Research: Add optional most visible camera name to lidar labels. -- 439742974 by Waymo Research: Fix a bug in segmentation metrics tool which will cause OOM. -- 439414596 by Waymo Research: Minor fix on the comments. -- PiperOrigin-RevId: 441839576
DavidsShi · Apr 15, 2022 · 2a7359b · 2a7359b
1 parent 7fd7da0
commit 2a7359b
Show file tree

Hide file tree

Showing 22 changed files with 3,775 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -4,21 +4,23 @@ We have released the Waymo Open Dataset publicly to aid the research community i
 
 The Waymo Open Dataset is composed of two datasets - the Perception dataset with high resolution sensor data and labels for 1,950 scenes, and the Motion dataset with object trajectories and corresponding 3D maps for 103,354 scenes.
 
+## April 2022 Update
+We released v1.3.1 of the Perception dataset to support the 2022 Challenges and have updated this repository accordingly.
+ - Added metrics (LET-3D-APL and LET-3D-AP) for the 3D Camera-Only Detection Challenge.
+ - Added 80 segments of 20-second camera imagery, as a test set for the 3D Camera-Only Detection Challenge.
+ - Added z-axis speed and acceleration in [lidar label metadata](waymo_open_dataset/label.proto#L53-L60).
+ - Updated the default configuration for the Occupancy and Flow Challenge, switching from aggregate waypoints to [subsampled waypoints](waymo_open_dataset/protos/occupancy_flow_metrics.proto#L38-L55).
+ - Updated the [tutorial](waymo-open-dataset/tutorial/tutorial_3d_semseg.ipynb) for 3D Semantic Segmentation Challenge with more detailed instructions.
 
 ## March 2022 Update
 
-We released v1.3 of the Perception dataset and the 2022 challenges. We have updated this repository to add support for the new labels and the challenges.
+We released v1.3.0 of the Perception dataset and the 2022 challenges. We have updated this repository to add support for the new labels and the challenges.
  - Added 3D semantic segmentation labels, tutorial, and metrics.
  - Added 2D and 3D keypoint labels, tutorial, and metrics.
  - Added correspondence between 2D and 3D labels.
  - Added tutorial and utilities for Occupancy Flow Prediction Challenge.
  - Added the soft mAP metric for Motion Prediction Challenge.
 
-Coming soon:
- - Add scenes with camera imagery, as a test set for the Camera-Only 3D Detection Challenge.
- - Add a new metric for the Camera-Only 3D Detection Challenge
-
-
 ## September 2021 Update
 
 We released v1.1 of the Motion dataset to include lane connectivity information. To read more on the technical details, please read [lane_neighbors_and_boundaries.md](docs/lane_neighbors_and_boundaries.md).

diff --git a/third_party/camera/camera_model.cc b/third_party/camera/camera_model.cc
@@ -197,6 +197,8 @@ struct CameraModel::GlobalShutterState {
   Eigen::Isometry3d n_tfm_cam0;
   // Transformation from ENU to camera at pose timestamp.
   Eigen::Isometry3d cam_tfm_n;
+  // Transformation from camera to vehicle at pose timestamp.
+  Eigen::Isometry3d vehicle_tfm_cam;
 };
 
 CameraModel::CameraModel(const CameraCalibration& calibration)
@@ -216,6 +218,7 @@ void CameraModel::PrepareProjection(const CameraImage& camera_image) {
   if (global_shutter_state_ == nullptr) {
     global_shutter_state_ = absl::make_unique<GlobalShutterState>();
   }
+  global_shutter_state_->vehicle_tfm_cam = vehicle_tfm_cam;
   global_shutter_state_->n_tfm_cam0 = n_tfm_vehicle0 * vehicle_tfm_cam;
   global_shutter_state_->cam_tfm_n =
       global_shutter_state_->n_tfm_cam0.inverse();
@@ -318,7 +321,7 @@ void CameraModel::PrepareProjection(const CameraImage& camera_image) {
 // In this function, we are solving a scalar nonlinear optimization problem:
 //  Min || t_h - IndexToTimeFromNormalizedCoord(Cam_p_f(t)) + t_offset ||
 //  over t_h where t_h is explained below.
-// where Cam_p_f(t) = projection(n_p_f, n_tfm_cam(t))
+// where Cam_p_f(t) = projection(n_p_f + t * n_v_f, n_tfm_cam(t))
 // The timestamps involved in the optimization problem:
 // t_capture: The timestamp the rolling shutter camera can actually catch the
 // point landmark. (this defines which scanline the point landmark falls in the
@@ -339,9 +342,10 @@ void CameraModel::PrepareProjection(const CameraImage& camera_image) {
 // coordinate space instead of going to the distortion space. The testing
 // results show that we get sufficiently good results already in normalized
 // coordinate space.
-bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
+bool CameraModel::WorldToImageMovingPointWithDepth(double x, double y, double z,
+                                        double v_x, double v_y, double v_z,
                                         bool check_image_bounds, double* u_d,
-                                        double* v_d, double* depth) const {
+                                        double* v_d, double* depth) const{
   if (calibration_.rolling_shutter_direction() ==
       CameraCalibration::GLOBAL_SHUTTER) {
     return WorldToImageWithDepthGlobalShutter(x, y, z, check_image_bounds, u_d,
@@ -351,6 +355,7 @@ bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
   // The initial guess is the center of the image.
   double t_h = 0.;
   const Eigen::Vector3d n_pos_f{x, y, z};
+  const Eigen::Vector3d n_vel_f{v_x, v_y, v_z};
   size_t iter_num = 0;
 
   // This threshold roughly corresponds to sub-pixel error for our camera
@@ -365,9 +370,10 @@ bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
   double point_depth = -1;
 
   while (std::fabs(residual) > kThreshold && iter_num < kMaxIterNum) {
-    if (!ComputeDepthResidualAndJacobian(n_pos_f, t_h, &normalized_coord,
-                                         &point_depth, &residual, &jacobian)) {
-      // The point is behind camera or the radial distortion is too large.
+    if (!ComputeDepthResidualAndJacobian(n_pos_f, n_vel_f, t_h,
+                                         &normalized_coord, &point_depth,
+                                         &residual, &jacobian)) {
+      // Return false if the point is behind camera.
       *u_d = -1;
       *v_d = -1;
       if (depth)
@@ -382,10 +388,10 @@ bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
   }
 
   // Get normalized coordinate.
-  if (!ComputeDepthResidualAndJacobian(n_pos_f, t_h, &normalized_coord,
+  if (!ComputeDepthResidualAndJacobian(n_pos_f, n_vel_f, t_h, &normalized_coord,
                                        &point_depth, &residual,
                                        /*jacobian=*/nullptr)) {
-    // The point is behind camera or the radial distortion is too large.
+    // Return false if the point is behind camera.
     *u_d = -1;
     *v_d = -1;
     if (depth)
@@ -396,6 +402,7 @@ bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
   if (depth)
     *depth = point_depth;
 
+  // Return false if the radial distortion is too large.
   if (!DirectionToImage(normalized_coord(0), normalized_coord(1), u_d, v_d)) {
     return false;
   }
@@ -408,6 +415,13 @@ bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
   return true;
 }
 
+bool CameraModel::WorldToImageWithDepth(double x, double y, double z,
+                                        bool check_image_bounds, double* u_d,
+                                        double* v_d, double* depth) const {
+  return WorldToImageMovingPointWithDepth(x, y, z, 0.0, 0.0, 0.0,
+                                          check_image_bounds, u_d, v_d, depth);
+}
+
 bool CameraModel::WorldToImage(double x, double y, double z,
                                bool check_image_bounds, double* u_d,
                                double* v_d) const {
@@ -463,6 +477,22 @@ void CameraModel::ImageToWorldGlobalShutter(double u_d, double v_d,
   *z = wp(2);
 }
 
+void CameraModel::ImageToVehicleGlobalShutter(double u_d, double v_d,
+                                              double depth, double* x,
+                                              double* y, double* z) const {
+  CHECK(x);
+  CHECK(y);
+  CHECK(z);
+  CHECK(global_shutter_state_) << "Please call PrepareProjection() first.";
+  double u_n = 0.0, v_n = 0.0;
+  ImageToDirection(u_d, v_d, &u_n, &v_n);
+  const Eigen::Vector3d wp = global_shutter_state_->vehicle_tfm_cam *
+                             Eigen::Vector3d(depth, -u_n * depth, -v_n * depth);
+  *x = wp(0);
+  *y = wp(1);
+  *z = wp(2);
+}
+
 bool CameraModel::CameraToImageWithDepth(double x, double y, double z,
                                          bool check_image_bounds, double* u_d,
                                          double* v_d, double* depth) const {
@@ -581,7 +611,7 @@ bool CameraModel::DirectionToImage(double u_n, double v_n, double* u_d,
 }
 
 bool CameraModel::ComputeDepthResidualAndJacobian(
-    const Eigen::Vector3d& n_pos_f, double t_h,
+    const Eigen::Vector3d& n_pos_f, const Eigen::Vector3d& n_vel_f, double t_h,
     Eigen::Vector2d* normalized_coord, double* depth, double* residual,
     double* jacobian) const {
   // The jacobian is allowed to be a nullptr.
@@ -596,7 +626,8 @@ bool CameraModel::ComputeDepthResidualAndJacobian(
   const Eigen::Vector3d n_pos_cam =
       rolling_shutter_state.n_tfm_cam0.translation() +
       t_h * rolling_shutter_state.n_vel_cam0;
-  const Eigen::Vector3d cam_pos_f = cam_dcm_n * (n_pos_f - n_pos_cam);
+  const Eigen::Vector3d cam_pos_f =
+      cam_dcm_n * (n_pos_f + t_h * n_vel_f - n_pos_cam);
 
   if (cam_pos_f(0) <= 0) {
     // The point is behind camera.
@@ -620,7 +651,7 @@ bool CameraModel::ComputeDepthResidualAndJacobian(
     // The following is based on a reduced form of the derivative. The details
     // of the way to derive that derivative are skipped here.
     const Eigen::Vector3d jacobian_landmark_to_index =
-        -cam_dcm_n * rolling_shutter_state.n_vel_cam0 -
+        cam_dcm_n * (n_vel_f - rolling_shutter_state.n_vel_cam0) -
         rolling_shutter_state.skew_omega * cam_pos_f;
 
     const double jacobian_combined =
@@ -638,12 +669,13 @@ bool CameraModel::ComputeDepthResidualAndJacobian(
 }
 
 bool CameraModel::ComputeResidualAndJacobian(const Eigen::Vector3d& n_pos_f,
+                                             const Eigen::Vector3d& n_vel_f,
                                              double t_h,
                                              Eigen::Vector2d* normalized_coord,
                                              double* residual,
                                              double* jacobian) const {
-  return ComputeDepthResidualAndJacobian(n_pos_f, t_h, normalized_coord,
-                                         nullptr, residual, jacobian);
+  return ComputeDepthResidualAndJacobian(
+      n_pos_f, n_vel_f, t_h, normalized_coord, nullptr, residual, jacobian);
 }
 
 }  // namespace open_dataset

diff --git a/third_party/camera/camera_model.h b/third_party/camera/camera_model.h
@@ -76,18 +76,34 @@ class CameraModel {
 
   // Projects a 3D point in global coordinates into the lens distorted image
   // coordinates (u_d, v_d, depth). These projections are in original image
-  // frame (x: image width, y: image height).
+  // frame (x: image width, y: image height). Outputs a depth of -1 if the given
+  // point is behind the camera. Note that `depth` may be a nullptr.
   bool WorldToImageWithDepth(double x, double y, double z,
                              bool check_image_bounds, double* u_d, double* v_d,
                              double* depth) const;
 
+  // Projects a moving point (x, y, z) with constant velocity (v_x, v_y, v_z) in
+  // the global coordinate system into the lens distorted image coordinate
+  // system (u_d, v_d, depth). It is assumed that the point is at position
+  // (x, y, z) at the timestamp that is associated with the camera image pose
+  // used to prepare the projection.
+  bool WorldToImageMovingPointWithDepth(double x, double y, double z,
+                                        double v_x, double v_y, double v_z,
+                                        bool check_image_bounds, double* u_d,
+                                        double* v_d, double* depth) const;
+
   // Converts a point in the image with a known depth into world coordinates.
   // Similar as `WorldToImage`. This method also compensates for rolling shutter
   // effect if applicable.
   // Requires: `PrepareProjection` is called.
   void ImageToWorld(double u_d, double v_d, double depth, double* x, double* y,
                     double* z) const;
 
+  // Similar as `ImageToWorldGlobalShutter` but converts to the vehicle frame.
+  // Requires: `PrepareProjection` is called.
+  void ImageToVehicleGlobalShutter(double u_d, double v_d, double depth,
+                                   double* x, double* y, double* z) const;
+
   // True if the given image coordinates are within the image.
   bool InImage(double u, double v) const;
 
@@ -144,15 +160,18 @@ class CameraModel {
 
   // This is a helper function for rolling shutter projection.
   // It takes the rolling shutter state variable, position of landmark in ENU
-  // frame, estimated time t_h, and computes projected feature in normalized
-  // coordinate frame, the corresponding depth, the residual and the jacobian.
-  // If the jacobian is given as nullptr we will skip its computation.
-  bool ComputeResidualAndJacobian(const Eigen::Vector3d& n_pos_f, double t_h,
+  // frame, velocity of the landmark in ENU frame, estimated time t_h, and
+  // computes projected feature in normalized coordinate frame, the
+  // corresponding depth, the residual and the jacobian. If the jacobian is
+  // given as nullptr we will skip its computation.
+  bool ComputeResidualAndJacobian(const Eigen::Vector3d& n_pos_f,
+                                  const Eigen::Vector3d& n_vel_f, double t_h,
                                   Eigen::Vector2d* normalized_coord,
                                   double* residual, double* jacobian) const;
 
   // Similar as `ComputeResidualAndJacobian` but returns depth.
   bool ComputeDepthResidualAndJacobian(const Eigen::Vector3d& n_pos_f,
+                                       const Eigen::Vector3d& n_vel_f,
                                        double t_h,
                                        Eigen::Vector2d* normalized_coord,
                                        double* depth, double* residual,

diff --git a/third_party/camera/camera_model_test.cc b/third_party/camera/camera_model_test.cc
@@ -154,6 +154,11 @@ TEST_F(CameraModelTest, GlobalShutter) {
   EXPECT_NEAR(x, -4091.97180, 0.1);
   EXPECT_NEAR(y, 11527.48092, 0.1);
   EXPECT_NEAR(z, 84.46586, 0.1);
+
+  camera_model.ImageToVehicleGlobalShutter(100, 1000, 20, &x, &y, &z);
+  EXPECT_NEAR(x, 21.44064, 0.1);
+  EXPECT_NEAR(y, 8.32240, 0.1);
+  EXPECT_NEAR(z, -1.62938, 0.1);
 }
 
 TEST_F(CameraModelTest, WorldToImageBehindCameraRollingShutter) {
@@ -213,6 +218,53 @@ TEST_F(CameraModelTest, RollingShutterWithDepth) {
   EXPECT_NEAR(z, 84.46667, 0.1);
 }
 
+TEST_F(CameraModelTest, RollingShutterMovingPointWithDepth) {
+  CameraModel camera_model(calibration_);
+  camera_model.PrepareProjection(camera_image_);
+
+  double x, y, z;
+  camera_model.ImageToWorld(100, 1000, 20, &x, &y, &z);
+  EXPECT_NEAR(x, -4091.88016, 0.1);
+  EXPECT_NEAR(y, 11527.42299, 0.1);
+  EXPECT_NEAR(z, 84.46667, 0.1);
+
+  double v_x, v_y, v_z;
+  double u_d, v_d, depth;
+
+  v_x = 0;
+  v_y = 0;
+  v_z = 0;
+  EXPECT_TRUE(camera_model.WorldToImageMovingPointWithDepth(
+      x, y, z, v_x, v_y, v_z, /*check_image_bounds=*/true, &u_d, &v_d, &depth));
+  EXPECT_NEAR(u_d, 100, 0.1);
+  EXPECT_NEAR(v_d, 1000, 0.1);
+  EXPECT_NEAR(depth, 20, 1e-3);
+
+  // Check WorldToImage has same results given zero velocity.
+  EXPECT_TRUE(camera_model.WorldToImage(x, y, z, /*check_image_bounds=*/true,
+                                        &u_d, &v_d));
+  EXPECT_NEAR(u_d, 100, 0.1);
+  EXPECT_NEAR(v_d, 1000, 0.1);
+  EXPECT_NEAR(depth, 20, 1e-3);
+
+  v_x = 1.0;
+  v_y = 2.0;
+  v_z = 0.1;
+  EXPECT_TRUE(camera_model.WorldToImageMovingPointWithDepth(
+      x, y, z, v_x, v_y, v_z, /*check_image_bounds=*/true, &u_d, &v_d, &depth));
+  EXPECT_NEAR(u_d, 94.114, 0.1);
+  EXPECT_NEAR(v_d, 1000, 0.1);
+  EXPECT_NEAR(depth, 20.00349, 1e-3);
+
+  constexpr double t_h = -0.0272851;
+  EXPECT_TRUE(camera_model.WorldToImageMovingPointWithDepth(
+      x + t_h * v_x, y + t_h * v_y, z + t_h * v_z, 0.0, 0.0, 0.0,
+      /*check_image_bounds=*/true, &u_d, &v_d, &depth));
+  EXPECT_NEAR(u_d, 94.114, 0.1);
+  EXPECT_NEAR(v_d, 1000, 0.1);
+  EXPECT_NEAR(depth, 20.00349, 1e-3);
+}
+
 TEST_F(CameraModelTest, GlobalShutterWithDepth) {
   calibration_.set_rolling_shutter_direction(CameraCalibration::GLOBAL_SHUTTER);
   CameraModel camera_model(calibration_);